import numpy as np
import pandas as pd
#Read Data
train = pd.read_csv('C:/Users/Documents/Kaggle credit/cs-training.csv')
test = pd.read_csv('C:/Users/Documents/Kaggle credit/cs-test.csv')
#Preview data
train.head(5)
Unnamed: 0 | SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
1 | 2 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
2 | 3 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
3 | 4 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
4 | 5 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
#Drop Unnecessary Column
train = train.drop(["Unnamed: 0"],axis=1)
#Replace NaN in monthly income and dependants with zero
train['MonthlyIncome'] = train['MonthlyIncome'].fillna(0)
train['NumberOfDependents'] = train['NumberOfDependents'].fillna(0)
#Convert NumberOfDependents to Integer
train['NumberOfDependents'] = train['NumberOfDependents'].astype(np.int64)
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150000 entries, 0 to 149999 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SeriousDlqin2yrs 150000 non-null int64 1 RevolvingUtilizationOfUnsecuredLines 150000 non-null float64 2 age 150000 non-null int64 3 NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64 4 DebtRatio 150000 non-null float64 5 MonthlyIncome 150000 non-null float64 6 NumberOfOpenCreditLinesAndLoans 150000 non-null int64 7 NumberOfTimes90DaysLate 150000 non-null int64 8 NumberRealEstateLoansOrLines 150000 non-null int64 9 NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64 10 NumberOfDependents 150000 non-null int64 dtypes: float64(3), int64(8) memory usage: 12.6 MB
import matplotlib.pyplot as plt
import seaborn as sns
corr = train.corr()
plt.figure(figsize=(18, 15))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='mako')
plt.title("Correlation Heatmap")
plt.show()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def preprocess_inputs(df):
df = df.copy()
# Split df into X and y
y = df['SeriousDlqin2yrs'].copy()
X = df.drop('SeriousDlqin2yrs', axis=1).copy()
# Scale X with a standard scaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
return X, y
X, y = preprocess_inputs(train)
{column: len(X[column].unique()) for column in X.columns}
{'RevolvingUtilizationOfUnsecuredLines': 125728, 'age': 86, 'NumberOfTime30-59DaysPastDueNotWorse': 16, 'DebtRatio': 114194, 'MonthlyIncome': 13594, 'NumberOfOpenCreditLinesAndLoans': 58, 'NumberOfTimes90DaysLate': 19, 'NumberRealEstateLoansOrLines': 28, 'NumberOfTime60-89DaysPastDueNotWorse': 13, 'NumberOfDependents': 13}
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=3)
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
models = {
LogisticRegression(): " Logistic Regression",
SVC(): "Support Vector Machine",
MLPClassifier(): " Neural Network"
}
for model in models.keys():
model.fit(X_train, y_train)
Wall time: 18min 44s
for model, name in models.items():
print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))
Logistic Regression: 93.46% Support Vector Machine: 93.54% Neural Network: 93.73%
for model in models.keys():
model.predict(test)