import numpy as np
import pandas as pd

#Read Data
train = pd.read_csv('C:/Users/Documents/Kaggle credit/cs-training.csv')
test = pd.read_csv('C:/Users/Documents/Kaggle credit/cs-test.csv')


#Preview data
train.head(5)


#Drop Unnecessary Column
train = train.drop(["Unnamed: 0"],axis=1)

#Replace NaN in monthly income and dependants with zero
train['MonthlyIncome'] = train['MonthlyIncome'].fillna(0)
train['NumberOfDependents'] = train['NumberOfDependents'].fillna(0)

#Convert NumberOfDependents to Integer
train['NumberOfDependents'] = train['NumberOfDependents'].astype(np.int64)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    150000 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 12.6 MB


import matplotlib.pyplot as plt
import seaborn as sns

corr = train.corr()

plt.figure(figsize=(18, 15))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='mako')
plt.title("Correlation Heatmap")
plt.show()


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def preprocess_inputs(df):
    df = df.copy()
    
    # Split df into X and y
    y = df['SeriousDlqin2yrs'].copy()
    X = df.drop('SeriousDlqin2yrs', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y


X, y = preprocess_inputs(train)


{column: len(X[column].unique()) for column in X.columns}

{'RevolvingUtilizationOfUnsecuredLines': 125728,
 'age': 86,
 'NumberOfTime30-59DaysPastDueNotWorse': 16,
 'DebtRatio': 114194,
 'MonthlyIncome': 13594,
 'NumberOfOpenCreditLinesAndLoans': 58,
 'NumberOfTimes90DaysLate': 19,
 'NumberRealEstateLoansOrLines': 28,
 'NumberOfTime60-89DaysPastDueNotWorse': 13,
 'NumberOfDependents': 13}


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=3)


%%time

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

models = {
    LogisticRegression(): "   Logistic Regression",
    SVC():                "Support Vector Machine",
    MLPClassifier():      "        Neural Network"
  
}

for model in models.keys():
    model.fit(X_train, y_train)

Wall time: 18min 44s


for model, name in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

   Logistic Regression: 93.46%
Support Vector Machine: 93.54%
        Neural Network: 93.73%


for model in models.keys():
    model.predict(test)

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

Credit Score Classification with Logistic Regression, Support Vecor Machine and Neural Networks¶

Introduction¶

In this notebook, 3 models are developed to predict whether a borrower will experience financial distress in the next two years. The data is from this dataset.¶

Plot Correlations¶

Modelling¶

Split the traing data¶

Create models¶

Prediction¶

Conclusions¶