Modelling Credit Risk

Author

Aleksei Prishchepo

Published

September 17, 2025

1 Load Data

Let’s load saved data.

dataset <- read.csv("data_selected.csv", row.names=NULL)
dataset <- dataset |> 
  mutate(across(where(is.character), as.factor))
glimpse(dataset)

Rows: 32,581
Columns: 7
$ loan_status               <fct> Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, …
$ person_home_ownership     <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, RENT, …
$ person_emp_length         <int> 4, 5, 1, 4, 8, 2, 8, 5, 8, 6, 6, 2, 2, 4, 2,…
$ loan_intent               <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, MEDIC…
$ loan_amnt                 <int> 35000, 1000, 5500, 35000, 35000, 2500, 35000…
$ loan_to_income_ratio      <dbl> 0.5932203, 0.1041667, 0.5729167, 0.5343511, …
$ cb_person_default_on_file <fct> Y, N, N, N, Y, N, N, N, N, N, N, N, N, N, N,…

2 Preprocess Data

We will split data into train and test sets.

set.seed(123)
train_index <- createDataPartition(dataset$loan_status, p = 0.7, list = FALSE)
train <- dataset[train_index, ]
test  <- dataset[-train_index, ]

3 Logistic Regression

We’re going to use selected at the previous step variables.

glm_model <- glm(loan_status ~ .,
  family = "binomial", data = train
)
summary(glm_model)


Call:
glm(formula = loan_status ~ ., family = "binomial", data = train)

Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -2.991e+00  7.004e-02 -42.697  < 2e-16 ***
person_home_ownershipOTHER  6.047e-01  3.082e-01   1.962 0.049749 *  
person_home_ownershipOWN   -1.361e+00  1.083e-01 -12.566  < 2e-16 ***
person_home_ownershipRENT   8.121e-01  4.323e-02  18.786  < 2e-16 ***
person_emp_length          -2.009e-02  5.281e-03  -3.804 0.000142 ***
loan_intentEDUCATION       -8.119e-01  6.289e-02 -12.909  < 2e-16 ***
loan_intentHOMEIMPROVEMENT  1.180e-01  6.817e-02   1.731 0.083469 .  
loan_intentMEDICAL         -1.744e-01  5.882e-02  -2.965 0.003023 ** 
loan_intentPERSONAL        -5.341e-01  6.326e-02  -8.443  < 2e-16 ***
loan_intentVENTURE         -1.001e+00  6.762e-02 -14.801  < 2e-16 ***
loan_amnt                  -6.939e-05  4.043e-06 -17.160  < 2e-16 ***
loan_to_income_ratio        1.164e+01  2.389e-01  48.715  < 2e-16 ***
cb_person_default_on_fileY  1.159e+00  4.404e-02  26.324  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 23929  on 22807  degrees of freedom
Residual deviance: 18095  on 22795  degrees of freedom
AIC: 18121

Number of Fisher Scoring iterations: 5

glm_probs <- predict(glm_model, newdata = test, type = "response")

glm_pred <- ifelse(glm_probs > 0.5, "Yes", "No") |>
  factor(levels = c("No","Yes"))

# Confusion matrix
confusionMatrix(glm_pred, test$loan_status)

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  7286 1269
       Yes  355  863
                                          
               Accuracy : 0.8338          
                 95% CI : (0.8263, 0.8412)
    No Information Rate : 0.7818          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4238          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9535          
            Specificity : 0.4048          
         Pos Pred Value : 0.8517          
         Neg Pred Value : 0.7085          
             Prevalence : 0.7818          
         Detection Rate : 0.7455          
   Detection Prevalence : 0.8754          
      Balanced Accuracy : 0.6792          
                                          
       'Positive' Class : No

# ROC curve
roc_glm <- roc(response = test$loan_status, predictor = glm_probs, levels = c("No","Yes"))
plot(roc_glm, col = "blue", main = "ROC Curve - Logistic Regression")

auc(roc_glm)

Area under the curve: 0.8061

4 Random Forest

The next model we’ll try is Random Forest. It is not interpretable except for variable importance, but often performs better than logistic regression.

We will use full set of variables here since Random Forest can handle correlated features well.

dataset <- read.csv("data_cleaned.csv", row.names=NULL)

# Remove not independent variables
dataset <- dataset |> select(-c(loan_int_rate, loan_grade))

dataset <- dataset |> 
  mutate(across(where(is.character), as.factor))
dataset <- dataset |> mutate(loan_status = ifelse(loan_status == 0, "No", "Yes"))
dataset <- dataset |> mutate(loan_status = factor(loan_status, levels = c("No","Yes")))
glimpse(dataset)

Rows: 32,581
Columns: 23
$ person_age                 <int> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21, 22,…
$ person_income              <int> 59000, 9600, 9600, 65500, 54400, 9900, 7710…
$ person_home_ownership      <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, RENT,…
$ person_emp_length          <int> 4, 5, 1, 4, 8, 2, 8, 5, 8, 6, 6, 2, 2, 4, 2…
$ loan_intent                <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, MEDI…
$ loan_amnt                  <int> 35000, 1000, 5500, 35000, 35000, 2500, 3500…
$ loan_status                <fct> Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
$ cb_person_default_on_file  <fct> Y, N, N, N, Y, N, N, N, N, N, N, N, N, N, N…
$ cb_person_cred_hist_length <int> 3, 2, 3, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, 4, 4…
$ gender                     <fct> Male, Female, Female, Male, Female, Male, F…
$ marital_status             <fct> Married, Divorced, Married, Married, Single…
$ education_level            <fct> High School, Master, Master, Bachelor, Bach…
$ country                    <fct> Canada, Canada, UK, Canada, USA, USA, Canad…
$ state                      <fct> Ontario, Ontario, Wales, BC, New York, Cali…
$ city                       <fct> Toronto, Toronto, Swansea, Vancouver, Buffa…
$ employment_type            <fct> Self-employed, Full-time, Full-time, Part-t…
$ loan_term_months           <int> 36, 36, 36, 12, 36, 36, 36, 36, 24, 24, 60,…
$ loan_to_income_ratio       <dbl> 0.5932203, 0.1041667, 0.5729167, 0.5343511,…
$ other_debt                 <dbl> 8402.454, 1607.803, 2760.506, 7155.286, 156…
$ debt_to_income_ratio       <dbl> 0.7356348, 0.2716461, 0.8604693, 0.6435922,…
$ open_accounts              <int> 14, 10, 14, 15, 4, 10, 15, 6, 10, 11, 15, 1…
$ credit_utilization_ratio   <dbl> 0.49555669, 0.58543602, 0.75073184, 0.37933…
$ past_delinquencies         <int> 0, 3, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 2, 1, 0…

set.seed(123)

train_index <- createDataPartition(dataset$loan_status, p = 0.7, list = FALSE)
train <- dataset[train_index, ]
test  <- dataset[-train_index, ]

set.seed(123)
rf_model <- randomForest(
  train |> select(-loan_status), train$loan_status,
  data = train, ntree = 100, importance = TRUE
)

# Predictions
rf_probs <- predict(rf_model, newdata = test, type = "prob")[, 2]
rf_pred <- predict(rf_model, newdata = test)

4.1 Confusion matrix

confusionMatrix(rf_pred, test$loan_status)

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  7558 1089
       Yes   83 1043
                                          
               Accuracy : 0.8801          
                 95% CI : (0.8735, 0.8865)
    No Information Rate : 0.7818          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5764          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9891          
            Specificity : 0.4892          
         Pos Pred Value : 0.8741          
         Neg Pred Value : 0.9263          
             Prevalence : 0.7818          
         Detection Rate : 0.7734          
   Detection Prevalence : 0.8848          
      Balanced Accuracy : 0.7392          
                                          
       'Positive' Class : No

4.2 ROC curve

roc_rf <- roc(test$loan_status, rf_probs, levels = c("No", "Yes"))
plot(roc_glm, col = "blue")
plot(roc_rf, col = "darkgreen", add = TRUE)
legend("bottomright", legend = c("Logistic Regression", "Random Forest"),
       col = c("blue", "darkgreen"), lwd = 2)

auc(roc_rf)

Area under the curve: 0.8519

4.3 Variable importance

varImpPlot(rf_model)

rf_model$importance

                                      No           Yes MeanDecreaseAccuracy
person_age                  5.074295e-03 -2.743061e-04         3.908376e-03
person_income               4.024869e-02  4.330861e-02         4.091343e-02
person_home_ownership       1.702881e-02  8.993896e-02         3.292097e-02
person_emp_length           1.685146e-03  5.690109e-03         2.560075e-03
loan_intent                 1.281827e-02  1.757426e-02         1.385680e-02
loan_amnt                   2.375752e-02  2.190229e-03         1.905499e-02
cb_person_default_on_file   6.382113e-03  2.083413e-02         9.533927e-03
cb_person_cred_hist_length  1.828719e-03  2.144633e-04         1.475256e-03
gender                     -4.289931e-05 -2.064939e-04        -7.861986e-05
marital_status             -1.747235e-04  4.926996e-05        -1.266530e-04
education_level            -2.358898e-04  2.072778e-04        -1.382721e-04
country                     1.664602e-03 -6.591833e-04         1.158692e-03
state                       7.493608e-03 -5.342603e-03         4.691483e-03
city                        9.038566e-03 -6.901540e-03         5.567463e-03
employment_type            -2.860980e-04 -1.322608e-04        -2.518474e-04
loan_term_months           -2.914796e-05 -8.398471e-05        -4.048451e-05
loan_to_income_ratio        5.924424e-02  1.253260e-01         7.364924e-02
other_debt                  2.027721e-02  6.643592e-03         1.731777e-02
debt_to_income_ratio        2.970312e-02  2.249385e-02         2.813217e-02
open_accounts               1.344340e-04 -4.691147e-04         8.567810e-06
credit_utilization_ratio   -9.932181e-06 -1.519704e-04        -4.057814e-05
past_delinquencies          1.142258e-04  5.381937e-04         2.051077e-04
                           MeanDecreaseGini
person_age                        247.86016
person_income                     851.03608
person_home_ownership             565.29922
person_emp_length                 251.68305
loan_intent                       372.97047
loan_amnt                         414.43148
cb_person_default_on_file         224.17479
cb_person_cred_hist_length        201.08723
gender                             49.96632
marital_status                    119.76651
education_level                   132.87820
country                            53.60312
state                             221.52174
city                              505.38766
employment_type                   124.55949
loan_term_months                  123.07084
loan_to_income_ratio             1302.50085
other_debt                        492.89839
debt_to_income_ratio              796.39787
open_accounts                     241.11040
credit_utilization_ratio          386.77655
past_delinquencies                 95.55954

To simplify the model we can use only the most important variables. Both metrics (Mean Decrease Accuracy and Mean Decrease Gini) show that the least important variables are gender, marital_status, education_level, cb_person_cred_hist_length, country, credit_utilization_ratio, employment_type, loan_term_months, open_accounts, and past_delinquencies. The state is shadowed by city so we can drop it as well.

Let’s drop them, retrain the model, and see if the performance changes.

set.seed(123)

train <- train |> select(-c(
  gender, marital_status, education_level, cb_person_cred_hist_length,
  country, credit_utilization_ratio, employment_type, loan_term_months,
  open_accounts, past_delinquencies, state
))
test <- test |> select(-c(
  gender, marital_status, education_level, cb_person_cred_hist_length,
  country, credit_utilization_ratio, employment_type, loan_term_months,
  open_accounts, past_delinquencies, state
))
rf_model <- randomForest(
  train |> select(-loan_status), train$loan_status,
  data = train, ntree = 100, importance = TRUE
)

# Predictions
rf_probs <- predict(rf_model, newdata = test, type = "prob")[, 2]
rf_pred <- predict(rf_model, newdata = test)

# Confusion matrix
confusionMatrix(rf_pred, test$loan_status)

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  7490 1000
       Yes  151 1132
                                          
               Accuracy : 0.8822          
                 95% CI : (0.8757, 0.8886)
    No Information Rate : 0.7818          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5969          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9802          
            Specificity : 0.5310          
         Pos Pred Value : 0.8822          
         Neg Pred Value : 0.8823          
             Prevalence : 0.7818          
         Detection Rate : 0.7664          
   Detection Prevalence : 0.8687          
      Balanced Accuracy : 0.7556          
                                          
       'Positive' Class : No

# AUC-ROC 
roc_rf <- roc(test$loan_status, rf_probs, levels = c("No", "Yes"))
auc(roc_rf)

Area under the curve: 0.861

The model performance remains roughly the same or even improves, so we will use the simplified set of variables for the next model as well.

5 XGBoost

The last model we’ll try is XGBoost. It often provides the best performance but lacks interpretability.

# Prepare data
x_train <- train |>
  mutate(loan_status = ifelse(loan_status == "Yes", 1, 0))

x_test <- test |>
  mutate(loan_status = ifelse(loan_status == "Yes", 1, 0))

x_test <- cbind(x_test, model.matrix(~ . - 1, data = x_test |> select(where(is.factor)) ))

x_test <- x_test |> select(-where(is.factor))

x_train <- cbind(x_train, model.matrix(~ . - 1, data = x_train |> select(where(is.factor))))
x_train <- x_train |> select(-where(is.factor))

dtrain <- xgb.DMatrix(
  data = as.matrix(select(x_train, -loan_status)),
  label = x_train$loan_status
)
dtest <- xgb.DMatrix(
  data = as.matrix(select(x_test, -loan_status)),
  label = x_test$loan_status
)
# Train
params <- list(
  objective = "binary:logistic",
  eval_metric = "auc"
)

xgb_model <- xgb.train(
  params = params, data = dtrain, nrounds = 200,
  watchlist = list(train = dtrain, test = dtest),
  verbose = 0
)

# Predictions
xgb_probs <- predict(xgb_model, dtest)
roc_xgb <- roc(x_test$loan_status, xgb_probs)


plot(roc_glm, col = "blue")
plot(roc_rf, col = "darkgreen", add = TRUE)
plot(roc_xgb, col = "red", add = TRUE)
legend("bottomright",
  legend = c("Logistic Regression", "Random Forest", "XGBoost"),
  col = c("blue", "darkgreen", "red"), lwd = 2
)

auc(roc_xgb)

Area under the curve: 0.8963

# Feature importance
vip(xgb_model, num_features = 15)

xgb.importance(model = xgb_model)

                          Feature         Gain        Cover    Frequency
                           <char>        <num>        <num>        <num>
 1:          loan_to_income_ratio 0.2660964674 0.1326040720 0.1215748486
 2:                 person_income 0.2387269615 0.3074887979 0.1883472743
 3:     person_home_ownershipRENT 0.1113860219 0.0098803626 0.0126910874
 4:          debt_to_income_ratio 0.0575864782 0.1368028120 0.1446495529
 5:                    other_debt 0.0571434083 0.1341356638 0.1492644938
 6:    cb_person_default_on_fileY 0.0430258959 0.0223544683 0.0181713297
 7:                    person_age 0.0408989919 0.0386773162 0.0868185751
 8:             person_emp_length 0.0390744738 0.0261730910 0.0827805019
 9:                     loan_amnt 0.0359844789 0.0658562850 0.0773002596
10:    loan_intentHOMEIMPROVEMENT 0.0196661949 0.0091830517 0.0089414479
11:      person_home_ownershipOWN 0.0180182873 0.0257793799 0.0066339775
12:            loan_intentVENTURE 0.0146651882 0.0234550087 0.0089414479
13:            loan_intentMEDICAL 0.0115010192 0.0083771742 0.0132679550
14:          loan_intentEDUCATION 0.0098498229 0.0176463335 0.0096625324
15:           loan_intentPERSONAL 0.0072508573 0.0070573026 0.0095183155
16: person_home_ownershipMORTGAGE 0.0058586324 0.0014337281 0.0093740986
17:             citySan Francisco 0.0019694159 0.0030011330 0.0040380733
18:                 cityEdinburgh 0.0017921655 0.0017065314 0.0051918085
19:                    cityDallas 0.0015850226 0.0011680909 0.0040380733
20:             cityNew York City 0.0015804676 0.0016521800 0.0030285549
21:                   cityToronto 0.0015351409 0.0024112093 0.0034612057
22:                 cityVancouver 0.0015204596 0.0017167117 0.0028843380
23:                   cityGlasgow 0.0013142534 0.0018850666 0.0031727718
24:                  cityVictoria 0.0013031976 0.0033292658 0.0034612057
25:               cityQuebec City 0.0012944066 0.0019869832 0.0028843380
26:                    cityOttawa 0.0012854228 0.0013705663 0.0030285549
27:                   cityHouston 0.0012535462 0.0032237779 0.0023074704
28:                    cityLondon 0.0011198546 0.0014824478 0.0023074704
29:                   citySwansea 0.0011094006 0.0003402817 0.0018748197
30:                cityManchester 0.0009815181 0.0012804384 0.0023074704
31:                  cityMontreal 0.0009800594 0.0025290666 0.0024516873
32:                   cityCardiff 0.0009293275 0.0008997710 0.0023074704
33:               cityLos Angeles 0.0008993730 0.0017949787 0.0027401211
34:    person_home_ownershipOTHER 0.0008137881 0.0013166521 0.0005768676
                          Feature         Gain        Cover    Frequency

6 Precision-Recall Curves

# Logistic regression PR curve
pr_glm <- pr.curve(
  scores.class0 = glm_probs[test$loan_status == "Yes"],
  scores.class1 = glm_probs[test$loan_status == "No"], curve = TRUE
)
plot(pr_glm, main = "Precision-Recall Curve (GLM)")

# Random forest PR curve
pr_rf <- pr.curve(
  scores.class0 = rf_probs[test$loan_status == "Yes"],
  scores.class1 = rf_probs[test$loan_status == "No"], curve = TRUE
)
plot(pr_rf, main = "Precision-Recall Curve (RF)")

pr_xgb <- pr.curve(
  scores.class0 = xgb_probs[x_test$loan_status == 1],
  scores.class1 = xgb_probs[x_test$loan_status == 0], curve = TRUE
)
plot(pr_xgb, main = "Precision-Recall Curve (XGBoost)")