<- readxl::read_excel("Credit_Risk_Dataset_Onyx_Data_September_25.xlsx") dataset
Scripts for Power Query
In this document, we will create R scripts that can be used in Power Query within Power BI. The scripts will cover data preparation, encoding categorical variables, and building a logistic regression model with marginal effects and cross-validation.
1 Load data
2 Prepared Data
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(caret)
Loading required package: ggplot2
Loading required package: lattice
# Select variables of interest (see )
<- c(
selected_variables "client_ID", "loan_status", "person_age", "person_income",
"person_home_ownership", "person_emp_length", "loan_intent", "loan_amnt",
"loan_status", "cb_person_default_on_file", "city", "loan_to_income_ratio",
"other_debt", "debt_to_income_ratio", "credit_utilization_ratio",
"loan_int_rate"
)
<- dataset |> select(all_of(selected_variables)) |> data.frame()
dataset
# Replace outliers
<- dataset |>
dataset mutate(
person_age = ifelse(person_age > 100,
median(person_age, na.rm = TRUE),
person_age
),person_emp_length = ifelse(person_emp_length > 70,
median(person_emp_length, na.rm = TRUE),
person_emp_length
)
)
# Impute missing
<- dataset |>
dataset mutate(across(where(is.numeric), ~ ifelse(is.na(.),
median(., na.rm = TRUE), .
)))
# Split into train and test
set.seed(123)
<- createDataPartition(dataset$loan_status, p = 0.7, list = FALSE)
train_index
"is_train"] <- 1
dataset[train_index, -train_index, "is_train"] <- 0
dataset[
<- dataset
output
glimpse(output)
Rows: 32,581
Columns: 16
$ client_ID <chr> "CUST_00001", "CUST_00002", "CUST_00003", "C…
$ loan_status <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
$ person_age <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21, 22, …
$ person_income <dbl> 59000, 9600, 9600, 65500, 54400, 9900, 77100…
$ person_home_ownership <chr> "RENT", "OWN", "MORTGAGE", "RENT", "RENT", "…
$ person_emp_length <dbl> 4, 5, 1, 4, 8, 2, 8, 5, 8, 6, 6, 2, 2, 4, 2,…
$ loan_intent <chr> "PERSONAL", "EDUCATION", "MEDICAL", "MEDICAL…
$ loan_amnt <dbl> 35000, 1000, 5500, 35000, 35000, 2500, 35000…
$ cb_person_default_on_file <chr> "Y", "N", "N", "N", "Y", "N", "N", "N", "N",…
$ city <chr> "Toronto", "Toronto", "Swansea", "Vancouver"…
$ loan_to_income_ratio <dbl> 0.5932203, 0.1041667, 0.5729167, 0.5343511, …
$ other_debt <dbl> 8402.454, 1607.803, 2760.506, 7155.286, 1562…
$ debt_to_income_ratio <dbl> 0.7356348, 0.2716461, 0.8604693, 0.6435922, …
$ credit_utilization_ratio <dbl> 0.49555669, 0.58543602, 0.75073184, 0.379333…
$ loan_int_rate <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14, 12.…
$ is_train <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,…
::write_csv(output, "prepared_dataset.csv") readr
3 Marginal Effects from GLM
<- read.csv("prepared_dataset.csv") dataset
library(dplyr)
library(caret)
# Select variables for GLM (see 01_eda.qmd)
<- c(
continuous_variables "person_emp_length", "loan_amnt", "loan_to_income_ratio"
)<- c(
factor_columns "person_home_ownership", "loan_intent", "cb_person_default_on_file"
)
<- dataset |> select(c(
train "loan_status", all_of(continuous_variables),
all_of(factor_columns)
))
<- train |> mutate(across(where(is.character), as.factor)) |> data.frame()
train
# First, train without standardizing
<- glm(loan_status ~ .,
glm_model family = "binomial", data = train
)
<- predict(glm_model, type = "response")
p_hat
# Extract coefficients
<- coef(glm_model)
betas
# Add betas for omitted levels of factors
for (col in factor_columns) {
<- levels(train[[col]])
levels_col for (lvl in levels_col) {
<- paste0(col, lvl)
var_name if (!(var_name %in% names(betas))) {
<- 0
betas[var_name]
}
}
}
# Compute marginal effects: β_j * p * (1-p)
<- sapply(1:length(betas), function(j) {
AME mean(betas[j] * p_hat * (1 - p_hat), na.rm = TRUE)
})
names(AME) <- names(betas)
<- data.frame(effect_raw = AME, coef = betas) |>
raw_effects ::rownames_to_column(var = "Variable")
tibble
# Now standardize continuous variables and retrain
<- train |> mutate(across(
train .cols = c(continuous_variables),
.fns = ~ (. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)
))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(...)`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(continuous_variables)
# Now:
data %>% select(all_of(continuous_variables))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
<- glm(loan_status ~ .,
glm_model family = "binomial", data = train
)
<- predict(glm_model, type = "response")
p_hat
# Extract coefficients
<- coef(glm_model)
betas
# Add betas for omitted levels of factors
for (col in factor_columns) {
<- levels(train[[col]])
levels_col for (lvl in levels_col) {
<- paste0(col, lvl)
var_name if (!(var_name %in% names(betas))) {
<- 0
betas[var_name]
}
}
}
# Compute marginal effects: β_j * p * (1-p)
<- sapply(1:length(betas), function(j) {
AME mean(betas[j] * p_hat * (1 - p_hat), na.rm = TRUE)
})
names(AME) <- names(betas)
<- data.frame(effect_std = AME) |>
output ::rownames_to_column(var = "Variable")
tibble
<- output |> left_join(raw_effects, by = "Variable")
output
# Separate factor and continuous variables
<- paste(factor_columns, collapse = "|")
reg_templ
<- output |>
output mutate(Variable = gsub("[\\(\\)]", "", Variable))
<- output |> mutate(Factor = gsub(reg_templ, "", Variable))
output <- output |> mutate(Factor = stringr::str_replace(Factor, Variable, ""))
output
<- output |>
output mutate(Variable = gsub(paste0("(", reg_templ, ").+"), "\\1", Variable))
output
Variable effect_std effect_raw coef
1 Intercept -0.226071106 -3.826266e-01 -3.0539654641
2 person_emp_length -0.006671196 -1.675753e-03 -0.0133751627
3 loan_amnt -0.053430210 -8.451357e-06 -0.0000674552
4 loan_to_income_ratio 0.154137503 1.439880e+00 11.4925187838
5 person_home_ownership 0.086267040 8.626704e-02 0.6885475098
6 person_home_ownership -0.170707587 -1.707076e-01 -1.3625167192
7 person_home_ownership 0.106229568 1.062296e-01 0.8478800805
8 loan_intent -0.094288288 -9.428829e-02 -0.7525697667
9 loan_intent 0.017260559 1.726056e-02 0.1377665794
10 loan_intent -0.022701433 -2.270143e-02 -0.1811933670
11 loan_intent -0.065168974 -6.516897e-02 -0.5201515576
12 loan_intent -0.116198538 -1.161985e-01 -0.9274482406
13 cb_person_default_on_file 0.141680158 1.416802e-01 1.1308318911
14 person_home_ownership 0.000000000 0.000000e+00 0.0000000000
15 loan_intent 0.000000000 0.000000e+00 0.0000000000
16 cb_person_default_on_file 0.000000000 0.000000e+00 0.0000000000
Factor
1
2
3
4
5 OTHER
6 OWN
7 RENT
8 EDUCATION
9 HOMEIMPROVEMENT
10 MEDICAL
11 PERSONAL
12 VENTURE
13 Y
14 MORTGAGE
15 DEBTCONSOLIDATION
16 N
summary(glm_model)
Call:
glm(formula = loan_status ~ ., family = "binomial", data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.80441 0.04373 -41.259 < 2e-16 ***
person_emp_length -0.05325 0.01736 -3.067 0.002159 **
loan_amnt -0.42646 0.02133 -19.991 < 2e-16 ***
loan_to_income_ratio 1.23026 0.02121 58.001 < 2e-16 ***
person_home_ownershipOTHER 0.68855 0.25421 2.709 0.006757 **
person_home_ownershipOWN -1.36252 0.09139 -14.908 < 2e-16 ***
person_home_ownershipRENT 0.84788 0.03622 23.409 < 2e-16 ***
loan_intentEDUCATION -0.75257 0.05209 -14.449 < 2e-16 ***
loan_intentHOMEIMPROVEMENT 0.13777 0.05695 2.419 0.015551 *
loan_intentMEDICAL -0.18119 0.04927 -3.678 0.000235 ***
loan_intentPERSONAL -0.52015 0.05301 -9.813 < 2e-16 ***
loan_intentVENTURE -0.92745 0.05620 -16.503 < 2e-16 ***
cb_person_default_on_fileY 1.13083 0.03677 30.756 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 34182 on 32580 degrees of freedom
Residual deviance: 25971 on 32568 degrees of freedom
AIC: 25997
Number of Fisher Scoring iterations: 5
4 Cross-Validation for GLM
<- read.csv("prepared_dataset.csv") dataset
library(dplyr)
library(tidyr)
library(caret)
<- dataset |>
dataset mutate(
loan_status = as.factor(loan_status)
)
<- c(
selected_variables "loan_status", "person_emp_length", "loan_amnt", "loan_to_income_ratio",
"person_home_ownership", "loan_intent", "cb_person_default_on_file"
)
<- dataset |>
train filter(is_train == 1) |>
select(all_of(selected_variables))
<- dataset |>
test filter(is_train == 0) |>
select(all_of(selected_variables))
<- glm(loan_status ~ .,
glm_model family = "binomial", data = train
)
<- predict(glm_model, newdata = test, type = "response")
glm_probs
<- ifelse(glm_probs > 0.5, 1, 0) |> as.factor()
glm_pred
# Confusion matrix
<- confusionMatrix(glm_pred, test$loan_status)
conf_matrix
<- data.frame(Value = conf_matrix$byClass) |>
output ::rownames_to_column(var = "Parameter") |>
tibblepivot_wider(names_from = Parameter, values_from = Value)
output
# A tibble: 1 × 11
Sensitivity Specificity `Pos Pred Value` `Neg Pred Value` Precision Recall
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.957 0.402 0.849 0.727 0.849 0.957
# ℹ 5 more variables: F1 <dbl>, Prevalence <dbl>, `Detection Rate` <dbl>,
# `Detection Prevalence` <dbl>, `Balanced Accuracy` <dbl>