<- read.csv("prepared_dataset.csv") dataset
Synthetic Data For Simulations
In this document we generate synthetic data for simulation purposes. The synthetic data is generated by sampling from the original dataset with some added noise to numeric features. We then use a pre-trained on the original data XGBoost model to predict the probability of default (PD) for each synthetic borrower.
This script will return two tables: the first one will contain synthetic data and the second one will contain metrics of the XGBoost model.
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(caret)
Loading required package: ggplot2
Loading required package: lattice
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.2.1
✔ purrr 1.1.0 ✔ tidyr 1.3.1
✔ readr 2.1.5
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ purrr::lift() masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(xgboost)
Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':
slice
<- c(
numeric_cols "person_age", "person_income", "person_emp_length", "loan_amnt",
"loan_to_income_ratio", "other_debt", "debt_to_income_ratio",
"credit_utilization_ratio", "loan_int_rate"
)<- c(
factor_cols "loan_intent", "person_home_ownership",
"cb_person_default_on_file", "city"
)<- c(
selected_variables "loan_status", "person_age", "person_income", "person_home_ownership",
"person_emp_length", "loan_intent", "loan_amnt", "cb_person_default_on_file",
"city", "loan_to_income_ratio", "other_debt", "debt_to_income_ratio",
"credit_utilization_ratio", "loan_int_rate"
)
# Calculate proportions of categorical variables
<- dataset |>
dataset mutate(across(all_of(factor_cols), as.factor))
<- prop.table(table(dataset$person_home_ownership))
prop_home_ownership <- prop.table(table(dataset$loan_intent))
prop_loan_intent <- prop.table(table(dataset$cb_person_default_on_file))
prop_cb_default <- prop.table(table(dataset$city))
prop_city
# Save levels to ensure consistency later
<- levels(dataset$loan_intent)
loan_intent_levels <- levels(dataset$person_home_ownership)
home_ownership_levels <- levels(dataset$cb_person_default_on_file)
cb_default_levels <- levels(dataset$city)
city_levels
# Select columns
<- dataset |> select(all_of(c("is_train", selected_variables)))
encoded_dataset
# Encode factors
<- model.matrix(~ loan_intent - 1, data = dataset) |>
encoded_loan_intent as.data.frame()
<- model.matrix(~ person_home_ownership - 1,
encoded_home_ownership data = dataset
|> as.data.frame()
) <- model.matrix(~ cb_person_default_on_file - 1,
encoded_cb_default data = dataset
|> as.data.frame()
) <- model.matrix(~ city - 1, data = dataset) |> as.data.frame()
encoded_city
<- cbind(
encoded_dataset
encoded_dataset, encoded_loan_intent, encoded_home_ownership,
encoded_cb_default, encoded_city
)
<- encoded_dataset |> select(-all_of(factor_cols))
encoded_dataset
# Correct column names
colnames(encoded_dataset) <- make.names(colnames(encoded_dataset))
# Prepare data
<- encoded_dataset |>
x_train filter(is_train == 1) |>
select(-is_train)
<- encoded_dataset |>
x_test filter(is_train == 0) |>
select(-is_train)
<- xgb.DMatrix(
dtrain data = as.matrix(select(x_train, -loan_status)),
label = x_train$loan_status
)
<- xgb.DMatrix(
dtest data = as.matrix(select(x_test, -loan_status)),
label = x_test$loan_status
)
# Train
<- list(
params objective = "binary:logistic",
eval_metric = "auc"
)
<- xgb.train(
xgb_model params = params, data = dtrain, nrounds = 200,
watchlist = list(train = dtrain, test = dtest),
verbose = 0
)
# Generate synthetic data
# Parameters
<- 50000
N_synth
# Sampling function
<- function(dataset, N, prop_weights) {
sample_by_purpose <- names(prop_weights)
purposes <- round(N * prop_weights |> unname() / sum(prop_weights))
counts
<- map2_dfr(purposes, counts, function(p, cnt) {
sampled <- dataset |> filter(loan_intent == p)
base if (nrow(base) == 0) {
return(tibble())
}
|> sample_n(size = cnt, replace = TRUE)
base
})
sampled
}# Generate synthetic borrowers
<- sample_by_purpose(dataset, N_synth, prop_loan_intent)
synth
# Add jitter to numeric features
<- synth |>
synth mutate(across(all_of(numeric_cols), ~ .x * exp(rnorm(n(), 0, 0.05))))
# Round where appropriate
$person_age <- round(synth$person_age)
synth$person_emp_length <- round(synth$person_emp_length)
synth$loan_amnt <- round(synth$loan_amnt / 100) * 100
synth$loan_int_rate <- round(synth$loan_int_rate, 2)
synth$person_income <- round(synth$person_income / 100) * 100
synth$credit_utilization_ratio <- round(synth$credit_utilization_ratio, 2)
synth
$loan_to_income_ratio <- synth$loan_amnt / synth$person_income
synth
# Assign home ownership randomly according to weights
$person_home_ownership <- sample(names(prop_home_ownership),
synthnrow(synth),
replace = TRUE, prob = prop_home_ownership
)
# Select variables
<- synth |>
x_final select(all_of(selected_variables)) |>
select(-loan_status)
# Convert to factors
<- x_final |>
x_final mutate(across(where(is.character), as.factor)) |>
data.frame()
# Ensure factor levels match training data
$loan_intent <- factor(x_final$loan_intent, levels = loan_intent_levels)
x_final$person_home_ownership <- factor(x_final$person_home_ownership,
x_finallevels = home_ownership_levels
)$cb_person_default_on_file <- factor(x_final$cb_person_default_on_file,
x_finallevels = cb_default_levels
)$city <- factor(x_final$city, levels = city_levels)
x_final
# Encode factors
<- model.matrix(~ loan_intent - 1, data = x_final) |>
encoded_loan_intent as.data.frame()
<- model.matrix(~ person_home_ownership - 1,
encoded_home_ownership data = x_final
|> as.data.frame()
) <- model.matrix(~ cb_person_default_on_file - 1,
encoded_cb_default data = x_final
|> as.data.frame()
) <- model.matrix(~ city - 1, data = x_final) |> as.data.frame()
encoded_city
<- cbind(
x_final
x_final, encoded_loan_intent, encoded_home_ownership,
encoded_cb_default, encoded_city
)
colnames(x_final) <- make.names(colnames(x_final))
<- x_final |> select(-all_of(factor_cols))
x_final
<- xgb.DMatrix(
dmat data = as.matrix(x_final)
)
# Predict PD
$Predicted_PD <- predict(xgb_model, dmat)
synth
# Add bins for slicers
<- synth |>
synth mutate(
age_bin = cut(person_age,
breaks = c(20, 25, 30, 35, 40, 45, 100),
labels = c("20-24", "25-29", "30-34", "35-39", "40-44", "45+"),
include.lowest = TRUE
),income_bin = cut(person_income,
breaks = c(0, 25000, 50000, 75000, 100000, 1e7),
labels = c("0-25k", "25-50k", "50-75k", "75-100k", "100k+"),
include.lowest = TRUE
),loan_amount_bin = cut(loan_amnt,
breaks = c(0, 5000, 10000, 20000, 40000, 1e7),
labels = c("0-5k", "5-10k", "10-20k", "20-40k", "40k+"),
include.lowest = TRUE
),lti_bin = cut(loan_to_income_ratio,
breaks = c(0, 0.1, 0.2, 0.3, 1),
labels = c("0-0.1", "0.1-0.2", "0.2-0.3", "0.3+"),
include.lowest = TRUE
)
)
<- synth |>
output mutate(client_ID = paste0("FAKE_", str_pad(row_number(), 5, pad = "0")))
# Cross-validation
<- predict(xgb_model, newdata = dtest)
xgb_pred <- ifelse(xgb_pred > 0.5, 1, 0) |> as.factor()
xgb_pred
<- confusionMatrix(xgb_pred, x_test$loan_status |> as.factor())
conf_matrix
<- data.frame(Value = conf_matrix$byClass) |>
xgb_metrics ::rownames_to_column(var = "Metric")
tibble
glimpse(output)
Rows: 50,001
Columns: 21
$ client_ID <chr> "FAKE_00001", "FAKE_00002", "FAKE_00003", "F…
$ loan_status <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
$ person_age <dbl> 25, 28, 29, 24, 36, 36, 27, 33, 34, 25, 24, …
$ person_income <dbl> 43200, 59700, 233500, 43300, 31500, 147400, …
$ person_home_ownership <chr> "RENT", "MORTGAGE", "OWN", "OWN", "MORTGAGE"…
$ person_emp_length <dbl> 1, 3, 1, 7, 0, 14, 3, 0, 8, 8, 6, 1, 1, 3, 8…
$ loan_intent <fct> DEBTCONSOLIDATION, DEBTCONSOLIDATION, DEBTCO…
$ loan_amnt <dbl> 3900, 9400, 1900, 13200, 5900, 23900, 4300, …
$ cb_person_default_on_file <fct> N, Y, N, N, N, N, N, Y, N, N, N, N, Y, N, N,…
$ city <fct> Houston, Quebec City, Los Angeles, Buffalo, …
$ loan_to_income_ratio <dbl> 0.090277778, 0.157453936, 0.008137045, 0.304…
$ other_debt <dbl> 8393.495, 5202.016, 24855.584, 12734.842, 35…
$ debt_to_income_ratio <dbl> 0.2739014, 0.2374459, 0.1111667, 0.5844703, …
$ credit_utilization_ratio <dbl> 0.26, 0.45, 0.27, 0.67, 0.37, 0.22, 0.23, 0.…
$ loan_int_rate <dbl> 17.63, 11.67, 9.39, 13.15, 11.88, 11.21, 9.3…
$ is_train <int> 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,…
$ Predicted_PD <dbl> 0.9993345141, 0.0396767929, 0.0001281199, 0.…
$ age_bin <fct> 20-24, 25-29, 25-29, 20-24, 35-39, 35-39, 25…
$ income_bin <fct> 25-50k, 50-75k, 100k+, 25-50k, 25-50k, 100k+…
$ loan_amount_bin <fct> 0-5k, 5-10k, 0-5k, 10-20k, 5-10k, 20-40k, 0-…
$ lti_bin <fct> 0-0.1, 0.1-0.2, 0-0.1, 0.3+, 0.1-0.2, 0.1-0.…
glimpse(xgb_metrics)
Rows: 11
Columns: 2
$ Metric <chr> "Sensitivity", "Specificity", "Pos Pred Value", "Neg Pred Value…
$ Value <dbl> 0.9830531, 0.7012026, 0.9205314, 0.9215805, 0.9205314, 0.983053…