Synthetic Data For Simulations

Author

Aleksei Prishchepo

Published

September 23, 2025

In this document we generate synthetic data for simulation purposes. The synthetic data is generated by sampling from the original dataset with some added noise to numeric features. We then use a pre-trained on the original data XGBoost model to predict the probability of default (PD) for each synthetic borrower.

dataset <- read.csv("prepared_dataset.csv")

This script will return two tables: the first one will contain synthetic data and the second one will contain metrics of the XGBoost model.

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(caret)
Loading required package: ggplot2
Loading required package: lattice
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.1.0     ✔ tidyr     1.3.1
✔ readr     2.1.5     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ purrr::lift()   masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(xgboost)

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
numeric_cols <- c(
  "person_age", "person_income", "person_emp_length", "loan_amnt",
  "loan_to_income_ratio", "other_debt", "debt_to_income_ratio",
  "credit_utilization_ratio", "loan_int_rate"
)
factor_cols <- c(
  "loan_intent", "person_home_ownership",
  "cb_person_default_on_file", "city"
)
selected_variables <- c(
  "loan_status", "person_age", "person_income", "person_home_ownership",
  "person_emp_length", "loan_intent", "loan_amnt", "cb_person_default_on_file",
  "city", "loan_to_income_ratio", "other_debt", "debt_to_income_ratio",
  "credit_utilization_ratio", "loan_int_rate"
)

# Calculate proportions of categorical variables
dataset <- dataset |>
  mutate(across(all_of(factor_cols), as.factor))

prop_home_ownership <- prop.table(table(dataset$person_home_ownership))
prop_loan_intent <- prop.table(table(dataset$loan_intent))
prop_cb_default <- prop.table(table(dataset$cb_person_default_on_file))
prop_city <- prop.table(table(dataset$city))

# Save levels to ensure consistency later
loan_intent_levels <- levels(dataset$loan_intent)
home_ownership_levels <- levels(dataset$person_home_ownership)
cb_default_levels <- levels(dataset$cb_person_default_on_file)
city_levels <- levels(dataset$city)

# Select columns
encoded_dataset <- dataset |> select(all_of(c("is_train", selected_variables)))

# Encode factors
encoded_loan_intent <- model.matrix(~ loan_intent - 1, data = dataset) |>
  as.data.frame()
encoded_home_ownership <- model.matrix(~ person_home_ownership - 1,
  data = dataset
) |> as.data.frame()
encoded_cb_default <- model.matrix(~ cb_person_default_on_file - 1,
  data = dataset
) |> as.data.frame()
encoded_city <- model.matrix(~ city - 1, data = dataset) |> as.data.frame()

encoded_dataset <- cbind(
  encoded_dataset, encoded_loan_intent, encoded_home_ownership, 
  encoded_cb_default, encoded_city
)

encoded_dataset <- encoded_dataset |> select(-all_of(factor_cols))

# Correct column names
colnames(encoded_dataset) <- make.names(colnames(encoded_dataset))

# Prepare data

x_train <- encoded_dataset |>
  filter(is_train == 1) |>
  select(-is_train)
x_test <- encoded_dataset |>
  filter(is_train == 0) |>
  select(-is_train)

dtrain <- xgb.DMatrix(
  data = as.matrix(select(x_train, -loan_status)),
  label = x_train$loan_status
)

dtest <- xgb.DMatrix(
  data = as.matrix(select(x_test, -loan_status)),
  label = x_test$loan_status
)

# Train
params <- list(
  objective = "binary:logistic",
  eval_metric = "auc"
)

xgb_model <- xgb.train(
  params = params, data = dtrain, nrounds = 200,
  watchlist = list(train = dtrain, test = dtest),
  verbose = 0
)


# Generate synthetic data

# Parameters
N_synth <- 50000

# Sampling function
sample_by_purpose <- function(dataset, N, prop_weights) {
  purposes <- names(prop_weights)
  counts <- round(N * prop_weights |> unname() / sum(prop_weights))

  sampled <- map2_dfr(purposes, counts, function(p, cnt) {
    base <- dataset |> filter(loan_intent == p)
    if (nrow(base) == 0) {
      return(tibble())
    }

    base |> sample_n(size = cnt, replace = TRUE)
  })
  sampled
}
# Generate synthetic borrowers
synth <- sample_by_purpose(dataset, N_synth, prop_loan_intent)

# Add jitter to numeric features
synth <- synth |>
  mutate(across(all_of(numeric_cols), ~ .x * exp(rnorm(n(), 0, 0.05))))
# Round where appropriate
synth$person_age <- round(synth$person_age)
synth$person_emp_length <- round(synth$person_emp_length)
synth$loan_amnt <- round(synth$loan_amnt / 100) * 100
synth$loan_int_rate <- round(synth$loan_int_rate, 2)
synth$person_income <- round(synth$person_income / 100) * 100
synth$credit_utilization_ratio <- round(synth$credit_utilization_ratio, 2)

synth$loan_to_income_ratio <- synth$loan_amnt / synth$person_income

# Assign home ownership randomly according to weights
synth$person_home_ownership <- sample(names(prop_home_ownership),
  nrow(synth),
  replace = TRUE, prob = prop_home_ownership
)

# Select variables
x_final <- synth |>
  select(all_of(selected_variables)) |>
  select(-loan_status)

# Convert to factors
x_final <- x_final |>
  mutate(across(where(is.character), as.factor)) |>
  data.frame()

# Ensure factor levels match training data
x_final$loan_intent <- factor(x_final$loan_intent, levels = loan_intent_levels)
x_final$person_home_ownership <- factor(x_final$person_home_ownership,
  levels = home_ownership_levels
)
x_final$cb_person_default_on_file <- factor(x_final$cb_person_default_on_file,
  levels = cb_default_levels
)
x_final$city <- factor(x_final$city, levels = city_levels)

# Encode factors
encoded_loan_intent <- model.matrix(~ loan_intent - 1, data = x_final) |>
  as.data.frame()
encoded_home_ownership <- model.matrix(~ person_home_ownership - 1,
  data = x_final
) |> as.data.frame()
encoded_cb_default <- model.matrix(~ cb_person_default_on_file - 1,
  data = x_final
) |> as.data.frame()
encoded_city <- model.matrix(~ city - 1, data = x_final) |> as.data.frame()

x_final <- cbind(
  x_final, encoded_loan_intent, encoded_home_ownership,
  encoded_cb_default, encoded_city
)

colnames(x_final) <- make.names(colnames(x_final))

x_final <- x_final |> select(-all_of(factor_cols))

dmat <- xgb.DMatrix(
  data = as.matrix(x_final)
)

# Predict PD
synth$Predicted_PD <- predict(xgb_model, dmat)

# Add bins for slicers
synth <- synth |>
  mutate(
    age_bin = cut(person_age,
      breaks = c(20, 25, 30, 35, 40, 45, 100),
      labels = c("20-24", "25-29", "30-34", "35-39", "40-44", "45+"),
      include.lowest = TRUE
    ),
    income_bin = cut(person_income,
      breaks = c(0, 25000, 50000, 75000, 100000, 1e7),
      labels = c("0-25k", "25-50k", "50-75k", "75-100k", "100k+"),
      include.lowest = TRUE
    ),
    loan_amount_bin = cut(loan_amnt,
      breaks = c(0, 5000, 10000, 20000, 40000, 1e7),
      labels = c("0-5k", "5-10k", "10-20k", "20-40k", "40k+"),
      include.lowest = TRUE
    ),
    lti_bin = cut(loan_to_income_ratio,
      breaks = c(0, 0.1, 0.2, 0.3, 1),
      labels = c("0-0.1", "0.1-0.2", "0.2-0.3", "0.3+"),
      include.lowest = TRUE
    )
  )

output <- synth |>
  mutate(client_ID = paste0("FAKE_", str_pad(row_number(), 5, pad = "0")))

# Cross-validation
xgb_pred <- predict(xgb_model, newdata = dtest)
xgb_pred <- ifelse(xgb_pred > 0.5, 1, 0) |> as.factor()

conf_matrix <- confusionMatrix(xgb_pred, x_test$loan_status |> as.factor())

xgb_metrics <- data.frame(Value = conf_matrix$byClass) |>
  tibble::rownames_to_column(var = "Metric")

glimpse(output)
Rows: 50,001
Columns: 21
$ client_ID                 <chr> "FAKE_00001", "FAKE_00002", "FAKE_00003", "F…
$ loan_status               <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
$ person_age                <dbl> 25, 28, 29, 24, 36, 36, 27, 33, 34, 25, 24, …
$ person_income             <dbl> 43200, 59700, 233500, 43300, 31500, 147400, …
$ person_home_ownership     <chr> "RENT", "MORTGAGE", "OWN", "OWN", "MORTGAGE"…
$ person_emp_length         <dbl> 1, 3, 1, 7, 0, 14, 3, 0, 8, 8, 6, 1, 1, 3, 8…
$ loan_intent               <fct> DEBTCONSOLIDATION, DEBTCONSOLIDATION, DEBTCO…
$ loan_amnt                 <dbl> 3900, 9400, 1900, 13200, 5900, 23900, 4300, …
$ cb_person_default_on_file <fct> N, Y, N, N, N, N, N, Y, N, N, N, N, Y, N, N,…
$ city                      <fct> Houston, Quebec City, Los Angeles, Buffalo, …
$ loan_to_income_ratio      <dbl> 0.090277778, 0.157453936, 0.008137045, 0.304…
$ other_debt                <dbl> 8393.495, 5202.016, 24855.584, 12734.842, 35…
$ debt_to_income_ratio      <dbl> 0.2739014, 0.2374459, 0.1111667, 0.5844703, …
$ credit_utilization_ratio  <dbl> 0.26, 0.45, 0.27, 0.67, 0.37, 0.22, 0.23, 0.…
$ loan_int_rate             <dbl> 17.63, 11.67, 9.39, 13.15, 11.88, 11.21, 9.3…
$ is_train                  <int> 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,…
$ Predicted_PD              <dbl> 0.9993345141, 0.0396767929, 0.0001281199, 0.…
$ age_bin                   <fct> 20-24, 25-29, 25-29, 20-24, 35-39, 35-39, 25…
$ income_bin                <fct> 25-50k, 50-75k, 100k+, 25-50k, 25-50k, 100k+…
$ loan_amount_bin           <fct> 0-5k, 5-10k, 0-5k, 10-20k, 5-10k, 20-40k, 0-…
$ lti_bin                   <fct> 0-0.1, 0.1-0.2, 0-0.1, 0.3+, 0.1-0.2, 0.1-0.…
glimpse(xgb_metrics)
Rows: 11
Columns: 2
$ Metric <chr> "Sensitivity", "Specificity", "Pos Pred Value", "Neg Pred Value…
$ Value  <dbl> 0.9830531, 0.7012026, 0.9205314, 0.9215805, 0.9205314, 0.983053…