Scripts for Power Query

Author

Aleksei Prishchepo

Published

September 20, 2025

In this document, we will create R scripts that can be used in Power Query within Power BI. The scripts will cover data preparation, encoding categorical variables, and building a logistic regression model with marginal effects and cross-validation.

1 Load data

dataset <- readxl::read_excel("Credit_Risk_Dataset_Onyx_Data_September_25.xlsx")

2 Prepared Data

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(caret)

Loading required package: ggplot2

Loading required package: lattice

# Select variables of interest (see )

selected_variables <- c(
  "client_ID", "loan_status", "person_age", "person_income",
  "person_home_ownership", "person_emp_length", "loan_intent", "loan_amnt",
  "loan_status", "cb_person_default_on_file", "city", "loan_to_income_ratio",
  "other_debt", "debt_to_income_ratio", "credit_utilization_ratio", 
  "loan_int_rate"
)

dataset <- dataset |> select(all_of(selected_variables)) |> data.frame()

# Replace outliers

dataset <- dataset |>
  mutate(
    person_age = ifelse(person_age > 100,
      median(person_age, na.rm = TRUE),
      person_age
    ),
    person_emp_length = ifelse(person_emp_length > 70,
      median(person_emp_length, na.rm = TRUE),
      person_emp_length
    )
  )

# Impute missing
dataset <- dataset |>
  mutate(across(where(is.numeric), ~ ifelse(is.na(.),
    median(., na.rm = TRUE), .
  )))

# Split into train and test
set.seed(123)

train_index <- createDataPartition(dataset$loan_status, p = 0.7, list = FALSE)

dataset[train_index, "is_train"] <- 1
dataset[-train_index, "is_train"] <- 0

output <- dataset

glimpse(output)

Rows: 32,581
Columns: 16
$ client_ID                 <chr> "CUST_00001", "CUST_00002", "CUST_00003", "C…
$ loan_status               <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,…
$ person_age                <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21, 22, …
$ person_income             <dbl> 59000, 9600, 9600, 65500, 54400, 9900, 77100…
$ person_home_ownership     <chr> "RENT", "OWN", "MORTGAGE", "RENT", "RENT", "…
$ person_emp_length         <dbl> 4, 5, 1, 4, 8, 2, 8, 5, 8, 6, 6, 2, 2, 4, 2,…
$ loan_intent               <chr> "PERSONAL", "EDUCATION", "MEDICAL", "MEDICAL…
$ loan_amnt                 <dbl> 35000, 1000, 5500, 35000, 35000, 2500, 35000…
$ cb_person_default_on_file <chr> "Y", "N", "N", "N", "Y", "N", "N", "N", "N",…
$ city                      <chr> "Toronto", "Toronto", "Swansea", "Vancouver"…
$ loan_to_income_ratio      <dbl> 0.5932203, 0.1041667, 0.5729167, 0.5343511, …
$ other_debt                <dbl> 8402.454, 1607.803, 2760.506, 7155.286, 1562…
$ debt_to_income_ratio      <dbl> 0.7356348, 0.2716461, 0.8604693, 0.6435922, …
$ credit_utilization_ratio  <dbl> 0.49555669, 0.58543602, 0.75073184, 0.379333…
$ loan_int_rate             <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14, 12.…
$ is_train                  <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,…

readr::write_csv(output, "prepared_dataset.csv")

3 Marginal Effects from GLM

dataset <- read.csv("prepared_dataset.csv")

library(dplyr)
library(caret)

# Select variables for GLM (see 01_eda.qmd)

continuous_variables <- c(
  "person_emp_length", "loan_amnt", "loan_to_income_ratio"
)
factor_columns <- c(
  "person_home_ownership", "loan_intent", "cb_person_default_on_file"
)

train <- dataset |> select(c(
  "loan_status", all_of(continuous_variables),
  all_of(factor_columns)
))

train <- train |> mutate(across(where(is.character), as.factor)) |> data.frame()

# First, train without standardizing
glm_model <- glm(loan_status ~ .,
  family = "binomial", data = train
)

p_hat <- predict(glm_model, type = "response")

# Extract coefficients
betas <- coef(glm_model)

# Add betas for omitted levels of factors
for (col in factor_columns) {
  levels_col <- levels(train[[col]])
  for (lvl in levels_col) {
    var_name <- paste0(col, lvl)
    if (!(var_name %in% names(betas))) {
      betas[var_name] <- 0
    }
  }
}

# Compute marginal effects: β_j * p * (1-p)
AME <- sapply(1:length(betas), function(j) {
  mean(betas[j] * p_hat * (1 - p_hat), na.rm = TRUE)
})

names(AME) <- names(betas)
raw_effects <- data.frame(effect_raw = AME, coef = betas) |>
  tibble::rownames_to_column(var = "Variable")


# Now standardize continuous variables and retrain
train <- train |> mutate(across(
  .cols = c(continuous_variables),
  .fns = ~ (. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE)
))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(...)`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(continuous_variables)

  # Now:
  data %>% select(all_of(continuous_variables))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

glm_model <- glm(loan_status ~ .,
  family = "binomial", data = train
)

p_hat <- predict(glm_model, type = "response")

# Extract coefficients
betas <- coef(glm_model)

# Add betas for omitted levels of factors
for (col in factor_columns) {
  levels_col <- levels(train[[col]])
  for (lvl in levels_col) {
    var_name <- paste0(col, lvl)
    if (!(var_name %in% names(betas))) {
      betas[var_name] <- 0
    }
  }
}

# Compute marginal effects: β_j * p * (1-p)
AME <- sapply(1:length(betas), function(j) {
  mean(betas[j] * p_hat * (1 - p_hat), na.rm = TRUE)
})

names(AME) <- names(betas)

output <- data.frame(effect_std = AME) |>
  tibble::rownames_to_column(var = "Variable")

output <- output |> left_join(raw_effects, by = "Variable")

# Separate factor and continuous variables

reg_templ <- paste(factor_columns, collapse = "|")

output <- output |> 
  mutate(Variable = gsub("[\\(\\)]", "", Variable))
output <- output |> mutate(Factor = gsub(reg_templ, "", Variable))
output <- output |> mutate(Factor = stringr::str_replace(Factor, Variable, ""))

output <- output |> 
  mutate(Variable = gsub(paste0("(", reg_templ, ").+"), "\\1", Variable))

output

                    Variable   effect_std    effect_raw          coef
1                  Intercept -0.226071106 -3.826266e-01 -3.0539654641
2          person_emp_length -0.006671196 -1.675753e-03 -0.0133751627
3                  loan_amnt -0.053430210 -8.451357e-06 -0.0000674552
4       loan_to_income_ratio  0.154137503  1.439880e+00 11.4925187838
5      person_home_ownership  0.086267040  8.626704e-02  0.6885475098
6      person_home_ownership -0.170707587 -1.707076e-01 -1.3625167192
7      person_home_ownership  0.106229568  1.062296e-01  0.8478800805
8                loan_intent -0.094288288 -9.428829e-02 -0.7525697667
9                loan_intent  0.017260559  1.726056e-02  0.1377665794
10               loan_intent -0.022701433 -2.270143e-02 -0.1811933670
11               loan_intent -0.065168974 -6.516897e-02 -0.5201515576
12               loan_intent -0.116198538 -1.161985e-01 -0.9274482406
13 cb_person_default_on_file  0.141680158  1.416802e-01  1.1308318911
14     person_home_ownership  0.000000000  0.000000e+00  0.0000000000
15               loan_intent  0.000000000  0.000000e+00  0.0000000000
16 cb_person_default_on_file  0.000000000  0.000000e+00  0.0000000000
              Factor
1                   
2                   
3                   
4                   
5              OTHER
6                OWN
7               RENT
8          EDUCATION
9    HOMEIMPROVEMENT
10           MEDICAL
11          PERSONAL
12           VENTURE
13                 Y
14          MORTGAGE
15 DEBTCONSOLIDATION
16                 N

summary(glm_model)


Call:
glm(formula = loan_status ~ ., family = "binomial", data = train)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -1.80441    0.04373 -41.259  < 2e-16 ***
person_emp_length          -0.05325    0.01736  -3.067 0.002159 ** 
loan_amnt                  -0.42646    0.02133 -19.991  < 2e-16 ***
loan_to_income_ratio        1.23026    0.02121  58.001  < 2e-16 ***
person_home_ownershipOTHER  0.68855    0.25421   2.709 0.006757 ** 
person_home_ownershipOWN   -1.36252    0.09139 -14.908  < 2e-16 ***
person_home_ownershipRENT   0.84788    0.03622  23.409  < 2e-16 ***
loan_intentEDUCATION       -0.75257    0.05209 -14.449  < 2e-16 ***
loan_intentHOMEIMPROVEMENT  0.13777    0.05695   2.419 0.015551 *  
loan_intentMEDICAL         -0.18119    0.04927  -3.678 0.000235 ***
loan_intentPERSONAL        -0.52015    0.05301  -9.813  < 2e-16 ***
loan_intentVENTURE         -0.92745    0.05620 -16.503  < 2e-16 ***
cb_person_default_on_fileY  1.13083    0.03677  30.756  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 34182  on 32580  degrees of freedom
Residual deviance: 25971  on 32568  degrees of freedom
AIC: 25997

Number of Fisher Scoring iterations: 5

4 Cross-Validation for GLM

dataset <- read.csv("prepared_dataset.csv")

library(dplyr)
library(tidyr)
library(caret)

dataset <- dataset |>
  mutate(
    loan_status = as.factor(loan_status)
  )

selected_variables <- c(
  "loan_status", "person_emp_length", "loan_amnt", "loan_to_income_ratio",
  "person_home_ownership", "loan_intent", "cb_person_default_on_file"
)

train <- dataset |>
  filter(is_train == 1) |>
  select(all_of(selected_variables))

test <- dataset |>
  filter(is_train == 0) |>
  select(all_of(selected_variables)) 

glm_model <- glm(loan_status ~ .,
  family = "binomial", data = train
)

glm_probs <- predict(glm_model, newdata = test, type = "response")

glm_pred <- ifelse(glm_probs > 0.5, 1, 0) |> as.factor()

# Confusion matrix
conf_matrix <- confusionMatrix(glm_pred, test$loan_status)

output <- data.frame(Value = conf_matrix$byClass) |>
  tibble::rownames_to_column(var = "Parameter") |>
  pivot_wider(names_from = Parameter, values_from = Value)

output

# A tibble: 1 × 11
  Sensitivity Specificity `Pos Pred Value` `Neg Pred Value` Precision Recall
        <dbl>       <dbl>            <dbl>            <dbl>     <dbl>  <dbl>
1       0.957       0.402            0.849            0.727     0.849  0.957
# ℹ 5 more variables: F1 <dbl>, Prevalence <dbl>, `Detection Rate` <dbl>,
#   `Detection Prevalence` <dbl>, `Balanced Accuracy` <dbl>