Clustering Analysis

Author

Aleksei Prishchepo

Published

October 24, 2025

The goal of this document is to perform clustering analysis on the companies based on their characteristics to identify distinct groups of companies with similar profiles.

1 Load data

file_path <- "DataDNA Dataset Challenge - Consumer Financial Complaints Dataset - October 2025.xlsx"
dataset <- readxl::read_excel(file_path, sheet = 2)

2 Clustering Analysis

We create a chunk of code that performs k-means clustering on the dataset based on selected features. We will standardize the features before clustering to ensure they are on the same scale.

library(dplyr)
library(cluster)
library(scales)
library(stats)


dataset <- dataset |>
  janitor::clean_names()

df <- dataset |>
  filter(!is.na(enforcement_history), !is.na(avg_response_time_days)) |>
  mutate(
    enforcement = ifelse(enforcement_history == "Yes", 1, 0),
    size_num = case_when(
      company_size_tier == "Small" ~ 1,
      company_size_tier == "Medium" ~ 2,
      company_size_tier == "Large" ~ 3,
      TRUE ~ NA_real_
    )
  ) |>
  select(
    company_id_1081, enforcement, size_num, reputation_score, market_share_percent,
    timely_response_rate, avg_response_time_days, complaints_per_1pct_share
  )

features <- df |>
  select(enforcement, size_num, reputation_score, market_share_percent,
    timely_response_rate, avg_response_time_days, complaints_per_1pct_share) |>
  mutate(across(everything(), scale))

silhouette_scores <- data.frame(k = integer(), avg_sil_width = numeric())

for (k in 2:10) {
  set.seed(123)
  km <- kmeans(features, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(features))
  silhouette_scores <- rbind(
    silhouette_scores,
    data.frame(k = k, avg_sil_width = mean(ss[, 3]))
  )
}

best_k <- silhouette_scores$k[which.max(silhouette_scores$avg_sil_width)]

set.seed(123)
final_km <- kmeans(features, centers = best_k, nstart = 25)

clustered <- df |>
  mutate(cluster = final_km$cluster)

cluster_summary <- clustered |>
  group_by(cluster) |>
  summarise(
    n_companies = n(),
    avg_complaint_rate = mean(complaints_per_1pct_share, na.rm = TRUE),
    avg_company_size = mean(size_num, na.rm = TRUE),
    avg_market_share_perc = mean(market_share_percent, na.rm = TRUE),
    enforcement_share = mean(enforcement, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    cluster_label = paste0("Cluster ", cluster)
  )


cluster_quality <- silhouette_scores |>
  mutate(
    is_best = (k == best_k)
  )

clustered |> tail()
# A tibble: 6 × 9
  company_id_1081 enforcement size_num reputation_score market_share_percent
  <chr>                 <dbl>    <dbl>            <dbl>                <dbl>
1 COMP-1076                 0        1               68               0.0274
2 COMP-1077                 0        1               56               0.0099
3 COMP-1078                 0        1               87               0.0472
4 COMP-1079                 0        1               52               0.109 
5 COMP-1080                 0        1               99               0.0485
6 COMP-1081                 0        1               89               0.028 
# ℹ 4 more variables: timely_response_rate <dbl>, avg_response_time_days <dbl>,
#   complaints_per_1pct_share <dbl>, cluster <int>
cluster_summary
# A tibble: 3 × 7
  cluster n_companies avg_complaint_rate avg_company_size avg_market_share_perc
    <int>       <int>              <dbl>            <dbl>                 <dbl>
1       1         279               325.             2.39                0.209 
2       2         611              2251.             1                   0.0458
3       3         191              1786.             1.24                0.0715
# ℹ 2 more variables: enforcement_share <dbl>, cluster_label <chr>
cluster_quality
   k avg_sil_width is_best
1  2     0.2548143   FALSE
2  3     0.2647291    TRUE
3  4     0.2397746   FALSE
4  5     0.2066999   FALSE
5  6     0.2034004   FALSE
6  7     0.1953937   FALSE
7  8     0.1856649   FALSE
8  9     0.1953324   FALSE
9 10     0.1979762   FALSE

The results indicate that the optimal number of clusters is 3 based on the silhouette scores. The clusters can be interpreted as follows:

  1. Cluster 1: Large and medium companies with low complaint rates and practically unexistent enforcement history.
  2. Cluster 2: Small companies with high complaint rates and an zero enforcement history.
  3. Cluster 3: Medium and small companies having enforcement history with high complaint rates.

3 Plot clusters

Let’s visualize the clusters based on company size and complaint rate.

library(ggplot2)

ggplot(
  clustered,
  aes(x = as.factor(size_num), y = complaints_per_1pct_share, color = as.factor(cluster))
) +
  geom_point(alpha = 0.6) +
  labs(
    title = "Company Clusters based on Size and Complaint Rate",
    x = "Company Size (1=Small, 2=Medium, 3=Large)",
    y = "Complaints per 1% Market Share",
    color = "Cluster"
  ) +
  theme_minimal()

The plot reveals the same patterns observed in the causal analysis, confirming that the clusters align with company size. Therefore it doesn’t add much new information to the report alone, but we can try plotting principal components.

4 PCA Analysis

Principal Component Analysis (PCA) can help visualize the clusters that could make a good addition to the report.

library(dplyr)
library(stats)

df <- df |> left_join(
  clustered |> select(company_id_1081, cluster),
  by = "company_id_1081"
)

# Standardize features
features <- df |>
  select(enforcement, size_num, complaints_per_1pct_share) |>
  mutate(across(everything(), scale))

pca_model <- prcomp(features, center = TRUE, scale. = TRUE)

# Extract first two principal components
pca_data <- as.data.frame(pca_model$x[, 1:2])
colnames(pca_data) <- c("PC1", "PC2")

# Combine with company info
pca_result <- df |>
  bind_cols(pca_data) |>
  mutate(
    cluster = as.factor(cluster)
  )

# Variance explained by first two components
explained_var <- summary(pca_model)$importance[2, 1:2] * 100
explained_df <- data.frame(
  Component = c("PC1", "PC2"),
  VarianceExplained = explained_var
)

pca_result |> tail()
# A tibble: 6 × 11
  company_id_1081 enforcement size_num reputation_score market_share_percent
  <chr>                 <dbl>    <dbl>            <dbl>                <dbl>
1 COMP-1076                 0        1               68               0.0274
2 COMP-1077                 0        1               56               0.0099
3 COMP-1078                 0        1               87               0.0472
4 COMP-1079                 0        1               52               0.109 
5 COMP-1080                 0        1               99               0.0485
6 COMP-1081                 0        1               89               0.028 
# ℹ 6 more variables: timely_response_rate <dbl>, avg_response_time_days <dbl>,
#   complaints_per_1pct_share <dbl>, cluster <fct>, PC1 <dbl>, PC2 <dbl>
explained_df
    Component VarianceExplained
PC1       PC1            48.546
PC2       PC2            33.330

The first two principal components explain a significant portion of the variance in the data, allowing us to visualize the clusters effectively.

5 Plot PCA Results

library(ggplot2)
ggplot(
  pca_result,
  aes(x = PC1, y = PC2, color = cluster)
) +
  geom_point(alpha = 0.6) +
  labs(
    title = "PCA of Company Features",
    x = paste0("PC1 (", round(explained_var[1], 1), "% variance)"),
    y = paste0("PC2 (", round(explained_var[2], 1), "% variance)"),
    color = "Cluster"
  ) +
  theme_minimal()

The plot shows that the first two principal components effectively separate the clusters identified earlier, confirming the distinct profiles of the companies based on their size. This PCA analysis doesn’t add much new information to the report, so we will also skip it.

6 References