library(dplyr)
library(readr)
dataset <- read.csv("dataset.csv")
dataset <- dataset |>
mutate(
date_received = as.Date(date_received)
)Time Series Forecast
The date of receipt of complaints can be used to create time series forecasts of complaint volumes over time, segmented by state. This can help in identifying trends.
1 Load data
2 Daily granularity (too slow)
Note on loading packages: we can only use packages available in Power BI environment (listed here).
library(dplyr)
library(lubridate)
library(forecast)
library(janitor)
dataset <- dataset |>
clean_names()
h_days <- 90 # forecast horizon (~3 months)
min_date <- min(dataset$date_received)
max_date <- max(dataset$date_received)
# Aggregate by date and state
daily <- dataset |>
mutate(date = as.Date(date_received)) |>
group_by(state, date) |>
summarise(complaints = n(), .groups = "drop")
# Define modeling function for each state
forecast_state <- function(df) {
state_name <- unique(df$state)
# Fill missing dates for this state
all_days <- tibble(date = seq(min_date, max_date, by = "day"))
df_full <- all_days |>
left_join(df, by = "date") |>
mutate(
state = state_name,
complaints = ifelse(is.na(complaints), 0, complaints)
)
# Skip if too few data points
if (nrow(df_full) < 100) {
return(
tibble(
state = state_name,
date = df_full$date,
actual = df_full$complaints,
forecast = NA_real_,
lower95 = NA_real_,
upper95 = NA_real_
)
)
}
# Convert to daily time series
ts_data <- ts(df_full$complaints, frequency = 7) # weekly cycle assumed
# Fit model (use ARIMA for medium series, TBATS for complex ones)
if (length(ts_data) > 1000) {
model <- tbats(ts_data)
} else {
model <- auto.arima(ts_data)
}
fc <- forecast(model, h = h_days)
# Combine historical and forecasted
result <- tibble(
state = state_name,
date = seq(min_date, by = "day", length.out = length(ts_data) + h_days),
actual = c(df_full$complaints, rep(NA, h_days)),
forecast = c(rep(NA, length(ts_data)), as.numeric(fc$mean)),
lower95 = c(rep(NA, length(ts_data)), fc$lower[,2]),
upper95 = c(rep(NA, length(ts_data)), fc$upper[,2])
)
return(result)
}
output <- daily |>
group_by(state) |>
group_modify(~ forecast_state(.x)) |>
ungroup()
output |> tail()# A tibble: 6 × 6
state date actual forecast lower95 upper95
<chr> <date> <dbl> <dbl> <dbl> <dbl>
1 WY 2023-11-21 NA 0.0270 -0.172 0.226
2 WY 2023-11-22 NA 0.0207 -0.178 0.219
3 WY 2023-11-23 NA 0.00983 -0.189 0.209
4 WY 2023-11-24 NA 0.0229 -0.176 0.222
5 WY 2023-11-25 NA 0.00296 -0.196 0.202
6 WY 2023-11-26 NA 0.0141 -0.185 0.213
The daily granularity approach is quite slow, which is critical for using in PowerQuery. Therefore, we will switch to monthly granularity for faster processing.
3 Monthly granularity (faster)
We will reload the raw data to start fresh as if it runs in Power BI. We only use the first sheet of the Excel file that contains the time series.
file_path <- "DataDNA Dataset Challenge - Consumer Financial Complaints Dataset - October 2025.xlsx"
dataset <- readxl::read_excel(file_path, sheet = 1)library(dplyr)
library(lubridate)
library(forecast)
library(janitor)
dataset <- dataset |>
clean_names()
dataset <- dataset |> mutate(
date_submitted =
as.Date(date_submitted, origin = "1899-12-30"),
date_received =
as.Date(date_received, origin = "1899-12-30"),
company_response_date =
as.Date(company_response_date, origin = "1899-12-30")
)
h_months <- 6 # forecast horizon
min_date <- min(dataset$date_received)
max_date <- max(dataset$date_received)
monthly <- dataset |>
mutate(month = floor_date(as.Date(date_received), "month")) |>
group_by(state, month) |>
summarise(complaints = n(), .groups = "drop") |>
arrange(state, month)
# Modeling function for each state
forecast_state_monthly <- function(df) {
state_name <- unique(df$state)
# Fill missing months with zeros
all_months <- tibble(month = seq(min_date, max_date, by = "month"))
df_full <- all_months |>
left_join(df, by = "month") |>
mutate(
state = state_name,
complaints = ifelse(is.na(complaints), 0, complaints)
)
# Require at least 12 months to fit
if (nrow(df_full) < 12) {
return(
tibble(
state = state_name,
month = df_full$month,
actual = df_full$complaints,
forecast = NA_real_,
lower95 = NA_real_,
upper95 = NA_real_
)
)
}
ts_data <- ts(df_full$complaints, frequency = 12)
# Fit ARIMA; if fails, fall back to ETS
fit <- tryCatch(auto.arima(ts_data), error = function(e) NULL)
if (is.null(fit)) {
print(paste("ARIMA failed for state:", state_name, "- trying ETS"))
fit <- tryCatch(ets(ts_data), error = function(e) NULL)
}
if (is.null(fit)) {
return(
tibble(
state = state_name,
month = df_full$month,
actual = df_full$complaints,
forecast = NA_real_,
lower95 = NA_real_,
upper95 = NA_real_
)
)
}
fc <- forecast(fit, h = h_months)
# Combine actual + forecast
result <- tibble(
state = state_name,
month = seq(min_date, by = "month", length.out = length(ts_data) + h_months),
actual = c(df_full$complaints, rep(NA, h_months)),
forecast = c(rep(NA, length(ts_data)), as.numeric(fc$mean)),
lower95 = c(rep(NA, length(ts_data)), fc$lower[, 2]),
upper95 = c(rep(NA, length(ts_data)), fc$upper[, 2])
)
return(result)
}
# Apply across states
output <- monthly |>
group_by(state) |>
group_modify(~ forecast_state_monthly(.x)) |>
ungroup()Warning: Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
Unknown or uninitialised column: `state`.
# Fill the latest data points with actual values (for visualization consistency)
output <- output |> mutate(
forecast =
ifelse(month == floor_date(as.Date(max_date), "month"), actual, forecast),
lower95 =
ifelse(month == floor_date(as.Date(max_date), "month"), actual, lower95),
upper95 =
ifelse(month == floor_date(as.Date(max_date), "month"), actual, upper95),
)
# Add rolling mean for smoother actual trend line
output <- output |>
group_by(state) |>
arrange(month) |>
mutate(
actual_ma = zoo::rollmean(actual, k = 12, fill = NA, align = "right")
) |>
ungroup()
# View the tail of the output; delete this line in production
output |> tail()# A tibble: 6 × 7
state month actual forecast lower95 upper95 actual_ma
<chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
1 VA 2024-02-01 NA 39.5 20.5 58.5 NA
2 VT 2024-02-01 NA 0.811 -0.981 2.60 NA
3 WA 2024-02-01 NA 23.2 11.5 34.8 NA
4 WI 2024-02-01 NA 3.83 -0.736 8.39 NA
5 WV 2024-02-01 NA 1.85 -1.04 4.73 NA
6 WY 2024-02-01 NA 0.288 -1.03 1.61 NA
This approach works much faster and is suitable for Power BI integration. The output contains actual complaint counts, forecasts, and confidence intervals for each state on a monthly basis. The latest actual data point is used to anchor the forecast visually in the plot.
We’re going to include this code chunk to the Power BI report.