Analysis Document: Analysis of activity statement data

Author

Rob Heirene

Published

February 6, 2025

Preamble

This document outlines the code used for our analyses of data relating to use and impact of activity statements.

Set up analysis environment

Load packages and datasets:

Show set-up code
# install.packages("groundhog") # Install

library(groundhog) # Load


# List desired packages:
packages <- c('tidyverse', # Clean, organise, and visualise data
              "readxl", #  load Excel files
               "survey", # weight study data
              "weights", # Weighted statistics 
              "segmented",
              "wCorr", # Weighted correlations (https://cran.r-project.org/web/packages/wCorr/wCorr.pdf)
              'MASS', # Ordinal regression
              'lmtest', # Model diagnostics the timeseries regressions
              'performance', # Check model diagnostics
              'see', # supports above package
              'e1071', # Asses variable skewness
              'bestNormalize', # transformed variables
              'brant', # test regression model assumptions
              'pscl', # Compute model performance metrics
              'rms', # Compute model performance metrics
              'emmeans', # compute marginal means
              'DescTools', # Many functions but used when is predictor variables
              'kableExtra', # Make tables
              'formattable', #  Add visualisations to tables
              'gt', # Alternative table options
              'gtExtras', # Add colours to gt tables
              'gtsummary', # Create summary tables
              'plotly', # Add interactive elements to figures
              'htmlwidgets', # Make plotly plots HTML format
              'ggstatsplot', # Produce plots with combined statistical output
              'rstantools', # required for above package
              'ggstream', # smooooth area charts
              # 'ggsankey', # ggplot style sankey plots
              'ggrain', # Raincloud plots
              'networkD3', # Sankey plots
              'likert', # likert data plots
              'ggstats', # Also likert data plots that accept weights
              'viridis', # plot colours
              'scales', # Wrap labels in plots
              "patchwork", # Assemble/combine plots
              'rcartocolor', # plot colours
              'NatParksPalettes', # plot colours
              'sysfonts', # Special fonts for figures
              'showtext', # Special fonts for figures
              "data.table", # Work with very large datasets more easily
              "qs", # Save and store very large datasets
              'sessioninfo' # Detailed session info for reproducibility
              ) 


# Load desired package with versions specific to project start date:
groundhog.library(packages, "2024-05-29", force.install = TRUE)

Set up a standard theme for plots/data visualisations:

Show code
# Load new font for figures/graphs
font_add_google("Poppins")
showtext_auto()

# Save new theme for figures/graphs.This will determine the layout, presentation, font type and font size used in all data visualisations presented here:
plot_theme<- theme_classic() +
  theme(
    text=element_text(family="Poppins"),
    plot.title = element_text(hjust = 0.5, size = 26),
          plot.subtitle = element_text(hjust = 0.5, size = 22),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 22),
    plot.caption = element_text(size = 16),
    legend.title=element_text(size=18), 
    legend.text=element_text(size=16)
        ) 

# Colour schemes:

# Extract and customize the "Arches" palette
arches_palette <- natparks.pals("Arches", 11)

custom_palette1 <- c(
  arches_palette[1], 
  arches_palette[3], 
  arches_palette[5], 
  arches_palette[8], 
  arches_palette[10] 
)

custom_palette2 <- c(
  arches_palette[1], 
  arches_palette[3], 
  arches_palette[6], 
  arches_palette[9], 
  arches_palette[11] 
)

arches_palette_extra <- natparks.pals("Arches", 25)

custom_palette3 <- c(
  arches_palette_extra[1],  
  arches_palette_extra[3],  
  arches_palette_extra[5],  
  arches_palette_extra[14],  
  arches_palette_extra[15]
)

custom_palette4 <- c(
  arches_palette_extra[1],  
  arches_palette_extra[6],  
  arches_palette_extra[10],  
  arches_palette_extra[16],  
  arches_palette_extra[25]
)



custom_report_palette_1<-  c("#440154FF", 
                             "#3B528BFF", 
                             "#21908CFF", 
                             "#4AC16DFF",
                             "#9FDA3AFF")
custom_report_palette_2 <- c("#440154FF", 
                             "#414487FF", 
                             # "#2A788EFF", 
                             "#F2F2F2", 
                             "#4AC16DFF",
                             "#9FDA3AFF")

Load data

Load in master dataset containing all customer details (code hidden):

Process the data for analysis by removing individuals who didn`t pass the attention checks and those who did not complete the survey:

Show code
master_dataset <-  master_dataset_initial %>%
  as_tibble() %>%
    filter(SURVEY_STATUS_PGSI == "Complete") %>%
filter(PAS_6M_BET_FREQ_DAYS != 0) %>% # Remove anybody who didn't bet in the window of interest
     filter(
            (ATTENTION_CHECK_1 == "PASSED" | is.na(ATTENTION_CHECK_1)) &
            (ATTENTION_CHECK_2 == "PASSED" | is.na(ATTENTION_CHECK_2))
             ) %>%
  mutate(GHM_OVER_PRIORITISE = as.numeric(GHM_OVER_PRIORITISE),
    GHM_STRAIN = as.numeric(GHM_STRAIN),
    GHM_SEVERE = as.numeric(GHM_SEVERE)
  ) %>%
  # Enforce the order of factors so that they present properly in tables and graphs:
  mutate(PGSI_CATEGORY = factor(PGSI_CATEGORY, levels = c('No risk',
                                                              'Low risk',
                                                              'Moderate risk',
                                                              'High risk')))

Analyses - Survey

Recruitment

Provide some key figures for the participant section in the methods.

How many people were invited to the survey?

Show code
 master_dataset_initial %>%
  nrow()
[1] 24829

How many people completed up to the PGSI?

Show code
 master_dataset_initial %>%
   filter(SURVEY_STATUS_PGSI == "Complete") %>%
  nrow()
[1] 1959

How many people were removed for not passing the attention check?

Show code
 master_dataset_initial %>%
  filter(SURVEY_STATUS_PGSI == "Complete") %>%
  filter(ATTENTION_CHECK_1 == "NOT PASSED" | ATTENTION_CHECK_2 == "NOT PASSED")  %>%
  nrow()
[1] 309

And how many remaining people didn’t place a bet in the 6 month before survey?

Show code
master_dataset_initial %>%
  as_tibble() %>%
  filter(SURVEY_STATUS_PGSI == "Complete") %>%
  filter(ATTENTION_CHECK_1 == "PASSED" | ATTENTION_CHECK_2 == "PASSED")  %>%
filter(PAS_6M_BET_FREQ_DAYS == 0) %>% 
  nrow()
[1] 3

Leaving how many participants in our sample used for analyses:

Show code
 master_dataset_initial %>%
    as_tibble() %>%
  filter(SURVEY_STATUS_PGSI == "Complete") %>%
     filter(
            (ATTENTION_CHECK_1 == "PASSED" | is.na(ATTENTION_CHECK_1)) &
            (ATTENTION_CHECK_2 == "PASSED" | is.na(ATTENTION_CHECK_2))
             ) %>%
  filter(PAS_6M_BET_FREQ_DAYS != 0) %>% 
  nrow()
[1] 1647

Weighting

All of the results presented from now, unless otherwise specified, will be weighted outcomes. Let’s create the dataframe required for weighting:

Show code
# Create survey design object:
survey_design <- survey::svydesign(
  id = ~1,
  data = master_dataset,
  weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS
)

Sample summary

Produce weighted summary variables that describe the entire survey sample.

Show code
# Gender percentages
q_table_GENDER <- svytable(~GENDER, survey_design)
prop_table_GENDER <- prop.table(q_table_GENDER)
GENDER_Percentages <- prop_table_GENDER %>% 
  as.data.frame() %>%
  mutate('Percent' = round(Freq*100, 2)) %>%
  select(-Freq) %>%
  mutate(Characteristic = paste("Gender:", GENDER),
         Value = paste0(Percent, "%")) %>%
  select(Characteristic, Value)

# Age mean and SD
Age_mean <- svymean(~AGE, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  filter(rowname == "mean")
Age_sd <- svyvar(~AGE, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
   rownames_to_column() %>%
  mutate(SD = sqrt(variance)) %>%
  select(SD)

Age_summary <- bind_cols(Age_mean, Age_sd) %>%
  mutate(Characteristic = "Age",
         Value = paste0(round(AGE, 2), " (", round(SD, 2), ")")) %>%
  select(Characteristic, Value)

# PGSI mean and SD
PGSI_TOTAL_mean <- svymean(~PGSI_TOTAL, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  
  rownames_to_column() %>%
  filter(rowname == "mean") 
PGSI_TOTAL_sd <- svyvar(~PGSI_TOTAL, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  mutate(SD = sqrt(variance)) %>%
  select(SD)

PGSI_summary <- bind_cols(PGSI_TOTAL_mean, PGSI_TOTAL_sd) %>%
  mutate(Characteristic = "PGSI Total Score",
         Value = paste0(round(PGSI_TOTAL, 2), " (", round(SD, 2), ")")) %>%
  select(Characteristic, Value)

# Education percentages
q_table_EDUCATION <- svytable(~EDUCATION, survey_design)
prop_table_EDUCATION <- prop.table(q_table_EDUCATION)
EDUCATION_Percentages <- prop_table_EDUCATION %>%
  as.data.frame()

# Check column names to see the correct variable name
# colnames(EDUCATION_Percentages)

EDUCATION_Percentages <- EDUCATION_Percentages %>%
  mutate('Percent' = round(Freq*100, 2)) %>%
  select(-Freq) %>%
  mutate(Characteristic = paste("Education:", EDUCATION),
         Value = paste0(Percent, "%")) %>%
  select(Characteristic, Value)

# Betting intensity mean and SD
PAS_30_BET_INTENSITY_mean <- svymean(~PAS_30_BET_INTENSITY, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  filter(rowname == "mean") 
PAS_30_BET_INTENSITY_sd <- svyvar(~PAS_30_BET_INTENSITY, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  
  rownames_to_column() %>%
  mutate(SD = sqrt(variance)) %>%
  select(SD)

Betting_intensity_summary <- bind_cols(PAS_30_BET_INTENSITY_mean, PAS_30_BET_INTENSITY_sd) %>%
  mutate(Characteristic = "Past 30 Day Betting Intensity",
         Value = paste0(round(PAS_30_BET_INTENSITY, 2), " (", round(SD, 2), ")")) %>%
  select(Characteristic, Value)

# Past 30 day net outcome mean and SD
PAS_30_NET_OUTCOME_mean <- svymean(~PAS_30_NET_OUTCOME, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  
  rownames_to_column() %>%
  filter(rowname == "mean")
PAS_30_NET_OUTCOME_sd <- svyvar(~PAS_30_NET_OUTCOME, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  mutate(SD = sqrt(variance)) %>%
  select(SD)

Net_outcome_summary <- bind_cols(PAS_30_NET_OUTCOME_mean, PAS_30_NET_OUTCOME_sd) %>%
  mutate(Characteristic = "Past 30 Day Net Outcome",
         Value = paste0(round(PAS_30_NET_OUTCOME, 2), " (", round(SD, 2), ")")) %>%
  select(Characteristic, Value)

# Past 30 day deposit frequency mean and SD
PAS_30_TOTAL_DEPOSIT_FREQ_mean <- svymean(~PAS_30_TOTAL_DEPOSIT_FREQ, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  filter(rowname == "mean") 

PAS_30_TOTAL_DEPOSIT_FREQ_sd <- svyvar(~PAS_30_TOTAL_DEPOSIT_FREQ, survey_design, na.rm = TRUE) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  mutate(SD = sqrt(variance)) %>%
  select(SD)

Deposit_freq_summary <- bind_cols(PAS_30_TOTAL_DEPOSIT_FREQ_mean, PAS_30_TOTAL_DEPOSIT_FREQ_sd) %>%
  mutate(Characteristic = "Past 30 Day Deposit Frequency",
         Value = paste0(round(PAS_30_TOTAL_DEPOSIT_FREQ, 2), " (", round(SD, 2), ")")) %>%
  select(Characteristic, Value)

# Active deposit limit percentages
q_table_ACTIVE_DEPOSIT_LIMIT <- svytable(~ACTIVE_DEPOSIT_LIMIT, survey_design)
prop_table_ACTIVE_DEPOSIT_LIMIT <- prop.table(q_table_ACTIVE_DEPOSIT_LIMIT)
Deposit_limit_Percentages <- prop_table_ACTIVE_DEPOSIT_LIMIT %>%
  as.data.frame() %>%
  mutate('Percent' = round(Freq*100, 2)) %>%
  select(-Freq) %>%
  mutate(Characteristic = paste("Active Deposit Limit:", ACTIVE_DEPOSIT_LIMIT),
         Value = paste0(Percent, "%")) %>%
  select(Characteristic, Value)

# Combine all summaries
combined_summary <- bind_rows(GENDER_Percentages,
                              Age_summary,
                              PGSI_summary,
                              EDUCATION_Percentages,
                              Betting_intensity_summary,
                              Net_outcome_summary,
                              Deposit_freq_summary,
                              Deposit_limit_Percentages)

# Display as a table
combined_summary %>%
  gt()%>%
  tab_options(data_row.padding = px(3)) %>%
  tab_header("Table 1. Sample characteristics and use of activity statements")
Table 1. Sample characteristics and use of activity statements
Characteristic Value
Gender: Female 13.79%
Gender: Male 84.62%
Gender: Unknown 1.59%
Age 44.06 (15.5)
PGSI Total Score 3.64 (4.12)
Education: Advanced diploma/diploma 9.05%
Education: Bachelor's degree 17.85%
Education: Certificate III/IV 19.23%
Education: Doctoral degree 0.33%
Education: Graduate diploma or graduate level certificate 8.47%
Education: Master's degree 4.05%
Education: Year 11 or below (includes Certificate I/II/n) 16.95%
Education: Year 12 24.07%
Past 30 Day Betting Intensity 7.87 (15.54)
Past 30 Day Net Outcome -181.01 (1226.21)
Past 30 Day Deposit Frequency 9.41 (28.48)
Active Deposit Limit: 0 90.45%
Active Deposit Limit: 1 9.55%

Confirm sample size for above:

Show code
nrow(master_dataset)
[1] 1647

Question summaries

Let’s provide some basic statistics for each of the questions relating to activity statement use.

Q: Preferred tools

The first question of interest was:

Please indicate how useful the following are in supporting you to track your gambling spending:

  • Activity Statements

  • Bank Statements

  • Setting a deposit limit on my account

  • Only gambling what I have in my account

  • Setting a budget

  • Recording my bets

  • Using site ‘check-in’ tool

  • Other

For this, we don’t want to provide loads of outcomes, let’s just plop the responses so we can see which tools were most preferred:

Show code
# Define a function to create weighted frequency and percentage tables
create_weighted_table <- function(var_name) {
  freq_table <- svytable(as.formula(paste("~", var_name)), survey_design)
  prop_table <- prop.table(freq_table)
  
  percentages <- prop_table %>%
    as.data.frame() %>%
    mutate('Percent' = round(Freq * 100, 2)) %>%
    rename(Response = !!sym(var_name)) %>%
    select(-Freq) %>%
    mutate(Variable = var_name) # Add variable name for identification
  
  return(percentages)
}

# List of variables to process
variables <- c("ACT2_1", "ACT2_2", "ACT2_3", "ACT2_4", "ACT2_5", "ACT2_6", "ACT2_7", "ACT2_8")

# Apply the function to each variable and combine results
weighted_tables <- lapply(variables, create_weighted_table)
combined_table <- bind_rows(weighted_tables) %>%
 mutate(Variable = case_when(
    Variable == "ACT2_1" ~ "Activity Statements",
    Variable == "ACT2_2" ~ "Bank Statements",
    Variable == "ACT2_3" ~ "Setting a deposit limit on my account",
    Variable == "ACT2_4" ~ "Only gambling what I have in my account",
    Variable == "ACT2_5" ~ "Setting a budget",
    Variable == "ACT2_6" ~ "Recording my bets",
    Variable == "ACT2_7" ~ "Using site 'check-in' tool",
    Variable == "ACT2_8" ~ "Other"
  )) %>%
   mutate(Variable = factor(Variable, levels = c(
    "Activity Statements",
    "Bank Statements",
    "Setting a deposit limit on my account",
    "Only gambling what I have in my account",
    "Setting a budget",
    "Recording my bets",
    "Using site 'check-in' tool",
    "Other"
  )))

# Ensure Response is a factor with the correct levels in the desired order
combined_table$Response <- factor(combined_table$Response, levels = rev(c('Not Important', 'Slightly Important', 'Moderately Important', 'Important', 'Very Important')))

# Display the combined table using gt
combined_table %>%
  pivot_wider(names_from = Response, values_from = Percent) %>%
  gt() %>%
   tab_options(data_row.padding = px(3)) %>%
  tab_header("Responses to most useful tools question", subtitle = md("***Weighted outcomes***"))
Responses to most useful tools question
Weighted outcomes
Variable Important Moderately Important Not Important Slightly Important Very Important
Activity Statements 20.75 16.77 36.06 15.51 10.91
Bank Statements 14.91 14.65 44.50 17.24 8.70
Setting a deposit limit on my account 15.12 11.94 47.92 11.90 13.12
Only gambling what I have in my account 29.19 18.27 16.64 13.33 22.57
Setting a budget 25.94 12.84 24.00 13.79 23.42
Recording my bets 15.57 17.82 38.02 16.76 11.83
Using site 'check-in' tool 7.32 11.55 64.36 12.64 4.13
Other 5.19 7.98 74.69 3.81 8.33

Now produce a plot of this data:

Show code
# Add line breaks for longer stringy strings:
combined_table$Variable <- gsub("Only gambling what I have in my account", 
                                         "Only gambling what I have\nin my account", 
                                         combined_table$Variable)

combined_table$Variable <- gsub("Setting a deposit limit on my account", 
                                         "Setting a deposit limit on\nmy account", 
                                         combined_table$Variable)

# Sum the proportions of "Moderately Important", "Important", and "Very Important" for each variable
importance_sum <- combined_table %>%
  filter(Response %in% c('Moderately Important', 'Important', 'Very Important')) %>%
  group_by(Variable) %>%
  summarise(Importance_Sum = round(sum(Percent),0))

# Order variables by the computed sum
order_vars <- importance_sum %>%
  arrange(Importance_Sum) %>%
  pull(Variable)

# Join the computed sum back to the main data and reorder the Variable factor
combined_table_w_counts <- combined_table %>%
  left_join(importance_sum, by = "Variable") %>%
  mutate(Variable = factor(Variable, levels = order_vars))

# Plot the data
Q2_summary_plot<- ggplot(combined_table_w_counts, aes(x = Variable, y = Percent, fill = Response)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(data = combined_table_w_counts %>% filter(Response == "Very Important"), 
            aes(x = Variable, y = 94, label = round(Importance_Sum, 2)), 
            hjust = 0, color = "white", fontface = "bold") +
  coord_flip() +
  # scale_fill_manual(values=natparks.pals("Arches")) +
    scale_fill_manual(values = custom_palette3) +
  labs(title = NULL, x = NULL, y = "Percentage (%)", fill = "Response") 

# Present plot in quarto doc:
Q2_summary_plot +
  plot_theme

Show code
# Present plot for report:

# ggplot(combined_table_w_counts, aes(x = Variable, y = Percent, fill = Response)) +
#   geom_bar(stat = "identity", position = "stack") +
#   geom_text(data = combined_table_w_counts %>% filter(Response == "Very Important"),
#             aes(x = Variable, y = 94, label = round(Importance_Sum, 2)),
#             hjust = 0, color = "white", fontface = "bold") +
#   coord_flip() +
#   # scale_fill_manual(values=natparks.pals("Arches")) +
#     scale_fill_manual(values = custom_report_palette_1) +
#   labs(title = NULL, x = NULL, y = "Percentage (%)", fill = "Response") +
#    theme_classic() +
#   theme(text = element_text(family = "Poppins"),
#           plot.margin = margin(t = 0, r = 0, b = 0, l = 5.5))


# Save plot with proper text dimensions for paper:
Q2_summary_plot_saved<- Q2_summary_plot +
  theme_classic() +
  theme(text = element_text(family = "Poppins"),
          plot.margin = margin(t = 0, r = 0, b = 0, l = 0))

ggsave("Figures/Q2_summary_plot.pdf", 
       plot = Q2_summary_plot_saved, 
       height = 80, 
       width = 170, 
       units = "mm")

The numbers in the above plot represent the total proportion of responses that were: ‘Moderately Important’, ‘Important’, or ‘Very Important’.

Q: Open statements

The next question asked was:

How often have you opened and read a monthly activity statement?

Show code
# master_dataset$ACT3_READ_FREQ

freq_table_ACT3 <- svytable(~ACT3_READ_FREQ, survey_design)
prop_table_ACT3<- prop.table(freq_table_ACT3)

ACT3_Percentages<- prop_table_ACT3 %>% 
  as.data.frame() %>%
mutate('Percent' = round(Freq*100,2)) %>%
rename(Response = ACT3_READ_FREQ) %>%
      mutate(Response = factor(Response, levels = c('Never',
                                                    'Rarely',
                                                    'Sometimes',
                                                    'Very Often',
                                                    'Always'))) %>%
  arrange(Response)

ACT3_Percentages %>%
select(-Freq) %>%
gt() %>%
  tab_header("How often have you opened and read a monthly activity statement?", subtitle = md("***Weighted outcomes***"))
How often have you opened and read a monthly activity statement?
Weighted outcomes
Response Percent
Never 23.07
Rarely 19.76
Sometimes 23.55
Very Often 13.29
Always 20.33

What proportion of customers said they sometimes, very often, or always open statements:

Show code
ACT3_Percentages %>%
filter(Response != "Never" & Response != "Rarely") %>%
summarise(sum=sum(Percent))
    sum
1 57.17

Now provide some summary summary statistics relating to gambling harm/risk:

Show code
# Compute mean and SD for PGSI grouped by ACT3_READ_FREQ
pgsi_mean_ACT3_READ_FREQ <- svyby(~PGSI_TOTAL, ~ACT3_READ_FREQ, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT3_READ_FREQ,
        Mean_PGSI = PGSI_TOTAL)
pgsi_SD_ACT3_READ_FREQ <- svyby(~PGSI_TOTAL, ~ACT3_READ_FREQ, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(PGSI_SD = sqrt(PGSI_TOTAL)) %>%
 select(Response = ACT3_READ_FREQ,
        SD_PGSI = PGSI_SD)

# Compute mean and SD for Gambling harms measure grouped by ACT3_READ_FREQ
GHM_mean_ACT3_READ_FREQ <- svyby(~GHM_TOTAL, ~ACT3_READ_FREQ, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT3_READ_FREQ,
        Mean_GHM = GHM_TOTAL)
GHM_SD_ACT3_READ_FREQ <- svyby(~GHM_TOTAL, ~ACT3_READ_FREQ, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(GHM_SD = sqrt(GHM_TOTAL)) %>%
 select(Response = ACT3_READ_FREQ,
        SD_GHM = GHM_SD)

Act_3_Risk_scores<- pgsi_mean_ACT3_READ_FREQ %>%
left_join(pgsi_SD_ACT3_READ_FREQ, by = 'Response') %>%
left_join(GHM_mean_ACT3_READ_FREQ, by = 'Response') %>%
  left_join(GHM_SD_ACT3_READ_FREQ, by = 'Response') %>%
  mutate_if(is.numeric, round, 2) %>%
  mutate(Response = factor(Response, levels = c('Never', 'Rarely', 'Sometimes', 'Very Often', 'Always'))) %>%
  arrange(Response)

gt(Act_3_Risk_scores) %>%
     tab_options(data_row.padding = px(3)) %>%
  tab_header("Gambling harm scores grouped by response to: How often have you opened and read a monthly activity statement?", subtitle = md("***Weighted outcomes***"))
Gambling harm scores grouped by response to: How often have you opened and read a monthly activity statement?
Weighted outcomes
Response Mean_PGSI SD_PGSI Mean_GHM SD_GHM
Never 3.62 4.36 1.38 2.57
Rarely 3.93 4.40 1.52 2.87
Sometimes 3.78 4.04 1.59 2.98
Very Often 3.83 4.07 1.38 2.26
Always 2.90 3.52 1.01 2.14

Let’s also provide a table that shows us the proportion of responses in relation to each PGSI category:

Show code
pgsi_cat_ACT3_READ_FREQ  <- svytable(~ACT3_READ_FREQ + PGSI_CATEGORY, survey_design)

# Convert to a data frame and compute proportions
pgsi_cat_df_ACT3_READ_FREQ <- as.data.frame(pgsi_cat_ACT3_READ_FREQ) %>%
  group_by(PGSI_CATEGORY) %>%
  mutate(Proportion = Freq / sum(Freq)) %>%
  ungroup() %>%
  select(Response = ACT3_READ_FREQ, PGSI_CATEGORY, Proportion) %>%
  mutate(Proportion = round(Proportion * 100, 2)) %>%
    mutate(Response = factor(Response, levels = c('Never', 'Rarely', 'Sometimes', 'Very Often', 'Always'))) %>%
  arrange(Response)

# Present as table:
pgsi_cat_df_ACT3_READ_FREQ %>%
  pivot_wider(names_from = PGSI_CATEGORY, values_from = Proportion) %>%
  gt() %>%
  tab_options(data_row.padding = px(3)) %>%
  tab_header(
    title = "Proportion of customers in each PGSI category by response to: How often have you opened and read a monthly activity statement?",
    subtitle = md("***Weighted outcomes***")
  )
Proportion of customers in each PGSI category by response to: How often have you opened and read a monthly activity statement?
Weighted outcomes
Response No risk Low risk Moderate risk High risk
Never 26.02 22.12 20.29 25.65
Rarely 21.34 17.30 18.33 24.40
Sometimes 19.31 25.61 24.88 24.41
Very Often 10.28 13.63 15.34 13.48
Always 23.05 21.33 21.16 12.06

Plot this data:

Show code
# Sum the proportions of "Moderately Important", "Important", and "Very Important" for each variable
importance_sum_ACT3_READ_FREQ <- pgsi_cat_df_ACT3_READ_FREQ %>%
  filter(Response %in% c('Always', 'Very Often', 'Sometimes')) %>%
  group_by(PGSI_CATEGORY) %>%
  summarise(Importance_Sum = round(sum(Proportion),0))

# Join the computed sum back to the main data table:
pgsi_cat_df_ACT3_READ_FREQ_w_counts <- pgsi_cat_df_ACT3_READ_FREQ %>%
  left_join(importance_sum_ACT3_READ_FREQ, by = "PGSI_CATEGORY")

Q3_summary_plot <-
  pgsi_cat_df_ACT3_READ_FREQ_w_counts %>%
    mutate(Response = factor(Response, levels = c('Always', 'Very Often', 'Sometimes', 'Rarely', 'Never'))) %>%
  # arrange(desc(Response)) %>%
ggplot(aes(x = PGSI_CATEGORY, y = Proportion, fill = Response)) +
  geom_bar(stat = "identity", position = "stack") +
    geom_text(data = pgsi_cat_df_ACT3_READ_FREQ_w_counts %>% filter(Response == "Always"), 
            aes(x = PGSI_CATEGORY, y = 94, label = round(Importance_Sum, 2)), 
            hjust = 0, color = "white", fontface = "bold") +
  coord_flip() +
  # scale_fill_manual(values = natparks.pals("Arches", 5)) +
    scale_fill_manual(values = custom_palette1) +
  labs(title = NULL, x = NULL, y = "Percentage (%)", fill = "Response") +
    theme(text = element_text(family = "Poppins"))

# Present plot in quarto doc:
Q3_summary_plot +
  plot_theme

Show code
# Safe plot with proper text dimensions for paper:
Q3_summary_plot_saved<- Q3_summary_plot +
  theme_classic() +
  theme(text = element_text(family = "Poppins"),
        legend.title = element_text(size = 9),  
        legend.text = element_text(size = 8))

ggsave("Figures/Q3_summary_plot_2.pdf", 
       plot = Q3_summary_plot_saved, 
       height = 1.9, 
       width = 6, 
       units = "in")

The numbers in the above plot represent the total proportion of responses that were: ‘Sometimes’, ‘Very often’, or ‘Always’.

Q: Attention to statements

The next question asked was:

Upon opening, how closely did you read the monthly activity statements?

Note that there was an error in our survey such that people who stated they “Always” read statements did not receive this question. We do not present outcomes from this question in the manuscript as a result but include them here for transparency.

Show code
# master_dataset$ACT4_READ_CLOSE
freq_table_ACT4 <- svytable(~ACT4_READ_CLOSE, survey_design)
prop_table_ACT4<- prop.table(freq_table_ACT4)

ACT4_Percentages<- prop_table_ACT4 %>% 
  as.data.frame() %>%
mutate('Percent' = round(Freq*100,2)) %>%
rename(Response = ACT4_READ_CLOSE) %>%
      mutate(Response = factor(Response, levels = c('Not at all',
                                                    'Slightly',
                                                    'Moderately',
                                                    'Very',
                                                    'Extremely'))) %>%
  arrange(Response)

ACT4_Percentages %>%
select(-Freq) %>%
gt() %>%
  tab_header("Upon opening, how closely did you read the monthly activity statements?", subtitle = md("***Weighted outcomes***"))
Upon opening, how closely did you read the monthly activity statements?
Weighted outcomes
Response Percent
Not at all 35.04
Slightly 24.12
Moderately 25.11
Very 14.08
Extremely 1.66

Now provide some summary summary statistics relating to gambling harm/risk:

Show code
# Compute mean and SD for PGSI grouped by ACT4_READ_CLOSE
pgsi_mean_ACT4_READ_CLOSE <- svyby(~PGSI_TOTAL, ~ACT4_READ_CLOSE, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT4_READ_CLOSE,
        Mean_PGSI = PGSI_TOTAL)
pgsi_SD_ACT4_READ_CLOSE <- svyby(~PGSI_TOTAL, ~ACT4_READ_CLOSE, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(PGSI_SD = sqrt(PGSI_TOTAL)) %>%
 select(Response = ACT4_READ_CLOSE,
        SD_PGSI = PGSI_SD)

# Compute mean and SD for Gambling harms measure grouped by ACT4_READ_CLOSE
GHM_mean_ACT4_READ_CLOSE <- svyby(~GHM_TOTAL, ~ACT4_READ_CLOSE, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT4_READ_CLOSE,
        Mean_GHM = GHM_TOTAL)
GHM_SD_ACT4_READ_CLOSE <- svyby(~GHM_TOTAL, ~ACT4_READ_CLOSE, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(GHM_SD = sqrt(GHM_TOTAL)) %>%
 select(Response = ACT4_READ_CLOSE,
        SD_GHM = GHM_SD)

Act_4_Risk_scores<- pgsi_mean_ACT4_READ_CLOSE %>%
left_join(pgsi_SD_ACT4_READ_CLOSE, by = 'Response') %>%
left_join(GHM_mean_ACT4_READ_CLOSE, by = 'Response') %>%
  left_join(GHM_SD_ACT4_READ_CLOSE, by = 'Response') %>%
  mutate_if(is.numeric, round, 2) %>%
  mutate(Response = factor(Response, levels = c('Not at all',
                                                    'Slightly',
                                                    'Moderately',
                                                    'Very',
                                                    'Extremely'))) %>%
  arrange(Response)

gt(Act_4_Risk_scores) %>%
     tab_options(data_row.padding = px(3)) %>%
  tab_header("Gambling harm scores grouped by response to: Upon opening, how closely did you read the monthly activity statements?", subtitle = md("***Weighted outcomes***"))
Gambling harm scores grouped by response to: Upon opening, how closely did you read the monthly activity statements?
Weighted outcomes
Response Mean_PGSI SD_PGSI Mean_GHM SD_GHM
Not at all 3.77 4.41 1.50 2.94
Slightly 3.77 4.08 1.34 2.54
Moderately 3.81 3.93 1.47 2.50
Very 3.86 4.63 1.63 2.91
Extremely 2.83 2.90 1.56 2.45

Again, also provide a table that shows us the proportion of responses in relation to each PGSI category:

Show code
pgsi_cat_ACT4_READ_CLOSE  <- svytable(~ACT4_READ_CLOSE + PGSI_CATEGORY, survey_design)

# Convert to a data frame and compute proportions
pgsi_cat_df_ACT4_READ_CLOSE <- as.data.frame(pgsi_cat_ACT4_READ_CLOSE) %>%
  group_by(PGSI_CATEGORY) %>%
  mutate(Proportion = Freq / sum(Freq)) %>%
  ungroup() %>%
  select(Response = ACT4_READ_CLOSE, PGSI_CATEGORY, Proportion) %>%
  mutate(Proportion = round(Proportion * 100, 2)) %>%
    mutate(Response = factor(Response, levels = c('Not at all',
                                                    'Slightly',
                                                    'Moderately',
                                                    'Very',
                                                    'Extremely'))) %>%
  arrange(Response)

# Present as table:
pgsi_cat_df_ACT4_READ_CLOSE %>%
  pivot_wider(names_from = PGSI_CATEGORY, values_from = Proportion) %>%
  gt() %>%
  tab_options(data_row.padding = px(3)) %>%
  tab_header(
    title = "Proportion of customers in each PGSI category by response to: Upon opening, how closely did you read the monthly activity statements?",
    subtitle = md("***Weighted outcomes***")
  )
Proportion of customers in each PGSI category by response to: Upon opening, how closely did you read the monthly activity statements?
Weighted outcomes
Response No risk Low risk Moderate risk High risk
Not at all 38.49 34.73 31.21 37.78
Slightly 23.68 23.79 25.31 22.96
Moderately 21.89 27.57 24.81 26.79
Very 14.21 12.28 16.61 11.69
Extremely 1.73 1.63 2.06 0.79

Plot this data:

Show code
# Sum the proportions of "Moderately Important", "Important", and "Very Important" for each variable
importance_sum_ACT4_READ_CLOSE <- pgsi_cat_df_ACT4_READ_CLOSE %>%
  filter(Response %in% c('Moderately',
                                                    'Very',
                                                    'Extremely')) %>%
  group_by(PGSI_CATEGORY) %>%
  summarise(Importance_Sum = round(sum(Proportion),0))

# Join the computed sum back to the main data table:
pgsi_cat_df_ACT4_READ_CLOSE_w_counts <- pgsi_cat_df_ACT4_READ_CLOSE %>%
  left_join(importance_sum_ACT4_READ_CLOSE, by = "PGSI_CATEGORY")

Q4_summary_plot <-
  pgsi_cat_df_ACT4_READ_CLOSE_w_counts %>%
    mutate(Response = factor(Response, levels = c('Extremely',
'Very',
'Moderately',
'Slightly',
'Not at all'))) %>%
  # arrange(desc(Response)) %>%
ggplot(aes(x = PGSI_CATEGORY, y = Proportion, fill = Response)) +
  geom_bar(stat = "identity", position = "stack") +
    geom_text(data = pgsi_cat_df_ACT4_READ_CLOSE_w_counts %>% filter(Response == "Extremely"), 
            aes(x = PGSI_CATEGORY, y = 94, label = round(Importance_Sum, 2)), 
            hjust = 0, color = "white", fontface = "bold") +
  coord_flip() +
  # scale_fill_manual(values = natparks.pals("Arches", 5)) +
      scale_fill_manual(values = custom_palette1) +
  labs(title = NULL, x = NULL, y = "Percentage (%)", fill = "Response") +
    theme(text = element_text(family = "Poppins"))

# Present plot in quarto doc:
Q4_summary_plot +
  plot_theme

Show code
# Save plot with proper text dimensions for paper:
Q4_summary_plot_saved<- Q4_summary_plot +
  theme_classic() +
  theme(text = element_text(family = "Poppins"),
        legend.title = element_text(size = 9),  
        legend.text = element_text(size = 8))

ggsave("Figures/Q4_summary_plot.pdf", 
       plot = Q4_summary_plot_saved, 
       height = 1.9, 
       width = 6, 
       units = "in")

The numbers in the above plot represent the total proportion of responses that were: ‘Moderately’, ‘Very’, or ‘Extremely’.

Q: Statement impact

The next question asked was:

To what extent if any did reading your activity statement result in any change in your gambling behaviour?

Note that there was an error in our survey such that people who stated they “Always” read statements did not receive this question. The group that stated they “Never” opened their statements should have been excluded from this question. As such, we will exclude those who gave this response from the summary of outcomes here.

Show code
master_dataset_ACT3_never_removed   <-  master_dataset %>%
      filter(ACT3_READ_FREQ != "Never") 

# Create Updated survey design object:
survey_design_ACT3_never_removed <- survey::svydesign(
  id = ~1,
  data = master_dataset_ACT3_never_removed,
  weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS
)

# master_dataset$ACT5_BEHAVIOUR

      
freq_table_ACT5 <- svytable(~ACT5_BEHAVIOUR, survey_design_ACT3_never_removed)
prop_table_ACT5<- prop.table(freq_table_ACT5)

ACT5_Percentages<- prop_table_ACT5 %>% 
  as.data.frame() %>%
mutate('Percent' = round(Freq*100,2)) %>%
rename(Response = ACT5_BEHAVIOUR) %>%
      mutate(Response = factor(Response, levels = c('Greatly increased my gambling',
                                                    'Increased my gambling',
                                                    'No change in my gambling',
                                                    'Decreased my gambling',
                                                    'Greatly decreased my gambling'))) %>%
  arrange(Response)

ACT5_Percentages %>%
select(-Freq) %>%
gt() %>%
  tab_header("To what extent if any did reading your activity statement result in any change in your gambling behaviour?", subtitle = md("***Weighted outcomes***"))
To what extent if any did reading your activity statement result in any change in your gambling behaviour?
Weighted outcomes
Response Percent
Greatly increased my gambling 0.29
Increased my gambling 0.50
No change in my gambling 81.60
Decreased my gambling 15.40
Greatly decreased my gambling 2.21

Compute some stats to tell us what the ratio of people who said they decreased their gambling relative to those who said they increased their gambling following statement use:

Show code
# Sum the frequencies for "Decreased my gambling" and "Greatly decreased my gambling"
decreased_gambling <- sum(ACT5_Percentages$Freq[ACT5_Percentages$Response %in% 
                                                c("Decreased my gambling", "Greatly decreased my gambling")])

# Sum the frequencies for "Increased my gambling" and "Greatly increased my gambling"
increased_gambling <- sum(ACT5_Percentages$Freq[ACT5_Percentages$Response %in% 
                                                c("Increased my gambling", "Greatly increased my gambling")])

# Calculate the ratio
ratio <- decreased_gambling / increased_gambling

# Print the ratio in a readable format
cat("The ratio of people who said they decreased their gambling because of reading their statements relative to those who said they increased their gambling was", round(ratio, 2), ":1\n")
The ratio of people who said they decreased their gambling because of reading their statements relative to those who said they increased their gambling was 22.14 :1

Now provide some summary summary statistics relating to gambling harm/risk:

Show code
# Compute mean and SD for PGSI grouped by ACT5_BEHAVIOUR
pgsi_mean_ACT5_BEHAVIOUR <- svyby(~PGSI_TOTAL, ~ACT5_BEHAVIOUR, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT5_BEHAVIOUR,
        Mean_PGSI = PGSI_TOTAL)
pgsi_SD_ACT5_BEHAVIOUR <- svyby(~PGSI_TOTAL, ~ACT5_BEHAVIOUR, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(PGSI_SD = sqrt(PGSI_TOTAL)) %>%
 select(Response = ACT5_BEHAVIOUR,
        SD_PGSI = PGSI_SD)

# Compute mean and SD for Gambling harms measure grouped by ACT5_BEHAVIOUR
GHM_mean_ACT5_BEHAVIOUR <- svyby(~GHM_TOTAL, ~ACT5_BEHAVIOUR, survey_design, svymean, na.rm = TRUE) %>%
 select(Response = ACT5_BEHAVIOUR,
        Mean_GHM = GHM_TOTAL)
GHM_SD_ACT5_BEHAVIOUR <- svyby(~GHM_TOTAL, ~ACT5_BEHAVIOUR, survey_design, svyvar, na.rm = TRUE) %>%
  mutate(GHM_SD = sqrt(GHM_TOTAL)) %>%
 select(Response = ACT5_BEHAVIOUR,
        SD_GHM = GHM_SD)

Act_5_Risk_scores<- pgsi_mean_ACT5_BEHAVIOUR %>%
left_join(pgsi_SD_ACT5_BEHAVIOUR, by = 'Response') %>%
left_join(GHM_mean_ACT5_BEHAVIOUR, by = 'Response') %>%
  left_join(GHM_SD_ACT5_BEHAVIOUR, by = 'Response') %>%
  mutate_if(is.numeric, round, 2) %>%
  mutate(Response = factor(Response, levels = c('Greatly increased my gambling',
                                                    'Increased my gambling',
                                                    'No change in my gambling',
                                                    'Decreased my gambling',
                                                    'Greatly decreased my gambling'))) %>%
  arrange(Response)

gt(Act_5_Risk_scores) %>%
     tab_options(data_row.padding = px(3)) %>%
  tab_header("Gambling harm scores grouped by response to: To what extent if any did reading your activity statement result in any change in your gambling behaviour?", subtitle = md("***Weighted outcomes***"))
Gambling harm scores grouped by response to: To what extent if any did reading your activity statement result in any change in your gambling behaviour?
Weighted outcomes
Response Mean_PGSI SD_PGSI Mean_GHM SD_GHM
Greatly increased my gambling 1.12 1.47 0.03 0.21
Increased my gambling 12.01 9.66 7.51 6.74
No change in my gambling 3.44 4.03 1.23 2.38
Decreased my gambling 5.78 4.41 2.88 3.59
Greatly decreased my gambling 4.74 4.10 2.62 4.48

As previously (but particularly here), the above figures are a little hard to interpret, and I think it actually is more informative to look at how responses to this are distributed across the PGSI categories:

Show code
pgsi_cat_ACT5_BEHAVIOUR  <- svytable(~ACT5_BEHAVIOUR + PGSI_CATEGORY, survey_design)

# Convert to a data frame and compute proportions
pgsi_cat_df_ACT5_BEHAVIOUR <- as.data.frame(pgsi_cat_ACT5_BEHAVIOUR) %>%
  group_by(PGSI_CATEGORY) %>%
  mutate(Proportion = Freq / sum(Freq)) %>%
  ungroup() %>%
  select(Response = ACT5_BEHAVIOUR, PGSI_CATEGORY, Proportion) %>%
  mutate(Proportion = round(Proportion * 100, 2)) %>%
    mutate(Response = factor(Response, levels = c('Greatly increased my gambling',
                                                    'Increased my gambling',
                                                    'No change in my gambling',
                                                    'Decreased my gambling',
                                                    'Greatly decreased my gambling'))) %>%
  arrange(Response)

# Present as a table:
pgsi_cat_df_ACT5_BEHAVIOUR %>%
  pivot_wider(names_from = PGSI_CATEGORY, values_from = Proportion) %>%
  gt() %>%
  tab_options(data_row.padding = px(3)) %>%
  tab_header(
    title = "Proportion of customers in each PGSI category by response to: To what extent if any did reading your activity statement result in any change in your gambling behaviour?",
    subtitle = md("***Weighted outcomes***")
  )
Proportion of customers in each PGSI category by response to: To what extent if any did reading your activity statement result in any change in your gambling behaviour?
Weighted outcomes
Response No risk Low risk Moderate risk High risk
Greatly increased my gambling 0.40 0.39 0.02 0.00
Increased my gambling 0.44 0.00 0.37 1.49
No change in my gambling 94.43 91.46 80.05 74.45
Decreased my gambling 3.88 5.61 17.84 19.79
Greatly decreased my gambling 0.85 2.54 1.73 4.26

Plot this data:

Show code
# Sum the proportions of "Moderately Important", "Important", and "Very Important" for each variable
importance_sum_ACT5_BEHAVIOUR <- pgsi_cat_df_ACT5_BEHAVIOUR %>%
  filter(Response %in% c('Decreased my gambling',
                         'Greatly decreased my gambling')) %>%
  group_by(PGSI_CATEGORY) %>%
  summarise(Importance_Sum = round(sum(Proportion),0))

# Join the computed sum back to the main data table:
pgsi_cat_df_ACT5_BEHAVIOUR_w_counts <- pgsi_cat_df_ACT5_BEHAVIOUR %>%
  left_join(importance_sum_ACT5_BEHAVIOUR, by = "PGSI_CATEGORY")

Q5_summary_plot <-
  pgsi_cat_df_ACT5_BEHAVIOUR_w_counts %>%
    mutate(Response = factor(Response, levels = c('Greatly decreased my gambling',
'Decreased my gambling',
'No change in my gambling',
'Increased my gambling',
'Greatly increased my gambling'
))) %>%
  # arrange(desc(Response)) %>%
ggplot(aes(x = PGSI_CATEGORY, y = Proportion, fill = Response)) +
  geom_bar(stat = "identity", position = "stack") +
    geom_text(data = pgsi_cat_df_ACT5_BEHAVIOUR_w_counts %>% filter(Response == "Decreased my gambling"), 
            aes(x = PGSI_CATEGORY, y = 101, label = round(Importance_Sum, 2)), 
            hjust = 0, color = "black", size = 3) +
  coord_flip() +
    scale_fill_manual(values = custom_palette2) +
  # scale_fill_manual(values = natparks.pals("Arches", 5)) +
  labs(title = NULL, x = NULL, y = "Percentage (%)", fill = "") +
    theme(text = element_text(family = "Poppins"))

# Present plot in quarto doc:
Q5_summary_plot +
  plot_theme

Show code
# Save plot with proper text dimensions for paper:
Q5_summary_plot_saved<- Q5_summary_plot +
  theme_classic() +
  theme(text = element_text(family = "Poppins"),
        legend.title = element_text(size = 9),  
        legend.text = element_text(size = 8),
        legend.position = "top"
         # plot.margin = margin(t = 0, r = 10, b = 0, l = 0)  
  ) +
   guides(fill = guide_legend(nrow = 5))  +
  scale_y_continuous(limits = c(0, 105), breaks = c(0,25,50,75,100)) 

ggsave("Figures/Q5_summary_plot.pdf", 
       plot = Q5_summary_plot_saved, 
       height = 90, 
       width = 85, 
       units = "mm")

The numbers in the above plot represent the total proportion of responses that were: ‘Decreased my gambling’ or ‘Greatly decreased my gambling’.

Grouped responses

Show code
# Ensure that factor levels are set correctly
survey_design$variables$ACT1_TRACK <- factor(survey_design$variables$ACT1_TRACK, 
                                             levels = c("Never", "Rarely", "Sometimes", "Very Often", "Always"))
survey_design$variables$ACT3_READ_FREQ <- factor(survey_design$variables$ACT3_READ_FREQ, 
                                                 levels = c("Never", "Rarely", "Sometimes", "Very Often", "Always"))

# Function to create frequency and proportion tables
create_freq_table <- function(variable1, variable2, survey_design) {
  # Create a formula from the variable names
  formula <- as.formula(paste("~", variable1, "+", variable2))
  
  # Calculate the frequency and proportion tables
  freq_table <- svytable(formula, survey_design)
  prop_table <- prop.table(freq_table, margin = 1)
  
  # Convert tables to data frames
  freq_df <- as.data.frame(freq_table) %>%
    rename(Frequency = Freq) %>%
    mutate(Percentage = round(Frequency / sum(Frequency) * 100, 2))
  
  prop_df <- as.data.frame(prop_table) %>%
    rename(Proportion = Freq) %>%
    mutate('Percentage of tracking reponse group' = ifelse(is.na(Proportion), 0, round(Proportion * 100, 2)))
  
  list(freq_table = freq_df, prop_table = prop_df)
}

# Function to create summary table with gt
create_summary_table <- function(variable1, variable2, survey_design, title, subtitle) {
  tables <- create_freq_table(variable1, variable2, survey_design)
  
  # Sort the data frame by variable1 (ACT1_TRACK) and variable2 (ACT3_READ_FREQ)
  sorted_prop_table <- tables$prop_table %>%
    arrange(factor(!!sym(variable1), levels = c("Never", "Rarely", "Sometimes", "Very Often", "Always")),
            factor(!!sym(variable2), levels = c("Never", "Rarely", "Sometimes", "Very Often", "Always")))
  
  sorted_prop_table %>%
    select(-Proportion) %>%
    rename('Tracking frequency '= ACT1_TRACK,
      'Activity statement frequency' = ACT3_READ_FREQ) %>%
    gt() %>%
    tab_header(
      title = title,
      subtitle = md(subtitle)
    ) %>%
    tab_options(data_row.padding = px(3))
}

# Create the frequency table for Q4
Q4_table <- create_summary_table("ACT1_TRACK", "ACT3_READ_FREQ", survey_design,
                                 "Frequency of statement use grouped by people's reported spending tracking rate",
                                 "***Weighted outcomes***")

# Display the table
Q4_table
Frequency of statement use grouped by people's reported spending tracking rate
Weighted outcomes
Tracking frequency Activity statement frequency Percentage of tracking reponse group
Never Never 50.80
Never Rarely 21.77
Never Sometimes 11.45
Never Very Often 6.17
Never Always 9.80
Rarely Never 29.95
Rarely Rarely 27.74
Rarely Sometimes 21.81
Rarely Very Often 9.91
Rarely Always 10.58
Sometimes Never 21.38
Sometimes Rarely 19.85
Sometimes Sometimes 33.03
Sometimes Very Often 11.81
Sometimes Always 13.93
Very Often Never 14.68
Very Often Rarely 15.83
Very Often Sometimes 25.23
Very Often Very Often 24.66
Very Often Always 19.61
Always Never 15.58
Always Rarely 16.13
Always Sometimes 18.26
Always Very Often 10.02
Always Always 40.00

Regression models

Next, we run a series of ordinal regression models to identify the predictors of activity use & impact.

Ordinal regression

Great resource on conducting order of regression in R and interpreting the outcomes:
https://www.bookdown.org/rwnahhas/RMPH/blr-ordinal.html

This section is divided into subsections by the question addressed by each model.

Who uses statements?

This core question will involve using ACT3 (statement read frequency) as the outcome, with several predictor variables of theoretical and practical interest the predictor variables we decided include are:

  • Age
  • Gender
  • Education level
  • PGSI Category
  • Past 30-day betting intensity (i.e., bets per active day)
  • Past 30-day net outcome
  • Past 30-day no. deposits
  • Current deposit limit set

There are several steps we need to take for each ordinal regression model that are outlined below.

1. Check the order of the levels of our outcome variable

We need to ensure we know the probabilities our model is estimating (we want less to more engagement):

Show code
master_dataset$ACT3_READ_FREQ <- as.factor(master_dataset$ACT3_READ_FREQ)

master_dataset$ACT3_READ_FREQ <- factor(master_dataset$ACT3_READ_FREQ, levels = rev(c('Always', 'Very Often', 'Sometimes', 'Rarely', 'Never')))

levels(master_dataset$ACT3_READ_FREQ)
[1] "Never"      "Rarely"     "Sometimes"  "Very Often" "Always"    
2. Check all predictors are correctly formatted

If we have categorical predictors (Which we do), we need to ensure they are set to factors with the levels in a meaningful order.

The key variables of interest here are gender, PGSI category, education level, and current deposit limit.

Looking at the complexity of the different categories involved in education level in our data, and it make sense to converted to a numeric variable with higher values indicating more education:

Show code
# Gender:
# master_dataset %>%
#   count(GENDER)

master_dataset$GENDER <- factor(master_dataset$GENDER, levels = rev(c('Unknown', 'Female', 'Male')))

# levels(master_dataset$GENDER)

# Brand:
# master_dataset %>%
#   count(BRAND)

master_dataset$BRAND <- factor(master_dataset$BRAND, levels = c('Op1', 'Op2'))

# levels(master_dataset$BRAND)


# PGSI:
# master_dataset %>%
#   count(PGSI_CATEGORY)

# levels(master_dataset$PGSI_CATEGORY)

# Education level:
master_dataset$EDUCATION <- factor(master_dataset$EDUCATION, levels = rev(c(
  "Doctoral degree",
  "Master's degree",
  "Graduate diploma or graduate level certificate",
  "Bachelor's degree",
  "Advanced diploma/diploma",
  "Certificate III/IV",
  "Year 12",
  "Year 11 or below (includes Certificate I/II/n)"
)))

# levels(master_dataset$EDUCATION)

# Define the numeric levels for EDUCATION
education_levels <- c(
  "Year 11 or below (includes Certificate I/II/n)" = 1,
  "Year 12" = 2,
  "Certificate III/IV" = 3,
  "Advanced diploma/diploma" = 4,
  "Bachelor's degree" = 5,
  "Graduate diploma or graduate level certificate" = 6,
  "Master's degree" = 7,
  "Doctoral degree" = 8
)

# Convert EDUCATION to numeric
master_dataset$EDUCATION_NUM <- as.numeric(factor(master_dataset$EDUCATION, levels = names(education_levels), labels = education_levels))

# Current deposit limit:
master_dataset$ACTIVE_DEPOSIT_LIMIT <- factor(master_dataset$ACTIVE_DEPOSIT_LIMIT,
                                             levels = c('0', '1'),
                                             labels = c('No', 'Yes'))

# Check the updated levels
# levels(master_dataset$ACTIVE_DEPOSIT_LIMIT)
3. Re-define the weights (survey design)

Also do this, also drop any empty values so we can get a sense of sample size for this analysis and save them being removed from the model. One other thing I’m going to also do here but is based on checks performed later is winsorize the net outcome variable as it has quite extreme outliers:

Show code
master_dataset_who_uses_statements <- master_dataset %>%
drop_na(ACT3_READ_FREQ,
        EDUCATION_NUM)

# Winsorize the Net outcome variable
winsorized_net_outcome <- Winsorize(master_dataset_who_uses_statements$PAS_30_NET_OUTCOME, probs = c(0.01, 0.99))

# Add the winsorized variable to new dataset:
master_dataset_who_uses_statements$winsorized_PAS_30_NET_OUTCOME <- winsorized_net_outcome

# Define survey design With updated dataset
survey_design_regression_model_who_uses_statements <- svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, data = master_dataset_who_uses_statements)

The number of individuals included in the model is: 1213 [this value is derived directly from code and should update accordingly].

4. Run the model

For this, we’ll use the survey R Package and the svyolr() function. POLR stands for proportional odds logistic regression (proportional odds, in this context referring to assumption that the relationship between each pair of outcome groups is the same [ similar odds ratios across all response options]).

Note: we tried using the MASS package and the polr() function for this analysis, but it is the weighted population sample size and therefore leads to biased standard errors and model estimates. The actual function used isbased very closely on this function.

Show code
 # Fit the model using survey package:
who_uses_statements_model <- svyolr(ACT3_READ_FREQ ~ 
                                      AGE +
                                      GENDER +
                                      EDUCATION_NUM +
                                      PGSI_CATEGORY + 
                                      PAS_30_BET_INTENSITY +
                                      winsorized_PAS_30_NET_OUTCOME +
                                      PAS_30_TOTAL_DEPOSIT_FREQ +
                                      ACTIVE_DEPOSIT_LIMIT, 
                       design = survey_design_regression_model_who_uses_statements)

summary(who_uses_statements_model)
Call:
svyolr(ACT3_READ_FREQ ~ AGE + GENDER + EDUCATION_NUM + PGSI_CATEGORY + 
    PAS_30_BET_INTENSITY + winsorized_PAS_30_NET_OUTCOME + PAS_30_TOTAL_DEPOSIT_FREQ + 
    ACTIVE_DEPOSIT_LIMIT, design = survey_design_regression_model_who_uses_statements)

Coefficients:
                                      Value   Std. Error    t value
AGE                            2.677789e-03 0.0044001047  0.6085739
GENDERFemale                  -4.459078e-01 0.1808314044 -2.4658760
GENDERUnknown                  1.124762e+00 0.5428440805  2.0719805
EDUCATION_NUM                  2.148225e-02 0.0331689434  0.6476617
PGSI_CATEGORYLow risk          1.757147e-01 0.1710681298  1.0271621
PGSI_CATEGORYModerate risk     1.931243e-01 0.1649160591  1.1710461
PGSI_CATEGORYHigh risk        -4.073229e-01 0.1987460889 -2.0494637
PAS_30_BET_INTENSITY           6.740973e-04 0.0042139078  0.1599696
winsorized_PAS_30_NET_OUTCOME -8.128533e-05 0.0001473825 -0.5515263
PAS_30_TOTAL_DEPOSIT_FREQ      1.372829e-03 0.0025314106  0.5423178
ACTIVE_DEPOSIT_LIMITYes        3.756651e-01 0.1973360167  1.9036826

Intercepts:
                     Value   Std. Error t value
Never|Rarely         -0.9869  0.2749    -3.5897
Rarely|Sometimes     -0.0242  0.2718    -0.0889
Sometimes|Very Often  0.9300  0.2732     3.4044
Very Often|Always     1.6870  0.2774     6.0816
5. Check the assumption of proportional odds

The function used to check this doesn’t accept a model from the svyolr() function, so I’m going to run a nonweighted ordinal regression using MASS::polr() and then feed it to the function to check this assumption.

I will also display the results from the nonweighted model to give us a comparison of weighted vs. nonweighted outcomes.

Let’s start by performing the non-weighted model and showing the results:

Show code
# Fit the model using polr without survey weights
polr_who_uses_statements_model <- MASS::polr(ACT3_READ_FREQ ~ 
                                      AGE +
                                      GENDER +
                                      EDUCATION_NUM +
                                      PGSI_CATEGORY + 
                                      PAS_30_BET_INTENSITY +
                                      PAS_30_NET_OUTCOME +
                                      PAS_30_TOTAL_DEPOSIT_FREQ +
                                      ACTIVE_DEPOSIT_LIMIT, 
                         data = master_dataset, 
                         Hess = TRUE)

summary(polr_who_uses_statements_model)
Call:
MASS::polr(formula = ACT3_READ_FREQ ~ AGE + GENDER + EDUCATION_NUM + 
    PGSI_CATEGORY + PAS_30_BET_INTENSITY + PAS_30_NET_OUTCOME + 
    PAS_30_TOTAL_DEPOSIT_FREQ + ACTIVE_DEPOSIT_LIMIT, data = master_dataset, 
    Hess = TRUE)

Coefficients:
                                Value Std. Error t value
AGE                         4.009e-04  3.784e-03  0.1060
GENDERFemale               -4.438e-01  1.671e-01 -2.6554
GENDERUnknown               1.031e+00  4.468e-02 23.0695
EDUCATION_NUM               4.728e-02  2.876e-02  1.6442
PGSI_CATEGORYLow risk       2.370e-01  1.524e-01  1.5549
PGSI_CATEGORYModerate risk  1.377e-01  1.438e-01  0.9578
PGSI_CATEGORYHigh risk     -2.820e-01  1.685e-01 -1.6738
PAS_30_BET_INTENSITY        1.732e-03  3.049e-03  0.5681
PAS_30_NET_OUTCOME         -3.447e-05  5.321e-05 -0.6477
PAS_30_TOTAL_DEPOSIT_FREQ   1.841e-03  1.295e-03  1.4222
ACTIVE_DEPOSIT_LIMITYes     3.542e-01  1.758e-01  2.0155

Intercepts:
                     Value   Std. Error t value
Never|Rarely         -1.0479  0.2399    -4.3682
Rarely|Sometimes     -0.0747  0.2372    -0.3148
Sometimes|Very Often  0.8668  0.2391     3.6251
Very Often|Always     1.6365  0.2437     6.7148

Residual Deviance: 3840.941 
AIC: 3870.941 
(434 observations deleted due to missingness)

And now perform brant test of proportional odds:

Show code
# Perform brant test:
brant(polr_who_uses_statements_model)
------------------------------------------------------------ 
Test for            X2  df  probability 
------------------------------------------------------------ 
Omnibus             39.82   33  0.19
AGE             15.72   3   0
GENDERFemale            0.87    3   0.83
GENDERUnknown           2.15    3   0.54
EDUCATION_NUM           2.5 3   0.48
PGSI_CATEGORYLow risk       3.41    3   0.33
PGSI_CATEGORYModerate risk  2.4 3   0.49
PGSI_CATEGORYHigh risk  0.5 3   0.92
PAS_30_BET_INTENSITY        4.11    3   0.25
PAS_30_NET_OUTCOME      0.98    3   0.81
PAS_30_TOTAL_DEPOSIT_FREQ   0.47    3   0.92
ACTIVE_DEPOSIT_LIMITYes 1.88    3   0.6
------------------------------------------------------------ 

H0: Parallel Regression Assumption holds

Looking at the above, we can see that the overall model (omnibus) has a p value (probability) above 0.05, and so we fail to reject the null hypothesis that the proportional odds assumption holds. Age is the only variable with a p value lower than the conventional threshold, given the overall model outcome and the values for all of the predictors, I think it’s safe to proceed with the model as is.

6. Check assumption of multicollinearity

We will do this just using variable inflation factors values:

Show code
vif(who_uses_statements_model)
                          AGE                  GENDERFemale 
                    10.866169                      1.261673 
                GENDERUnknown                 EDUCATION_NUM 
                     1.070470                      4.988931 
        PGSI_CATEGORYLow risk    PGSI_CATEGORYModerate risk 
                     2.360605                      2.914197 
       PGSI_CATEGORYHigh risk          PAS_30_BET_INTENSITY 
                     2.065627                      1.572782 
winsorized_PAS_30_NET_OUTCOME     PAS_30_TOTAL_DEPOSIT_FREQ 
                     2.037711                      1.951067 
      ACTIVE_DEPOSIT_LIMITYes                  Never|Rarely 
                     1.120347                     20.233654 
             Rarely|Sometimes          Sometimes|Very Often 
                    38.115846                     39.407728 
            Very Often|Always 
                    22.512546 

No issues there.

7. Check linearity of continuous predictors

Is the relationship between continuous predictors and the log odds of the outcome variable linear?

Show code
# Predicted probabilities
predicted_probs <- predict(who_uses_statements_model, type = "probs")

# Visualize relationship between AGE and predicted probabilities
age_relation_who_uses_statements<- ggplot(master_dataset_who_uses_statements, aes(x = AGE, y = predicted_probs[, "Always"])) +
  geom_point() +
  geom_smooth(method = "loess") +
    xlab("Age") +
    ylab("Predicted Prob = Always)") +
  plot_theme

Education_relation_who_uses_statements <- ggplot(master_dataset_who_uses_statements, aes(x = EDUCATION_NUM, y = predicted_probs[, "Always"])) +
  geom_point() +
  geom_smooth(method = "loess") +
    xlab("Education level") +
    ylab("Predicted Prob = Always)") +
  plot_theme

pas_30_bet_intensity_relation_who_uses_statements <- ggplot(master_dataset_who_uses_statements, aes(x = PAS_30_BET_INTENSITY, y = predicted_probs[, "Always"])) +
  geom_point() +
  geom_smooth(method = "loess") +
  xlab("PAS 30 Bet Intensity") +
  ylab("Predicted Prob = Always)") +
  plot_theme

pas_30_net_outcome_relation_who_uses_statements <- ggplot(master_dataset_who_uses_statements, aes(x = winsorized_PAS_30_NET_OUTCOME, y = predicted_probs[, "Always"])) +
  geom_point() +
  geom_smooth(method = "loess") +
  xlab("PAS 30 Net Outcome") +
  ylab("Predicted Prob = Always)") +
  plot_theme

pas_30_total_deposit_freq_relation_who_uses_statements <- ggplot(master_dataset_who_uses_statements, aes(x = PAS_30_TOTAL_DEPOSIT_FREQ, y = predicted_probs[, "Always"])) +
  geom_point() +
  geom_smooth(method = "loess") +
  xlab("PAS 30 Total Deposit Frequency") +
  ylab("Predicted Prob = Always)") +
  plot_theme

# Combine plots using patchwork
combined_plots <- (
  age_relation_who_uses_statements +
  Education_relation_who_uses_statements +
  pas_30_bet_intensity_relation_who_uses_statements +
  pas_30_net_outcome_relation_who_uses_statements +
  pas_30_total_deposit_freq_relation_who_uses_statements
)

combined_plots +
  plot_layout(ncol = 2, byrow = TRUE)  # Adjust the number of columns and layout as needed

Looking at the above, other than net outcome, which is a bit of a strange variable, all variables are mostly linearly related to the outcome. It’s also More difficult to transform net outcome properly as there are minus values

8. Table outcomes
Show code
# Calculate the number of participants invovled in the analyses:
n_uses_statements <- nrow(master_dataset_who_uses_statements)

who_uses_statements_model_tbl_beta <-  tbl_regression(who_uses_statements_model,
  estimate_fun = purrr::partial(style_ratio, digits = 3),
   label = c(
      AGE ~ "Age",
      GENDER ~ "Gender",
      EDUCATION_NUM ~ "Education level",
      PGSI_CATEGORY ~ "PGSI category",
      PAS_30_BET_INTENSITY ~ "Betting intensity",
      winsorized_PAS_30_NET_OUTCOME ~ "Net outcome",
      PAS_30_TOTAL_DEPOSIT_FREQ ~ "Total no. deposits",
      ACTIVE_DEPOSIT_LIMIT ~ "Active deposit limit")) %>%
  italicize_levels() %>%
    modify_column_hide(columns = c('conf.low', 'conf.high', 'ci')) 


who_uses_statements_model_tbl_ORs <-  tbl_regression(who_uses_statements_model,
                exponentiate = TRUE,
               estimate_fun = purrr::partial(style_ratio, digits = 2),
    pvalue_fun = ~ style_pvalue(.x, digits = 3),
   label = c(
      AGE ~ "Age",
      GENDER ~ "Gender",
      EDUCATION_NUM ~ "Education level",
      PGSI_CATEGORY ~ "PGSI category",
      PAS_30_BET_INTENSITY ~ "Betting intensity",
      winsorized_PAS_30_NET_OUTCOME ~ "Net outcome",
      PAS_30_TOTAL_DEPOSIT_FREQ ~ "Total no. deposits",
      ACTIVE_DEPOSIT_LIMIT ~ "Active deposit limit")) %>%
  add_global_p() %>%
  italicize_levels()

combined_reg_tables_who_uses_statements<- tbl_merge(tbls = list(who_uses_statements_model_tbl_beta, who_uses_statements_model_tbl_ORs)) %>%
   modify_spanning_header(everything() ~ NA_character_)

combined_reg_tables_who_uses_statements  %>%
as_gt() %>%
tab_options(data_row.padding = px(2)) %>%
tab_header(
    title = "Table 2. Ordinal Regression: Predictors of Statement Use Frequency",
    subtitle = glue::glue("N = {n_uses_statements}")
  ) 
Table 2. Ordinal Regression: Predictors of Statement Use Frequency
N = 1213
Characteristic log(OR)1 OR1 95% CI1 p-value
Age 0.003 1.00 0.99, 1.01 0.543
Gender


0.005
    Male
    Female -0.446 0.64 0.45, 0.91
    Unknown 1.125 3.08 1.06, 8.92
Education level 0.021 1.02 0.96, 1.09 0.517
PGSI category


0.003
    No risk
    Low risk 0.176 1.19 0.85, 1.67
    Moderate risk 0.193 1.21 0.88, 1.68
    High risk -0.407 0.67 0.45, 0.98
Betting intensity 0.001 1.00 0.99, 1.01 0.873
Net outcome 0.000 1.00 1.00, 1.00 0.581
Total no. deposits 0.001 1.00 1.00, 1.01 0.588
Active deposit limit


0.057
    No
    Yes 0.376 1.46 0.99, 2.14
1 OR = Odds Ratio, CI = Confidence Interval

Essentially, this table is our response to the question: Who uses activity statements?

The above table indicates that females were much less likely to report using activity statements frequently than males, whereas having an unknown gender was strongly predictive of use. Having a high-risk PGSI score was associated with less use.

Note that all variables relating to the account data are aggregate variables covering the 30 days prior to the survey.

Who benefited most from statements?

To answer this question, we will use the ACT5 responses (To what extent if any did reading your activity statement result in any change in your gambling behaviour?). We will dichotomised responses into Responses that indicate a positive reduction in gambling behaviour and responses that indicate no change in behaviour. Very few people said that the gambling had increased as a result of using statements, so it’s not possible to separate out and look at this group independently.

The following responses are available in the outcome variable:

  • ‘Greatly increased my gambling’
  • ‘Increased my gambling’
  • ‘No change in my gambling’ – Reference group
  • ‘Decreased my gambling’ - Combined positive change group
  • ‘Greatly decreased my gambling’ - Combined positive change group

We will use a logistic regression model to this question.

It make sense omit people from this analysis Who said that they never use statements.

1. Check outcome variable

Correctly set the levels of our outcome variable.

Show code
master_dataset$ACT5_BEHAVIOUR <- as.factor(master_dataset$ACT5_BEHAVIOUR)
# master_dataset %>% 
# count(ACT5_BEHAVIOUR)

# Ensure consistent levels for ACT3_READ_FREQ
master_dataset$ACT3_READ_FREQ <- factor(master_dataset$ACT3_READ_FREQ,
                                        levels = c("Never", "Rarely", "Sometimes", "Very Often", "Always"))

master_dataset_statement_impact<- master_dataset %>%
    filter(ACT3_READ_FREQ != "Never") %>%
    filter(ACT5_BEHAVIOUR != 'Greatly increased my gambling' & 
         ACT5_BEHAVIOUR != 'Increased my gambling') %>% # This seems to remove all the Empty values as well
mutate(STATEMENT_IMPACT = case_when(
      ACT5_BEHAVIOUR %in% c('Greatly decreased my gambling', 'Decreased my gambling') ~ 'Decreased my gambling',
    TRUE ~ ACT5_BEHAVIOUR # Keep other responses unchanged
  )) %>%
mutate(STATEMENT_IMPACT = as.factor(STATEMENT_IMPACT))

master_dataset_statement_impact$STATEMENT_IMPACT <- relevel(master_dataset_statement_impact$STATEMENT_IMPACT, ref = "No change in my gambling")

# levels(master_dataset_statement_impact$STATEMENT_IMPACT)

# master_dataset_statement_impact %>%
# count(ACT3_READ_FREQ)

# master_dataset_statement_impact %>% 
# count(STATEMENT_IMPACT)
2. Check all predictors are correctly formatted

We have already done this for the model above.

3. Re-define the weights (survey design)

Whilst we have done this above, we have also now edited the outcome variable to make it a ordered factor and so we need to create a new design object using the exact same method just post-ACT4_READ_CLOSE-adjustment:

Show code
master_dataset_statement_impact <- master_dataset_statement_impact %>%
drop_na(ACT5_BEHAVIOUR,
        EDUCATION_NUM)

# Winsorize the Net outcome variable
winsorized_net_outcome <- Winsorize(master_dataset_statement_impact$PAS_30_NET_OUTCOME, probs = c(0.01, 0.99))

# Add the winsorized variable to new dataset:
master_dataset_statement_impact$winsorized_PAS_30_NET_OUTCOME <- winsorized_net_outcome

# Define survey design With updated dataset
survey_design_regression_model_statement_impact <- svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, data = master_dataset_statement_impact)

The number of individuals included in the model is: 690 [this value is derived directly from code and should update accordingly].

4. Run the model

For this model, we’ll again use the survey R Package and the svyolr() function.

Show code
 # Fit the model using survey package:
statement_impact_model <- svyglm(STATEMENT_IMPACT ~ 
                                      ACT3_READ_FREQ +
                                      AGE + 
                                      GENDER +
                                      EDUCATION_NUM +
                                      PGSI_CATEGORY + 
                                      PAS_30_BET_INTENSITY +
                                      winsorized_PAS_30_NET_OUTCOME +
                                      PAS_30_TOTAL_DEPOSIT_FREQ +
                                      ACTIVE_DEPOSIT_LIMIT, 
                       design = survey_design_regression_model_statement_impact,
                       family = quasibinomial())

summary(statement_impact_model)

Call:
svyglm(formula = STATEMENT_IMPACT ~ ACT3_READ_FREQ + AGE + GENDER + 
    EDUCATION_NUM + PGSI_CATEGORY + PAS_30_BET_INTENSITY + winsorized_PAS_30_NET_OUTCOME + 
    PAS_30_TOTAL_DEPOSIT_FREQ + ACTIVE_DEPOSIT_LIMIT, design = survey_design_regression_model_statement_impact, 
    family = quasibinomial())

Survey design:
svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, 
    data = master_dataset_statement_impact)

Coefficients:
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   -2.584e+00  7.532e-01  -3.431 0.000639 ***
ACT3_READ_FREQSometimes        1.121e+00  3.807e-01   2.946 0.003334 ** 
ACT3_READ_FREQVery Often       2.275e+00  3.896e-01   5.839 8.15e-09 ***
AGE                           -3.002e-02  1.141e-02  -2.632 0.008690 ** 
GENDERFemale                  -3.393e-01  5.268e-01  -0.644 0.519744    
GENDERUnknown                  3.865e-01  1.265e+00   0.305 0.760149    
EDUCATION_NUM                 -3.932e-02  7.929e-02  -0.496 0.620116    
PGSI_CATEGORYLow risk          1.011e-02  5.069e-01   0.020 0.984095    
PGSI_CATEGORYModerate risk     1.248e+00  4.231e-01   2.948 0.003304 ** 
PGSI_CATEGORYHigh risk         1.541e+00  4.862e-01   3.169 0.001596 ** 
PAS_30_BET_INTENSITY           1.722e-02  5.803e-03   2.967 0.003110 ** 
winsorized_PAS_30_NET_OUTCOME -4.623e-05  1.246e-04  -0.371 0.710678    
PAS_30_TOTAL_DEPOSIT_FREQ      5.802e-03  4.807e-03   1.207 0.227822    
ACTIVE_DEPOSIT_LIMITYes       -2.449e-01  4.562e-01  -0.537 0.591524    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasibinomial family taken to be 1.051759)

Number of Fisher Scoring iterations: 5
5. Check model assumptions
Show code
check_model(statement_impact_model)

These look pretty good!

6. Table outcomes
Show code
# Calculate the number of participants invovled in the analyses:
n_statement_impact <- nrow(master_dataset_statement_impact)


statement_impact_model_tbl_beta <-  tbl_regression(statement_impact_model,
  estimate_fun = purrr::partial(style_ratio, digits = 3),
   label = c(
      ACT3_READ_FREQ ~ "How frequently do you read statements?",
      AGE ~ "Age",
      GENDER ~ "Gender",
      EDUCATION_NUM ~ "Education level",
      PGSI_CATEGORY ~ "PGSI group",
      PAS_30_BET_INTENSITY ~ "Betting intensity",
      winsorized_PAS_30_NET_OUTCOME ~ "Net outcome",
      PAS_30_TOTAL_DEPOSIT_FREQ ~ "Total no. deposits",
      ACTIVE_DEPOSIT_LIMIT ~ "Active deposit limit")) %>%
  italicize_levels() %>%
    modify_column_hide(columns = c('p.value','conf.low', 'conf.high', 'ci')) # remove cols to prevent duplication and excessive information in the final table


statement_impact_model_tbl_ORs <-  tbl_regression(statement_impact_model,
                exponentiate = TRUE,
               estimate_fun = purrr::partial(style_ratio, digits = 2),
    pvalue_fun = ~ style_pvalue(.x, digits = 3),
   label = c(
      ACT3_READ_FREQ ~ "How frequently do you read statements?",
      AGE ~ "Age",
      GENDER ~ "Gender",
      EDUCATION_NUM ~ "Education level",
      PGSI_CATEGORY ~ "PGSI group",
      PAS_30_BET_INTENSITY ~ "Betting intensity",
      winsorized_PAS_30_NET_OUTCOME ~ "Net outcome",
      PAS_30_TOTAL_DEPOSIT_FREQ ~ "Total no. deposits",
      ACTIVE_DEPOSIT_LIMIT ~ "Active deposit limit")) %>%
  add_global_p() %>%
  italicize_levels()

combined_reg_tables_statement_impact<- tbl_merge(tbls = list(statement_impact_model_tbl_beta, statement_impact_model_tbl_ORs)
          ) %>%
   modify_spanning_header(everything() ~ NA_character_)

combined_reg_tables_statement_impact  %>%
as_gt() %>%
tab_options(data_row.padding = px(3.5)) %>%
tab_header(
    title = "Table 3. Logistic Regression: Predictors of self-reported reductions in gambling as a result of reading statements",
    subtitle = glue::glue("N = {n_statement_impact}")
  ) 
Table 3. Logistic Regression: Predictors of self-reported reductions in gambling as a result of reading statements
N = 690
Characteristic log(OR)1 OR1 95% CI1 p-value
How frequently do you read statements?


<0.001
    Rarely
    Rarely
    Rarely
    Rarely
    Sometimes 1.121 3.07 1.45, 6.48
    Very Often 2.275 9.73 4.53, 20.9
Age -0.030 0.97 0.95, 0.99 0.008
Gender


0.778
    Male
    Female -0.339 0.71 0.25, 2.00
    Unknown 0.386 1.47 0.12, 17.7
Education level -0.039 0.96 0.82, 1.12 0.620
PGSI group


<0.001
    No risk
    Low risk 0.010 1.01 0.37, 2.73
    Moderate risk 1.248 3.48 1.52, 7.99
    High risk 1.541 4.67 1.80, 12.1
Betting intensity 0.017 1.02 1.01, 1.03 0.003
Net outcome 0.000 1.00 1.00, 1.00 0.711
Total no. deposits 0.006 1.01 1.00, 1.02 0.227
Active deposit limit


0.591
    No
    Yes -0.245 0.78 0.32, 1.92
1 OR = Odds Ratio, CI = Confidence Interval

The above table indicates that reading statements more closely, being younger, having a higher PGSI score, a higher rate of betting intensity, and more deposits were all associated with a greater likelihood of a self-reported reduction in gambling as a result of using activity statements.

Note that all variables relating to the account data are aggregate variables covering the 30 days prior to the survey.

Does reading statements improve recall accuracy?

To answer this question, we’re going to run two linear regression models.

In the first, percentage accuracy of spend well-being the outcome and statement read frequency (ACT3) as predictor. We’ll also include betting frequency as a predictor to account for variation in betting volume.

In the second, percentage accuracy of spend well-being the outcome and statement read frequency (ACT3) as predictor. We’ll also include betting frequency as a predictor to account for variation in betting volume, and the number of days since they last received a statement to determine if this impacts accuracy.

An important, will use the percentage discrepancy variable for both models. This represents the difference between the estimated and actual value, as a percentage of the actual value. All values are absolute (no negatives), so larger values indicate a greater discrepancy (i.e. poorer recall).

1. Check outcome variable

As our outcomes are continuous, let’s compute summary figures.

Start with spend:

Show code
summary(master_dataset$PAS_SPEND_PERCENT_DIFF)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
    0.00    19.05    62.02   260.56    92.78 92207.69      246 

Standard deviation:

Show code
sd(master_dataset$PAS_SPEND_PERCENT_DIFF, na.rm = TRUE)
[1] 2976.843

How skewed is it:

Show code
skewness(master_dataset$PAS_SPEND_PERCENT_DIFF, na.rm = TRUE)
[1] 25.16902

This variable has extreme values and is highly skewed, so let’s convert it using a transformation in the model. I have checked the model diagnostics when using the non-transformed version of this variable which confirmed the need for transformation.

And now net outcome:

Show code
summary(master_dataset$PAS_NET_OUTCOME_PERCENT_DIFF)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
      0.0      22.2      88.3    2567.5     197.0 1999900.0       289 

Standard deviation:

Show code
sd(master_dataset$PAS_NET_OUTCOME_PERCENT_DIFF, na.rm = TRUE)
[1] 55308.65

How skewed is it:

Show code
skewness(master_dataset$PAS_NET_OUTCOME_PERCENT_DIFF, na.rm = TRUE)
[1] 34.79296

This variable is also highly skewed and will likely require transformation. As with spend,I have checked the model diagnostics when using the non-transformed version of this variable which confirmed the need for transformation.

2. Check all predictors

We don’t have any categorical predictors here. One thing We need to do is engineer a variable that tells us the number of days between completing the survey and receiving their activity statement as we don’t currently have that available in the dataset.

Show code
master_dataset <- master_dataset %>%
  mutate(START_DATE = as.Date(START_DATE), 
         DAYS_SINCE_STATEMENT = as.numeric(START_DATE - as.Date('2024-01-02')))

# Check new variable:
# master_dataset %>%
#   select(START_DATE, 
#          DAYS_SINCE_STATEMENT) %>%
# print(n=50)
3. Re-define the weights (survey design)

This time we need to drop any empty the values in our predictors and outcome and remove anyone who didn’t bed in the 30 days prior to the survey:

Show code
# colnames(master_dataset)

# That set up new datasets will transform variables and removed empty values:
master_dataset_spend_accuracy <- master_dataset %>%
drop_na(ACT3_READ_FREQ,
        PAS_SPEND_PERCENT_DIFF) %>%
  filter(PAS_30_BET_FREQ_BETS != 0) %>%
   mutate(PAS_SPEND_PERCENT_DIFF_transformed = sqrt(PAS_SPEND_PERCENT_DIFF))
    # mutate(PAS_SPEND_PERCENT_DIFF_transformed = orderNorm(PAS_SPEND_PERCENT_DIFF)$x.t) # That this was recommended by BestNormalise but led to poorer model diagnostics on almost all measures


master_dataset_net_outcome_accuracy <- master_dataset %>%
drop_na(ACT3_READ_FREQ,
        PAS_NET_OUTCOME_PERCENT_DIFF) %>%
  filter(PAS_30_BET_FREQ_BETS != 0) %>%
   mutate(PAS_NET_OUTCOME_PERCENT_DIFF_transformed = sqrt(PAS_NET_OUTCOME_PERCENT_DIFF))



# Define survey designs:
survey_design_regression_model_spend_accuracy <- svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, data = master_dataset_spend_accuracy)


survey_design_regression_model_net_outcome_accuracy <- svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, data = master_dataset_net_outcome_accuracy)

The number of individuals included in the spend accuracy model is: 1075 [this value is derived directly from code and should update accordingly].

The number of individuals included in the net outcome accuracy model is: 1034 [this value is derived directly from code and should update accordingly].

4. Run the model

For this model, we’ll use a the svyglm() function from the survey package to run the model with weights. A simple lm() model accept weights, but these are precision weights and not sampling rates, we want to use the latter.

Show code
# Spend:
spend_accuracy_model <- svyglm(PAS_SPEND_PERCENT_DIFF_transformed ~
                                            PAS_30_BET_FREQ_BETS+
                                            DAYS_SINCE_STATEMENT +
                                            ACT3_READ_FREQ, 
                              design = survey_design_regression_model_spend_accuracy)

summary(spend_accuracy_model)

Call:
svyglm(formula = PAS_SPEND_PERCENT_DIFF_transformed ~ PAS_30_BET_FREQ_BETS + 
    DAYS_SINCE_STATEMENT + ACT3_READ_FREQ, design = survey_design_regression_model_spend_accuracy)

Survey design:
svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, 
    data = master_dataset_spend_accuracy)

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               8.5511743  2.4584900   3.478 0.000525 ***
PAS_30_BET_FREQ_BETS     -0.0022037  0.0009586  -2.299 0.021700 *  
DAYS_SINCE_STATEMENT      0.2616324  0.1566764   1.670 0.095234 .  
ACT3_READ_FREQRarely     -0.5814193  2.4437070  -0.238 0.811985    
ACT3_READ_FREQSometimes  -2.8014444  2.0149949  -1.390 0.164728    
ACT3_READ_FREQVery Often  0.3004373  2.8101568   0.107 0.914879    
ACT3_READ_FREQAlways     -2.8756521  1.9478848  -1.476 0.140160    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 284.3139)

Number of Fisher Scoring iterations: 2
Show code
# cor(master_dataset_spend_accuracy$DAYS_SINCE_STATEMENT, master_dataset_spend_accuracy$PAS_SPEND_PERCENT_DIFF_transformed)

# Net outcome:

net_outcome_accuracy_model <- svyglm(PAS_NET_OUTCOME_PERCENT_DIFF_transformed ~
                                            PAS_30_BET_FREQ_BETS +
                                            DAYS_SINCE_STATEMENT +
                                            ACT3_READ_FREQ , 
                              design = survey_design_regression_model_net_outcome_accuracy)

summary(net_outcome_accuracy_model)

Call:
svyglm(formula = PAS_NET_OUTCOME_PERCENT_DIFF_transformed ~ PAS_30_BET_FREQ_BETS + 
    DAYS_SINCE_STATEMENT + ACT3_READ_FREQ, design = survey_design_regression_model_net_outcome_accuracy)

Survey design:
svydesign(ids = ~1, weights = ~ELIGIBLE_SURVEY_SAMPLE_WEIGHTS, 
    data = master_dataset_net_outcome_accuracy)

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)   
(Intercept)               6.276475   6.932044   0.905  0.36545   
PAS_30_BET_FREQ_BETS     -0.007107   0.002610  -2.723  0.00658 **
DAYS_SINCE_STATEMENT      0.763400   0.539652   1.415  0.15748   
ACT3_READ_FREQRarely      3.782467   4.128098   0.916  0.35974   
ACT3_READ_FREQSometimes   0.450654   2.843691   0.158  0.87411   
ACT3_READ_FREQVery Often  1.923367   3.026740   0.635  0.52527   
ACT3_READ_FREQAlways     12.665455  11.059882   1.145  0.25241   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 3921.111)

Number of Fisher Scoring iterations: 2
5. Check assumption of models

Spend model first:

Show code
check_model(spend_accuracy_model)

Show code
# cooksd <- cooks.distance(spend_accuracy_model_continuous)
# plot(cooksd, pch = 19, frame = FALSE, main = "Cook's Distance Plot")
# abline(h = 4/length(cooksd), col = "red")  # Highlight the cutoff line (usually 4/n)

No real concerns here. I did some additional checks for outliers and didn’t find any issues. The VIF for betting frequency is relatively high (correlates with read freq) but out of the running the model without its co-linear counterpart it doesn’t seem to impact the coefficients or SEs at all and VIF <5.

And now the net outcome model:

Show code
check_model(net_outcome_accuracy_model)

6. Table outcomes
Show code
spend_accuracy_model_tbl_beta <- tbl_regression(spend_accuracy_model,
  estimate_fun = purrr::partial(style_ratio, digits = 3),
   label = c(
      PAS_30_BET_FREQ_BETS ~ "Number of bets",
      DAYS_SINCE_STATEMENT ~ "Number of days since last statement",
  ACT3_READ_FREQ ~ "How often do you read statements?")) %>%
  italicize_levels() 

net_outcome_accuracy_model_tbl_beta <- tbl_regression(net_outcome_accuracy_model,
  estimate_fun = purrr::partial(style_ratio, digits = 3),
   label = c(
      PAS_30_BET_FREQ_BETS ~ "Number of bets",
      DAYS_SINCE_STATEMENT ~ "Number of days since last statement",
  ACT3_READ_FREQ ~ "How often do you read statements?")) %>%
  italicize_levels() 

Now, let’s combine the outcomes from the two linear regression models into one table for the paper:

Show code
n_spend_accuracy <- nrow(master_dataset_spend_accuracy)
n_net_outcome_accuracy <- nrow(master_dataset_net_outcome_accuracy)

# Combine to one table
tbl_merge(
  tbls = list(
    spend_accuracy_model_tbl_beta,
    net_outcome_accuracy_model_tbl_beta
  ),
  tab_spanner = c(
    glue::glue("**Predictors of spend recall (*N* = {n_spend_accuracy})**"),
    glue::glue("**Predictors of net outcome recall (*N* = {n_net_outcome_accuracy})**"))
) %>%
as_gt() %>%
tab_options(data_row.padding = px(3.5)) %>%
 tab_header("Table 4. Impact of statement use on ability to recall recent gambling spend and net outcome")
Table 4. Impact of statement use on ability to recall recent gambling spend and net outcome
Characteristic Predictors of spend recall (N = 1075) Predictors of net outcome recall (N = 1034)
Beta 95% CI1 p-value Beta 95% CI1 p-value
Number of bets -0.002 -0.004, 0.000 0.022 -0.007 -0.012, -0.002 0.007
Number of days since last statement 0.262 -0.046, 0.569 0.10 0.763 -0.296, 1.822 0.2
How often do you read statements?





    Never

    Rarely -0.581 -5.376, 4.214 0.8 3.782 -4.318, 11.88 0.4
    Sometimes -2.801 -6.755, 1.152 0.2 0.451 -5.129, 6.031 0.9
    Very Often 0.300 -5.214, 5.814 >0.9 1.923 -4.016, 7.863 0.5
    Always -2.876 -6.698, 0.946 0.14 12.67 -9.037, 34.37 0.3
1 CI = Confidence Interval

The above table shows that, after accounting for betting frequency, self-reported frequency of reading activity statements was not associated with a reduced discrepancy between estimated and actual spend (turnover) or net outcome (i.e., total amount won or lost).

7. Compute group means

One final thing of interest we can do here is compute estimated marginal means (or least-squares means) for each response group in terms of their spend and net-outcome recall accuracy.

Remember, larger values indicate poorer recall.

We can use the emmeans package to compute estimated means, but then we will need to convert them back to the original scale as they were normalised using a square root transformation.

Spend predicted means:

Show code
em_means_transformed_spend <- emmeans(spend_accuracy_model, "ACT3_READ_FREQ")

# Extract the EMMs and their confidence intervals
em_means_df_spend <- as.data.frame(summary(em_means_transformed_spend))

# Back-transform the means and confidence intervals from the square root scale to the original scale
em_means_df_spend <- em_means_df_spend %>%
  mutate(
    emmean_original = round(emmean^2,2),
    lower_CL_original = round(lower.CL^2,2),
    upper_CL_original = round(upper.CL^2,2)
  )

gt(em_means_df_spend)
ACT3_READ_FREQ emmean SE df lower.CL upper.CL emmean_original lower_CL_original upper_CL_original
Never 11.303925 1.8571916 1068 7.659767 14.948083 127.78 58.67 223.45
Rarely 10.722506 1.4737868 1068 7.830659 13.614352 114.97 61.32 185.35
Sometimes 8.502481 0.6459812 1068 7.234944 9.770017 72.29 52.34 95.45
Very Often 11.604362 1.9837293 1068 7.711913 15.496812 134.66 59.47 240.15
Always 8.428273 0.5225401 1068 7.402951 9.453595 71.04 54.80 89.37

The ‘emmean_original’ Column and the related CIs are what we need here. They could be joined with the above table.

Net outcome predicted means:

Show code
em_means_transformed_net_outcome <- emmeans(net_outcome_accuracy_model, "ACT3_READ_FREQ")

# Extract the EMMs and their confidence intervals
em_means_df_net_outcom <- as.data.frame(summary(em_means_transformed_net_outcome))

# Back-transform the means and confidence intervals from the square root scale to the original scale
em_means_df_net_outcome <- em_means_df_net_outcom %>%
  mutate(
    emmean_original = round(emmean^2,2),
    lower_CL_original = round(lower.CL^2,2),
    upper_CL_original = round(upper.CL^2,2)
  )

gt(em_means_df_net_outcome)
ACT3_READ_FREQ emmean SE df lower.CL upper.CL emmean_original lower_CL_original upper_CL_original
Never 14.18511 1.828769 1027 10.596564 17.77367 201.22 112.29 315.90
Rarely 17.96758 3.608392 1027 10.886918 25.04825 322.83 118.52 627.41
Sometimes 14.63577 2.179599 1027 10.358792 18.91274 214.21 107.30 357.69
Very Often 16.10848 2.448008 1027 11.304814 20.91215 259.48 127.80 437.32
Always 26.85057 10.498692 1027 6.249232 47.45191 720.95 39.05 2251.68

Behavioural tracking over time

The final thing we can look at is look at how people’s behaviour and indicators of risk (e.g., deposit amounts) vary over time and around the dates that people receive activity statements.

We can actually look at this broken down by responses to whether people read statements and the entire sample of individuals invited to the survey.

We need to look at this over a relatively long period of time, say 4 months, to avoid wrongly inferring impact from irregularities over one or two months. Let’s isolate everyone in the sample who would have received 4 activity statements from July to December (meaning they had to bet in every month from June to November)

Show code
customer_IDs <- master_dataset_initial %>%
  filter(BET_IN_AUGUST == TRUE) %>%
  filter(BET_IN_SEPTEMBER == TRUE) %>%
  filter(BET_IN_OCTOBER == TRUE) %>%
  filter(BET_IN_NOVEMBER == TRUE) %>%
select(CLIENT_ID,
SURVEY_STATUS_PGSI,
ACT3_READ_FREQ, 
ACT5_BEHAVIOUR) %>%
as_tibble()

How many people does this give us for all analyses:

Show code
nrow(customer_IDs)
[1] 9570

Bets per day

Let’s start with the number of bets made per day around the dates statements were sent. This is going to take some time to process, as the wagers dataset is very large.

Process raw wagers

Now we need to load in the, raw, bet by bet data for the entire sample and filter to only include the above customers who bet regularly in our window of interest.

Load in dataset (note that this is so large that we have to load and filter it using the data.table package) and add the column names (this step has been commented out to avoid it running each time we run the script):

Show code
# wagers_data <- fread(file = wagers_filepath)
# 
# # Filter to restrict the date range:
# filtered_wagers <- wagers_data[
#   BET_PLACED_DATE > as.Date("2023-08-20") & BET_PLACED_DATE < as.Date("2023-12-20")
# ]
# 
# # Join function to only include customers isolated above:
# filtered_wagers <- filtered_wagers[customer_IDs, on = "CLIENT_ID", nomatch = 0]
# 
# # quick check of filtered dataset:
# # as_tibble(filtered_wagers)

And now write this to a .csv file for storing and loading in. This will avoid having to access the full dataset again (which is very time consuming):

Show code
# Write to new .csv file:
# fwrite(filtered_wagers, file.path(filtered_wagers_folder, "wagers_filtered_activity_statement_sample.csv"), row.names = FALSE)

wagers_data_filtered <- fread(file = filtered_wagers_filepath)
# wagers_data_filtered <- read_csv(filtered_wagers_filepath)

Start preparing the data for plotting and the subsequent analysis by summarising the number of bets per day (we later group the outcomes by PGSI status, so some steps are taken here to ensure PGSI status is included in this dataset).

We also need to restricted the data to the period of interest. Based on discussion, it seems appropriate to look at 15 days either side of each statement, so we need to include all dates starting from 15 days before the first activity statement and ending 15 days after the last statement.

The statement dates are as follows: “2023-09-05”, “2023-10-04”, “2023-11-01”, “2023-12-04”

This means that the first date in our window of interest should be 21 August, and the last 19 December (all 2023).

Show code
wagers_data_filtered_selected_sample<- wagers_data_filtered %>%
filter(BET_PLACED_DATE > as.Date("2023-08-21") & 
BET_PLACED_DATE < as.Date("2023-12-19")) 

# rm(wagers_data_filtered)

# Extract PGSI:
pgsi_data_for_survey_sample<- master_dataset_initial %>%
select(CLIENT_ID, PGSI_CATEGORY, ALL_SAMPLE_WEIGHTS)

wagers_data_filtered_selected_sample_pgsi<- pgsi_data_for_survey_sample %>%
right_join(wagers_data_filtered_selected_sample) %>%
mutate(PGSI_CATEGORY = case_when(is.na(PGSI_CATEGORY) ~ "None available",
PGSI_CATEGORY == "Low risk" | PGSI_CATEGORY == "No risk" ~ "No-low risk",
PGSI_CATEGORY == "Moderate risk" | PGSI_CATEGORY == "High risk" ~ "Moderate-high risk",
TRUE ~ as.character(PGSI_CATEGORY))) %>%
as_tibble()

# Check we have the same customers across all stages:
# n_distinct(customer_IDs$CLIENT_ID)
# n_distinct(wagers_data_filtered_selected_sample_pgsi$CLIENT_ID)
# n_distinct(wagers_data_filtered_selected_sample$CLIENT_ID)

# Generate a list of all dates in the range of BET_PLACED_DATE
all_dates <- seq(min(wagers_data_filtered_selected_sample_pgsi$BET_PLACED_DATE),
                 max(wagers_data_filtered_selected_sample_pgsi$BET_PLACED_DATE), by = "day")

# # How many people are there in each PGSI group:
#  wagers_data_filtered_selected_sample_pgsi %>%
#     distinct(CLIENT_ID, .keep_all = TRUE) %>%
#     count(PGSI_CATEGORY)


# Compute daily number of bets summary :
bets_per_day<-wagers_data_filtered_selected_sample_pgsi %>%
# filter(PGSI_CATEGORY != "None available") %>%
  group_by(CLIENT_ID, BET_PLACED_DATE) %>%
  summarise(
       total_bets_per_day = n(),
      .groups = 'drop') %>%
group_by(BET_PLACED_DATE) %>%
 summarise(
       median_bets_per_day = median(total_bets_per_day),
      .groups = 'drop') 

# Check coding:
# bets_per_day %>%
# print(n=100)

Check the number of people for these analyses:

Show code
n_bets_analyses<- wagers_data_filtered %>%
filter(BET_PLACED_DATE > as.Date("2023-08-21") & 
BET_PLACED_DATE < as.Date("2023-12-19")) %>%
as_tibble() %>%
distinct(CLIENT_ID) %>%
nrow()

There are a total of 9570 people included (number code generated).

Plot outcomes

Now plot the data, highlighting weekends for time reference & the Melbourne cup (a major race event in Australia widely bet on):

Show code
# Dates to add vertical lines
statement_dates <- as.Date(c("2023-09-05", "2023-10-04", "2023-11-01", "2023-12-04"))

Melbourne_cup_date<- as.Date("2023-11-07") 

bets_per_day$weekend <- weekdays(bets_per_day$BET_PLACED_DATE) %in% c("Saturday", "Sunday")
# Convert BET_PLACED_DATE to Date class explicitly
bets_per_day$BET_PLACED_DATE <- as.Date(bets_per_day$BET_PLACED_DATE)


# Example plot with vertical lines
bets_per_day_plot<- ggplot(bets_per_day, aes(x = BET_PLACED_DATE, y = median_bets_per_day)) +
  geom_line(linewidth = 1) +
geom_vline(xintercept = statement_dates, linetype = "dashed", color = "black", linewidth = 1) +
geom_vline(xintercept = Melbourne_cup_date, linetype = "solid", color = "dodgerblue", linewidth = 1) + # Melbourne cup
# geom_vline(xintercept = as.Date("2023-10-21"), linetype = "solid", color = "royalblue", linewidth = 1) + # Caulfield cup
  labs(x = "", y = "Median number of\n  bets per day") +
geom_rect(data = subset(bets_per_day, weekend),
            aes(xmin = BET_PLACED_DATE - 0.5, xmax = BET_PLACED_DATE + 0.5, 
                ymin = -Inf, ymax = Inf, fill = "Weekend\ndays"),
            alpha = 0.3, color = NA) +
  scale_fill_manual(values = c("Weekend\ndays" = "grey"), name = ""
                    # guide = FALSE
) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
 scale_y_continuous(labels = scales::comma) +
  theme_classic() +
  theme(text = element_text(family = "Poppins"))

ggsave(filename = "Figures/Bets per day trend plot.pdf", 
       plot = bets_per_day_plot, 
       device = "pdf", 
       width = 8.9, height = 2.14, units = "in")

bets_per_day_plot + plot_theme

Make a plot that’s more consistent with ITS analyses:

Show code
# Update dataset to include a 'Period' variable:
bets_per_day <- bets_per_day %>%
  mutate(Period = case_when(
    BET_PLACED_DATE < statement_dates[1] ~ "Before 1st Statement",
    between(BET_PLACED_DATE, statement_dates[1], statement_dates[2]) ~ "After 1st Statement",
    between(BET_PLACED_DATE, statement_dates[2], statement_dates[3]) ~ "After 2nd Statement",
    between(BET_PLACED_DATE, statement_dates[3], statement_dates[4]) ~ "After 3rd Statement",
    BET_PLACED_DATE > statement_dates[4] ~ "After 4th Statement"
  )) %>%
  mutate( # Enforce levels for plot presentation
    Period = factor(Period, levels = c("Before 1st Statement", 
                                       "After 1st Statement", 
                                       "After 2nd Statement", 
                                       "After 3rd Statement", 
                                       "After 4th Statement",
                                       "Melbourne Cup")) )


# Plot with points and separate trend lines
bets_per_day_ITS_plot<- ggplot(bets_per_day, aes(x = BET_PLACED_DATE, y = median_bets_per_day, color = Period)) +
  geom_point() +  # Use points instead of lines
  geom_smooth(method = "lm", se = FALSE, aes(group = Period)) +  # Add linear trend lines per period
  geom_vline(xintercept = statement_dates, linetype = "dashed", color = "black", linewidth = 1) +
  # geom_vline(xintercept = Melbourne_cup_date, linetype = "solid", color = "dodgerblue", linewidth = 1) +
  # geom_rect(data = subset(bets_per_day, weekend),
  #           aes(xmin = BET_PLACED_DATE - 0.5, xmax = BET_PLACED_DATE + 0.5, ymin = -Inf, ymax = Inf, fill = "Weekend"),
            # alpha = 0.2, color = NA) +
  # scale_fill_manual(values = c("Weekend" = "grey"), guide = FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  scale_y_continuous(labels = scales::comma) +
  # scale_color_viridis(discrete = TRUE, option = "G") + # Old colour scheme pre-review
  scale_color_manual(values = custom_palette4) +
  labs(x = "", y = "Median number of\n bets per day", color = "Period") +
 theme_classic() +
  theme(
    text=element_text(family="Poppins"))
  
ggsave(filename = "Figures/Bets per day ITS plot.pdf", 
       plot = bets_per_day_ITS_plot, 
       device = "pdf", 
       width = 8.9, height = 2.2, units = "in")

bets_per_day_ITS_plot +
plot_theme

Combine plots
Show code
combined_bets_per_day_plots<- bets_per_day_plot/
bets_per_day_ITS_plot +
  plot_annotation(tag_levels = 'A') & 
  theme(
    plot.tag = element_text(size = 14, face = "bold"),  
    plot.tag.position = c(-0.02, 0.95), 
    plot.margin = margin(t = 5, r = 0, b = 0, l = 10)  
  )

ggsave(filename = "Figures/Combined bets per day plots.pdf", 
       plot = combined_bets_per_day_plots, 
       device = "pdf", 
       width = 8.9, height = 4.2, units = "in")

Regression

To statistically explore trends in the data around the dates of statements, we will use interrupted time series analysis – a type of linear model that can compare the outcome variable before and after statement dates. We can include an overall, post-intervention impact variable (0 = before, 1 = after), as well as a variable indicating the trend in the outcome post-intervention. The Time variable in the model is calculated as the number of days from the first date recorded in the dataset to each subsequent date. This transformation normalizes the date to a continuous scale starting from zero, which is essential for the model to correctly interpret the passage of time as a predictor. This variable helps in assessing how the total bets per day change independently of the interventions and other variables over the entire time period covered by the dataset.

Note that we first tried these models using total sum of bets for everyone per day and the AIC and BIC values were considerably higher (2000+ for AIC).

Show code
bets_per_day_model_data <- bets_per_day

bets_per_day_model_data$Time <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE - min(bets_per_day_model_data$BET_PLACED_DATE))

# Variable for paydays:

bets_per_day_model_data$pay_days<- weekdays(bets_per_day_model_data$BET_PLACED_DATE) %in% c("Wednesday", "Thursday")

# Create a dummy variable for Melbourne cup:
bets_per_day_model_data$Melbourne_cup <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE == Melbourne_cup_date)


# Create intervention indicators and time after intervention variables
bets_per_day_model_data$Post_Intervention1 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates[1])
bets_per_day_model_data$Post_Intervention2 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates[2])
bets_per_day_model_data$Post_Intervention3 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates[3])
bets_per_day_model_data$Post_Intervention4 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates[4])

# Time variables for each intervention
bets_per_day_model_data$Time_After_Intervention1 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates[1]))
bets_per_day_model_data$Time_After_Intervention2 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates[2]))
bets_per_day_model_data$Time_After_Intervention3 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates[3]))
bets_per_day_model_data$Time_After_Intervention4 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates[4]))

ITS_gls_multi_bets_day <- gls(median_bets_per_day  ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days +
                                         Melbourne_cup,
                     data = bets_per_day_model_data,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_bets_day_summary <- summary(ITS_gls_multi_bets_day)

ITS_gls_multi_bets_day_summary
Generalized least squares fit by REML
  Model: median_bets_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: bets_per_day_model_data 
       AIC      BIC    logLik
  362.1528 402.1044 -166.0764

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.6502359 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)               4.478597 0.2967847 15.090392  0.0000
Time                      0.013174 0.0390258  0.337584  0.7363
Post_Intervention1        0.019510 0.4014263  0.048603  0.9613
Time_After_Intervention1 -0.005554 0.0402431 -0.138016  0.8905
Post_Intervention2        0.204267 0.2975903  0.686403  0.4940
Time_After_Intervention2 -0.021639 0.0174114 -1.242801  0.2167
Post_Intervention3        0.299075 0.3019144  0.990596  0.3241
Time_After_Intervention3  0.008814 0.0163761  0.538245  0.5915
Post_Intervention4       -0.093232 0.3356961 -0.277729  0.7818
Time_After_Intervention4  0.014783 0.0366293  0.403584  0.6873
weekendTRUE               1.658923 0.1408524 11.777738  0.0000
pay_daysTRUE              0.907800 0.1528463  5.939299  0.0000
Melbourne_cup             3.617408 0.7269128  4.976399  0.0000

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.828                                          
Post_Intervention1        0.466 -0.788                                   
Time_After_Intervention1  0.790 -0.954  0.624                            
Post_Intervention2        0.043 -0.032  0.209 -0.165                     
Time_After_Intervention2  0.008 -0.028  0.309 -0.174 -0.003              
Post_Intervention3        0.039 -0.013  0.017  0.006  0.278 -0.473       
Time_After_Intervention3  0.012 -0.004  0.010 -0.004  0.470 -0.553  0.126
Post_Intervention4       -0.004 -0.008  0.009  0.007  0.007 -0.006  0.214
Time_After_Intervention4  0.016  0.004 -0.005 -0.004  0.004 -0.009  0.174
weekendTRUE              -0.181 -0.025  0.030  0.024 -0.023  0.024 -0.045
pay_daysTRUE             -0.319  0.092 -0.090 -0.069 -0.106  0.025 -0.122
Melbourne_cup            -0.042  0.006 -0.005 -0.004 -0.016  0.014 -0.280
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.323                            
Time_After_Intervention4 -0.170 -0.513                     
weekendTRUE              -0.020  0.096 -0.108              
pay_daysTRUE             -0.035  0.006 -0.042  0.500       
Melbourne_cup             0.174 -0.068 -0.096  0.122  0.125

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-2.1241294 -0.7610523  0.1216169  0.4962657  2.2931884 

Residual standard error: 1.037316 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_bets_day)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Whilst the above indicates that residuals are non-randomly distributed over time, the pattern is mostly accounted for by weekends, which are included as a variable in the model (plus the residuals are mostly small).

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_bets_day)

# Compute the Durbin-Watson statistic
DW_STAT_Bets_day<- sum(diff(residuals_gls)^2) / sum(residuals_gls^2)

The Durbin-Watson statistic calculated from the model’s residuals is 2.17. This value is very close to 2, which typically indicates little to no autocorrelation; however, since it’s slightly below 2, there might be a minimal autocorrelation present. In the context of our model, this suggests that while the model has accounted for most of the autocorrelation inherent in the data, there could be slight dependencies in residuals that were not entirely captured. This level of autocorrelation is generally not problematic.

Deposit amount per day

Process data

Here we can and load in the full dataset of transactions for all customers. This dataset is very large, but small by comparison to the raw wagers dataset.

Show code
transactions_data<- fread(transactions_filepath, header = FALSE, stringsAsFactors = FALSE)
transactions_headings <- fread(transactions_headings, header = FALSE, stringsAsFactors = FALSE)
# Add heading/column names:
colnames(transactions_data) <- as.matrix(transactions_headings)

transactions_data <- as_tibble(transactions_data)

Now filter the data to only include the customers who bet in our window of interest and only deposit transactions:

Show code
transactions_data_filtered<- semi_join(transactions_data, customer_IDs, by = "CLIENT_ID") %>% # Logic check: Keep all clients in the X present in Y
filter(TYPE == "Deposit") 

This gives us a total of 2895414 deposits to summarise (number generated from code).

Start preparing the data for analysis by summing the total amount deposited per day as restricting to the time period of interest discussed in relation to the bets per day variable above:

Show code
deposit_amount_per_day<- transactions_data_filtered %>%
filter(DATE > as.Date("2023-08-21") & DATE < as.Date("2023-12-19")) %>%
group_by(CLIENT_ID, DATE) %>%
 summarise(
       total_deposited_per_day = sum(AMOUNT),
      .groups = 'drop') %>%
group_by(DATE) %>%
 summarise(
       median_deposited_per_day = median(total_deposited_per_day),
      .groups = 'drop') 

# Check coding:
# deposit_amount_per_day %>%
# print(n=100)

Check the number of people for these analyses:

Show code
n_deposits_analyses<- transactions_data_filtered %>%
filter(DATE > as.Date("2023-08-21") & DATE < as.Date("2023-12-19")) %>%
distinct(CLIENT_ID) %>%
nrow()

There are a total of 9004 people included (number code generated).

Plot outcomes

And now plot the data, again highlighting weekends for time reference & the Melbourne cup (a major race event in Australia widely bet on):

Show code
# Convert DATE to Date class explicitly
deposit_amount_per_day$DATE <- as.Date(deposit_amount_per_day$DATE)
 # Recognise weekends in the data:
deposit_amount_per_day$weekend <- weekdays(deposit_amount_per_day$DATE) %in% c("Saturday", "Sunday")

deposit_amount_per_day$pay_days<- weekdays(deposit_amount_per_day$DATE) %in% c("Wednesday", "Thursday")

# Plot
deposits_per_day_plot <- ggplot(deposit_amount_per_day, aes(x = DATE, y = median_deposited_per_day)) +
  geom_line(linewidth = 1) +
  geom_vline(xintercept = statement_dates, linetype = "dashed", color = "#333434", linewidth = 1) +
  geom_vline(xintercept = Melbourne_cup_date, linetype = "solid", color = "dodgerblue", linewidth = 1.2) + # Melbourne cup
# geom_vline(xintercept = as.Date("2023-10-21"), linetype = "solid", color = "royalblue", linewidth = 1) + # Caulfield cup
  labs(x = "", y = "Median deposit\n amount ($AUD)\n per day") +
# geom_rect(data = subset(deposit_amount_per_day, weekend),
#             aes(xmin = DATE - 0.5, xmax = DATE + 0.5, 
#                 ymin = -Inf, ymax = Inf, fill = "Weekend"),
#             alpha = 0.3, color = NA) +
geom_rect(data = subset(deposit_amount_per_day, pay_days),
            aes(xmin = DATE - 0.5, xmax = DATE + 0.5, 
                ymin = -Inf, ymax = Inf, fill = "Paydays"),
            alpha = 0.1, color = NA) +
  scale_fill_manual(values = c(
# "Weekend" = "grey",
"Paydays" = "dodgerblue"),
name = ""
                    # guide = FALSE
) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
 scale_y_continuous(labels = scales::comma,
limits = c(90,150)
) +
  theme_classic() +
  theme(text = element_text(family = "Poppins"))

ggsave(filename = "Figures/Deposits per day trend plot.pdf", 
       plot = deposits_per_day_plot, 
       device = "pdf", 
       width = 8.9, height = 2.14, units = "in")

deposits_per_day_plot + plot_theme

Again, make a more ITS-style plot:

Show code
# Wragnle data with periods:
deposit_amount_per_day <- deposit_amount_per_day %>%
  mutate(
    Period = case_when(
      DATE < statement_dates[1] ~ "Before 1st Statement",
      between(DATE, statement_dates[1], statement_dates[2]) ~ "After 1st Statement",
      between(DATE, statement_dates[2], statement_dates[3]) ~ "After 2nd Statement",
      between(DATE, statement_dates[3], statement_dates[4]) ~ "After 3rd Statement",
      DATE > statement_dates[4] ~ "After 4th Statement"
    )) %>%
  mutate( # Enforce levels for plot presentation
    Period = factor(Period, levels = c("Before 1st Statement", 
                                       "After 1st Statement", 
                                       "After 2nd Statement", 
                                       "After 3rd Statement", 
                                       "After 4th Statement",
                                       "Melbourne Cup")) )


# Plot with points and separate trend lines for each period
deposits_per_day_ITS_plot<- ggplot(deposit_amount_per_day, aes(x = DATE, y = median_deposited_per_day, color = Period)) +
  geom_point() +  # Use points instead of lines
  geom_smooth(method = "lm", se = FALSE, aes(group = Period)) +  # Add linear trend lines per period
  geom_vline(xintercept = statement_dates, linetype = "dashed", color = "#333434", linewidth = 1) +
    # scale_color_viridis(discrete = TRUE, option = "G") + # Old colour sceheme pre-review
  scale_color_manual(values = custom_palette4) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  scale_y_continuous(labels = scales::comma, limits = c(90, 150)) +
  labs(x = "", y = "Median deposit\n amount ($AUD)\n per day", color = "") +
  theme_classic() +
  theme(text = element_text(family = "Poppins"))

ggsave(filename = "Figures/Deposits per day ITS plot.pdf", 
       plot = deposits_per_day_ITS_plot, 
       device = "pdf", 
       width = 8.9, height = 2.3, units = "in")

deposits_per_day_ITS_plot + plot_theme

Combine plots
Show code
combined_deposits_per_day_plots<- deposits_per_day_plot/
deposits_per_day_ITS_plot +
  plot_annotation(tag_levels = 'A') & 
  theme(
    plot.tag = element_text(size = 14, face = "bold"),  
    plot.tag.position = c(-0.02, 0.95), 
    plot.margin = margin(t = 5, r = 0, b = 0, l = 10)  
  )

ggsave(filename = "Figures/Combined deposits per day plots.pdf", 
       plot = combined_deposits_per_day_plots, 
       device = "pdf", 
       width = 8.9, height = 4.2, units = "in")

Regression

We will use the same interrupted time series analysis method here.

Show code
deposit_amount_per_day_model_data <- deposit_amount_per_day

deposit_amount_per_day_model_data$Time <- as.numeric(deposit_amount_per_day_model_data$DATE - min(deposit_amount_per_day_model_data$DATE))

# Create a dummy variable for Melbourne cup:
deposit_amount_per_day_model_data$Melbourne_cup <- as.numeric(deposit_amount_per_day_model_data$DATE == Melbourne_cup_date)


# Create intervention indicators and time after intervention variables
deposit_amount_per_day_model_data$Post_Intervention1 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates[1])
deposit_amount_per_day_model_data$Post_Intervention2 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates[2])
deposit_amount_per_day_model_data$Post_Intervention3 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates[3])
deposit_amount_per_day_model_data$Post_Intervention4 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates[4])

# Time variables for each intervention
deposit_amount_per_day_model_data$Time_After_Intervention1 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates[1]))
deposit_amount_per_day_model_data$Time_After_Intervention2 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates[2]))
deposit_amount_per_day_model_data$Time_After_Intervention3 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates[3]))
deposit_amount_per_day_model_data$Time_After_Intervention4 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates[4]))


ITS_gls_multi_deposit <- gls(median_deposited_per_day ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days+
                                         Melbourne_cup,
                     data = deposit_amount_per_day_model_data,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_deposit_summary<- summary(ITS_gls_multi_deposit)
ITS_gls_multi_deposit_summary
Generalized least squares fit by REML
  Model: median_deposited_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: deposit_amount_per_day_model_data 
       AIC      BIC    logLik
  927.2585 967.2101 -448.6292

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.2007793 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)              100.47510  5.850017 17.175181  0.0000
Time                       0.54174  0.759310  0.713469  0.4771
Post_Intervention1         1.42231  7.771017  0.183028  0.8551
Time_After_Intervention1  -0.39111  0.791551 -0.494104  0.6223
Post_Intervention2         1.74790  5.988400  0.291880  0.7709
Time_After_Intervention2  -0.50240  0.356261 -1.410208  0.1614
Post_Intervention3         3.49881  5.971017  0.585965  0.5591
Time_After_Intervention3   0.20514  0.331763  0.618339  0.5377
Post_Intervention4        -1.99163  6.760898 -0.294581  0.7689
Time_After_Intervention4   0.94527  0.711469  1.328622  0.1868
weekendTRUE                2.35161  2.606831  0.902094  0.3691
pay_daysTRUE              17.92099  2.776488  6.454554  0.0000
Melbourne_cup            -10.42445 13.078798 -0.797050  0.4272

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.830                                          
Post_Intervention1        0.437 -0.764                                   
Time_After_Intervention1  0.787 -0.950  0.586                            
Post_Intervention2        0.030 -0.019  0.206 -0.182                     
Time_After_Intervention2  0.001 -0.014  0.315 -0.198 -0.003              
Post_Intervention3        0.028 -0.010  0.012  0.005  0.273 -0.472       
Time_After_Intervention3  0.010 -0.002  0.006 -0.002  0.471 -0.562  0.144
Post_Intervention4       -0.002 -0.009  0.010  0.008  0.006 -0.003  0.206
Time_After_Intervention4  0.007  0.006 -0.006 -0.005  0.000 -0.005  0.172
weekendTRUE              -0.151 -0.045  0.043  0.041 -0.017  0.022 -0.029
pay_daysTRUE             -0.261  0.067 -0.074 -0.044 -0.102  0.025 -0.112
Melbourne_cup            -0.031  0.002 -0.003 -0.001 -0.011  0.007 -0.217
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.317                            
Time_After_Intervention4 -0.179 -0.509                     
weekendTRUE              -0.025  0.092 -0.086              
pay_daysTRUE             -0.038 -0.007 -0.016  0.454       
Melbourne_cup             0.137 -0.052 -0.075  0.105  0.109

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-1.8146024 -0.7062712 -0.1672987  0.5461755  2.9293210 

Residual standard error: 12.82473 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_deposit)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_deposit)

# Compute the Durbin-Watson statistic
sum(diff(residuals_gls)^2) / sum(residuals_gls^2)
[1] 1.833228

This value is also close to 2, indicating no issues.

Table regression outcomes

Show code
# Extract terms/varaibles from regression outputs:
bets_model_terms <-  ITS_gls_multi_bets_day_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

deposits_model_terms <-  ITS_gls_multi_deposit_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

# Extract coefficient tables from regression outputs:
bets_summary <- ITS_gls_multi_bets_day_summary$tTable %>% as_tibble() %>%
bind_cols(bets_model_terms)
deposit_summary <- ITS_gls_multi_deposit_summary$tTable %>% as_tibble() %>%
bind_cols(deposits_model_terms)


# Compute 95% confidence intervals:
bets_summary <- bets_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

deposit_summary <- deposit_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

# Prepare each dataset for tabling:
bets_data_table <- bets_summary  %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

deposit_data_table <- deposit_summary  %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

# Join and tidy: 
final_table_data<- bind_cols(bets_data_table,deposit_data_table) %>%
select(Characteristic = 1, everything()) %>%
select( 
Characteristic,
Beta...2,
`95%CI...4`,
pvalue...3,
Beta...6,
`95%CI...8`,
pvalue...7) %>%
mutate(Characteristic = str_replace_all(Characteristic, c(
    "\\(Intercept\\)" = "Intercept",
    "Post_Intervention(\\d+)" = "Post-statement \\1 (level)",
    "Time_After_Intervention(\\d+)" = "Time after statement \\1 (slope)",
    "weekendTRUE" = "Weekend days",
    "pay_daysTRUE" = "Pay days",
    "Melbourne_cup" = "Melbourne Cup"
  )))



# Create the final table
final_table_data %>%
  gt() %>%
tab_options(data_row.padding = px(3)) %>%
  tab_spanner(
    label = glue::glue("Median number of bets per day (*N* = {n_bets_analyses})"),
    columns = vars(`Beta...2`, `95%CI...4`,`pvalue...3`)
  ) %>%
  tab_spanner(
    label =  glue::glue("Median deposit amount per day (*N* = {n_deposits_analyses})"),
    columns = vars(`Beta...6`,  `95%CI...8`,`pvalue...7`)) %>%
  cols_label(
    'Beta...2' = 'Beta',
    '95%CI...4' = '95% CI',
    'pvalue...3' = 'p-value',
    'Beta...6' = 'Beta',
    '95%CI...8' = '95% CI',
    'pvalue...7' = 'p-value'
  ) %>%
tab_header(md("Table 5. Interrupted Time Series Analysis: Impact of Activity Statements on Betting Behaviour"))
Table 5. Interrupted Time Series Analysis: Impact of Activity Statements on Betting Behaviour
Characteristic Median number of bets per day (*N* = 9570) Median deposit amount per day (*N* = 9004)
Beta 95% CI p-value Beta 95% CI p-value
Intercept 4.479 3.897, 5.06 0.000 100.475 89.009, 111.941 0.000
Time 0.013 -0.063, 0.09 0.736 0.542 -0.947, 2.03 0.477
Post-statement 1 (level) 0.020 -0.767, 0.806 0.961 1.422 -13.809, 16.654 0.855
Time after statement 1 (slope) -0.006 -0.084, 0.073 0.890 -0.391 -1.943, 1.16 0.622
Post-statement 2 (level) 0.204 -0.379, 0.788 0.494 1.748 -9.989, 13.485 0.771
Time after statement 2 (slope) -0.022 -0.056, 0.012 0.217 -0.502 -1.201, 0.196 0.161
Post-statement 3 (level) 0.299 -0.293, 0.891 0.324 3.499 -8.204, 15.202 0.559
Time after statement 3 (slope) 0.009 -0.023, 0.041 0.592 0.205 -0.445, 0.855 0.538
Post-statement 4 (level) -0.093 -0.751, 0.565 0.782 -1.992 -15.243, 11.26 0.769
Time after statement 4 (slope) 0.015 -0.057, 0.087 0.687 0.945 -0.449, 2.34 0.187
Weekend days 1.659 1.383, 1.935 0.000 2.352 -2.758, 7.461 0.369
Pay days 0.908 0.608, 1.207 0.000 17.921 12.479, 23.363 0.000
Melbourne Cup 3.617 2.193, 5.042 0.000 -10.424 -36.059, 15.21 0.427

Sensitivity analyses

High-risk sample

To be sure that the above outcomes don’t mask true effects in more at-risk gamblers, we will now repeat the The Same analyses using only participants with survey responses who scored in the moderate to high risk range of the PGSI (i.e., 3+).

Bets per day

We’ll repeat the above steps, but first we need to group the bets per day data by PGSI group.

Process data

Show code
# Compute daily number of bets summary :
bets_per_day_PGSI<-wagers_data_filtered_selected_sample_pgsi %>%
# filter(PGSI_CATEGORY != "None available") %>%
  group_by(CLIENT_ID, BET_PLACED_DATE, PGSI_CATEGORY) %>%
  summarise(
       total_bets_per_day = n(),
      .groups = 'drop') %>%
group_by(BET_PLACED_DATE, PGSI_CATEGORY) %>%
 summarise(
       median_bets_per_day = median(total_bets_per_day),
      .groups = 'drop') 

Check the number of people for these analyses:

Show code
n_bets_analyses_PGSI<- wagers_data_filtered_selected_sample_pgsi %>%
filter(BET_PLACED_DATE > as.Date("2023-08-21") & 
BET_PLACED_DATE < as.Date("2023-12-19")) %>%
filter(PGSI_CATEGORY == "Moderate-high risk") %>% 
as_tibble() %>%
distinct(CLIENT_ID) %>%
nrow()

There are a total of 658 people included (number code generated).

Plot data

Show code
# Dates to add vertical lines
statement_dates <- as.Date(c("2023-09-05", "2023-10-04", "2023-11-01", "2023-12-04"))

Melbourne_cup_date<- as.Date("2023-11-07") 

bets_per_day_PGSI$weekend <- weekdays(bets_per_day_PGSI$BET_PLACED_DATE) %in% c("Saturday", "Sunday")
# Convert BET_PLACED_DATE to Date class explicitly
bets_per_day_PGSI$BET_PLACED_DATE <- as.Date(bets_per_day_PGSI$BET_PLACED_DATE)


# Example plot with vertical lines
bets_per_day_PGSI %>%
filter(PGSI_CATEGORY == "Moderate-high risk") %>% 
ggplot(aes(x = BET_PLACED_DATE, y = median_bets_per_day)) +
  geom_line(linewidth = 1) +
geom_vline(xintercept = statement_dates, linetype = "dashed", color = "black", linewidth = 1) +
geom_vline(xintercept = Melbourne_cup_date, linetype = "solid", color = "dodgerblue", linewidth = 1) + # Melbourne cup
# geom_vline(xintercept = as.Date("2023-10-21"), linetype = "solid", color = "royalblue", linewidth = 1) + # Caulfield cup
  labs(x = "", y = "Median number of\n  bets per day") +
geom_rect(data = subset(bets_per_day_PGSI, weekend),
            aes(xmin = BET_PLACED_DATE - 0.5, xmax = BET_PLACED_DATE + 0.5, 
                ymin = -Inf, ymax = Inf, fill = "Weekend"),
            alpha = 0.3, color = NA) +
  scale_fill_manual(values = c("Weekend" = "grey"),
                    guide = FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
 scale_y_continuous(labels = scales::comma) +
  plot_theme

Regression

Show code
bets_per_day_model_data_PGSI <- bets_per_day_PGSI %>%
filter(PGSI_CATEGORY == "Moderate-high risk") %>%
mutate(BET_PLACED_DATE = as.Date(BET_PLACED_DATE))

bets_per_day_model_data_PGSI$Time <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE - min(bets_per_day_model_data_PGSI$BET_PLACED_DATE))
# Variable for paydays:

bets_per_day_model_data_PGSI$pay_days<- weekdays(bets_per_day_model_data_PGSI$BET_PLACED_DATE) %in% c("Wednesday", "Thursday")

# Create a dummy variable for Melbourne cup:
bets_per_day_model_data_PGSI$Melbourne_cup <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE == Melbourne_cup_date)


# Create intervention indicators and time after intervention variables
bets_per_day_model_data_PGSI$Post_Intervention1 <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE >= statement_dates[1])
bets_per_day_model_data_PGSI$Post_Intervention2 <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE >= statement_dates[2])
bets_per_day_model_data_PGSI$Post_Intervention3 <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE >= statement_dates[3])
bets_per_day_model_data_PGSI$Post_Intervention4 <- as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE >= statement_dates[4])

# Time variables for each intervention
bets_per_day_model_data_PGSI$Time_After_Intervention1 <- pmax(0, as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE - statement_dates[1]))
bets_per_day_model_data_PGSI$Time_After_Intervention2 <- pmax(0, as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE - statement_dates[2]))
bets_per_day_model_data_PGSI$Time_After_Intervention3 <- pmax(0, as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE - statement_dates[3]))
bets_per_day_model_data_PGSI$Time_After_Intervention4 <- pmax(0, as.numeric(bets_per_day_model_data_PGSI$BET_PLACED_DATE - statement_dates[4]))

ITS_gls_multi_bets_day_PGSI <- gls(median_bets_per_day  ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days +
                                         Melbourne_cup,
                     data = bets_per_day_model_data_PGSI,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_bets_day_PGSI_summary <- summary(ITS_gls_multi_bets_day_PGSI)

ITS_gls_multi_bets_day_PGSI_summary
Generalized least squares fit by REML
  Model: median_bets_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: bets_per_day_model_data_PGSI 
       AIC      BIC    logLik
  495.8425 535.7941 -232.9213

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.5213665 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)               6.462094 0.6155542 10.498009  0.0000
Time                      0.102849 0.0807591  1.273529  0.2056
Post_Intervention1       -0.857507 0.8310100 -1.031885  0.3045
Time_After_Intervention1 -0.072339 0.0834333 -0.867031  0.3879
Post_Intervention2       -0.338194 0.6211784 -0.544439  0.5873
Time_After_Intervention2 -0.061223 0.0364306 -1.680534  0.0958
Post_Intervention3       -0.065636 0.6276651 -0.104572  0.9169
Time_After_Intervention3  0.053301 0.0341740  1.559692  0.1218
Post_Intervention4       -1.859825 0.6992823 -2.659619  0.0090
Time_After_Intervention4  0.148091 0.0756592  1.957336  0.0529
weekendTRUE               2.547373 0.2871567  8.871022  0.0000
pay_daysTRUE              0.941605 0.3141624  2.997193  0.0034
Melbourne_cup             3.156297 1.4930510  2.113991  0.0369

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.828                                          
Post_Intervention1        0.460 -0.783                                   
Time_After_Intervention1  0.789 -0.953  0.616                            
Post_Intervention2        0.041 -0.029  0.210 -0.168                     
Time_After_Intervention2  0.007 -0.027  0.311 -0.178 -0.003              
Post_Intervention3        0.036 -0.012  0.016  0.005  0.278 -0.473       
Time_After_Intervention3  0.011 -0.003  0.009 -0.004  0.471 -0.555  0.131
Post_Intervention4       -0.004 -0.009  0.009  0.007  0.007 -0.006  0.212
Time_After_Intervention4  0.013  0.005 -0.006 -0.005  0.003 -0.008  0.174
weekendTRUE              -0.177 -0.030  0.034  0.029 -0.022  0.024 -0.042
pay_daysTRUE             -0.308  0.086 -0.086 -0.062 -0.105  0.025 -0.120
Melbourne_cup            -0.039  0.005 -0.004 -0.003 -0.015  0.013 -0.268
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.321                            
Time_After_Intervention4 -0.172 -0.512                     
weekendTRUE              -0.021  0.095 -0.103              
pay_daysTRUE             -0.036  0.003 -0.035  0.498       
Melbourne_cup             0.167 -0.065 -0.092  0.120  0.121

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-2.1780409 -0.6370901 -0.0575573  0.8245235  2.6891407 

Residual standard error: 1.806969 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_bets_day_PGSI)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Whilst the above indicates that residuals are non-randomly distributed over time, the pattern is mostly accounted for by weekends, which are included as a variable in the model (plus the residuals are mostly small).

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_bets_day_PGSI)

# Compute the Durbin-Watson statistic
DW_STAT_Bets_day<- sum(diff(residuals_gls)^2) / sum(residuals_gls^2)

The Durbin-Watson statistic calculated from the model’s residuals is 2.09. Again, the value is very close to 2, which typically indicates little to no autocorrelation.

Deposit amount per day

Process data
Show code
deposit_amount_per_day_PGSI<- pgsi_data_for_survey_sample %>%
right_join(transactions_data_filtered) %>%
mutate(PGSI_CATEGORY = case_when(is.na(PGSI_CATEGORY) ~ "None available",
PGSI_CATEGORY == "Low risk" | PGSI_CATEGORY == "No risk" ~ "No-low risk",
PGSI_CATEGORY == "Moderate risk" | PGSI_CATEGORY == "High risk" ~ "Moderate-high risk",
TRUE ~ as.character(PGSI_CATEGORY))) %>%
filter(DATE > as.Date("2023-08-21") & DATE < as.Date("2023-12-19")) %>%
group_by(CLIENT_ID, DATE, PGSI_CATEGORY) %>%
 summarise(
       total_deposited_per_day = sum(AMOUNT),
      .groups = 'drop') %>%
group_by(DATE, PGSI_CATEGORY) %>%
 summarise(
       median_deposited_per_day = median(total_deposited_per_day),
      .groups = 'drop')  %>%
as_tibble()

# Check coding:
# deposit_amount_per_day_PGSI %>%
# print(n=100)

Check the number of people for these analyses:

Show code
n_deposits_analyses_PGSI<- pgsi_data_for_survey_sample %>%
right_join(transactions_data_filtered) %>%
mutate(PGSI_CATEGORY = case_when(is.na(PGSI_CATEGORY) ~ "None available",
PGSI_CATEGORY == "Low risk" | PGSI_CATEGORY == "No risk" ~ "No-low risk",
PGSI_CATEGORY == "Moderate risk" | PGSI_CATEGORY == "High risk" ~ "Moderate-high risk",
TRUE ~ as.character(PGSI_CATEGORY))) %>%
filter(DATE > as.Date("2023-08-21") & DATE < as.Date("2023-12-19")) %>%
filter(PGSI_CATEGORY == "Moderate-high risk") %>%
distinct(CLIENT_ID) %>%
nrow()

There are a total of 647 people included (number code generated).

Plot outcomes

And now plot the data, again highlighting weekends for time reference & the Melbourne cup (a major race event in Australia widely bet on):

Show code
# Convert DATE to Date class explicitly
deposit_amount_per_day_PGSI$DATE <- as.Date(deposit_amount_per_day_PGSI$DATE)
 # Recognise weekends in the data:
deposit_amount_per_day_PGSI$weekend <- weekdays(deposit_amount_per_day_PGSI$DATE) %in% c("Saturday", "Sunday")

deposit_amount_per_day_PGSI$pay_days<- weekdays(deposit_amount_per_day_PGSI$DATE) %in% c("Wednesday", "Thursday")

# Plot
deposit_amount_per_day_PGSI %>%
filter(PGSI_CATEGORY == "Moderate-high risk") %>%
# print(n=100)
ggplot(aes(x = DATE, y = median_deposited_per_day)) +
  geom_line(linewidth = 1) +
geom_vline(xintercept = statement_dates, linetype = "dashed", color = "dodgerblue", linewidth = 1) +
geom_vline(xintercept = Melbourne_cup_date, linetype = "solid", color = "#333434", linewidth = 1.2) + # Melbourne cup
# geom_vline(xintercept = as.Date("2023-10-21"), linetype = "solid", color = "royalblue", linewidth = 1) + # Caulfield cup
  labs(x = "", y = "Median deposit\n amount ($AUD)\n per day") +
geom_rect(data = subset(deposit_amount_per_day_PGSI, weekend),
            aes(xmin = DATE - 0.5, xmax = DATE + 0.5,
                ymin = -Inf, ymax = Inf, fill = "Weekend"),
            alpha = 0.1, color = NA) +
geom_rect(data = subset(deposit_amount_per_day_PGSI, pay_days),
            aes(xmin = DATE - 0.5, xmax = DATE + 0.5, 
                ymin = -Inf, ymax = Inf, fill = "Payday"),
            alpha = 0.05, color = NA) +
  scale_fill_manual(values = c("Weekend" = "grey",
"Payday" = "dodgerblue"),
                    guide = FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
 scale_y_continuous(labels = scales::comma,
limits = c(90,200)) +
plot_theme

Regression

We will use the same interrupted time series analysis method here.

Show code
deposit_amount_per_day_PGSI_model_data <- # Plot
deposit_amount_per_day_PGSI %>%
filter(PGSI_CATEGORY == "Moderate-high risk") 

deposit_amount_per_day_PGSI_model_data$Time <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE - min(deposit_amount_per_day_PGSI_model_data$DATE))

# Create a dummy variable for Melbourne cup:
deposit_amount_per_day_PGSI_model_data$Melbourne_cup <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE == Melbourne_cup_date)


# Create intervention indicators and time after intervention variables
deposit_amount_per_day_PGSI_model_data$Post_Intervention1 <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE >= statement_dates[1])
deposit_amount_per_day_PGSI_model_data$Post_Intervention2 <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE >= statement_dates[2])
deposit_amount_per_day_PGSI_model_data$Post_Intervention3 <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE >= statement_dates[3])
deposit_amount_per_day_PGSI_model_data$Post_Intervention4 <- as.numeric(deposit_amount_per_day_PGSI_model_data$DATE >= statement_dates[4])

# Time variables for each intervention
deposit_amount_per_day_PGSI_model_data$Time_After_Intervention1 <- pmax(0, as.numeric(deposit_amount_per_day_PGSI_model_data$DATE - statement_dates[1]))
deposit_amount_per_day_PGSI_model_data$Time_After_Intervention2 <- pmax(0, as.numeric(deposit_amount_per_day_PGSI_model_data$DATE - statement_dates[2]))
deposit_amount_per_day_PGSI_model_data$Time_After_Intervention3 <- pmax(0, as.numeric(deposit_amount_per_day_PGSI_model_data$DATE - statement_dates[3]))
deposit_amount_per_day_PGSI_model_data$Time_After_Intervention4 <- pmax(0, as.numeric(deposit_amount_per_day_PGSI_model_data$DATE - statement_dates[4]))


ITS_gls_multi_deposit_PGSI <- gls(median_deposited_per_day ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days+
                                         Melbourne_cup,
                     data = deposit_amount_per_day_PGSI_model_data,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_deposit_PGSI_summary<- summary(ITS_gls_multi_deposit_PGSI)
ITS_gls_multi_deposit_PGSI_summary
Generalized least squares fit by REML
  Model: median_deposited_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: deposit_amount_per_day_PGSI_model_data 
       AIC      BIC    logLik
  1064.217 1104.168 -517.1084

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.3516454 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)              129.61769 10.114503 12.815034  0.0000
Time                       1.16492  1.320958  0.881875  0.3798
Post_Intervention1        -3.07744 13.573112 -0.226730  0.8211
Time_After_Intervention1  -1.00430  1.370101 -0.733009  0.4652
Post_Intervention2         2.22811 10.293565  0.216457  0.8290
Time_After_Intervention2  -1.56131  0.607267 -2.571038  0.0115
Post_Intervention3         4.26987 10.332166  0.413260  0.6803
Time_After_Intervention3   1.79256  0.567481  3.158811  0.0021
Post_Intervention4         3.44368 11.577537  0.297445  0.7667
Time_After_Intervention4  -0.37481  1.236322 -0.303168  0.7624
weekendTRUE                7.61295  4.611627  1.650818  0.1017
pay_daysTRUE              15.37989  5.019483  3.064039  0.0028
Melbourne_cup            -12.38356 23.789089 -0.520556  0.6038

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.828                                          
Post_Intervention1        0.449 -0.774                                   
Time_After_Intervention1  0.788 -0.951  0.602                            
Post_Intervention2        0.036 -0.025  0.209 -0.175                     
Time_After_Intervention2  0.004 -0.022  0.314 -0.187 -0.003              
Post_Intervention3        0.033 -0.011  0.014  0.005  0.276 -0.473       
Time_After_Intervention3  0.011 -0.003  0.008 -0.004  0.472 -0.558  0.138
Post_Intervention4       -0.003 -0.009  0.010  0.008  0.007 -0.004  0.209
Time_After_Intervention4  0.010  0.006 -0.006 -0.005  0.002 -0.007  0.173
weekendTRUE              -0.165 -0.038  0.038  0.035 -0.019  0.023 -0.036
pay_daysTRUE             -0.287  0.077 -0.080 -0.053 -0.103  0.024 -0.116
Melbourne_cup            -0.035  0.003 -0.003 -0.001 -0.013  0.010 -0.245
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.318                            
Time_After_Intervention4 -0.176 -0.511                     
weekendTRUE              -0.023  0.093 -0.094              
pay_daysTRUE             -0.037 -0.002 -0.025  0.482       
Melbourne_cup             0.153 -0.059 -0.085  0.114  0.115

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-2.1035063 -0.7872957 -0.1092635  0.7106866  2.2661012 

Residual standard error: 25.00267 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_deposit_PGSI)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_deposit_PGSI)

# Compute the Durbin-Watson statistic
sum(diff(residuals_gls)^2) / sum(residuals_gls^2)
[1] 2.125928

This value is also close to 2, indicating no issues.

Table regression outcomes

Show code
# Extract terms/varaibles from regression outputs:
bets_model_terms <-  ITS_gls_multi_bets_day_PGSI_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

deposits_model_terms <-  ITS_gls_multi_deposit_PGSI_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

# Extract coefficient tables from regression outputs:
bets_summary <- ITS_gls_multi_bets_day_PGSI_summary$tTable %>% as_tibble() %>%
bind_cols(bets_model_terms)
deposit_summary <- ITS_gls_multi_deposit_PGSI_summary$tTable %>% as_tibble() %>%
bind_cols(deposits_model_terms)


# Compute 95% confidence intervals:
bets_summary_PGSI <- bets_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

deposit_summary_PGSI <- deposit_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

# Prepare each dataset for tabling:
bets_data_table_PGSI <- bets_summary_PGSI %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

deposit_data_table_PGSI <- deposit_summary_PGSI  %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

# Join and tidy: 
final_table_data<- bind_cols(bets_data_table_PGSI, deposit_data_table_PGSI) %>%
select(Characteristic = 1, everything()) %>%
select( 
Characteristic,
Beta...2,
`95%CI...4`,
pvalue...3,
Beta...6,
`95%CI...8`,
pvalue...7) %>%
mutate(Characteristic = str_replace_all(Characteristic, c(
    "\\(Intercept\\)" = "Intercept",
    "Post_Intervention(\\d+)" = "Post-statement \\1 (level)",
    "Time_After_Intervention(\\d+)" = "Time after statement \\1 (slope)",
    "weekendTRUE" = "Weekend days",
    "pay_daysTRUE" = "Pay days",
    "Melbourne_cup" = "Melbourne Cup"
  )))



# Create the final table
final_table_data %>%
  gt() %>%
tab_options(data_row.padding = px(3)) %>%
  tab_spanner(
    label = glue::glue("Median number of bes per day (*N* = {n_bets_analyses_PGSI})"),
    columns = vars(`Beta...2`, `95%CI...4`,`pvalue...3`)
  ) %>%
  tab_spanner(
    label =  glue::glue("Median deposit amount per day (*N* = {n_deposits_analyses_PGSI})"),
    columns = vars(`Beta...6`,  `95%CI...8`,`pvalue...7`)) %>%
  cols_label(
    'Beta...2' = 'Beta',
    '95%CI...4' = '95% CI',
    'pvalue...3' = 'p-value',
    'Beta...6' = 'Beta',
    '95%CI...8' = '95% CI',
    'pvalue...7' = 'p-value'
  ) %>%
tab_header(md("Table 6. Interrupted Time Series Analysis: Impact of Activity Statements on Betting Behaviour Among Moderate-High-Risk PGSI Groups"))
Table 6. Interrupted Time Series Analysis: Impact of Activity Statements on Betting Behaviour Among Moderate-High-Risk PGSI Groups
Characteristic Median number of bes per day (*N* = 658) Median deposit amount per day (*N* = 647)
Beta 95% CI p-value Beta 95% CI p-value
Intercept 6.462 5.256, 7.669 0.000 129.618 109.793, 149.442 0.000
Time 0.103 -0.055, 0.261 0.206 1.165 -1.424, 3.754 0.380
Post-statement 1 (level) -0.858 -2.486, 0.771 0.304 -3.077 -29.681, 23.526 0.821
Time after statement 1 (slope) -0.072 -0.236, 0.091 0.388 -1.004 -3.69, 1.681 0.465
Post-statement 2 (level) -0.338 -1.556, 0.879 0.587 2.228 -17.947, 22.403 0.829
Time after statement 2 (slope) -0.061 -0.133, 0.01 0.096 -1.561 -2.752, -0.371 0.012
Post-statement 3 (level) -0.066 -1.296, 1.165 0.917 4.270 -15.981, 24.521 0.680
Time after statement 3 (slope) 0.053 -0.014, 0.12 0.122 1.793 0.68, 2.905 0.002
Post-statement 4 (level) -1.860 -3.23, -0.489 0.009 3.444 -19.248, 26.136 0.767
Time after statement 4 (slope) 0.148 0, 0.296 0.053 -0.375 -2.798, 2.048 0.762
Weekend days 2.547 1.985, 3.11 0.000 7.613 -1.426, 16.652 0.102
Pay days 0.942 0.326, 1.557 0.003 15.380 5.542, 25.218 0.003
Melbourne Cup 3.156 0.23, 6.083 0.037 -12.384 -59.01, 34.243 0.604

Lagged statement dates

The next and final thing we can do is implement a slight delay/lag in the statement dates, in case the impact of statements is not realised until a day or so after they are sent to customers. To do this, we can simply add a single day to the statement dates.

NOTE: We have also tried this with two days, but only present the outcomes from a 1-day lag for ease and space. The outcomes from the one-day and two-day lag were the same – that is, no different to the original model outcomes.

We won’t re-plot the data here as we are using the exact same data as the original models, just with the updated statement dates.

Bets-per-day Regression

Show code
statement_dates_lag1 <- statement_dates + 1

# Create intervention indicators and time after intervention variables
bets_per_day_model_data$Post_Intervention1 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates_lag1[1])
bets_per_day_model_data$Post_Intervention2 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates_lag1[2])
bets_per_day_model_data$Post_Intervention3 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates_lag1[3])
bets_per_day_model_data$Post_Intervention4 <- as.numeric(bets_per_day_model_data$BET_PLACED_DATE >= statement_dates_lag1[4])


# Time variables for each intervention
bets_per_day_model_data$Time_After_Intervention1 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates_lag1[1]))
bets_per_day_model_data$Time_After_Intervention2 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates_lag1[2]))
bets_per_day_model_data$Time_After_Intervention3 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates_lag1[3]))
bets_per_day_model_data$Time_After_Intervention4 <- pmax(0, as.numeric(bets_per_day_model_data$BET_PLACED_DATE - statement_dates_lag1[4]))

ITS_gls_multi_bets_day_lag1 <- gls(median_bets_per_day  ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days +
                                         Melbourne_cup,
                     data = bets_per_day_model_data,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_bets_day_lag1_summary <- summary(ITS_gls_multi_bets_day_lag1)

ITS_gls_multi_bets_day_lag1_summary
Generalized least squares fit by REML
  Model: median_bets_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: bets_per_day_model_data 
       AIC     BIC    logLik
  362.1345 402.086 -166.0672

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.6508883 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)               4.448177 0.2853556 15.588188  0.0000
Time                      0.018197 0.0336477  0.540814  0.5898
Post_Intervention1       -0.073059 0.3757501 -0.194436  0.8462
Time_After_Intervention1 -0.007923 0.0352071 -0.225037  0.8224
Post_Intervention2        0.136113 0.2957069  0.460298  0.6462
Time_After_Intervention2 -0.023552 0.0173799 -1.355148  0.1783
Post_Intervention3        0.344527 0.3035744  1.134900  0.2590
Time_After_Intervention3  0.004821 0.0166449  0.289650  0.7727
Post_Intervention4        0.029399 0.3533643  0.083197  0.9339
Time_After_Intervention4  0.011410 0.0402177  0.283710  0.7772
weekendTRUE               1.662572 0.1403352 11.847149  0.0000
pay_daysTRUE              0.925935 0.1522167  6.083006  0.0000
Melbourne_cup             3.552559 0.7369312  4.820746  0.0000

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.831                                          
Post_Intervention1        0.447 -0.759                                   
Time_After_Intervention1  0.781 -0.940  0.556                            
Post_Intervention2        0.024 -0.022  0.212 -0.197                     
Time_After_Intervention2  0.021 -0.031  0.334 -0.201 -0.017              
Post_Intervention3        0.024 -0.007  0.011  0.001  0.269 -0.460       
Time_After_Intervention3 -0.009  0.003  0.003 -0.010  0.468 -0.543  0.076
Post_Intervention4        0.016 -0.010  0.010  0.008  0.006 -0.006  0.239
Time_After_Intervention4  0.002  0.006 -0.004 -0.005  0.004 -0.008  0.157
weekendTRUE              -0.215  0.023 -0.034 -0.012 -0.036  0.003 -0.060
pay_daysTRUE             -0.337  0.117 -0.120 -0.092 -0.043 -0.023 -0.066
Melbourne_cup            -0.044  0.010 -0.012 -0.007 -0.012  0.009 -0.307
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.349                            
Time_After_Intervention4 -0.146 -0.527                     
weekendTRUE               0.003  0.038 -0.085              
pay_daysTRUE              0.029 -0.059  0.010  0.502       
Melbourne_cup             0.210 -0.104 -0.091  0.128  0.117

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-2.1217193 -0.7724966  0.1411195  0.4794216  2.2884875 

Residual standard error: 1.037712 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_bets_day_lag1)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_bets_day_lag1)

# Compute the Durbin-Watson statistic
DW_STAT_Bets_day<- sum(diff(residuals_gls)^2) / sum(residuals_gls^2)

The Durbin-Watson statistic calculated from the model’s residuals is 2.18.

Deposit amount per day Regression

Show code
# Create intervention indicators and time after intervention variables
deposit_amount_per_day_model_data$Post_Intervention1 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates_lag1[1])
deposit_amount_per_day_model_data$Post_Intervention2 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates_lag1[2])
deposit_amount_per_day_model_data$Post_Intervention3 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates_lag1[3])
deposit_amount_per_day_model_data$Post_Intervention4 <- as.numeric(deposit_amount_per_day_model_data$DATE >= statement_dates_lag1[4])

# Time variables for each intervention
deposit_amount_per_day_model_data$Time_After_Intervention1 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates_lag1[1]))
deposit_amount_per_day_model_data$Time_After_Intervention2 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates_lag1[2]))
deposit_amount_per_day_model_data$Time_After_Intervention3 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates_lag1[3]))
deposit_amount_per_day_model_data$Time_After_Intervention4 <- pmax(0, as.numeric(deposit_amount_per_day_model_data$DATE - statement_dates_lag1[4]))


ITS_gls_multi_deposit_lag1<- gls(median_deposited_per_day ~ Time + 
                                         Post_Intervention1 + Time_After_Intervention1 +
                                         Post_Intervention2 + Time_After_Intervention2 +
                                         Post_Intervention3 + Time_After_Intervention3 +
Post_Intervention4 +
Time_After_Intervention4 +
                                         weekend +
pay_days+
                                         Melbourne_cup,
                     data = deposit_amount_per_day_model_data,
                     correlation = corAR1(form = ~ Time | factor(weekend)))


ITS_gls_multi_deposit_lag1_summary<- summary(ITS_gls_multi_deposit_lag1)
ITS_gls_multi_deposit_lag1_summary
Generalized least squares fit by REML
  Model: median_deposited_per_day ~ Time + Post_Intervention1 + Time_After_Intervention1 +      Post_Intervention2 + Time_After_Intervention2 + Post_Intervention3 +      Time_After_Intervention3 + Post_Intervention4 + Time_After_Intervention4 +      weekend + pay_days + Melbourne_cup 
  Data: deposit_amount_per_day_model_data 
       AIC      BIC    logLik
  927.0326 966.9842 -448.5163

Correlation Structure: ARMA(1,0)
 Formula: ~Time | factor(weekend) 
 Parameter estimate(s):
      Phi1 
-0.2126782 

Coefficients:
                             Value Std.Error   t-value p-value
(Intercept)              100.43308  5.626791 17.849085  0.0000
Time                       0.52959  0.667296  0.793640  0.4292
Post_Intervention1         1.99993  7.396331  0.270395  0.7874
Time_After_Intervention1  -0.45082  0.704432 -0.639976  0.5236
Post_Intervention2         3.61383  5.902931  0.612210  0.5417
Time_After_Intervention2  -0.47191  0.352662 -1.338133  0.1837
Post_Intervention3         3.82380  5.919941  0.645918  0.5197
Time_After_Intervention3   0.23256  0.331344  0.701875  0.4843
Post_Intervention4        -0.13337  6.985036 -0.019093  0.9848
Time_After_Intervention4   0.89765  0.777447  1.154615  0.2508
weekendTRUE                2.39039  2.579284  0.926764  0.3562
pay_daysTRUE              18.11509  2.755671  6.573747  0.0000
Melbourne_cup            -10.46383 13.120190 -0.797536  0.4269

 Correlation: 
                         (Intr) Time   Pst_I1 T_A_I1 Pst_I2 T_A_I2 Pst_I3
Time                     -0.834                                          
Post_Intervention1        0.433 -0.747                                   
Time_After_Intervention1  0.781 -0.938  0.538                            
Post_Intervention2        0.015 -0.012  0.207 -0.209                     
Time_After_Intervention2  0.014 -0.018  0.333 -0.219 -0.007              
Post_Intervention3        0.015 -0.003  0.006  0.000  0.265 -0.466       
Time_After_Intervention3 -0.005  0.003  0.001 -0.007  0.469 -0.557  0.118
Post_Intervention4        0.012 -0.009  0.009  0.008  0.003 -0.002  0.215
Time_After_Intervention4 -0.004  0.007 -0.006 -0.006  0.002 -0.005  0.156
weekendTRUE              -0.183  0.002 -0.020  0.006 -0.030  0.002 -0.043
pay_daysTRUE             -0.280  0.095 -0.107 -0.072 -0.033 -0.025 -0.046
Melbourne_cup            -0.033  0.006 -0.008 -0.004 -0.007  0.003 -0.234
                         T_A_I3 Pst_I4 T_A_I4 wkTRUE p_TRUE
Time                                                       
Post_Intervention1                                         
Time_After_Intervention1                                   
Post_Intervention2                                         
Time_After_Intervention2                                   
Post_Intervention3                                         
Time_After_Intervention3                                   
Post_Intervention4       -0.329                            
Time_After_Intervention4 -0.155 -0.534                     
weekendTRUE              -0.005  0.038 -0.065              
pay_daysTRUE              0.023 -0.060  0.032  0.458       
Melbourne_cup             0.162 -0.075 -0.071  0.110  0.101

Standardized residuals:
       Min         Q1        Med         Q3        Max 
-1.8179549 -0.7035393 -0.1576547  0.5555697  2.9483472 

Residual standard error: 12.82458 
Degrees of freedom: 119 total; 106 residual

Check model residuals:

Show code
residuals_plot <- residuals(ITS_gls_multi_deposit_lag1)
plot(residuals_plot, type = "l", main = "Residuals of GLS Model", xlab = "Time", ylab = "Residuals")
abline(h = 0, col = "red")

Now check for autocorrelation in residuals that could indicate the model has not captured all time-dependent structures:

Show code
# Manual Computation of Durbin-Watson Statistic (thanks ChatGPT) to check for autocorrelation in the residuals:
residuals_gls <- residuals(ITS_gls_multi_deposit_lag1)

# Compute the Durbin-Watson statistic
sum(diff(residuals_gls)^2) / sum(residuals_gls^2)
[1] 1.844208

This value is also close to 2, indicating no issues.

Table outcomes

Show code
# Extract terms/varaibles from regression outputs:
bets_model_lag1_terms <-  ITS_gls_multi_bets_day_lag1_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

deposits_model_lag1_terms <-  ITS_gls_multi_deposit_lag1_summary$coefficients %>% as.matrix() %>%
as.data.frame() %>%
rownames_to_column() %>%
select(term = rowname)

# Extract coefficient tables from regression outputs:
bets_lag1_summary <- ITS_gls_multi_bets_day_lag1_summary$tTable %>% as_tibble() %>%
bind_cols(bets_model_lag1_terms)
deposit_lag1_summary <- ITS_gls_multi_deposit_lag1_summary$tTable %>% as_tibble() %>%
bind_cols(deposits_model_lag1_terms)


# Compute 95% confidence intervals:
bets_lag1_summary <- bets_lag1_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

deposit_lag1_summary <- deposit_lag1_summary %>%
  mutate(
    ci_lower = Value  - 1.96 * Std.Error,
    ci_upper = Value  + 1.96 * Std.Error
  )

# Prepare each dataset for tabling:
bets_lag1_data_table <- bets_lag1_summary  %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

deposit_lag1_data_table <- deposit_lag1_summary  %>%
select(term, Beta = Value, ci_lower, ci_upper, `p-value`) %>%
 mutate(pvalue = format(`p-value`, scientific = FALSE)) %>%
select(-`p-value`) %>%
  mutate(Beta = round(Beta, 3),  
    ci_lower = round(ci_lower, 3),
    ci_upper = round(ci_upper, 3),
pvalue = round(as.numeric(pvalue), 3)) %>%
mutate("95%CI" = paste(ci_lower, ci_upper, sep = ", ")) %>%
select(-ci_lower,
      -ci_upper)

# Join and tidy: 
final_table_data_lag1<- bind_cols(bets_lag1_data_table, deposit_lag1_data_table) %>%
select(Characteristic = 1, everything()) %>%
select( 
Characteristic,
Beta...2,
`95%CI...4`,
pvalue...3,
Beta...6,
`95%CI...8`,
pvalue...7) %>%
mutate(Characteristic = str_replace_all(Characteristic, c(
    "\\(Intercept\\)" = "Intercept",
    "Post_Intervention(\\d+)" = "Post-statement \\1 (level)",
    "Time_After_Intervention(\\d+)" = "Time after statement \\1 (slope)",
    "weekendTRUE" = "Weekend days",
    "pay_daysTRUE" = "Pay days",
    "Melbourne_cup" = "Melbourne Cup"
  )))



# Create the final table
final_table_data_lag1 %>%
  gt() %>%
tab_options(data_row.padding = px(3)) %>%
  tab_spanner(
    label = glue::glue("Median number of bets per day (*N* = {n_bets_analyses})"),
    columns = vars(`Beta...2`, `95%CI...4`,`pvalue...3`)
  ) %>%
  tab_spanner(
    label =  glue::glue("Median deposit amount per day (*N* = {n_deposits_analyses})"),
    columns = vars(`Beta...6`,  `95%CI...8`,`pvalue...7`)) %>%
  cols_label(
    'Beta...2' = 'Beta',
    '95%CI...4' = '95% CI',
    'pvalue...3' = 'p-value',
    'Beta...6' = 'Beta',
    '95%CI...8' = '95% CI',
    'pvalue...7' = 'p-value'
  ) %>%
tab_header(md("Table 7. Interrupted Time Series Analysis: Lagged Impact of Activity Statements on Betting Behaviour"))
Table 7. Interrupted Time Series Analysis: Lagged Impact of Activity Statements on Betting Behaviour
Characteristic Median number of bets per day (*N* = 9570) Median deposit amount per day (*N* = 9004)
Beta 95% CI p-value Beta 95% CI p-value
Intercept 4.448 3.889, 5.007 0.000 100.433 89.405, 111.462 0.000
Time 0.018 -0.048, 0.084 0.590 0.530 -0.778, 1.837 0.429
Post-statement 1 (level) -0.073 -0.81, 0.663 0.846 2.000 -12.497, 16.497 0.787
Time after statement 1 (slope) -0.008 -0.077, 0.061 0.822 -0.451 -1.832, 0.93 0.524
Post-statement 2 (level) 0.136 -0.443, 0.716 0.646 3.614 -7.956, 15.184 0.542
Time after statement 2 (slope) -0.024 -0.058, 0.011 0.178 -0.472 -1.163, 0.219 0.184
Post-statement 3 (level) 0.345 -0.25, 0.94 0.259 3.824 -7.779, 15.427 0.520
Time after statement 3 (slope) 0.005 -0.028, 0.037 0.773 0.233 -0.417, 0.882 0.484
Post-statement 4 (level) 0.029 -0.663, 0.722 0.934 -0.133 -13.824, 13.557 0.985
Time after statement 4 (slope) 0.011 -0.067, 0.09 0.777 0.898 -0.626, 2.421 0.251
Weekend days 1.663 1.388, 1.938 0.000 2.390 -2.665, 7.446 0.356
Pay days 0.926 0.628, 1.224 0.000 18.115 12.714, 23.516 0.000
Melbourne Cup 3.553 2.108, 4.997 0.000 -10.464 -36.179, 15.252 0.427

Reproducibility of this analysis

Package stability

To ensure that every time this script runs it will use the same versions of the packages used during the script development, all packages are loaded using the groundhog.library() function from the groundhog package.

Rendering & publishing

This document is rendered in HTML format and published online using Quarto Pub by using the following command in the terminal:

  • quarto publish quarto-pub activity_statement_anlyses_main.qmd

Session details

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.1 (2024-06-14 ucrt)
 os       Windows 11 x64 (build 26100)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  English_Australia.utf8
 ctype    English_Australia.utf8
 tz       Australia/Sydney
 date     2025-02-06
 pandoc   3.2 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package          * version  date (UTC) lib source
 bestNormalize    * 1.9.1    2023-08-18 [1] CRAN (R 4.4.1)
 brant            * 0.3-0    2020-09-22 [1] CRAN (R 4.4.1)
 data.table       * 1.15.4   2024-03-30 [1] CRAN (R 4.4.0)
 DescTools        * 0.99.54  2024-02-03 [1] CRAN (R 4.4.0)
 dplyr            * 1.1.4    2023-11-17 [1] CRAN (R 4.4.1)
 e1071            * 1.7-14   2023-12-06 [1] CRAN (R 4.4.0)
 emmeans          * 1.10.2   2024-05-20 [1] CRAN (R 4.4.0)
 forcats          * 1.0.0    2023-01-29 [1] CRAN (R 4.4.1)
 formattable      * 0.2.1    2021-01-07 [1] CRAN (R 4.4.1)
 ggplot2          * 3.5.1    2024-04-23 [1] CRAN (R 4.4.1)
 ggrain           * 0.0.4    2024-01-23 [1] CRAN (R 4.4.1)
 ggstats          * 0.6.0    2024-04-05 [1] CRAN (R 4.4.0)
 ggstatsplot      * 0.12.3   2024-04-06 [1] CRAN (R 4.4.0)
 ggstream         * 0.1.0    2021-05-06 [1] CRAN (R 4.4.1)
 groundhog        * 3.2.1    2024-09-29 [2] CRAN (R 4.4.1)
 gt               * 0.10.1   2024-01-17 [1] CRAN (R 4.4.0)
 gtExtras         * 0.5.0    2023-09-15 [1] CRAN (R 4.4.1)
 gtsummary        * 1.7.2    2023-07-15 [1] CRAN (R 4.4.0)
 Hmisc            * 5.1-3    2024-05-28 [1] CRAN (R 4.4.1)
 htmlwidgets      * 1.6.4    2023-12-06 [1] CRAN (R 4.4.1)
 kableExtra       * 1.4.0    2024-01-24 [1] CRAN (R 4.4.1)
 likert           * 1.3.5    2016-12-31 [1] CRAN (R 4.4.1)
 lmtest           * 0.9-40   2022-03-21 [1] CRAN (R 4.4.1)
 lubridate        * 1.9.3    2023-09-27 [1] CRAN (R 4.4.1)
 MASS             * 7.3-60.2 2024-05-14 [1] local
 Matrix           * 1.7-0    2024-03-22 [1] CRAN (R 4.4.1)
 NatParksPalettes * 0.2.0    2022-10-09 [1] CRAN (R 4.4.1)
 networkD3        * 0.4      2017-03-18 [1] CRAN (R 4.4.1)
 nlme             * 3.1-164  2023-11-27 [1] CRAN (R 4.4.0)
 patchwork        * 1.2.0    2024-01-08 [1] CRAN (R 4.4.0)
 performance      * 0.11.0   2024-03-22 [1] CRAN (R 4.4.0)
 plotly           * 4.10.4   2024-01-13 [1] CRAN (R 4.4.1)
 pscl             * 1.5.9    2024-01-31 [1] CRAN (R 4.4.1)
 purrr            * 1.0.2    2023-08-10 [1] CRAN (R 4.4.1)
 qs               * 0.26.3   2024-05-16 [1] CRAN (R 4.4.0)
 rcartocolor      * 2.1.1    2023-05-13 [1] CRAN (R 4.4.1)
 readr            * 2.1.5    2024-01-10 [1] CRAN (R 4.4.1)
 readxl           * 1.4.3    2023-07-06 [1] CRAN (R 4.4.1)
 rms              * 6.8-1    2024-05-27 [1] CRAN (R 4.4.0)
 rstantools       * 2.4.0    2024-01-31 [1] CRAN (R 4.4.1)
 scales           * 1.3.0    2023-11-28 [1] CRAN (R 4.4.1)
 see              * 0.8.4    2024-04-29 [1] CRAN (R 4.4.0)
 segmented        * 2.1-0    2024-05-14 [1] CRAN (R 4.4.0)
 sessioninfo      * 1.2.2    2021-12-06 [1] CRAN (R 4.4.1)
 showtext         * 0.9-7    2024-03-02 [1] CRAN (R 4.4.1)
 showtextdb       * 3.0      2020-06-04 [1] CRAN (R 4.4.1)
 stringr          * 1.5.1    2023-11-14 [1] CRAN (R 4.4.1)
 survey           * 4.4-2    2024-03-20 [1] CRAN (R 4.4.1)
 survival         * 3.6-4    2024-04-24 [1] CRAN (R 4.4.0)
 sysfonts         * 0.8.9    2024-03-02 [1] CRAN (R 4.4.1)
 tibble           * 3.2.1    2023-03-20 [1] CRAN (R 4.4.1)
 tidyr            * 1.3.1    2024-01-24 [1] CRAN (R 4.4.1)
 tidyverse        * 2.0.0    2023-02-22 [1] CRAN (R 4.4.1)
 viridis          * 0.6.5    2024-01-29 [1] CRAN (R 4.4.1)
 viridisLite      * 0.4.2    2023-05-02 [1] CRAN (R 4.4.1)
 wCorr            * 1.9.8    2023-08-19 [1] CRAN (R 4.4.1)
 weights          * 1.0.4    2021-06-10 [1] CRAN (R 4.4.1)
 xtable           * 1.8-4    2019-04-21 [1] CRAN (R 4.4.1)
 zoo              * 1.8-12   2023-04-13 [1] CRAN (R 4.4.1)

 [1] C:/Users/rhei4496/AppData/Local/R/win-library/4.4
 [2] C:/Users/rhei4496/AppData/Local/Programs/R/R-4.4.1/library

──────────────────────────────────────────────────────────────────────────────
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default


locale:
[1] LC_COLLATE=English_Australia.utf8  LC_CTYPE=English_Australia.utf8   
[3] LC_MONETARY=English_Australia.utf8 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.utf8    

time zone: Australia/Sydney
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] sessioninfo_1.2.2      qs_0.26.3              data.table_1.15.4     
 [4] showtext_0.9-7         showtextdb_3.0         sysfonts_0.8.9        
 [7] NatParksPalettes_0.2.0 rcartocolor_2.1.1      patchwork_1.2.0       
[10] scales_1.3.0           viridis_0.6.5          viridisLite_0.4.2     
[13] ggstats_0.6.0          likert_1.3.5           xtable_1.8-4          
[16] networkD3_0.4          ggrain_0.0.4           ggstream_0.1.0        
[19] rstantools_2.4.0       ggstatsplot_0.12.3     htmlwidgets_1.6.4     
[22] plotly_4.10.4          gtsummary_1.7.2        gtExtras_0.5.0        
[25] gt_0.10.1              formattable_0.2.1      kableExtra_1.4.0      
[28] DescTools_0.99.54      emmeans_1.10.2         rms_6.8-1             
[31] pscl_1.5.9             brant_0.3-0            bestNormalize_1.9.1   
[34] e1071_1.7-14           see_0.8.4              performance_0.11.0    
[37] lmtest_0.9-40          zoo_1.8-12             wCorr_1.9.8           
[40] segmented_2.1-0        nlme_3.1-164           MASS_7.3-60.2         
[43] weights_1.0.4          Hmisc_5.1-3            survey_4.4-2          
[46] survival_3.6-4         Matrix_1.7-0           readxl_1.4.3          
[49] lubridate_1.9.3        forcats_1.0.0          stringr_1.5.1         
[52] dplyr_1.1.4            purrr_1.0.2            readr_2.1.5           
[55] tidyr_1.3.1            tibble_3.2.1           ggplot2_3.5.1         
[58] tidyverse_2.0.0        groundhog_3.2.1       

loaded via a namespace (and not attached):
  [1] fontawesome_0.5.2      gghalves_0.1.4         httr_1.4.7            
  [4] insight_0.19.11        doParallel_1.0.17      tools_4.4.1           
  [7] doRNG_1.8.6            backports_1.5.0        utf8_1.2.4            
 [10] R6_2.5.1               statsExpressions_1.5.4 mgcv_1.9-1            
 [13] lazyeval_0.2.2         jomo_2.7-6             withr_3.0.0           
 [16] gridExtra_2.3          textshaping_0.4.0      quantreg_5.98         
 [19] cli_3.6.2              sandwich_3.1-0         labeling_0.4.3        
 [22] sass_0.4.9             mvtnorm_1.2-5          polspline_1.1.25      
 [25] proxy_0.4-27           commonmark_1.9.1       systemfonts_1.1.0     
 [28] foreign_0.8-86         svglite_2.1.3          DHARMa_0.4.6          
 [31] parallelly_1.37.1      labelled_2.13.0        rstudioapi_0.16.0     
 [34] RApiSerialize_0.1.3    generics_0.1.3         shape_1.4.6.1         
 [37] gtools_3.9.5           car_3.1-2              fansi_1.0.6           
 [40] abind_1.4-5            lifecycle_1.0.4        multcomp_1.4-25       
 [43] yaml_2.3.8             carData_3.0-5          recipes_1.0.10        
 [46] paletteer_1.6.0        gdata_3.0.0            mitml_0.4-5           
 [49] butcher_0.3.4          lattice_0.22-6         haven_2.5.4           
 [52] zeallot_0.1.0          pillar_1.9.0           knitr_1.46            
 [55] boot_1.3-30            gld_2.6.6              estimability_1.5.1    
 [58] future.apply_1.11.2    codetools_0.2-20       pan_1.9               
 [61] glue_1.7.0             broom.helpers_1.15.0   vctrs_0.6.5           
 [64] cellranger_1.1.0       gtable_0.3.5           rematch2_2.1.2        
 [67] datawizard_0.10.0      ggpp_0.5.7             gower_1.0.1           
 [70] xfun_0.44              prodlim_2023.08.28     correlation_0.8.4     
 [73] coda_0.19-4.1          timeDate_4032.109      iterators_1.0.14      
 [76] hardhat_1.3.1          lava_1.8.0             TH.data_1.1-2         
 [79] ipred_0.9-14           rpart_4.1.23           colorspace_2.1-0      
 [82] DBI_1.2.2              nnet_7.3-19            Exact_3.2             
 [85] mnormt_2.1.1           tidyselect_1.2.1       curl_5.2.1            
 [88] compiler_4.4.1         glmnet_4.1-8           htmlTable_2.4.2       
 [91] mice_3.16.0            SparseM_1.82           expm_0.999-9          
 [94] xml2_1.3.6             stringfish_0.16.0      bayestestR_0.13.2     
 [97] checkmate_2.3.1        psych_2.4.3            digest_0.6.35         
[100] minqa_1.2.7            rmarkdown_2.27         htmltools_0.5.8.1     
[103] pkgconfig_2.0.3        base64enc_0.1-3        lme4_1.1-35.3         
[106] fastmap_1.2.0          rlang_1.1.3            farver_2.1.2          
[109] jsonlite_1.8.8         magrittr_2.0.3         polynom_1.4-1         
[112] Formula_1.2-5          parameters_0.21.7      munsell_0.5.1         
[115] Rcpp_1.0.12            stringi_1.8.4          rootSolve_1.8.2.4     
[118] plyr_1.8.9             ggrepel_0.9.5          parallel_4.4.1        
[121] listenv_0.9.1          lmom_3.0               splines_4.4.1         
[124] hms_1.1.3              igraph_2.0.3           markdown_1.12         
[127] rngtools_1.5.2         effectsize_0.8.8       reshape2_1.4.4        
[130] evaluate_0.23          mitools_2.4            RcppParallel_5.1.7    
[133] nloptr_2.0.3           tzdb_0.4.0             foreach_1.5.2         
[136] MatrixModels_0.5-3     future_1.33.2          broom_1.0.6           
[139] ragg_1.3.2             class_7.3-22           cluster_2.1.6         
[142] timechange_0.3.0       globals_0.16.3        

END