Introduction to Multiple Testing

1 1. Introduction

This tutorial illustrates the problem of multiple hypothesis testing using experimental data from Karlan and List (2007) on donation behavior using R.

2 2. Load and Prepare Data

Download the data from this link and save it as data.csv in the same directory as this R Markdown file.

data <- read_csv("data.csv")

data <- data %>%
  mutate(
    amountmat = amount * (1 + ratio),
    groupid = case_when(
      redcty == 1 & red0 == 1 ~ 1,
      redcty == 0 & red0 == 1 ~ 2,
      redcty == 0 & red0 == 0 ~ 3,
      redcty == 1 & red0 == 0 ~ 4,
      TRUE ~ NA_real_
    )
  )

3 3. Multiple Outcomes

Hypothesis testing with multiple outcomes. We consider four outcome variables and we are interested in testing whether the average treatment effect is zero for each one of those outcomes.

outcomes <- list(
  "Response rate" = "gave",
  "Dollars given w/o match" = "amount",
  "Dollars given + match" = "amountmat",
  "Amount change" = "amountchange"
)

# Run regressions and collect results
results <- map_dfr(names(outcomes), function(label) {
  outcome_var <- outcomes[[label]]
  model <- lm(as.formula(paste0(outcome_var, " ~ treatment")), data = data)
  
  tidy(model) %>%
    filter(term == "treatment") %>%
    mutate(
      Outcome = label,
      DI = estimate,
      `Unadj. p` = p.value
    ) %>%
    select(Outcome, DI, `Unadj. p`)
})

# First: add multiple testing adjustments
results <- results %>%
  mutate(
    `Bonf.` = p.adjust(`Unadj. p`, method = "bonferroni"),
    `Holm` = p.adjust(`Unadj. p`, method = "holm")
  )


# Add and format stars for all p-values
results <- results %>%
  mutate(
    stars_unadj = case_when(
      as.numeric(`Unadj. p`) < 0.01 ~ "***",
      as.numeric(`Unadj. p`) < 0.10 ~ "*",
      TRUE ~ ""
    ),
    stars_bonf = case_when(
      as.numeric(`Bonf.`) < 0.01 ~ "***",
      as.numeric(`Bonf.`) < 0.10 ~ "*",
      TRUE ~ ""
    ),
    stars_holm = case_when(
      as.numeric(`Holm`) < 0.01 ~ "***",
      as.numeric(`Holm`) < 0.10 ~ "*",
      TRUE ~ ""
    ),
    `Unadj. p` = paste0(formatC(as.numeric(`Unadj. p`), format = "f", digits = 4), stars_unadj),
    `Bonf.`     = paste0(formatC(as.numeric(`Bonf.`), format = "f", digits = 4), stars_bonf),
    `Holm`      = paste0(formatC(as.numeric(`Holm`), format = "f", digits = 4), stars_holm),
    DI = round(DI, 4)
  ) %>%
  select(Outcome, DI, `Unadj. p`, `Bonf.`, `Holm`)


# View Table 1
results

## # A tibble: 4 × 5
##   Outcome                     DI `Unadj. p` Bonf.     Holm     
##   <chr>                    <dbl> <chr>      <chr>     <chr>    
## 1 Response rate           0.0042 0.0019***  0.0077*** 0.0058***
## 2 Dollars given w/o match 0.154  0.0628*    0.2513    0.1256   
## 3 Dollars given + match   2.09   0.0000***  0.0000*** 0.0000***
## 4 Amount change           6.33   0.5982     1.0000    0.5982

4 4. Subgroup effects

Hypothesis testing with multiple subgroups: We consider four subgroups: red county in a red state, blue county in a red state, red county in a blue state, and blue county in a blue state. We focus on the outcome response rate.

# Significance stars
add_stars <- function(p) {
  case_when(
    p < 0.01 ~ "***",
    p < 0.10 ~ "*",
    TRUE ~ ""
  )
}

# Define readable subgroup names corresponding to groupid 1–4
subgroups <- list(
  "Red county, red state"   = 1,
  "Blue county, red state"  = 2,
  "Blue county, blue state" = 4,
  "Red county, blue state"  = 3
)

# Focus on one outcome: response rate (gave)
outcome_var <- "gave"
outcome_label <- "Response rate"

# Loop over subgroups and run regression
results_subgroups <- map_dfr(names(subgroups), function(subgroup_name) {
  groupid <- subgroups[[subgroup_name]]
  
  model <- lm(as.formula(paste0(outcome_var, " ~ treatment")), data = data[data$groupid == groupid, ])
  
  tidy(model) %>%
    filter(term == "treatment") %>%
    mutate(
      Outcome = outcome_label,
      Subgroup = subgroup_name,
      DI = estimate,
      p_raw = p.value
    ) %>%
    select(Outcome, Subgroup, DI, p_raw)
})

# Multiple testing corrections
results_subgroups <- results_subgroups %>%
  mutate(
    p_bonf = p.adjust(p_raw, method = "bonferroni"),
    p_holm = p.adjust(p_raw, method = "holm")
  )

# Format and append stars
results_subgroups <- results_subgroups %>%
  mutate(
    DI = round(DI, 4),
    `Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
    `Bonf.` = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
    `Holm` = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
  ) %>%
  select(Outcome, Subgroup, DI, `Unadj. p`, `Bonf.`, `Holm`)

# View result
results_subgroups

## # A tibble: 4 × 6
##   Outcome       Subgroup                    DI `Unadj. p` Bonf.     Holm     
##   <chr>         <chr>                    <dbl> <chr>      <chr>     <chr>    
## 1 Response rate Red county, red state   0.0095 0.0003***  0.0010*** 0.0010***
## 2 Response rate Blue county, red state  0.007  0.0525*    0.2102    0.1576   
## 3 Response rate Blue county, blue state 0      0.9935     1.0000    0.9935   
## 4 Response rate Red county, blue state  0.0016 0.4685     1.0000    0.9370

5 5. Treatment Comparisons

Hypothesis testing with multiple treatments: We consider the three treatments for match ratio: 1:1, 2:1, and 3:1. We focus on the outcome dollars given not including match. Here we compare each treatment group to the control.

# Define the ratio groups of interest (assuming 3 levels of ratio)
treatment_ratios <- sort(unique(data$ratio[data$treatment == 1]))

# Run regression for each treatment group vs control
results_ratio <- map_dfr(treatment_ratios, function(r) {
  
  # Subset to control + one treatment arm
  subset_data <- data %>%
    filter((treatment == 0) | (ratio == r))
  
  # Define new treatment dummy: 1 if in treatment with ratio == r, 0 if control
  subset_data <- subset_data %>%
    mutate(ratiodummy = as.integer(treatment == 1 & ratio == r))
  
  # Run regression
  model <- lm(amount ~ ratiodummy, data = subset_data)
  
  # Extract treatment effect
  tidy(model) %>%
    filter(term == "ratiodummy") %>%
    mutate(
      Outcome = paste0("Treatment ratio = ", r),
      DI = estimate,
      p_raw = p.value
    ) %>%
    select(Outcome, DI, p_raw)
})

# Adjust for multiple testing
results_ratio <- results_ratio %>%
  mutate(
    p_bonf = p.adjust(p_raw, method = "bonferroni"),
    p_holm = p.adjust(p_raw, method = "holm")
  )

# Format and add stars
results_ratio <- results_ratio %>%
  mutate(
    DI = round(DI, 4),
    `Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
    `Bonf.`     = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
    `Holm`      = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
  ) %>%
  select(Outcome, DI, `Unadj. p`, `Bonf.`, `Holm`)

# View result
results_ratio

## # A tibble: 3 × 5
##   Outcome                DI `Unadj. p` Bonf.  Holm  
##   <chr>               <dbl> <chr>      <chr>  <chr> 
## 1 Treatment ratio = 1 0.123 0.2443     0.7329 0.4247
## 2 Treatment ratio = 2 0.213 0.0448*    0.1345 0.1345
## 3 Treatment ratio = 3 0.124 0.2123     0.6370 0.4247

6 6. Full Design

Hypothesis testing with multiple outcomes, subgroups, treatments: We consider four outcome variables: response rate, dollars given not including match, dollars given including match, and amount change. We also consider four subgroups: red county in a red state, blue county in a red state, red county in a blue state, and blue county in a blue state. Lastly, we compare the control to the three treatments for matching ratio: 1:1, 2:1, and 3:1.

treatment_ratios <- c(1, 2, 3)  # Ratio levels

# Significance stars function
add_stars <- function(p) {
  case_when(
    p < 0.01 ~ "***",
    p < 0.10 ~ "*",
    TRUE ~ ""
  )
}

# Loop through all outcome-subgroup-ratio combinations
results_all <- crossing(
  outcome_name = names(outcomes),
  subgroup_name = names(subgroups),
  ratio = treatment_ratios
) %>%
  mutate(
    outcome_var = outcomes[outcome_name],
    groupid = subgroups[subgroup_name]
  ) %>%
  pmap_dfr(function(outcome_name, subgroup_name, ratio, outcome_var, groupid) {
    
    df <- data %>%
      filter(groupid == !!groupid & (treatment == 0 | (treatment == 1 & ratio == !!ratio))) %>%
      mutate(treat_dummy = as.integer(treatment == 1 & ratio == !!ratio))
    
    model <- lm(as.formula(paste0(outcome_var, " ~ treat_dummy")), data = df)
    
    tidy(model) %>%
      filter(term == "treat_dummy") %>%
      mutate(
        Outcome = outcome_name,
        Subgroup = subgroup_name,
        Ratio = paste0(ratio, ":1"),
        DI = estimate,
        p_raw = p.value
      ) %>%
      select(Outcome, Subgroup, Ratio, DI, p_raw)
  })

# Adjust p-values
results_all <- results_all %>%
  mutate(
    p_bonf = p.adjust(p_raw, method = "bonferroni"),
    p_holm = p.adjust(p_raw, method = "holm")
  )

# Format + stars
results_all <- results_all %>%
  mutate(
    DI = round(DI, 4),
    `Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
    `Bonf.` = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
    `Holm`  = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
  ) %>%
  select(Outcome, Subgroup, Ratio, DI, `Unadj. p`, `Bonf.`, `Holm`)

# Preview
print(results_all, n = 10)

## # A tibble: 48 × 7
##    Outcome       Subgroup                Ratio      DI `Unadj. p` Bonf.  Holm  
##    <chr>         <chr>                   <chr>   <dbl> <chr>      <chr>  <chr> 
##  1 Amount change Blue county, blue state 1:1     0.929 0.4541     1.0000 1.0000
##  2 Amount change Blue county, blue state 2:1    -0.294 0.8225     1.0000 1.0000
##  3 Amount change Blue county, blue state 3:1     0.515 0.6755     1.0000 1.0000
##  4 Amount change Blue county, red state  1:1    92.3   0.4137     1.0000 1.0000
##  5 Amount change Blue county, red state  2:1    93.7   0.4080     1.0000 1.0000
##  6 Amount change Blue county, red state  3:1    94.3   0.3993     1.0000 1.0000
##  7 Amount change Red county, blue state  1:1   -52.0   0.2037     1.0000 1.0000
##  8 Amount change Red county, blue state  2:1    -0.445 0.6662     1.0000 1.0000
##  9 Amount change Red county, blue state  3:1     1.14  0.2544     1.0000 1.0000
## 10 Amount change Red county, red state   1:1     1.83  0.1365     1.0000 1.0000
## # ℹ 38 more rows

7 6. Plots

results_all <- crossing(
  outcome_name = names(outcomes),
  subgroup_name = names(subgroups),
  ratio = c(1, 2, 3)
) %>%
  mutate(
    outcome_var = outcomes[outcome_name],
    groupid = subgroups[subgroup_name]
  ) %>%
  pmap_dfr(function(outcome_name, subgroup_name, ratio, outcome_var, groupid) {
    df <- data %>%
      filter(groupid == !!groupid & (treatment == 0 | (treatment == 1 & ratio == !!ratio))) %>%
      mutate(treat_dummy = as.integer(treatment == 1 & ratio == !!ratio))
    model <- lm(as.formula(paste0(outcome_var, " ~ treat_dummy")), data = df)
    tidy(model, conf.int = TRUE) %>%
      filter(term == "treat_dummy") %>%
      mutate(
        Outcome = outcome_name,
        Subgroup = subgroup_name,
        Ratio = paste0(ratio, ":1"),
        DI = estimate,
        SE = std.error,
        p_raw = p.value,
        conf.low = conf.low,
        conf.high = conf.high
      )
  }) %>%
  mutate(
    label = paste0(Outcome, " | ", Subgroup, " | ", Ratio),
    significant = p_raw < 0.05,
    holm_significant = p.adjust(p_raw, "holm") < 0.05,
    label_display = ifelse(
      holm_significant,
      paste0("<span style='color:blue;'>", label, "</span>"),
      label
    )
  )

# Plot
plot_data <- results_all %>% filter(Outcome != "Amount change")

ggplot(plot_data, aes(x = reorder(label_display, DI), y = DI, ymin = conf.low, ymax = conf.high, color = significant)) +
  geom_pointrange() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip() +
  scale_color_manual(values = c("black", "red")) +
  labs(
    title = "Estimated Treatment Effects",
    subtitle = "Labels in <span style='color:blue;'>blue</span> are Holm-significant (p < 0.05)",
    x = "", y = "Estimated DI"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = ggtext::element_markdown(size = 7),
    legend.position = "bottom",
    plot.subtitle = element_markdown()
  )