This tutorial illustrates the problem of multiple hypothesis testing using experimental data from Karlan and List (2007) on donation behavior using R.
Download the data from this
link and save it as data.csv
in the same directory as
this R Markdown file.
data <- read_csv("data.csv")
data <- data %>%
mutate(
amountmat = amount * (1 + ratio),
groupid = case_when(
redcty == 1 & red0 == 1 ~ 1,
redcty == 0 & red0 == 1 ~ 2,
redcty == 0 & red0 == 0 ~ 3,
redcty == 1 & red0 == 0 ~ 4,
TRUE ~ NA_real_
)
)
Hypothesis testing with multiple outcomes. We consider four outcome variables and we are interested in testing whether the average treatment effect is zero for each one of those outcomes.
outcomes <- list(
"Response rate" = "gave",
"Dollars given w/o match" = "amount",
"Dollars given + match" = "amountmat",
"Amount change" = "amountchange"
)
# Run regressions and collect results
results <- map_dfr(names(outcomes), function(label) {
outcome_var <- outcomes[[label]]
model <- lm(as.formula(paste0(outcome_var, " ~ treatment")), data = data)
tidy(model) %>%
filter(term == "treatment") %>%
mutate(
Outcome = label,
DI = estimate,
`Unadj. p` = p.value
) %>%
select(Outcome, DI, `Unadj. p`)
})
# First: add multiple testing adjustments
results <- results %>%
mutate(
`Bonf.` = p.adjust(`Unadj. p`, method = "bonferroni"),
`Holm` = p.adjust(`Unadj. p`, method = "holm")
)
# Add and format stars for all p-values
results <- results %>%
mutate(
stars_unadj = case_when(
as.numeric(`Unadj. p`) < 0.01 ~ "***",
as.numeric(`Unadj. p`) < 0.10 ~ "*",
TRUE ~ ""
),
stars_bonf = case_when(
as.numeric(`Bonf.`) < 0.01 ~ "***",
as.numeric(`Bonf.`) < 0.10 ~ "*",
TRUE ~ ""
),
stars_holm = case_when(
as.numeric(`Holm`) < 0.01 ~ "***",
as.numeric(`Holm`) < 0.10 ~ "*",
TRUE ~ ""
),
`Unadj. p` = paste0(formatC(as.numeric(`Unadj. p`), format = "f", digits = 4), stars_unadj),
`Bonf.` = paste0(formatC(as.numeric(`Bonf.`), format = "f", digits = 4), stars_bonf),
`Holm` = paste0(formatC(as.numeric(`Holm`), format = "f", digits = 4), stars_holm),
DI = round(DI, 4)
) %>%
select(Outcome, DI, `Unadj. p`, `Bonf.`, `Holm`)
# View Table 1
results
## # A tibble: 4 × 5
## Outcome DI `Unadj. p` Bonf. Holm
## <chr> <dbl> <chr> <chr> <chr>
## 1 Response rate 0.0042 0.0019*** 0.0077*** 0.0058***
## 2 Dollars given w/o match 0.154 0.0628* 0.2513 0.1256
## 3 Dollars given + match 2.09 0.0000*** 0.0000*** 0.0000***
## 4 Amount change 6.33 0.5982 1.0000 0.5982
Hypothesis testing with multiple subgroups: We consider four subgroups: red county in a red state, blue county in a red state, red county in a blue state, and blue county in a blue state. We focus on the outcome response rate.
# Significance stars
add_stars <- function(p) {
case_when(
p < 0.01 ~ "***",
p < 0.10 ~ "*",
TRUE ~ ""
)
}
# Define readable subgroup names corresponding to groupid 1–4
subgroups <- list(
"Red county, red state" = 1,
"Blue county, red state" = 2,
"Blue county, blue state" = 4,
"Red county, blue state" = 3
)
# Focus on one outcome: response rate (gave)
outcome_var <- "gave"
outcome_label <- "Response rate"
# Loop over subgroups and run regression
results_subgroups <- map_dfr(names(subgroups), function(subgroup_name) {
groupid <- subgroups[[subgroup_name]]
model <- lm(as.formula(paste0(outcome_var, " ~ treatment")), data = data[data$groupid == groupid, ])
tidy(model) %>%
filter(term == "treatment") %>%
mutate(
Outcome = outcome_label,
Subgroup = subgroup_name,
DI = estimate,
p_raw = p.value
) %>%
select(Outcome, Subgroup, DI, p_raw)
})
# Multiple testing corrections
results_subgroups <- results_subgroups %>%
mutate(
p_bonf = p.adjust(p_raw, method = "bonferroni"),
p_holm = p.adjust(p_raw, method = "holm")
)
# Format and append stars
results_subgroups <- results_subgroups %>%
mutate(
DI = round(DI, 4),
`Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
`Bonf.` = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
`Holm` = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
) %>%
select(Outcome, Subgroup, DI, `Unadj. p`, `Bonf.`, `Holm`)
# View result
results_subgroups
## # A tibble: 4 × 6
## Outcome Subgroup DI `Unadj. p` Bonf. Holm
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Response rate Red county, red state 0.0095 0.0003*** 0.0010*** 0.0010***
## 2 Response rate Blue county, red state 0.007 0.0525* 0.2102 0.1576
## 3 Response rate Blue county, blue state 0 0.9935 1.0000 0.9935
## 4 Response rate Red county, blue state 0.0016 0.4685 1.0000 0.9370
Hypothesis testing with multiple treatments: We consider the three treatments for match ratio: 1:1, 2:1, and 3:1. We focus on the outcome dollars given not including match. Here we compare each treatment group to the control.
# Define the ratio groups of interest (assuming 3 levels of ratio)
treatment_ratios <- sort(unique(data$ratio[data$treatment == 1]))
# Run regression for each treatment group vs control
results_ratio <- map_dfr(treatment_ratios, function(r) {
# Subset to control + one treatment arm
subset_data <- data %>%
filter((treatment == 0) | (ratio == r))
# Define new treatment dummy: 1 if in treatment with ratio == r, 0 if control
subset_data <- subset_data %>%
mutate(ratiodummy = as.integer(treatment == 1 & ratio == r))
# Run regression
model <- lm(amount ~ ratiodummy, data = subset_data)
# Extract treatment effect
tidy(model) %>%
filter(term == "ratiodummy") %>%
mutate(
Outcome = paste0("Treatment ratio = ", r),
DI = estimate,
p_raw = p.value
) %>%
select(Outcome, DI, p_raw)
})
# Adjust for multiple testing
results_ratio <- results_ratio %>%
mutate(
p_bonf = p.adjust(p_raw, method = "bonferroni"),
p_holm = p.adjust(p_raw, method = "holm")
)
# Format and add stars
results_ratio <- results_ratio %>%
mutate(
DI = round(DI, 4),
`Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
`Bonf.` = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
`Holm` = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
) %>%
select(Outcome, DI, `Unadj. p`, `Bonf.`, `Holm`)
# View result
results_ratio
## # A tibble: 3 × 5
## Outcome DI `Unadj. p` Bonf. Holm
## <chr> <dbl> <chr> <chr> <chr>
## 1 Treatment ratio = 1 0.123 0.2443 0.7329 0.4247
## 2 Treatment ratio = 2 0.213 0.0448* 0.1345 0.1345
## 3 Treatment ratio = 3 0.124 0.2123 0.6370 0.4247
Hypothesis testing with multiple outcomes, subgroups, treatments: We consider four outcome variables: response rate, dollars given not including match, dollars given including match, and amount change. We also consider four subgroups: red county in a red state, blue county in a red state, red county in a blue state, and blue county in a blue state. Lastly, we compare the control to the three treatments for matching ratio: 1:1, 2:1, and 3:1.
treatment_ratios <- c(1, 2, 3) # Ratio levels
# Significance stars function
add_stars <- function(p) {
case_when(
p < 0.01 ~ "***",
p < 0.10 ~ "*",
TRUE ~ ""
)
}
# Loop through all outcome-subgroup-ratio combinations
results_all <- crossing(
outcome_name = names(outcomes),
subgroup_name = names(subgroups),
ratio = treatment_ratios
) %>%
mutate(
outcome_var = outcomes[outcome_name],
groupid = subgroups[subgroup_name]
) %>%
pmap_dfr(function(outcome_name, subgroup_name, ratio, outcome_var, groupid) {
df <- data %>%
filter(groupid == !!groupid & (treatment == 0 | (treatment == 1 & ratio == !!ratio))) %>%
mutate(treat_dummy = as.integer(treatment == 1 & ratio == !!ratio))
model <- lm(as.formula(paste0(outcome_var, " ~ treat_dummy")), data = df)
tidy(model) %>%
filter(term == "treat_dummy") %>%
mutate(
Outcome = outcome_name,
Subgroup = subgroup_name,
Ratio = paste0(ratio, ":1"),
DI = estimate,
p_raw = p.value
) %>%
select(Outcome, Subgroup, Ratio, DI, p_raw)
})
# Adjust p-values
results_all <- results_all %>%
mutate(
p_bonf = p.adjust(p_raw, method = "bonferroni"),
p_holm = p.adjust(p_raw, method = "holm")
)
# Format + stars
results_all <- results_all %>%
mutate(
DI = round(DI, 4),
`Unadj. p` = paste0(formatC(p_raw, format = "f", digits = 4), add_stars(p_raw)),
`Bonf.` = paste0(formatC(p_bonf, format = "f", digits = 4), add_stars(p_bonf)),
`Holm` = paste0(formatC(p_holm, format = "f", digits = 4), add_stars(p_holm))
) %>%
select(Outcome, Subgroup, Ratio, DI, `Unadj. p`, `Bonf.`, `Holm`)
# Preview
print(results_all, n = 10)
## # A tibble: 48 × 7
## Outcome Subgroup Ratio DI `Unadj. p` Bonf. Holm
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Amount change Blue county, blue state 1:1 0.929 0.4541 1.0000 1.0000
## 2 Amount change Blue county, blue state 2:1 -0.294 0.8225 1.0000 1.0000
## 3 Amount change Blue county, blue state 3:1 0.515 0.6755 1.0000 1.0000
## 4 Amount change Blue county, red state 1:1 92.3 0.4137 1.0000 1.0000
## 5 Amount change Blue county, red state 2:1 93.7 0.4080 1.0000 1.0000
## 6 Amount change Blue county, red state 3:1 94.3 0.3993 1.0000 1.0000
## 7 Amount change Red county, blue state 1:1 -52.0 0.2037 1.0000 1.0000
## 8 Amount change Red county, blue state 2:1 -0.445 0.6662 1.0000 1.0000
## 9 Amount change Red county, blue state 3:1 1.14 0.2544 1.0000 1.0000
## 10 Amount change Red county, red state 1:1 1.83 0.1365 1.0000 1.0000
## # ℹ 38 more rows
results_all <- crossing(
outcome_name = names(outcomes),
subgroup_name = names(subgroups),
ratio = c(1, 2, 3)
) %>%
mutate(
outcome_var = outcomes[outcome_name],
groupid = subgroups[subgroup_name]
) %>%
pmap_dfr(function(outcome_name, subgroup_name, ratio, outcome_var, groupid) {
df <- data %>%
filter(groupid == !!groupid & (treatment == 0 | (treatment == 1 & ratio == !!ratio))) %>%
mutate(treat_dummy = as.integer(treatment == 1 & ratio == !!ratio))
model <- lm(as.formula(paste0(outcome_var, " ~ treat_dummy")), data = df)
tidy(model, conf.int = TRUE) %>%
filter(term == "treat_dummy") %>%
mutate(
Outcome = outcome_name,
Subgroup = subgroup_name,
Ratio = paste0(ratio, ":1"),
DI = estimate,
SE = std.error,
p_raw = p.value,
conf.low = conf.low,
conf.high = conf.high
)
}) %>%
mutate(
label = paste0(Outcome, " | ", Subgroup, " | ", Ratio),
significant = p_raw < 0.05,
holm_significant = p.adjust(p_raw, "holm") < 0.05,
label_display = ifelse(
holm_significant,
paste0("<span style='color:blue;'>", label, "</span>"),
label
)
)
# Plot
plot_data <- results_all %>% filter(Outcome != "Amount change")
ggplot(plot_data, aes(x = reorder(label_display, DI), y = DI, ymin = conf.low, ymax = conf.high, color = significant)) +
geom_pointrange() +
geom_hline(yintercept = 0, linetype = "dashed") +
coord_flip() +
scale_color_manual(values = c("black", "red")) +
labs(
title = "Estimated Treatment Effects",
subtitle = "Labels in <span style='color:blue;'>blue</span> are Holm-significant (p < 0.05)",
x = "", y = "Estimated DI"
) +
theme_minimal() +
theme(
axis.text.y = ggtext::element_markdown(size = 7),
legend.position = "bottom",
plot.subtitle = element_markdown()
)