This is the notebook containing code and results of the analysis of the section editing experiment (T218851). This analysis is tracked in ticket T211239 and performed on SWAP.
Key findings:
# https://stackoverflow.com/a/35018739/1091835
library(IRdisplay)
display_html(
'<script>
code_show=true;
function code_toggle() {
if (code_show){
$(\'div.input\').hide();
} else {
$(\'div.input\').show();
}
code_show = !code_show
}
// $( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>'
)
⬆ Use that button to hide the code
# Packages:
library(glue)
library(zeallot)
library(magrittr)
import::from(dplyr, group_by, keep_where = filter, ungroup, summarize,
mutate, rename, select, arrange, n, left_join, distinct,
vars, everything, starts_with)
import::from(tidyr, spread, gather)
library(repr)
library(ggplot2)
library(patchwork)
# Helper functions:
import::from(polloi, compress)
inf2na <- function(x) {
y <- x
y[is.infinite(x)] <- NA
return(y)
}
nan2na <- function(x) {
y <- x
y[is.nan(x)] <- NA
return(y)
}
na2zero <- function(x) {
y <- x
y[is.na(x)] <- 0
return(y)
}
suppress_messages_warnings <- function(x) {
suppressMessages(suppressWarnings(x))
}
to_html <- function(df, ...) {
df %>%
knitr::kable(format = "html", ...) %>%
as.character() %>%
IRdisplay::display_html()
}
The data comes from client-side EventLogging which uses the EditAttemptStep schema and we fetch the daily session summaries using the following query:
query <- "USE event;
SELECT wiki,
event.user_id,
IF(event.user_id % 2 = 0, 'control', 'treatment') AS bucket,
event.session_token AS mw_session_token,
event.page_id,
MIN(dt) AS session_dt_start,
MAX(dt) AS session_dt_end,
MAX(event.user_editcount) AS user_edit_count,
MAX(event.init_type) AS init_type,
MAX(event.init_mechanism) AS init_mechanism,
SUM(IF(event.action = 'init', 1, 0)) > 0 AS ve_session_initialized,
SUM(IF(event.action = 'ready', 1, 0)) > 0 AS ve_session_readied,
SUM(IF(event.action = 'loaded', 1, 0)) > 0 AS ve_session_loaded,
SUM(IF(event.action = 'saveSuccess', 1, 0)) > 0 AS ve_session_succeeded,
SUM(IF(event.action = 'abort', 1, 0)) > 0 AS ve_session_aborted
FROM EditAttemptStep
WHERE year = ${year} AND month = ${month} AND day = ${day}
AND event.page_ns = 0 -- main articles only
AND event.user_id > 0 -- only logged-in users
AND event.page_id > 0 -- page creation VE sessions
AND wiki RLIKE 'wiki$'
AND NOT wiki IN('${exclude_wikis}')
AND event.session_token IS NOT NULL
AND event.editor_interface = 'visualeditor'
AND event.platform IN('phone', 'tablet')
GROUP BY wiki,
event.user_id, IF(event.user_id % 2 = 0, 'control', 'treatment'),
event.session_token, event.platform, event.page_id"
query_data <- function() {
# see https://phabricator.wikimedia.org/T211239
start_date <- as.Date("2019-03-29") # not 2019-03-18
end_date <- as.Date("2019-06-10")
test_dates <- seq(start_date, end_date, by = "day")
exclude_wikis <- c(
'wikidatawiki', 'commonswiki', 'mediawikiwiki', 'metawiki',
'sourceswiki', 'specieswiki', 'outreachwiki', 'testwiki', 'incubatorwiki',
paste0(c('he', 'bn', 'zh_yue'), 'wiki') # 1st wave had 100% rollout (not A/B tested)
) %>% paste0(collapse = "', '")
results <- purrr::map_dfr(test_dates, function(date) {
# message("Fetching mobile VE session data from ", date)
c(year, month, day) %<-% wmf::extract_ymd(date)
query <- glue(query, .open = "${", .close = "}")
result <- suppress_messages_warnings(wmf::query_hive(query))
result %<>%
mutate(
date = date,
bucket = ifelse(user_id %% 2 == 0, 'control', 'treatment'),
unique_user_id = paste(wiki, user_id)
) %>%
dplyr::mutate_at(vars(starts_with("session_dt_")), lubridate::ymd_hms) %>%
dplyr::mutate_at(vars(starts_with("init_")), ~ ifelse(.x == "NULL", NA, .x)) %>%
dplyr::mutate_at(vars(starts_with("ve_session")), ~ .x == "true") %>%
select(date, wiki, bucket, user_id, mw_session_token, page_id, everything()) %>%
arrange(date, wiki, bucket, user_id, session_dt_start)
if (date < "2019-04-02") {
wikis <- paste0(c('hi', 'ar', 'fa', 'id', 'mr', 'ms', 'ml', 'th', 'az', 'sq'), 'wiki')
result <- result[result$wiki %in% wikis, ] # only keep where test was live as of 28 March 2019
# 2 April 2019 is all remaining wikis
}
return(result)
})
return(results)
}
if (file.exists("daily_data.rds")) {
daily_data <- readr::read_rds("daily_data.rds")
} else {
daily_data <- query_data()
readr::write_rds(daily_data, "daily_data.rds", compress = "gz")
}
The per-wiki user counts break down as follows:
per_wiki_user_counts <- daily_data %>%
group_by(wiki, bucket) %>%
summarize(users = length(unique(unique_user_id))) %>%
ungroup %>%
spread(bucket, users, fill = 0) %>%
mutate(total = control + treatment) %>%
arrange(dplyr::desc(total)) %>%
mutate(rank = 1:dplyr::n(), in_top10 = rank <= 10)
not_top10 <- per_wiki_user_counts %>%
keep_where(!in_top10) %>%
summarize(
wiki = paste(dplyr::n(), "more wikis"),
control = sum(control),
treatment = sum(treatment),
total = sum(total)
)
per_wiki_user_counts %>%
keep_where(in_top10) %>%
dplyr::select(wiki, control, treatment, total) %>%
rbind(not_top10) %>%
dplyr::mutate_if(is.double, polloi::compress) %>%
to_html
daily_counts <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
group_by(bucket, date) %>%
summarize(
sessions = n(),
success_rate = sum(ve_session_succeeded) / sessions,
users = length(unique(unique_user_id)),
avg_sessions_per_user = sessions / users,
) %>%
ungroup %>%
gather(metric, value, -c(bucket, date))
options(repr.plot.width = 15, repr.plot.height = 10)
ggplot(daily_counts, aes(x = date, y = value, color = bucket)) +
geom_line(size = 1) +
facet_wrap(~ metric, scales = "free_y", ncol = 1) +
scale_x_date(date_breaks = "1 week", minor_breaks = NULL, date_labels = "%d %B\n%Y") +
scale_y_continuous(minor_breaks = NULL) +
scale_color_brewer(palette = "Set1") +
wmf::theme_facet(base_size = 14)
Raw summary stats:
counts <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
group_by(bucket) %>%
summarize(
sessions = n(),
successes = sum(ve_session_succeeded),
success_rate = successes / sessions,
users = length(unique(unique_user_id)),
avg_sessions_per_user = sessions / users
) %>%
ungroup
counts %>% dplyr::mutate_if(is.integer, polloi::compress) %>% to_html
Here, we analyze the probability of a successful VE session once a session has been initiated (and VE has loaded), where success is "an edit is published". For this, we use the BCDA package to compare success probabilities $\pi_1$ and $\pi_2$ between the $n_1$ sessions in treatment bucket ("group 1" in the analysis below) and $n_2$ sessions in control bucket ("group 2" below). In this simple check, we consider each session independent of others, even though some sessions come from the same user.
We model the successes $y_1$ and $y_2$ with the Binomial distributions having parameters $(\pi_1, n_1)$ and $\pi_2, n_2)$, respectively. We assign Beta priors on $\pi_1$ and $\pi_2$, which forms our traditional beta-binomial model and makes it very easy to sample from the posterior. Using those those samples, we can calculate credible intervals for the quantities $\Delta_\pi = \pi_1 - \pi_2$ (the difference between the success probabilities) and the relative risk $\theta = \frac{\pi_1}{\pi_2}$.
bb <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
mutate(contributed = ifelse(ve_session_succeeded, 'Yes', 'No')) %>%
group_by(bucket, contributed) %>%
dplyr::tally() %>%
ungroup() %>%
spread(contributed, n) %>%
{
m <- as.matrix(.[, -1])
rownames(m) <- .$bucket
m
} %>%
BCDA::flip_rows() %>%
BCDA::flip_cols() %>%
BCDA::beta_binom()
options(digits = 3)
BCDA::present_bbfit(bb, raw = TRUE) %>% to_html()
According to this approach (model), initialized VE sessions made by users in the treatment group had an increased probability of success compared to VE sessions initialized by users in the control group. The increase (of 0.92%) is from an average of 47.2% in the control group to 48.14% in the treatment group. That is, sessions with mobile section editing are 1.019 times more likely to result in a contribution than sessions without it.
per_user_successes <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
mutate(
contributed = as.numeric(ve_session_succeeded),
treated = as.numeric(bucket == "treatment")
)
per_user_successes %<>%
mutate(wiki = factor(wiki, sort(unique(per_user_successes$wiki))))
fit0 <- glm(contributed ~ treated, data = per_user_successes, family = binomial()) # AIC: 188224
options(digits = 4)
rbind(
car::deltaMethod(fit0, "treated / 4"),
car::deltaMethod(fit0, "exp(treated)") # exp(beta) := odds ratio
) %>% to_html
These frequentist results are consistent with the Bayesian results above -- an increase of 0.92% in probability and an odds ratio of 1.038. However, there is one major unresolved issue.
The issue with the above approaches is that the sessions are assumed to be independent, which is not the actual case. Multiple sessions can belong to the same user who is just more (or less) likely than others to make edits. Furthermore, each language of Wikipedia can have its own base probability of an initiated VE session ending successfully. Therefore, a more correct model of success probability takes into consideration the within-user correlations and the between-user/within-wiki correlations. Let $y = 0$ if the initialized (and readied/loaded) VE session did not result in a contribution and $y = 1$ if it did. Then the outcome $y_i$ of the $i$-th session by $j$-th user from $k$-th wiki can be modeled as follows:
$$\begin{align} y_i & \sim \mathrm{Bernoulli}(\pi_i),~i = 1, \ldots, N\\ \log(\pi_i) & = \beta x + \alpha_{j[i]},~i = 1, \ldots, N\\ \alpha_j & \sim \mathcal{N}(\gamma_{k[j]}, \sigma_\alpha),~j = 1, \ldots, J\\ \gamma_k & \sim \mathcal{N}(\mu, \sigma_\gamma),~k = 1, \ldots, K \end{align}$$suppressPackageStartupMessages({
library(lme4)
# library(arm)
})
fit1 <- glmer(
contributed ~ treated + (1 | wiki / unique_user_id),
data = per_user_successes,
family = binomial()
)
summary(fit1)
control_prob <- unname(arm::invlogit(fixef(fit1)["(Intercept)"]))
treatment_prob <- unname(arm::invlogit(sum(fixef(fit1))))
data.frame(
bucket = c("control", "treatment"),
success_prob = scales::percent(c(control_prob, treatment_prob), 0.01)
) %>% to_html()
c(
prob_diff = scales::percent(treatment_prob - control_prob, 0.01), # estimate difference in success probability,
rel_risk = round(treatment_prob / control_prob, 3) # estimate relative risk
)
By taking into consideration that sessions from the same user would be similar and that users from the same wiki would behave similarly, we increase the ability of the model to extract the effect of the treatment. This gets us an approximate increase of 0.97% in success probability -- from 35.67% in the control group to 36.64% in the treatment.
options(digits = 5)
to_html(dplyr::mutate_if(car::deltaMethod(fit1, "treated / 4"), is.double, scales::percent))
Using the Gelman & Hill (2006) "divide by 4" rule and the delta method to get an upper bound on change in probability, we get an approximate 1.05% increase with a 95% Confidence Interval of (0.08-2.02)%
Gelman, A., & Hill, J. (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. Cambridge University Press.
With $\hat{\gamma}_k$ as the $k$-th wiki's estimated intercept and $\hat{\beta}$ as the estimated effect of treatment on the linear scale, we can estimate the average probability of completion of the control and the treatment groups on $k$-th wiki as:
$$\begin{align} \mathrm{Pr}(y = 1 | x = 0) &= \mathrm{logit}^{-1}(\hat{\gamma}_k),~k = 1, \ldots, K\\ \mathrm{Pr}(y = 1 | x = 1) &= \mathrm{logit}^{-1}(\hat{\gamma}_k + \hat{\beta}),~k = 1, \ldots, K \end{align}$$Note: the model uses $\alpha_j$ to as the user-level intercept. We are interested in completion probability of the "average user" of a wiki, so we're using the wiki-level intercept $\gamma_k$.
varying_intercepts <- ranef(fit1) # these are conditional modes (differences between population-level and group-level)
# ^ these must be added to the overall, population-level intercept to get an estimate at the group-level
overall_intercept <- fixef(fit1)["(Intercept)"] # population-level intercept
treatment_effect <- fixef(fit1)["treated"] # coefficient belonging to "treated" indicator
wiki_intercepts <- dplyr::tibble(
wiki = levels(per_user_successes$wiki),
intercept = varying_intercepts$wiki$`(Intercept)`
) %>%
mutate(
control_prob = arm::invlogit(intercept + overall_intercept),
treatment_prob = arm::invlogit(intercept + overall_intercept + treatment_effect),
change = treatment_prob - control_prob
) %>%
select(-intercept)
Here are some wikis:
top10_improved <- wiki_intercepts %>%
dplyr::top_n(10, change) %>%
select(wiki) %>%
dplyr::pull()
wiki_intercepts %>%
keep_where(wiki %in% c(
"enwiki", "ruwiki", "kowiki", "frwiki", "dewiki",
"arwiki", "hiwiki", "hewiki", "jawiki", "zhwiki"
) | wiki %in% top10_improved) %>%
arrange(dplyr::desc(change)) %>%
dplyr::mutate_if(is.double, ~ scales::percent(.x, 0.001)) %>%
to_html()
If we define exposure as the time (in days) that the user has had the ability to edit sections in VE on mobile since their first mobile VE session, then we can consider a model where the success probability of each session is also affected by this exposure time:
# test_exposures <- per_user_successes %>%
# group_by(unique_user_id) %>%
# summarize(
# first_session_at = min(session_dt_start),
# final_session_at = max(session_dt_start)
# ) %>%
# mutate(time_in_test = purrr::map2_dbl(final_session_at, first_session_at, difftime, units = "days"))
# per_user_successes_augmented <- per_user_successes %>%
# dplyr::left_join(test_exposures, by = "unique_user_id") %>%
# mutate(exposure = purrr::map2_dbl(session_dt_start, first_session_at, difftime, units = "days"))
# fit2 <- update(fit1, . ~ . + offset(log(exposure + 1e-4)), data = per_user_successes_augmented) # AIC: 281280
We can also consider a model where the effect of section editing is allowed to vary by wiki:
fit3 <- glmer(
contributed ~ treated + (1 | unique_user_id:wiki) + (1 + treated | wiki),
data = per_user_successes,
family = binomial()
)
# fit3 <- update(fit1, . ~ . + (treated | wiki)) # AIC: 168704
However, both of these models are worse (as determined by AIC) than our original one with the per-user, per-wiki varying intercepts and a constant effect of treatment, which means the additional complexity is unnecessary.
summary(fit3)
varying_intercepts_3 <- ranef(fit3)
str(varying_intercepts_3)
per_wiki_treatment_effects <- dplyr::tibble(
wiki = levels(per_user_successes$wiki),
effect = varying_intercepts_3$wiki$treated,
variance = apply(attr(varying_intercepts_3$wiki, "postVar"), 3, function(x) { x[2, 2] })
) %>%
mutate(
std_dev = sqrt(variance),
lower95 = effect - 1.96 * std_dev,
upper95 = effect + 1.96 * std_dev
)
summary(per_wiki_treatment_effects)
per_wiki_treatment_effects %>%
dplyr::top_n(10, effect)
per_wiki_treatment_effects %>%
dplyr::top_n(10, dplyr::desc(effect))
per_wiki_treatment_effects %>%
keep_where(upper95 < 0 | lower95 > 0) # statistically significant per-wiki effects
options(repr.plot.width = 16, repr.plot.height = 8)
ggplot(per_wiki_treatment_effects) +
geom_hline(yintercept = 0, size = 0.5) +
geom_pointrange(aes(x = wiki, ymin = lower95, y = effect, ymax = upper95), color = "darkred") +
scale_y_continuous(minor_breaks = NULL) +
hrbrthemes::theme_ipsum() +
theme(panel.grid.major.x = element_blank(), axis.text.x = element_blank())
temp_coefs <- coef(fit3)
options(repr.plot.width = 8, repr.plot.height = 4)
ggplot(temp_coefs$wiki, aes(x = treated)) +
geom_histogram(bins = 30) +
scale_x_continuous(minor_breaks = NULL) +
scale_y_continuous(breaks = NULL, minor_breaks = NULL) +
labs(
y = NULL, x = "Effect of section editing",
title = "Distribution of per-wiki section editing effects"
) +
hrbrthemes::theme_ipsum()
Since (enrolled) users at the start of the test have more opportunities to see/use section editing, they have potential to have much higher number of sessions than users who entered the test at a later time -- especially near the end of the test. Therefore, we have decided to compare "average number of sessions per user within first week of entrance into the test" between the two groups. As a result, users who entered the dataset in the last week of the test were not included in this analysis.
user_windows <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
group_by(unique_user_id) %>%
summarize(
first_sesh_in_test = min(session_dt_start),
first_week_in_test = first_sesh_in_test + lubridate::days(7),
)
last_possible_dt <- lubridate::ymd_hms("2019-06-10T00:00:00Z") - lubridate::days(7)
per_user_session_counts <- daily_data %>%
keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
dplyr::left_join(user_windows, by = "unique_user_id") %>%
keep_where(session_dt_start < first_week_in_test, session_dt_start < last_possible_dt) %>%
group_by(bucket, unique_user_id) %>%
dplyr::tally() %>%
ungroup %>%
mutate(bucket = factor(bucket, c("control", "treatment")))
per_user_session_counts %>%
group_by(bucket) %>%
summarize(
users = dplyr::n(),
total_sessions = sum(n),
avg_sessions_per_user = mean(n),
med_sessions_per_user = median(n),
percentile95 = quantile(n, 0.95),
percentile99 = quantile(n, 0.99)
) %>%
to_html
t.test(n ~ bucket, data = per_user_session_counts, paired = FALSE)
Users with section editing had slightly more sessions (within 7 days of their first VE session) than users without, but that increase was not statistically significant (p-value = 0.8).