Section editing on mobile VE

This is the notebook containing code and results of the analysis of the section editing experiment (T218851). This analysis is tracked in ticket T211239 and performed on SWAP.

Key findings:

  • Mobile VE users with section editing were 1.03 times more likely to finish making an edit after starting
    • The probability of success (defined as successfully finishing a mobile VE session, resulting in an edit to the article) increased from 0.3567 in the control group to 0.3664 in the treatment
    • This 0.0097 increase in success probability is statistically significant at $\alpha=0.05$ level (p-value = 0.034)
    • This roughly translates to 1 fewer VE edits being abanded per every 100 VE edits started if we enable section editing for everyone, everywhere (compared to keeping the status quo)
  • The average number of sessions per user was 1.948 in the treatment group and 1.940 in the control group
    • This difference is not statistically significant (p-value = 0.8)
In [1]:
# https://stackoverflow.com/a/35018739/1091835
library(IRdisplay)

display_html(
'<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
// $( document ).ready(code_toggle);
</script>
  <form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
 </form>'
)

⬆ Use that button to hide the code

In [2]:
# Packages:
library(glue)
library(zeallot)
library(magrittr)

import::from(dplyr, group_by, keep_where = filter, ungroup, summarize,
             mutate, rename, select, arrange, n, left_join, distinct,
             vars, everything, starts_with)
import::from(tidyr, spread, gather)

library(repr)
library(ggplot2)
library(patchwork)

# Helper functions:
import::from(polloi, compress)
inf2na <- function(x) {
    y <- x
    y[is.infinite(x)] <- NA
    return(y)
}
nan2na <- function(x) {
    y <- x
    y[is.nan(x)] <- NA
    return(y)
}
na2zero <- function(x) {
    y <- x
    y[is.na(x)] <- 0
    return(y)
}
suppress_messages_warnings <- function(x) {
    suppressMessages(suppressWarnings(x))
}
to_html <- function(df, ...) {
    df %>%
    knitr::kable(format = "html", ...) %>%
    as.character() %>%
    IRdisplay::display_html()
}

Data

The data comes from client-side EventLogging which uses the EditAttemptStep schema and we fetch the daily session summaries using the following query:

In [3]:
query <- "USE event;
SELECT wiki,
  event.user_id,
  IF(event.user_id % 2 = 0, 'control', 'treatment') AS bucket,
  event.session_token AS mw_session_token,
  event.page_id,
  MIN(dt) AS session_dt_start,
  MAX(dt) AS session_dt_end,
  MAX(event.user_editcount) AS user_edit_count,
  MAX(event.init_type) AS init_type,
  MAX(event.init_mechanism) AS init_mechanism,
  SUM(IF(event.action = 'init', 1, 0)) > 0 AS ve_session_initialized,
  SUM(IF(event.action = 'ready', 1, 0)) > 0 AS ve_session_readied,
  SUM(IF(event.action = 'loaded', 1, 0)) > 0 AS ve_session_loaded,
  SUM(IF(event.action = 'saveSuccess', 1, 0)) > 0 AS ve_session_succeeded,
  SUM(IF(event.action = 'abort', 1, 0)) > 0 AS ve_session_aborted
FROM EditAttemptStep
WHERE year = ${year} AND month = ${month} AND day = ${day}
  AND event.page_ns = 0 -- main articles only
  AND event.user_id > 0 -- only logged-in users
  AND event.page_id > 0 -- page creation VE sessions
  AND wiki RLIKE 'wiki$'
  AND NOT wiki IN('${exclude_wikis}')
  AND event.session_token IS NOT NULL
  AND event.editor_interface = 'visualeditor'
  AND event.platform IN('phone', 'tablet')
GROUP BY wiki,
  event.user_id, IF(event.user_id % 2 = 0, 'control', 'treatment'),
  event.session_token, event.platform, event.page_id"
In [4]:
query_data <- function() {
    
    # see https://phabricator.wikimedia.org/T211239
    start_date <- as.Date("2019-03-29") # not 2019-03-18
    end_date <- as.Date("2019-06-10")
    test_dates <- seq(start_date, end_date, by = "day")
    
    exclude_wikis <- c(
        'wikidatawiki', 'commonswiki', 'mediawikiwiki', 'metawiki',
        'sourceswiki', 'specieswiki', 'outreachwiki', 'testwiki', 'incubatorwiki',
        paste0(c('he', 'bn', 'zh_yue'), 'wiki') # 1st wave had 100% rollout (not A/B tested)
    ) %>% paste0(collapse = "', '")

    results <- purrr::map_dfr(test_dates, function(date) {
        # message("Fetching mobile VE session data from ", date)
        c(year, month, day) %<-% wmf::extract_ymd(date)
        query <- glue(query, .open = "${", .close = "}")
        result <- suppress_messages_warnings(wmf::query_hive(query))
        result %<>%
            mutate(
                date = date,
                bucket = ifelse(user_id %% 2 == 0, 'control', 'treatment'),
                unique_user_id = paste(wiki, user_id)
            ) %>%
            dplyr::mutate_at(vars(starts_with("session_dt_")), lubridate::ymd_hms) %>%
            dplyr::mutate_at(vars(starts_with("init_")), ~ ifelse(.x == "NULL", NA, .x)) %>%
            dplyr::mutate_at(vars(starts_with("ve_session")), ~ .x == "true") %>%
            select(date, wiki, bucket, user_id, mw_session_token, page_id, everything()) %>%
            arrange(date, wiki, bucket, user_id, session_dt_start)
        if (date < "2019-04-02") {
            wikis <- paste0(c('hi', 'ar', 'fa', 'id', 'mr', 'ms', 'ml', 'th', 'az', 'sq'), 'wiki')
            result <- result[result$wiki %in% wikis, ] # only keep where test was live as of 28 March 2019
            # 2 April 2019 is all remaining wikis
        }
        return(result)
    })
    
    return(results)

}

if (file.exists("daily_data.rds")) {
    daily_data <- readr::read_rds("daily_data.rds")
} else {
    daily_data <- query_data()
    readr::write_rds(daily_data, "daily_data.rds", compress = "gz")
}

The per-wiki user counts break down as follows:

In [5]:
per_wiki_user_counts <- daily_data %>%
    group_by(wiki, bucket) %>%
    summarize(users = length(unique(unique_user_id))) %>%
    ungroup %>%
    spread(bucket, users, fill = 0) %>%
    mutate(total = control + treatment) %>%
    arrange(dplyr::desc(total)) %>%
    mutate(rank = 1:dplyr::n(), in_top10 = rank <= 10)
not_top10 <- per_wiki_user_counts %>%
    keep_where(!in_top10) %>%
    summarize(
        wiki = paste(dplyr::n(), "more wikis"),
        control = sum(control),
        treatment = sum(treatment),
        total = sum(total)
    )
per_wiki_user_counts %>%
    keep_where(in_top10) %>%
    dplyr::select(wiki, control, treatment, total) %>%
    rbind(not_top10) %>%
    dplyr::mutate_if(is.double, polloi::compress) %>%
    to_html
wiki control treatment total
enwiki 11.59K 11.55K 23.14K
eswiki 2.35K 2.33K 4.68K
fawiki 1.41K 1.33K 2.75K
arwiki 1.32K 1.39K 2.71K
frwiki 1.31K 1.29K 2.6K
jawiki 1.04K 1.1K 2.13K
itwiki 973 948 1.92K
ruwiki 917 829 1.75K
zhwiki 854 845 1.7K
dewiki 835 858 1.69K
140 more wikis 5.46K 5.41K 10.87K
In [6]:
daily_counts <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    group_by(bucket, date) %>%
    summarize(
        sessions = n(),
        success_rate = sum(ve_session_succeeded) / sessions,
        users = length(unique(unique_user_id)),
        avg_sessions_per_user = sessions / users,
    ) %>%
    ungroup %>%
    gather(metric, value, -c(bucket, date))
In [7]:
options(repr.plot.width = 15, repr.plot.height = 10)
ggplot(daily_counts, aes(x = date, y = value, color = bucket)) +
    geom_line(size = 1) +
    facet_wrap(~ metric, scales = "free_y", ncol = 1) +
    scale_x_date(date_breaks = "1 week", minor_breaks = NULL, date_labels = "%d %B\n%Y") +
    scale_y_continuous(minor_breaks = NULL) +
    scale_color_brewer(palette = "Set1") +
    wmf::theme_facet(base_size = 14)

Raw summary stats:

In [8]:
counts <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    group_by(bucket) %>%
    summarize(
        sessions = n(),
        successes = sum(ve_session_succeeded),
        success_rate = successes / sessions,
        users = length(unique(unique_user_id)),
        avg_sessions_per_user = sessions / users
    ) %>%
    ungroup
counts %>% dplyr::mutate_if(is.integer, polloi::compress) %>% to_html
bucket sessions successes success_rate users avg_sessions_per_user
control 65.64K 31K 0.4722112 22.04K 2.978806
treatment 70.35K 33.87K 0.4813862 22.12K 3.180569

Probability of contribution

Here, we analyze the probability of a successful VE session once a session has been initiated (and VE has loaded), where success is "an edit is published". For this, we use the BCDA package to compare success probabilities $\pi_1$ and $\pi_2$ between the $n_1$ sessions in treatment bucket ("group 1" in the analysis below) and $n_2$ sessions in control bucket ("group 2" below). In this simple check, we consider each session independent of others, even though some sessions come from the same user.

We model the successes $y_1$ and $y_2$ with the Binomial distributions having parameters $(\pi_1, n_1)$ and $\pi_2, n_2)$, respectively. We assign Beta priors on $\pi_1$ and $\pi_2$, which forms our traditional beta-binomial model and makes it very easy to sample from the posterior. Using those those samples, we can calculate credible intervals for the quantities $\Delta_\pi = \pi_1 - \pi_2$ (the difference between the success probabilities) and the relative risk $\theta = \frac{\pi_1}{\pi_2}$.

In [9]:
bb <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    mutate(contributed = ifelse(ve_session_succeeded, 'Yes', 'No')) %>%
    group_by(bucket, contributed) %>%
    dplyr::tally() %>%
    ungroup() %>%
    spread(contributed, n) %>%
    {
        m <- as.matrix(.[, -1])
        rownames(m) <- .$bucket
        m
    } %>%
    BCDA::flip_rows() %>%
    BCDA::flip_cols() %>%
    BCDA::beta_binom()
In [10]:
options(digits = 3)
BCDA::present_bbfit(bb, raw = TRUE) %>% to_html()
Group 1 Group 2 Pr(Success) in Group 1 Pr(Success) in Group 2 Difference Relative Risk Odds Ratio
70351 65638 48.138% (47.762%, 48.505%) 47.217% (46.852%, 47.599%) 0.921% (0.391%, 1.442%) 1.020 (1.008, 1.031) 1.038 (1.016, 1.060)

According to this approach (model), initialized VE sessions made by users in the treatment group had an increased probability of success compared to VE sessions initialized by users in the control group. The increase (of 0.92%) is from an average of 47.2% in the control group to 48.14% in the treatment group. That is, sessions with mobile section editing are 1.019 times more likely to result in a contribution than sessions without it.

In [11]:
per_user_successes <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    mutate(
        contributed = as.numeric(ve_session_succeeded),
        treated = as.numeric(bucket == "treatment")
    )
per_user_successes %<>%
    mutate(wiki = factor(wiki, sort(unique(per_user_successes$wiki))))
In [12]:
fit0 <- glm(contributed ~ treated, data = per_user_successes, family = binomial()) # AIC: 188224
In [13]:
options(digits = 4)
rbind(
    car::deltaMethod(fit0, "treated / 4"),
    car::deltaMethod(fit0, "exp(treated)") # exp(beta) := odds ratio
) %>% to_html
Estimate SE 2.5 % 97.5 %
treated/4 0.0092 0.0027 0.0039 0.0145
exp(treated) 1.0375 0.0113 1.0154 1.0596

These frequentist results are consistent with the Bayesian results above -- an increase of 0.92% in probability and an odds ratio of 1.038. However, there is one major unresolved issue.

Multilevel model of success probability

The issue with the above approaches is that the sessions are assumed to be independent, which is not the actual case. Multiple sessions can belong to the same user who is just more (or less) likely than others to make edits. Furthermore, each language of Wikipedia can have its own base probability of an initiated VE session ending successfully. Therefore, a more correct model of success probability takes into consideration the within-user correlations and the between-user/within-wiki correlations. Let $y = 0$ if the initialized (and readied/loaded) VE session did not result in a contribution and $y = 1$ if it did. Then the outcome $y_i$ of the $i$-th session by $j$-th user from $k$-th wiki can be modeled as follows:

$$\begin{align} y_i & \sim \mathrm{Bernoulli}(\pi_i),~i = 1, \ldots, N\\ \log(\pi_i) & = \beta x + \alpha_{j[i]},~i = 1, \ldots, N\\ \alpha_j & \sim \mathcal{N}(\gamma_{k[j]}, \sigma_\alpha),~j = 1, \ldots, J\\ \gamma_k & \sim \mathcal{N}(\mu, \sigma_\gamma),~k = 1, \ldots, K \end{align}$$
In [14]:
suppressPackageStartupMessages({
    library(lme4)
    # library(arm)
})
In [15]:
fit1 <- glmer(
    contributed ~ treated + (1 | wiki / unique_user_id),
    data = per_user_successes,
    family = binomial()
)
summary(fit1)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: contributed ~ treated + (1 | wiki/unique_user_id)
   Data: per_user_successes

     AIC      BIC   logLik deviance df.resid 
  168701   168740   -84346   168693   135985 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-5.631 -0.644 -0.380  0.786  4.159 

Random effects:
 Groups              Name        Variance Std.Dev.
 unique_user_id:wiki (Intercept) 1.3088   1.14    
 wiki                (Intercept) 0.0843   0.29    
Number of obs: 135989, groups:  unique_user_id:wiki, 44154; wiki, 137

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.5897     0.0452  -13.05   <2e-16 ***
treated       0.0419     0.0198    2.12    0.034 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
        (Intr)
treated -0.223
In [16]:
control_prob <- unname(arm::invlogit(fixef(fit1)["(Intercept)"]))
treatment_prob <- unname(arm::invlogit(sum(fixef(fit1))))
data.frame(
    bucket = c("control", "treatment"),
    success_prob = scales::percent(c(control_prob, treatment_prob), 0.01)
) %>% to_html()
bucket success_prob
control 35.67%
treatment 36.64%
In [17]:
c(
    prob_diff = scales::percent(treatment_prob - control_prob, 0.01), # estimate difference in success probability,
    rel_risk = round(treatment_prob / control_prob, 3) # estimate relative risk
)
prob_diff
'0.97%'
rel_risk
'1.027'

By taking into consideration that sessions from the same user would be similar and that users from the same wiki would behave similarly, we increase the ability of the model to extract the effect of the treatment. This gets us an approximate increase of 0.97% in success probability -- from 35.67% in the control group to 36.64% in the treatment.

In [18]:
options(digits = 5)
to_html(dplyr::mutate_if(car::deltaMethod(fit1, "treated / 4"), is.double, scales::percent))
Estimate SE 2.5 % 97.5 %
1.05% 0.496% 0.0775% 2.02%

Using the Gelman & Hill (2006) "divide by 4" rule and the delta method to get an upper bound on change in probability, we get an approximate 1.05% increase with a 95% Confidence Interval of (0.08-2.02)%

Gelman, A., & Hill, J. (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. Cambridge University Press.

Per-wiki improvements

With $\hat{\gamma}_k$ as the $k$-th wiki's estimated intercept and $\hat{\beta}$ as the estimated effect of treatment on the linear scale, we can estimate the average probability of completion of the control and the treatment groups on $k$-th wiki as:

$$\begin{align} \mathrm{Pr}(y = 1 | x = 0) &= \mathrm{logit}^{-1}(\hat{\gamma}_k),~k = 1, \ldots, K\\ \mathrm{Pr}(y = 1 | x = 1) &= \mathrm{logit}^{-1}(\hat{\gamma}_k + \hat{\beta}),~k = 1, \ldots, K \end{align}$$

Note: the model uses $\alpha_j$ to as the user-level intercept. We are interested in completion probability of the "average user" of a wiki, so we're using the wiki-level intercept $\gamma_k$.

In [19]:
varying_intercepts <- ranef(fit1) # these are conditional modes (differences between population-level and group-level)
# ^ these must be added to the overall, population-level intercept to get an estimate at the group-level
overall_intercept <- fixef(fit1)["(Intercept)"] # population-level intercept
treatment_effect <- fixef(fit1)["treated"] # coefficient belonging to "treated" indicator
wiki_intercepts <- dplyr::tibble(
    wiki = levels(per_user_successes$wiki),
    intercept = varying_intercepts$wiki$`(Intercept)`
) %>%
    mutate(
        control_prob = arm::invlogit(intercept + overall_intercept),
        treatment_prob = arm::invlogit(intercept + overall_intercept + treatment_effect),
        change = treatment_prob - control_prob
    ) %>%
    select(-intercept)

Here are some wikis:

In [20]:
top10_improved <- wiki_intercepts %>%
    dplyr::top_n(10, change) %>%
    select(wiki) %>%
    dplyr::pull()
wiki_intercepts %>%
    keep_where(wiki %in% c(
        "enwiki", "ruwiki", "kowiki", "frwiki", "dewiki",
        "arwiki", "hiwiki", "hewiki", "jawiki", "zhwiki"
    ) | wiki %in% top10_improved) %>%
    arrange(dplyr::desc(change)) %>%
    dplyr::mutate_if(is.double, ~ scales::percent(.x, 0.001)) %>%
    to_html()
wiki control_prob treatment_prob change
cawiki 45.364% 46.406% 1.042%
elwiki 44.861% 45.901% 1.040%
slwiki 44.844% 45.883% 1.040%
simplewiki 44.765% 45.804% 1.039%
glwiki 44.197% 45.234% 1.037%
ukwiki 44.088% 45.124% 1.036%
svwiki 43.993% 45.029% 1.036%
etwiki 43.395% 44.428% 1.033%
srwiki 42.662% 43.691% 1.029%
mkwiki 42.581% 43.609% 1.029%
dewiki 40.448% 41.463% 1.014%
jawiki 39.609% 40.616% 1.008%
enwiki 36.879% 37.861% 0.982%
kowiki 36.748% 37.729% 0.980%
ruwiki 36.400% 37.376% 0.977%
frwiki 36.399% 37.375% 0.976%
arwiki 35.435% 36.400% 0.965%
zhwiki 35.238% 36.201% 0.963%
hiwiki 32.582% 33.510% 0.928%

Additional models for consideration

If we define exposure as the time (in days) that the user has had the ability to edit sections in VE on mobile since their first mobile VE session, then we can consider a model where the success probability of each session is also affected by this exposure time:

In [21]:
# test_exposures <- per_user_successes %>%
#     group_by(unique_user_id) %>%
#     summarize(
#         first_session_at = min(session_dt_start),
#         final_session_at = max(session_dt_start)
#     ) %>%
#     mutate(time_in_test = purrr::map2_dbl(final_session_at, first_session_at, difftime, units = "days"))
# per_user_successes_augmented <- per_user_successes %>%
#     dplyr::left_join(test_exposures, by = "unique_user_id") %>%
#     mutate(exposure = purrr::map2_dbl(session_dt_start, first_session_at, difftime, units = "days"))
# fit2 <- update(fit1, . ~ . + offset(log(exposure + 1e-4)), data = per_user_successes_augmented) # AIC: 281280

We can also consider a model where the effect of section editing is allowed to vary by wiki:

In [22]:
fit3 <- glmer(
    contributed ~ treated + (1 | unique_user_id:wiki) + (1 + treated | wiki),
    data = per_user_successes,
    family = binomial()
)
# fit3 <- update(fit1, . ~ . + (treated | wiki)) # AIC: 168704

However, both of these models are worse (as determined by AIC) than our original one with the per-user, per-wiki varying intercepts and a constant effect of treatment, which means the additional complexity is unnecessary.

In [23]:
summary(fit3)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: contributed ~ treated + (1 | unique_user_id:wiki) + (1 + treated |  
    wiki)
   Data: per_user_successes

     AIC      BIC   logLik deviance df.resid 
  168702   168761   -84345   168690   135983 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-5.642 -0.641 -0.380  0.787  4.159 

Random effects:
 Groups              Name        Variance Std.Dev. Corr
 unique_user_id:wiki (Intercept) 1.30801  1.1437       
 wiki                (Intercept) 0.07803  0.2793       
                     treated     0.00794  0.0891   0.22
Number of obs: 135989, groups:  unique_user_id:wiki, 44154; wiki, 137

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.5983     0.0448  -13.35   <2e-16 ***
treated       0.0607     0.0313    1.94    0.053 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
        (Intr)
treated -0.212
In [24]:
varying_intercepts_3 <- ranef(fit3)
In [25]:
str(varying_intercepts_3)
List of 2
 $ unique_user_id:wiki:'data.frame':	44154 obs. of  1 variable:
  ..$ (Intercept): num [1:44154] 0.605 0.622 -0.494 -0.355 0.634 ...
  ..- attr(*, "postVar")= num [1, 1, 1:44154] 0.993 0.991 0.272 1.042 0.991 ...
 $ wiki               :'data.frame':	137 obs. of  2 variables:
  ..$ (Intercept): num [1:137] 0.0757 -0.0334 0.0392 -0.0646 -0.0176 ...
  ..$ treated    : num [1:137] 0.00879 -0.00153 0.00629 -0.0066 -0.00283 ...
  ..- attr(*, "postVar")= num [1:2, 1:2, 1:137] 0.07565 0.00518 0.00518 0.0079 0.07105 ...
 - attr(*, "class")= chr "ranef.mer"
In [26]:
per_wiki_treatment_effects <- dplyr::tibble(
    wiki = levels(per_user_successes$wiki),
    effect = varying_intercepts_3$wiki$treated,
    variance = apply(attr(varying_intercepts_3$wiki, "postVar"), 3, function(x) { x[2, 2] })
) %>%
    mutate(
        std_dev = sqrt(variance),
        lower95 = effect - 1.96 * std_dev,
        upper95 = effect + 1.96 * std_dev
    )
In [27]:
summary(per_wiki_treatment_effects)
     wiki               effect            variance           std_dev      
 Length:137         Min.   :-0.09945   Min.   :0.000795   Min.   :0.0282  
 Class :character   1st Qu.:-0.00624   1st Qu.:0.007054   1st Qu.:0.0840  
 Mode  :character   Median : 0.00096   Median :0.007735   Median :0.0879  
                    Mean   : 0.00215   Mean   :0.007169   Mean   :0.0842  
                    3rd Qu.: 0.01192   3rd Qu.:0.007881   3rd Qu.:0.0888  
                    Max.   : 0.08812   Max.   :0.007932   Max.   :0.0891  
    lower95           upper95       
 Min.   :-0.2276   Min.   :0.00804  
 1st Qu.:-0.1781   1st Qu.:0.16249  
 Median :-0.1715   Median :0.17306  
 Mean   :-0.1629   Mean   :0.16720  
 3rd Qu.:-0.1494   3rd Qu.:0.18153  
 Max.   :-0.0247   Max.   :0.24432  
In [28]:
per_wiki_treatment_effects %>%
    dplyr::top_n(10, effect)
A tibble: 10 × 6
wikieffectvariancestd_devlower95upper95
<chr><dbl><dbl><dbl><dbl><dbl>
cawiki 0.0752210.00671900.081970-0.0854400.23588
eswiki 0.0765290.00266700.051643-0.0246910.17775
etwiki 0.0377900.00750650.086640-0.1320250.20760
fiwiki 0.0442320.00640000.080000-0.1125680.20103
hrwiki 0.0514830.00680320.082482-0.1101810.21315
kowiki 0.0523970.00650890.080678-0.1057320.21053
simplewiki0.0630600.00700300.083684-0.1009610.22708
slwiki 0.0398800.00733340.085636-0.1279660.20773
svwiki 0.0881250.00635060.079691-0.0680690.24432
viwiki 0.0555940.00582140.076298-0.0939500.20514
In [29]:
per_wiki_treatment_effects %>%
    dplyr::top_n(10, dplyr::desc(effect))
A tibble: 10 × 6
wikieffectvariancestd_devlower95upper95
<chr><dbl><dbl><dbl><dbl><dbl>
dewiki-0.0994540.004271940.065360-0.227560.0286517
enwiki-0.0472230.000795070.028197-0.102490.0080428
frwiki-0.0440350.003674770.060620-0.162850.0747802
hiwiki-0.0525400.005895570.076783-0.203030.0979542
idwiki-0.0521610.004960830.070433-0.190210.0858877
kkwiki-0.0473020.007429750.086196-0.216250.1216424
mywiki-0.0378230.007402970.086040-0.206460.1308162
tewiki-0.0541110.007371440.085857-0.222390.1141686
thwiki-0.0604540.005851530.076495-0.210380.0894770
uzwiki-0.0513770.007522270.086731-0.221370.1186156
In [30]:
per_wiki_treatment_effects %>%
    keep_where(upper95 < 0 | lower95 > 0) # statistically significant per-wiki effects
A tibble: 0 × 6
wikieffectvariancestd_devlower95upper95
<chr><dbl><dbl><dbl><dbl><dbl>
In [31]:
options(repr.plot.width = 16, repr.plot.height = 8)
ggplot(per_wiki_treatment_effects) +
    geom_hline(yintercept = 0, size = 0.5) +
    geom_pointrange(aes(x = wiki, ymin = lower95, y = effect, ymax = upper95), color = "darkred") +
    scale_y_continuous(minor_breaks = NULL) +
    hrbrthemes::theme_ipsum() +
    theme(panel.grid.major.x = element_blank(), axis.text.x = element_blank())
In [32]:
temp_coefs <- coef(fit3)
In [33]:
options(repr.plot.width = 8, repr.plot.height = 4)
ggplot(temp_coefs$wiki, aes(x = treated)) +
    geom_histogram(bins = 30) +
    scale_x_continuous(minor_breaks = NULL) +
    scale_y_continuous(breaks = NULL, minor_breaks = NULL) +
    labs(
        y = NULL, x = "Effect of section editing",
        title = "Distribution of per-wiki section editing effects"
    ) +
    hrbrthemes::theme_ipsum()

Average sessions per user

Since (enrolled) users at the start of the test have more opportunities to see/use section editing, they have potential to have much higher number of sessions than users who entered the test at a later time -- especially near the end of the test. Therefore, we have decided to compare "average number of sessions per user within first week of entrance into the test" between the two groups. As a result, users who entered the dataset in the last week of the test were not included in this analysis.

In [34]:
user_windows <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    group_by(unique_user_id) %>%
    summarize(
        first_sesh_in_test = min(session_dt_start),
        first_week_in_test = first_sesh_in_test + lubridate::days(7),
    )
In [35]:
last_possible_dt <- lubridate::ymd_hms("2019-06-10T00:00:00Z") - lubridate::days(7)
In [36]:
per_user_session_counts <- daily_data %>%
    keep_where(ve_session_initialized, ve_session_readied, ve_session_loaded) %>%
    dplyr::left_join(user_windows, by = "unique_user_id") %>%
    keep_where(session_dt_start < first_week_in_test, session_dt_start < last_possible_dt) %>%
    group_by(bucket, unique_user_id) %>%
    dplyr::tally() %>%
    ungroup %>%
    mutate(bucket = factor(bucket, c("control", "treatment")))
In [37]:
per_user_session_counts %>%
    group_by(bucket) %>%
    summarize(
        users = dplyr::n(),
        total_sessions = sum(n),
        avg_sessions_per_user = mean(n),
        med_sessions_per_user = median(n),
        percentile95 = quantile(n, 0.95),
        percentile99 = quantile(n, 0.99)
    ) %>%
    to_html
bucket users total_sessions avg_sessions_per_user med_sessions_per_user percentile95 percentile99
control 19639 38091 1.9396 1 5 13
treatment 19763 38505 1.9483 1 5 12
In [38]:
t.test(n ~ bucket, data = per_user_session_counts, paired = FALSE)
	Welch Two Sample t-test

data:  n by bucket
t = -0.261, df = 38800, p-value = 0.79
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.074627  0.057070
sample estimates:
  mean in group control mean in group treatment 
                 1.9396                  1.9483 

Users with section editing had slightly more sessions (within 7 days of their first VE session) than users without, but that increase was not statistically significant (p-value = 0.8).

In [ ]: