Baseline Android Program Metrics for FY18-19

In FY18-19, Android team will focus editing features.

This is an initial exploration with two goals:

  1. Write relevant queries that can be productionized later (made to execute daily with data flowing into Druid/Superset)
  2. Learn what the program metrics look like right now so we can set our expectations accordingly

Metrics

When possible, we will compute daily counts that can be easily aggregated to a weekly/monthly/quarterly resolution. We will also perform a minor "edit drift" adjustment by recalculating the most recent 5-7 days (TBD) of edit metrics by filtering out reverted edits. That is, at any point the most recent 5-7 days of metrics are subject to change.

Note: platform-specific edit data was not available until mid-June via T194424. Prior to the deployment of the patch, all edits made in iOS and Android apps were lumped together under a single change tag "mobile app edit". Since multi-platform usage and platform switching are not common occurrences, any editor who has made a "mobile app edit"-tagged contribution previously is ineligble to be considered a new Android editor.

  • Edits (Required): number of "android app edit"-tagged contributions (including Wikidata descriptions)
  • Revert Ratio (Sorta required): proportion of "android app edit"-tagged contributions which have been reverted
  • New Editor Acquisition
    • New-to-Android Editors (Required): number of editors who have edited on Android for the first time irrespective of whether they have previously edited on iOS, mobile web, or desktop
    • Android-initiated New Editors (Optional): number of editors whose first contribution ever is through Android
  • Contributor Retention
    • New Editor Retention: of the users who registered in a given time span and made at least one edit in their first 30 days, the percentage that made at least one edit in their second 30 days.
    • Purely-Android Retention (Required): number of new Android editors who have made additional edits through the Android app in the 30 days following their first Android edit (does not include editors who have only made additional edits through other platforms following their first Android edit)
    • Android-initiated Retention (Optional): number of new Android editors whose first contribution ever is through Android and who have proceeded to make edits in the first 30 days of their first edit, including edits made on other platforms

Baselines

Here is where we'll measure our impact first:

  • Arabic Wikipedia (arwiki)
  • Bangla Wikipedia (bnwiki)
  • Persian Wikipedia (fawiki)
  • Finnish Wikipedia (fiwiki)
  • Hebrew Wikipedia (hewiki)
  • Hindi Wikipedia (hiwiki)
  • Italian Wikipedia (itwiki)
  • Marathi Wikipedia (mrwiki)
  • Portuguese Wikipedia (ptwiki)
  • Russian Wikipedia (ruwiki)
  • Swedish Wikipedia (svwiki)
  • Tamil Wikipedia (tawiki)
  • Chinese Wikipedia (zhwiki)
In [1]:
# https://stackoverflow.com/a/35018739/1091835
library(IRdisplay)

display_html(
'<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
$( document ).ready(code_toggle);
</script>
  <form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
 </form>'
)
In [2]:
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
    library(magrittr); library(zeallot); library(glue); library(tidyverse)
})

start_date <- "20180710"
start_date_human <- format(as.Date(start_date, '%Y%m%d'), '%d %B %Y')
end_date <- "20190627"
end_date_human <- format(as.Date(end_date, '%Y%m%d'), '%d %B %Y')

target_wikis <- c(
    "arwiki" = "Arabic", "bnwiki" = "Bangla", "fawiki" = "Persian",
    "fiwiki" = "Finnish", "hewiki"= "Hebrew", "hiwiki" = "Hindi",
    "itwiki" = "Italian", "mrwiki" = "Marathi", "ptwiki" = "Portuguese",
    "ruwiki" = "Russian", "svwiki" = "Swedish", "tawiki" = "Tamil",
    "zhwiki" = "Chinese"
)

collapse_targets <- function(x) {
    return(forcats::fct_collapse(x,
        `Hindi & Marathi` = c("Hindi", "Marathi"),
        `Swedish & Finnish` = c("Swedish", "Finnish")
    ))
}

message("Baselines last computed up to: ", end_date_human)
Baselines last computed up to: 27 June 2019
In [3]:
options(jupyter.plot_mimetypes = "image/png", digits = 4)
In [4]:
# Interactive time series visualization:
shhh({
    library(xts)
    library(dygraphs)
})

# For working in Jupyter:
library(htmlwidgets)
library(IRdisplay)

# Helper functions:
make_dygraph <- function(xts_data, brewer_palette = "Paired", legend_width = 600, ...) {
    if (ncol(xts_data) > 10) {
        color_palette <- scales::hue_pal()(ncol(xts_data))
    } else {
        color_palette <- RColorBrewer::brewer.pal(ncol(xts_data), brewer_palette)
    }
    dy <- dygraph(xts_data, ...)%>%
        dyAxis("x", drawGrid = FALSE) %>%
        dyLegend(width = legend_width) %>%
        dyRangeSelector() %>%
        dyOptions(colors = color_palette)
    return(dy)
}
output_jupyter <- function(html_widget, name, width = 800, height = 600) {
    file_name <- glue("widget_{name}.html")
    # dir_name <- sub(".html", "_files", file_name, fixed = TRUE)
    saveWidget(html_widget, file_name, selfcontained = TRUE)
    display_html(glue('<iframe src="{file_name}" width="{width}" height="{height}"></iframe>'))
    # message(glue("Remember to download {file_name} when exporting notebook\nand to share it together with the exported HTML."))
}

Edits

In [5]:
query <- "SELECT
  DATE(LEFT(rev_timestamp, 8)) AS date,
  COUNT(*) AS n_edits
FROM (
  SELECT ct_rev_id AS rev_id
  FROM change_tag
  INNER JOIN change_tag_def ON (
    change_tag.ct_tag_id = change_tag_def.ctd_id
    AND change_tag_def.ctd_name = 'android app edit'
  )
) AS android_app_edits
INNER JOIN (
    -- Edits on visible pages:
    SELECT
      rev_id,
      rev_timestamp
    FROM revision
    WHERE rev_timestamp >= '{start_date}'
      AND rev_timestamp < '{end_date}'
    UNION ALL
    -- Edits on deleted pages:
    SELECT
      ar_rev_id AS rev_id,
      ar_timestamp AS rev_timestamp
    FROM archive
    WHERE ar_timestamp >= '{start_date}'
      AND ar_timestamp < '{end_date}'
) AS combined_revisions ON android_app_edits.rev_id = combined_revisions.rev_id
GROUP BY date;"
In [6]:
if (file.exists("per_wiki_edit_counts.csv")) {
    per_wiki_edit_counts <- readr::read_csv("per_wiki_edit_counts.csv", col_types = "Dci")
} else {
    per_wiki_edit_counts <- map_df(
        set_names(names(target_wikis), names(target_wikis)),
        ~ shhh(wmf::mysql_read(glue(query), .x)),
        .id = "wiki"
    ) %>%
        complete(date, wiki, fill = list(n_edits = 0)) %>%
        mutate(
            date = as.Date(date),
            wiki = collapse_targets(factor(target_wikis[wiki]))
        ) %>%
        group_by(date, wiki) %>%
        summarize(n_edits = sum(n_edits)) %>%
        ungroup
    readr::write_csv(per_wiki_edit_counts, "per_wiki_edit_counts.csv")
}
In [7]:
dy_edits_per_wiki <- per_wiki_edit_counts %>%
    mutate(week = lubridate::floor_date(date, "week")) %>%
    group_by(date = week, wiki) %>%
    summarize(n_edits = sum(n_edits)) %>%
    ungroup %>%
    spread(wiki, n_edits) %>%
    { xts(select(., -date), order.by = .$date) } %>%
    make_dygraph(
        main = "Weekly number of edits made with Android app, by Wikipedia",
        xlab = "Date", ylab = "Wikipedia edits"
    ) %>%
    dyOptions(stackedGraph = TRUE) %>%
    dyEvent("2018-10-24", "Nav Update", labelLoc = "bottom") %>%
    dyEvent("2019-04-23", "Suggested Edits v1", labelLoc = "bottom")
output_jupyter(dy_edits_per_wiki, "edit_counts_per_wiki", width = 900, height = 450)
In [8]:
major_releases <- dplyr::tibble(
    date = as.Date(c("2018-10-24", "2019-04-23")),
    release = c("Nav Update", "Suggested Edits v1")
)
In [9]:
options(repr.plot.width = 14, repr.plot.height = 7)
per_wiki_edit_counts %>%
    group_by(date) %>%
    summarize(n_edits = sum(n_edits)) %>%
    ggplot(aes(x = date)) +
    geom_ribbon(aes(ymin = 0, ymax = n_edits), fill = "gray90") +
    geom_line(aes(y = n_edits)) +
    geom_smooth(aes(y = n_edits), se = FALSE, method = "loess", size = 2) +
    scale_y_continuous(breaks = seq(0, 1000, 100)) +
    scale_x_date(date_breaks = "2 months", date_minor_breaks = "1 month", date_labels = "%a\n%m/%d") +
    geom_vline(aes(xintercept = date), data = major_releases, linetype = "dashed") +
    geom_label(aes(label = release, y = 0), data = major_releases) +
    hrbrthemes::theme_ipsum(base_size = 14, axis_title_size = 18) +
    labs(
        title = "Number of Wikipedia edits made with Android app",
        subtitle = paste("Across the following languages:", paste0(sort(target_wikis), collapse = ", ")),
        x = "Date", y = "Wikipedia edits"
    )

Wikidata Description Edits

In [10]:
wd_desc_edits_query <- "SET mapreduce.map.memory.mb=4096;
USE wmf;
WITH android_app_title_description_edits AS (
  SELECT
    SUBSTR(event_timestamp, 1, 10) AS date,
    event_comment AS revision_summary
  FROM mediawiki_history
  WHERE snapshot = '2019-05'
    AND wiki_db = 'wikidatawiki'
    AND ARRAY_CONTAINS(revision_tags, 'android app edit')
    AND event_entity = 'revision'
    AND event_timestamp >= '${start_date}'
    AND event_timestamp < '${end_date}'
    AND INSTR(event_comment, 'wbsetdescription') > 0
)
SELECT
  date,
  REGEXP_EXTRACT(revision_summary, '\\\\|([a-z]{2,})\\\\s', 1) AS language_code,
  COUNT(1) AS n_edits
FROM android_app_title_description_edits
WHERE REGEXP_EXTRACT(revision_summary, '\\\\|([a-z]{2,})\\\\s', 1) IN('ar', 'bn', 'fa', 'fi', 'he', 'hi', 'it', 'mr', 'pt', 'ru', 'sv', 'ta', 'zh')
GROUP BY date, REGEXP_EXTRACT(revision_summary, '\\\\|([a-z]{2,})\\\\s', 1);"
In [11]:
start_dates <- seq(as.Date("2018-08-01"), as.Date("2019-06-01"), by = "month")
end_dates <- start_dates
start_dates <- c(as.Date("2018-07-10"), start_dates[-11])
start_dates <- as.character(start_dates)
end_dates <- as.character(end_dates)
if (file.exists("wikidata_edit_counts.csv")) {
    wikidata_edit_counts <- readr::read_csv("wikidata_edit_counts.csv", col_types = "Dci")
} else {
    wikidata_edit_counts <- purrr::map2_dfr(start_dates, end_dates, function(start_date, end_date) {
        message("Counting title descriptions from ", substr(start_date, 1, 7))
        query <- glue(wd_desc_edits_query, .open = "${")
        result <- shhh(wmf::query_hive(query, heap_size = 2048))
        return(result)
    }) %>% mutate(date = as.Date(date))
    readr::write_csv(wikidata_edit_counts, "wikidata_edit_counts.csv")
}

wikidata_edit_counts %<>%
    mutate(
        wiki = paste0(language_code, "wiki"),
        wiki = forcats::fct_collapse(factor(target_wikis[wiki]), `Hindi & Marathi` = c("Hindi", "Marathi"))
    )
In [12]:
dy_wikidata_edits_per_language <- wikidata_edit_counts %>%
    complete(date, wiki, fill = list(n_edits = 0)) %>%
    mutate(month = lubridate::floor_date(date, "month")) %>%
    group_by(date = month, wiki) %>%
    summarize(n_edits = sum(n_edits)) %>%
    ungroup %>%
    spread(wiki, n_edits) %>%
    { xts(select(., -date), order.by = .$date) } %>%
    make_dygraph(
        main = "Number of Wikidata description edits made with Android app, by language",
        xlab = "Date", ylab = "Wikidata edits"
    ) %>%
    dyOptions(stackedGraph = TRUE) %>%
    dyEvent("2018-10-24", "Nav Update", labelLoc = "bottom") %>%
    dyEvent("2019-04-05", "Suggested Edits v1 beta", labelLoc = "bottom") %>%
    dyEvent("2019-04-23", "Suggested Edits v1 prod", labelLoc = "bottom")
output_jupyter(dy_wikidata_edits_per_language, "wikidata_edit_counts_per_language", width = 900, height = 900)
Warning message:
“Column `wiki` has different attributes on LHS and RHS of join”
In [13]:
options(repr.plot.width = 14, repr.plot.height = 7)
wikidata_edit_counts %>%
    complete(date, wiki, fill = list(n_edits = 0)) %>%
    mutate(month = lubridate::floor_date(date, "month")) %>%
    group_by(date = month, wiki) %>%
    summarize(n_edits = sum(n_edits)) %>%
    ungroup %>%
    ggplot(aes(x = date)) +
    geom_area(aes(y = n_edits, fill = wiki), position = "stack", color = "black", size = 0.25) +
    scale_x_date(date_breaks = "1 month", minor_breaks = NULL, date_labels = "%b\n%Y") +
    # scale_fill_brewer(palette = "Paired") +
    geom_vline(aes(xintercept = date), data = major_releases, linetype = "dashed") +
    geom_label(aes(label = release, y = 0), data = major_releases) +
    hrbrthemes::theme_ipsum(base_size = 14, axis_title_size = 18) +
    labs(
        title = "Number of Wikidata description edits made with Android app",
        subtitle = "This shows the total number of edits across languages and enables\nus to see how much each language contributed to the total count",
        x = "Date", y = "Wikidata edits"
    )
Warning message:
“Column `wiki` has different attributes on LHS and RHS of join”
In [14]:
options(repr.plot.width = 14, repr.plot.height = 7)
wikidata_edit_counts %>%
    group_by(date) %>%
    summarize(n_edits = sum(n_edits)) %>%
    ggplot(aes(x = date)) +
    geom_ribbon(aes(ymin = 0, ymax = n_edits), fill = "gray90") +
    geom_line(aes(y = n_edits)) +
    geom_smooth(aes(y = n_edits), method = "loess", se = FALSE, size = 1.1) +
    scale_x_date(date_breaks = "1 month", minor_breaks = NULL, date_labels = "%b\n%Y") +
    geom_vline(aes(xintercept = date), data = major_releases, linetype = "dashed") +
    geom_label(aes(label = release, y = 0), data = major_releases) +
    hrbrthemes::theme_ipsum(base_size = 14, axis_title_size = 18) +
    labs(
        title = "Number of Wikidata description edits made with Android app",
        x = "Date", y = "Wikidata edits"
    )

New Contributors

Here we take users who have registered on or after 10 July 2018 who have made at least one edit with the Android app. In this case we are restricting ourselves to users who created new accounts on the wikis, not those who had pre-existing accounts from elsewhere whose accounts were auto-created when they visited the wiki.

In [15]:
query <- "SELECT
  -- For every new user (and their registration date),
  -- count how many edits they made on each day:
  user_id,
  registration_date,
  edit_date,
  COUNT(*) AS n_edits
FROM (
  SELECT
    new_users.user_id AS user_id,
    registration_date,
    edit_date, edit_id
  FROM (
    -- New users (self-registered, not auto-created):
    SELECT
      DATE(LEFT(log_timestamp, 8)) AS registration_date,
      log_user AS user_id
    FROM logging
    WHERE log_type = 'newusers'
      AND log_action = 'create'
      AND log_timestamp >= '{start_date}'
      AND log_timestamp < '{end_date}'
  ) AS new_users
  INNER JOIN (
    (
      -- Edits made on visible pages with the Android app:
      SELECT
        DATE(LEFT(rev_timestamp, 8)) AS edit_date,
        rev_user AS user_id,
        rev_id AS edit_id
      FROM change_tag
      INNER JOIN change_tag_def ON (
        change_tag.ct_tag_id = change_tag_def.ctd_id
        AND change_tag_def.ctd_name = 'android app edit'
      )
      INNER JOIN revision ON (
        revision.rev_id = change_tag.ct_rev_id
        AND rev_timestamp >= '{start_date}'
        AND rev_timestamp < '{end_date}'
      )
    )
    UNION ALL
    (
      -- Edits made on deleted pages with the Android app:
      SELECT
        DATE(LEFT(ar_timestamp, 8)) AS edit_date,
        ar_user AS user_id,
        ar_id AS edit_id
      FROM change_tag
      INNER JOIN change_tag_def ON (
        change_tag.ct_tag_id = change_tag_def.ctd_id
        AND change_tag_def.ctd_name = 'android app edit'
      )
      INNER JOIN archive ON (
        archive.ar_id = change_tag.ct_rev_id
        AND ar_timestamp >= '{start_date}'
        AND ar_timestamp < '{end_date}'
      )
    )
  ) AS android_app_edits ON new_users.user_id = android_app_edits.user_id
) AS new_android_user_edits
GROUP BY user_id, registration_date, edit_date;"
In [16]:
if (file.exists("per_wiki_per_user_edit_counts.csv")) {
    per_wiki_per_user_edit_counts <- readr::read_csv("per_wiki_per_user_edit_counts.csv", col_types = "ciDDi")
} else {
    per_wiki_per_user_edit_counts <- set_names(names(target_wikis), names(target_wikis)) %>%
    map_df(~ shhh(wmf::mysql_read(glue(query), .x)), .id = "wiki") %>%
    mutate(
        registration_date = as.Date(registration_date),
        edit_date = as.Date(edit_date),
        wiki = factor(target_wikis[wiki])
    )
    readr::write_csv(per_wiki_per_user_edit_counts, "per_wiki_per_user_edit_counts.csv")
}
In [17]:
dy_new_contributor_acquisition <- per_wiki_per_user_edit_counts %>%
    distinct(wiki, user_id, registration_date) %>%
    mutate(wiki = collapse_targets(wiki)) %>%
    group_by(wiki, registration_date) %>%
    summarize(n_users = n()) %>%
    ungroup %>%
    complete(registration_date, wiki, fill = list(n_users = 0)) %>%
    mutate(month = lubridate::floor_date(registration_date, "month")) %>%
    filter(month < lubridate::floor_date(Sys.Date(), "month")) %>%
    group_by(registration_date = month, wiki) %>%
    summarize(n_users = sum(n_users)) %>%
    ungroup %>%
    spread(wiki, n_users) %>%
    { xts(select(., -registration_date), order.by = .$registration_date) } %>%
    make_dygraph(
        main = "Acquisition of new contributors, by language",
        xlab = "Date", ylab = "New Android editors"
    ) %>%
    dyOptions(stackedGraph = TRUE) %>%
    dyEvent("2018-10-24", "Nav Update", labelLoc = "bottom") %>%
    dyEvent("2019-04-05", "Suggested Edits v1 beta", labelLoc = "bottom") %>%
    dyEvent("2019-04-23", "Suggested Edits v1 prod", labelLoc = "bottom")
output_jupyter(dy_new_contributor_acquisition, "new_contributor_acquisition", width = 900, height = 450)
In [18]:
options(repr.plot.width = 14, repr.plot.height = 10)
per_wiki_per_user_edit_counts %>%
    distinct(wiki, user_id, registration_date) %>%
    mutate(wiki = collapse_targets(wiki)) %>%
    group_by(wiki, registration_date) %>%
    summarize(n_users = n()) %>%
    ungroup %>%
    complete(registration_date, wiki, fill = list(n_users = 0)) %>%
    mutate(month = lubridate::floor_date(registration_date, "month")) %>%
    filter(month < lubridate::floor_date(Sys.Date(), "month")) %>%
    group_by(registration_date = month, wiki) %>%
    summarize(n_users = sum(n_users)) %>%
    ungroup %>%
    ggplot() +
    geom_area(aes(x = registration_date, y = n_users, fill = wiki), position = "stack", color = "white") +
    scale_x_date(date_breaks = "1 month", minor_breaks = NULL, date_labels = "%b\n%Y") +
    geom_vline(aes(xintercept = date), data = major_releases, linetype = "dashed") +
    geom_label(aes(x = date, label = release, y = 0), data = major_releases) +
    hrbrthemes::theme_ipsum(base_size = 14, axis_title_size = 18) +
    labs(
        title = "Acquisition of new contributors by month",
        subtitle = glue("A new contributor is any newly-created account that has
            made at least one edit with the Android app since {start_date_human}"),
        x = "Account creation date", y = "New contributors"
    )