from IPython.display import HTML as display_html
import numpy as np
import pandas as pd
from wmfdata import hive, utils
from wmfdata.utils import num_str
# A selector to remove the margins from a pivot table, for use style `subset` arguments
subset_without_margins = pd.IndexSlice[
lambda df: ~df.index.str.match("total"),
lambda df: ~df.columns.get_level_values(0).str.match("total")
]
We have clean, complete data in EditAttemptStep starting on 8 February 2019 and we started oversampling mobile visual editor events 2019-03-06T00:23:49Z (https://phabricator.wikimedia.org/T212253).
display_html("""
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>
""")
attempts = hive.run("""
-- Transform the steps into attempts
with steps as (
select
event.editing_session_id as attempt_id,
unix_timestamp(
dt,
"yyyy-MM-dd'T'hh:mm:ssX"
) as ts,
wiki,
database_group,
event.platform as platform,
event.editor_interface as interface,
event.action as action,
event.is_oversample as is_oversample,
event.user_editcount as user_edit_count,
coalesce(event.user_class = "IP", False) as user_is_unregistered
from event_sanitized.editattemptstep
left join canonical_data.wikis
on wiki = database_code
where
-- Remove Flow and other non-standard edits
event.integration = "page" and
-- Remove events not needed to calculate session starts or completions
event.action in ("ready", "saveSuccess") and
-- One month of data
year = 2019 and (
(month = 5 and day >= 17) or
(month = 6 and day < 16)
)
-- Ensure that steps within each session are in chronological order without having to do a global order by
distribute by attempt_id
sort by ts asc
)
-- Group those steps into attempts
select
attempt_id,
from_unixtime(min(ts), "yyyy-MM-dd'T'hh:mm:ssX") as dt,
min(wiki) as wiki,
case
when min(database_group) != "wikipedia" then Null
when min(wiki) in ("enwiki", "nlwiki", "eswiki") then False
else True
end as ve_default_wiki,
min(platform) as platform,
-- Select the last editor in the session as the "real" one
collect_list(interface)[
size(collect_list(interface)) - 1
] as editor,
sum(cast(action = "saveSuccess" as int)) >= 1 as is_completed,
max(is_oversample) as is_oversample,
case
when max(user_is_unregistered) or max(user_edit_count) is null then "unregistered"
when max(user_edit_count) = 0 then "0"
when max(user_edit_count) < 10 then "1-9"
when max(user_edit_count) < 100 then "10-99"
when max(user_edit_count) < 1000 then "100-999"
else "1000+"
end as user_experience
from steps
group by attempt_id
""").assign(
dt=lambda df: pd.to_datetime(df["dt"]),
wiki=lambda df: pd.Categorical(df["wiki"]),
platform=lambda df: pd.Categorical(df["platform"]),
editor=lambda df: pd.Categorical(df["editor"]),
user_experience=lambda df: pd.Categorical(
df["user_experience"],
categories=["unregistered", "0", "1-9", "10-99", "100-999", "1000+"], ordered=True
)
)
We consider any session that reaches the ready
event as an attempt, to filter out what seems to be a large number of bot inits (https://phabricator.wikimedia.org/T209995).
daily_attempts = (
attempts
.query("~is_oversample")
.pivot_table(
columns=["platform", "editor"],
index="user_experience",
values="dt",
aggfunc="count",
margins="True",
margins_name="total"
)
.multiply((16/30)) # Correct for sampling, reduce to daily average
)
(
daily_attempts
.style
.background_gradient(
axis=None,
subset=(~daily_attempts.index.str.match("total"), ~daily_attempts.columns.get_level_values(0).str.match("total"))
)
.format(lambda x: num_str(int(x)))
)
completion = (
attempts
.query("~is_oversample")
.assign(user_experience=lambda df: df["user_experience"].cat.as_unordered()) # Work around https://github.com/pandas-dev/pandas/issues/25815
.pivot_table(
columns=["platform", "editor"],
index="user_experience",
values="is_completed",
aggfunc=lambda grp: grp.sum() / len(grp),
margins="True",
margins_name="total"
)
)
(
completion
.style
.background_gradient(
axis=None,
subset=(~completion.index.str.match("total"), ~completion.columns.get_level_values(0).str.match("total"))
)
.format("{:.0%}")
)
desktop_completion = (
attempts
.query("platform == 'desktop' & editor == 'visualeditor' & ~ve_default_wiki.isnull()")
.pivot_table(
columns=["ve_default_wiki", "editor"],
index="user_experience",
values="is_completed",
aggfunc=lambda grp: grp.sum() / len(grp),
margins="True",
margins_name="total"
)
)
desktop_completion.insert(2, "wikitext", completion[("desktop", "wikitext")])
desktop_completion = (
desktop_completion
.set_axis([
"visualeditor (non-default wiki)",
"visualeditor (default wiki)",
"wikitext",
"total"
], axis=1, inplace=False)
)
(
desktop_completion
.style
.background_gradient(
axis=None,
subset=(~desktop_completion.index.str.match("total"), ~desktop_completion.columns.get_level_values(0).str.match("total"))
)
.format("{:.0%}")
)
Has to be computed separately, because user agent info isn't included in the event_santitized
database.
hive.run("""
select
useragent.browser_family,
useragent.device_family,
useragent.os_family
from event.editattemptstep
where year = 2019 and month = 6 and day = 16 and hour = 1
limit 50
""")
!jupyter nbconvert --to html Editing-team-metrics.ipynb --output /srv/published-datasets/Editing-team-metrics