In [266]:
from IPython.display import HTML as display_html
import numpy as np
import pandas as pd
from wmfdata import hive, utils
from wmfdata.utils import num_str
In [263]:
# A selector to remove the margins from a pivot table, for use style `subset` arguments
subset_without_margins = pd.IndexSlice[
    lambda df: ~df.index.str.match("total"), 
    lambda df: ~df.columns.get_level_values(0).str.match("total")
]

We have clean, complete data in EditAttemptStep starting on 8 February 2019 and we started oversampling mobile visual editor events 2019-03-06T00:23:49Z (https://phabricator.wikimedia.org/T212253).

In [98]:
display_html("""
<script>  
    code_show=true; 
    function code_toggle() {
        if (code_show){
            $('div.input').hide();
        } else {
            $('div.input').show();
        }
        code_show = !code_show
    }  
    $( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
</form>
""")
Out[98]:

Attempts and completion by interface and experience

In [224]:
attempts = hive.run("""
-- Transform the steps into attempts
with steps as (
    select
        event.editing_session_id as attempt_id,
        unix_timestamp(
            dt,
            "yyyy-MM-dd'T'hh:mm:ssX"
        ) as ts,
        wiki,
        database_group,
        event.platform as platform,
        event.editor_interface as interface,
        event.action as action,
        event.is_oversample as is_oversample,
        event.user_editcount as user_edit_count,
        coalesce(event.user_class = "IP", False) as user_is_unregistered
    from event_sanitized.editattemptstep
    left join canonical_data.wikis
    on wiki = database_code
    where
        -- Remove Flow and other non-standard edits
        event.integration = "page" and
        -- Remove events not needed to calculate session starts or completions
        event.action in ("ready", "saveSuccess") and
        -- One month of data
        year = 2019 and (
            (month = 5 and day >= 17) or
            (month = 6 and day < 16)
        )
    -- Ensure that steps within each session are in chronological order without having to do a global order by
    distribute by attempt_id
    sort by ts asc
)
-- Group those steps into attempts
select
    attempt_id,
    from_unixtime(min(ts), "yyyy-MM-dd'T'hh:mm:ssX") as dt,
    min(wiki) as wiki,
    case
        when min(database_group) != "wikipedia" then Null
        when min(wiki) in ("enwiki", "nlwiki", "eswiki") then False
        else True
    end as ve_default_wiki,
    min(platform) as platform,
    -- Select the last editor in the session as the "real" one
    collect_list(interface)[
        size(collect_list(interface)) - 1
    ] as editor,
    sum(cast(action = "saveSuccess" as int)) >= 1 as is_completed,
    max(is_oversample) as is_oversample,
    case
        when max(user_is_unregistered) or max(user_edit_count) is null then "unregistered"
        when max(user_edit_count) = 0 then "0"
        when max(user_edit_count) < 10 then "1-9"
        when max(user_edit_count) < 100 then "10-99"
        when max(user_edit_count) < 1000 then "100-999"
        else "1000+"
    end as user_experience
from steps        
group by attempt_id
""").assign(
    dt=lambda df: pd.to_datetime(df["dt"]),
    wiki=lambda df: pd.Categorical(df["wiki"]),
    platform=lambda df: pd.Categorical(df["platform"]),
    editor=lambda df: pd.Categorical(df["editor"]),
    user_experience=lambda df: pd.Categorical(
        df["user_experience"],
        categories=["unregistered", "0", "1-9", "10-99", "100-999", "1000+"], ordered=True
    )
)

Average daily edit attempts

We consider any session that reaches the ready event as an attempt, to filter out what seems to be a large number of bot inits (https://phabricator.wikimedia.org/T209995).

In [276]:
daily_attempts = (
    attempts
    .query("~is_oversample")
    .pivot_table(
        columns=["platform", "editor"],
        index="user_experience",
        values="dt",
        aggfunc="count",
        margins="True",
        margins_name="total"
    )
    .multiply((16/30)) # Correct for sampling, reduce to daily average
)

(
    daily_attempts
    .style
    .background_gradient(
        axis=None, 
        subset=(~daily_attempts.index.str.match("total"), ~daily_attempts.columns.get_level_values(0).str.match("total"))
    )
    .format(lambda x: num_str(int(x)))
)
Out[276]:
platform desktop phone total
editor visualeditor wikitext wikitext-2017 visualeditor wikitext
user_experience
unregistered 170,000 390,000 9 7,900 1,100,000 1,600,000
0 4,000 8,400 56 960 4,300 32,000
1-9 5,300 12,000 160 1,800 5,000 250,000
10-99 6,000 19,000 410 1,200 5,200 25,000
100-999 4,700 28,000 1,100 750 6,600 42,000
1000+ 6,800 220,000 5,900 370 11,000 18,000
total 200,000 680,000 7,700 13,000 1,100,000 2,000,000

Completion rates

In [277]:
completion = (
    attempts
    .query("~is_oversample")
    .assign(user_experience=lambda df: df["user_experience"].cat.as_unordered()) # Work around https://github.com/pandas-dev/pandas/issues/25815
    .pivot_table(
        columns=["platform", "editor"],
        index="user_experience",
        values="is_completed",
        aggfunc=lambda grp: grp.sum() / len(grp),
        margins="True",
        margins_name="total"
    )
)

(
    completion
    .style
    .background_gradient(
        axis=None, 
        subset=(~completion.index.str.match("total"), ~completion.columns.get_level_values(0).str.match("total"))
    )
    .format("{:.0%}")
)
Out[277]:
platform desktop phone total
editor visualeditor wikitext wikitext-2017 visualeditor wikitext
user_experience
unregistered 4% 10% 0% 20% 2% 4%
0 26% 20% 10% 24% 22% 22%
1-9 46% 42% 23% 32% 32% 40%
10-99 55% 54% 42% 46% 41% 52%
100-999 64% 66% 53% 57% 50% 63%
1000+ 71% 80% 70% 68% 59% 78%
total 11% 37% 65% 28% 3% 16%

Desktop completion rate by interface, experience, and VE default status

In [330]:
desktop_completion = (
    attempts
    .query("platform == 'desktop' & editor == 'visualeditor' & ~ve_default_wiki.isnull()")
    .pivot_table(
        columns=["ve_default_wiki", "editor"],
        index="user_experience",
        values="is_completed",
        aggfunc=lambda grp: grp.sum() / len(grp),
        margins="True",
        margins_name="total"
    )
)

desktop_completion.insert(2, "wikitext", completion[("desktop", "wikitext")])

desktop_completion = (
    desktop_completion
    .set_axis([
        "visualeditor (non-default wiki)", 
        "visualeditor (default wiki)", 
        "wikitext", 
        "total"
    ], axis=1, inplace=False)
)

(
    desktop_completion
    .style
    .background_gradient(
        axis=None,
        subset=(~desktop_completion.index.str.match("total"), ~desktop_completion.columns.get_level_values(0).str.match("total"))
    )
    .format("{:.0%}")
)
Out[330]:
visualeditor (non-default wiki) visualeditor (default wiki) wikitext total
user_experience
unregistered 8% 4% 10% 4%
0 31% 26% 20% 27%
1-9 48% 44% 42% 55%
10-99 55% 55% 54% 45%
100-999 62% 65% 66% 64%
1000+ 71% 72% 80% 72%
total 30% 9% 37% 11%

User agent breakdown

Has to be computed separately, because user agent info isn't included in the event_santitized database.

In [ ]:
hive.run("""
select
    useragent.browser_family,
    useragent.device_family,
    useragent.os_family
from event.editattemptstep
where year = 2019 and month = 6 and day = 16 and hour = 1
limit 50
""")
In [332]:
!jupyter nbconvert --to html Editing-team-metrics.ipynb --output /srv/published-datasets/Editing-team-metrics
[NbConvertApp] Converting notebook Editing-team-metrics.ipynb to html
[NbConvertApp] Writing 330702 bytes to /srv/published-datasets/Editing-team-metrics.ipynb.html
Traceback (most recent call last):
  File "/home/neilpquinn-wmf/venv/bin/jupyter-nbconvert", line 11, in <module>
    sys.exit(main())
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/jupyter_core/application.py", line 266, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/nbconvert/nbconvertapp.py", line 337, in start
    self.convert_notebooks()
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/nbconvert/nbconvertapp.py", line 507, in convert_notebooks
    self.convert_single_notebook(notebook_filename)
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/nbconvert/nbconvertapp.py", line 479, in convert_single_notebook
    write_results = self.write_single_notebook(output, resources)
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/nbconvert/nbconvertapp.py", line 440, in write_single_notebook
    output, resources, notebook_name=notebook_name)
  File "/home/neilpquinn-wmf/venv/lib/python3.5/site-packages/nbconvert/writers/files.py", line 126, in write
    with io.open(dest, 'w', encoding='utf-8') as f:
OSError: [Errno 30] Read-only file system: '/srv/published-datasets/Editing-team-metrics.ipynb.html'
In [ ]: