This report is updating daily at 2AM UTC. If you have any comments or questions, please leave your feedback in the ticket: https://phabricator.wikimedia.org/T212414
The code for this notebook can be access at: https://github.com/wikimedia-research/Audiences-External_automatic_translation
from datetime import datetime
import dateutil
import pandas as pd
import plotly
import plotly.graph_objs as go
from plotly import tools
import wmfdata as wmf
plotly.offline.init_notebook_mode(connected=True)
current_date = datetime.utcnow().strftime("%Y-%m-%d")
wmf.utils.insert_code_toggle()
The following graphs break down the pageviews into two types:
translated_pageviews = wmf.hive.run("""
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
IF(
client_srp,
'Automatic translation from search result page',
'User initiated translation'
) AS initiation_method, (
to_language = 'id' OR
(to_language is null OR to_language='') AND home_language = 'id'
) AS is_id,
SUM(count) AS pageviews
FROM neilpquinn.toledo_pageviews
WHERE (
year = 2018 AND month = 12 AND day > 2 OR
year >= 2019
) AND
agent_type = 'user' AND
client_srp IS NOT NULL
GROUP BY
year, month, day,
client_srp,
(to_language = 'id' OR (to_language is null OR to_language='') AND home_language = 'id')
ORDER BY
year, month, day,
client_srp
LIMIT 1000000
""").query("date < @current_date") # Remove today's partial data
translated_pageviews_id = translated_pageviews.query("is_id").pivot(
index='date',
columns='initiation_method',
values='pageviews'
)
fig = {
'data': [{
'x': translated_pageviews_id.index,
'y': translated_pageviews_id[col],
'name': col
} for col in translated_pageviews_id.columns],
'layout': {
'yaxis': {'title': "Page Views"},
'title': {'text': 'Page views translated to Indonesian via Google Translate from all languages'},
'legend': {'x': 0, 'y': -0.25},
'hoverlabel': {'namelength' : -1},
'annotations': [{
'x':'2018-12-05',
'y':22397,
'text':'Google launched the<br>automatic translation<br>service for Indonesian<br>users on mobile',
'ax': 70,
'ay': 70
}]
}
}
plotly.offline.iplot(fig)
Note:
Please check https://discovery.wmflabs.org/external/ for total pageviews on all wikimedia projects by referrer class.
pageviews = wmf.hive.run("""
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
referer_class,
project = 'id.wikipedia' AS is_id,
SUM(view_count) AS pageviews
FROM wmf.projectview_hourly
WHERE (
year = 2018 AND month = 12 AND day > 2 OR
year >= 2019
) AND
project LIKE '%wikipedia' AND
agent_type = 'user' AND
access_method != 'mobile app'
GROUP BY
year, month, day,
access_method,
referer_class,
project = 'id.wikipedia'
ORDER BY
year, month, day
LIMIT 1000000
""").query("date < @current_date")
desktop_pageviews_id = (
pageviews
.query("is_id & access_method == 'desktop'")
.pivot(
index="date",
columns="referer_class",
values="pageviews"
)
)
# Convert each count into a proportion of the daily total
daily_totals = desktop_pageviews_id.sum(axis=1)
desktop_pageviews_id = desktop_pageviews_id.div(daily_totals, axis="index")
fig = {
'data': [{
'x': desktop_pageviews_id.index,
'y': desktop_pageviews_id[col],
'name': col
} for col in desktop_pageviews_id.columns],
'layout': {
'yaxis': {
'title': "Proportion of pageviews",
'tickformat': ',.3%'
},
'title': {'text': 'Proportion of desktop pageviews on Indonesian Wikipedia, by referrer class'},
'legend': {'x': 0, 'y': -0.4},
'hoverlabel': {'namelength': -1}
}
}
plotly.offline.iplot(fig)
mobile_pageviews_id = (
pageviews
.query("is_id & access_method == 'mobile web'")
.pivot(
index="date",
columns="referer_class",
values="pageviews"
)
)
# Convert each count into a proportion of the daily total
daily_totals = mobile_pageviews_id.sum(axis=1)
mobile_pageviews_id = mobile_pageviews_id.div(daily_totals, axis="index")
fig = {
'data': [{
'x': mobile_pageviews_id.index,
'y': mobile_pageviews_id[col],
'name': col
} for col in mobile_pageviews_id.columns],
'layout': {
'yaxis': {
'title': "Proportion of pageviews",
'tickformat': ',.3%'
},
'title': {'text': 'Proportion of mobile pageviews on Indonesian Wikipedia, by referrer class'},
'legend': {'x': 0, 'y': -0.4},
'hoverlabel': {'namelength': -1}
}
}
plotly.offline.iplot(fig)
idwiki_pageviews = (
pageviews
.query("is_id")
.groupby('date')
['pageviews']
.sum()
)
idwiki_autotranslated_pageviews = (
translated_pageviews
.query("is_id & initiation_method == 'Automatic translation from search result page'")
.groupby('date')
['pageviews']
.sum()
)
idwiki_autotranslation_prop = idwiki_autotranslated_pageviews / idwiki_pageviews
fig = {
'data': [{
'x': idwiki_autotranslation_prop.index,
'y': idwiki_autotranslation_prop.values,
}],
'layout': {
'yaxis': {
'title': "Proportion of total pageviews",
'tickformat': ',.3%'
},
'title': {'text': 'Share of Indonesian Wikipedia pageviews from auto-translation'},
'margin':{'l': 90},
'hoverlabel': {'namelength' : -1},
'annotations': [{
'x': '2018-12-05',
'y': 0.0037,
'text': 'Google launched the<br>automatic translation<br>service for Indonesian<br>users on mobile',
'ax': 70,
'ay': 70
}]
}
}
plotly.offline.iplot(fig)
The conversion rate in this section are event level conversion rate rather than user level conversion. Currently we don't have reliable method to identify user/session consistently from reading to editing. The conversion rates are calculated as:
Note:
init
and mtinfo
from ExternalGuidance drop drastically since June 17 2019. It has been fixed.eg_events = wmf.hive.run("""
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
event.action,
event.source_language,
event.target_language,
COUNT(1) as n_events
FROM event.externalguidance
WHERE (
year = 2019 and month = 3 and day > 13 OR
year = 2019 and month > 3 OR
year > 2019
) AND
NOT useragent.is_bot
GROUP BY
year, month, day,
event.action,
event.source_language,
event.target_language
ORDER BY year, month, day
LIMIT 1000000
""").query("date < @current_date") # Remove today's partial data
edits = wmf.hive.run("""
SELECT
SUBSTR(r.rev_timestamp, 0, 10) AS date,
IF(array_contains(t.tags, "mobile edit"), 'mobile', 'desktop') AS access_method,
IF(array_contains(t.tags, "campaign-external-machine-translation"), TRUE, FALSE) AS is_eg_edit,
r.`database` = 'idwiki' AS is_id,
r.rev_parent_id IS NULL AS is_new_page,
COUNT(DISTINCT r.rev_id) AS edits
FROM event.mediawiki_revision_tags_change t
RIGHT OUTER JOIN event.mediawiki_revision_create r
ON
t.rev_id = r.rev_id AND
r.year >= 2019 AND
t.year >= 2019 AND
r.page_id = t.page_id AND
r.`database` = t.`database` AND
r.page_namespace = t.page_namespace
WHERE
NOT r.performer.user_is_bot AND
NOT ARRAY_CONTAINS(r.performer.user_groups, 'bot') AND
SUBSTR(r.rev_timestamp, 0, 4) >= 2019 AND
r.meta.domain LIKE '%wikipedia%'
GROUP BY
SUBSTR(r.rev_timestamp, 0, 10),
ARRAY_CONTAINS(t.tags, "mobile edit"),
ARRAY_CONTAINS(t.tags, "campaign-external-machine-translation"),
r.`database` = 'idwiki', r.rev_parent_id IS NULL
ORDER BY date ASC
LIMIT 1000000
""").query("date < @current_date") # Remove today's partial data
eg_edits = edits.query("is_eg_edit").groupby(['date','is_id'], as_index = False)['edits'].sum()
eg_events['action'] = eg_events['action'].map({
'init': 'Access the translated page',
'specialpage': 'Access the contribution options page',
'createpage': 'Create a new page in local language',
'editpage': 'Edit an existing page in local language',
'mtinfo': 'View information about automatic translation',
'edit-original': 'Edit the original article'
})
el_by_action = (
eg_events
.groupby(['date','action'], as_index = False)
.sum()
.pivot(index='date',columns='action',values='n_events')
.merge(
eg_edits.groupby('date', as_index=False)['edits'].sum(),
how='left',
on='date')
.fillna(0)
.rename(columns={'edits': 'Edit saved successfully'})
.set_index('date')
)
data = [{'x': el_by_action.index,
'y': el_by_action[col],
'name': col
} for col in el_by_action.columns.difference(['Access the translated page', 'View information about automatic translation', 'Access the contribution options page'])] \
+ [{'x': el_by_action.index,
'y': el_by_action[col],
'name': col,
'xaxis': 'x2',
'yaxis': 'y2'
} for col in ['View information about automatic translation', 'Access the contribution options page']] \
+ [{'x': el_by_action.index,
'y': el_by_action['Access the translated page'],
'name': 'Access the translated page',
'xaxis': 'x3',
'yaxis': 'y3'}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline': True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline': True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline': True},
'yaxis': {'domain': [0, 0.333]},
'yaxis2': {'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'domain': [0.667, 1]},
'title': {'text': 'Number of events by action type, for all languages'},
'legend': {'x': 0, 'y': -0.42},
'hoverlabel': {'namelength': -1}
}
}
plotly.offline.iplot(fig)
el_funnel = eg_events.copy()
el_funnel.loc[
el_funnel.action.isin([
'Create a new page in local language', \
'Edit the original article', \
'Edit an existing page in local language'
]),
'action'] = 'edit'
el_funnel = (
el_funnel
.groupby(['date','action'], as_index = False)
.sum()
.pivot(index='date' , columns='action', values='n_events')
.merge(eg_edits.groupby('date', as_index=False)['edits'].sum(), how='left', on='date')
.fillna(0)
.set_index('date')
)
el_funnel['Translated page to automatic translation info'] = el_funnel['View information about automatic translation'].div(el_funnel['Access the translated page'])
el_funnel['Translated page to contribution options page'] = el_funnel['Access the contribution options page'].div(el_funnel['Access the translated page'])
el_funnel['Contribution options page to editing start'] = el_funnel['edit'].div(el_funnel['Access the contribution options page'])
# el_funnel['Translated page to editing start'] = el_funnel['edit'].div(el_funnel['Access the translated page'])
el_funnel['Editing start to edit saved'] = el_funnel['edits'].div(el_funnel['edit'])
# el_funnel['Translated page to edit saved'] = el_funnel['edits'].div(el_funnel['Access the translated page'])
el_funnel = el_funnel.fillna(0)
data = [{'x': el_funnel.index,
'y': el_funnel[col],
'name': col
} for col in ['Translated page to automatic translation info', 'Translated page to contribution options page']]
data = data + [{'x': el_funnel.index,
'y': el_funnel[col],
'name': col,
'xaxis': 'x2',
'yaxis': 'y2'
} for col in ['Contribution options page to editing start', 'Editing start to edit saved']]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showticklabels': False},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.5]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Conversion rate from access to contribution, for all languages'},
'legend': {'x': 0, 'y': -0.25},
'hoverlabel': {'namelength': -1}
}
}
plotly.offline.iplot(fig)
Note: The conversion rate from contribution options page to edit can be greater than 100% because users can click both buttons on that page: edit the original article or create a new page in local lanuage (or edit an existing page in local language).
el_by_action_id = (
eg_events[eg_events.target_language=='id']
.groupby(['date','action'],as_index = False)
.sum()
.pivot(index='date',columns='action',values='n_events')
.merge(
eg_edits[eg_edits.is_id].groupby('date', as_index=False)['edits'].sum(),
how='left',
on='date'
).fillna(0)
.rename(columns={'edits': 'Edit saved on Indonesian Wikipedia'})
.set_index('date')
)
data = [{'x': el_by_action_id.index,
'y': el_by_action_id[col],
'name': col
} for col in el_by_action_id.columns.difference(['Access the translated page', 'View information about automatic translation', 'Access the contribution options page'])] \
+ [{'x': el_by_action_id.index,
'y': el_by_action_id[col],
'name': col,
'xaxis': 'x2',
'yaxis': 'y2'
} for col in ['View information about automatic translation', 'Access the contribution options page']] \
+ [{'x': el_by_action_id.index,
'y': el_by_action_id['Access the translated page'],
'name': 'Access the translated page',
'xaxis': 'x3',
'yaxis': 'y3'}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline':True},
'yaxis': {'domain': [0, 0.333]},
'yaxis2': {'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'domain': [0.667, 1]},
'title': {'text': 'Number of events by action type, for pages translated to Indonesian'},
'legend': {'x': 0, 'y': -0.42},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
el_funnel_id = el_by_action_id.copy()
el_funnel_id['edit'] = el_funnel_id['Create a new page in local language'] + \
el_funnel_id['Edit the original article'] + \
el_funnel_id['Edit an existing page in local language']
el_funnel_id['Translated page to automatic translation info'] = el_funnel_id['View information about automatic translation'].div(el_funnel_id['Access the translated page'])
el_funnel_id['Translated page to contribution options page'] = el_funnel_id['Access the contribution options page'].div(el_funnel_id['Access the translated page'])
el_funnel_id['Contribution options page to editing start'] = el_funnel_id['edit'].div(el_funnel_id['Access the contribution options page'])
# el_funnel_id['Translated page to edit'] = el_funnel_id['edit'].div(el_funnel_id['Access the translated page'])
el_funnel_id['Editing start to edit saved(approx)'] = el_funnel_id['Edit saved on Indonesian Wikipedia'].div(el_funnel_id['edit'])
el_funnel_id = el_funnel_id.fillna(0)
data = [{'x': el_funnel_id.index,
'y': el_funnel_id[col],
'name': col
} for col in ['Translated page to automatic translation info', 'Translated page to contribution options page']]
data = data + [{'x': el_funnel_id.index,
'y': el_funnel_id[col],
'name': col,
'xaxis': 'x2',
'yaxis': 'y2'
} for col in ['Contribution options page to editing start', 'Editing start to edit saved(approx)']]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showticklabels': False},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.5]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Conversion rate from access to contribution, for pages translated to Indonesian'},
'legend': {'x': 0, 'y': -0.25},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
Note:
top_lang = (
eg_events[(eg_events.action=='Access the translated page')&(eg_events.source_language=='en')]
.groupby(['target_language'])
.sum()
.nlargest(5, 'n_events')
.index
)
el_top_lang = (
eg_events
.query("action == 'Access the translated page' & source_language == 'en' & target_language.isin(@top_lang)")
.groupby(['date','target_language'],as_index = False)
.sum()
.pivot(index='date',columns='target_language',values='n_events')
.fillna(0)
)
fig = {
'data': [{
'x': el_top_lang.index,
'y': el_top_lang[col],
'name': col
} for col in el_top_lang.columns],
'layout': {
'yaxis': {'title': "Number of events"},
'title': {'text': 'Number of access to the translated pages from English, by top 5 target languages'},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
The conversion rate in this section are event level conversion rate rather than user level conversion. Currently we don't have reliable method to identify user/session consistently from reading to editing. The conversion rates are calculated as:
Note:
wiki_edit_init = wmf.hive.run("""
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
IF(
(length(useragent.wmf_app_version) > 2) OR
(webhost rlike "(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"),
'mobile',
'desktop'
) AS access_method,
wiki = 'idwiki' AS is_id,
-- is_oversample means this event is extra: 15/16
SUM(if(event.is_oversample, 0, 16)) AS init
FROM event.editattemptstep
WHERE
year >= 2019 AND
webhost LIKE '%wikipedia%' AND
NOT useragent.is_bot AND
event.action = 'ready' -- ready should eliminate those undetected bot
-- and event.init_mechanism != 'url'
GROUP BY
year, month, day,
wiki = 'idwiki',
IF(
(length(useragent.wmf_app_version) > 2) OR
(webhost rlike "(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"),
'mobile',
'desktop'
)
ORDER BY
year, month, day
LIMIT 1000000
""").query("date < @current_date") # Remove today's partial data
wiki_funnel_pageviews = pageviews[pageviews.date >= '2019-01-01']
wiki_funnel_pageviews.loc[
wiki_funnel_pageviews.access_method.isin(['mobile web', 'mobile app']),
'access_method'
] = 'mobile'
all_lang_wiki_funnel = wiki_funnel_pageviews.groupby(['date', 'access_method'], as_index = False)['pageviews'].sum() \
.merge(wiki_edit_init.groupby(['date', 'access_method'], as_index = False)['init'].sum(),on=['date', 'access_method']) \
.merge(edits.groupby(['date', 'access_method'], as_index = False)['edits'].sum(),on=['date', 'access_method'])
all_lang_wiki_funnel['Reading to editing start'] = all_lang_wiki_funnel['init'].div(all_lang_wiki_funnel['pageviews'])
all_lang_wiki_funnel['Editing start to edit saved'] = all_lang_wiki_funnel['edits'].div(all_lang_wiki_funnel['init'])
all_lang_wiki_funnel['Reading to edit saved'] = all_lang_wiki_funnel['edits'].div(all_lang_wiki_funnel['pageviews'])
all_lang_wiki_funnel_desktop = all_lang_wiki_funnel[all_lang_wiki_funnel.access_method == 'desktop']
data = [{'x': all_lang_wiki_funnel_desktop.date,
'y': all_lang_wiki_funnel_desktop['Reading to edit saved'],
'name': 'Reading to edit saved'
},
{'x': all_lang_wiki_funnel_desktop.date,
'y': all_lang_wiki_funnel_desktop['Reading to editing start'],
'name': 'Reading to editing start',
'xaxis': 'x2',
'yaxis': 'y2'
},
{'x': all_lang_wiki_funnel_desktop.date,
'y': all_lang_wiki_funnel_desktop['Editing start to edit saved'],
'name': 'Editing start to edit saved',
'xaxis': 'x3',
'yaxis': 'y3'
}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline':True},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.333]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'tickformat': ',.3%', 'domain': [0.667, 1]},
'title': {'text': 'Conversion rate on desktop from reading to editing, for all languages'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
all_lang_wiki_funnel_mobile = all_lang_wiki_funnel[all_lang_wiki_funnel.access_method == 'mobile']
data = [{'x': all_lang_wiki_funnel_mobile.date,
'y': all_lang_wiki_funnel_mobile['Reading to edit saved'],
'name': 'Reading to edit saved'
},
{'x': all_lang_wiki_funnel_mobile.date,
'y': all_lang_wiki_funnel_mobile['Reading to editing start'],
'name': 'Reading to editing start',
'xaxis': 'x2',
'yaxis': 'y2'
},
{'x': all_lang_wiki_funnel_mobile.date,
'y': all_lang_wiki_funnel_mobile['Editing start to edit saved'],
'name': 'Editing start to edit saved',
'xaxis': 'x3',
'yaxis': 'y3'
}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline':True},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.333]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'tickformat': ',.3%', 'domain': [0.667, 1]},
'title': {'text': 'Conversion rate on mobile from reading to editing, for all languages'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1},
'annotations': [{'x':'2019-06-25', 'y':0, 'xref':'x3', 'yref':'y3', 'text':'Bug:T221197',
'ax': -30, 'ay': -50},
{'x':'2019-06-25', 'y':0.003, 'xref':'x2', 'yref':'y2', 'text':'Bug:T221197',
'ax': -30, 'ay': 50}]
}
}
plotly.offline.iplot(fig)
idwiki_funnel = wiki_funnel_pageviews[wiki_funnel_pageviews.is_id].groupby(['date', 'access_method'], as_index = False)['pageviews'].sum() \
.merge(wiki_edit_init[wiki_edit_init.is_id].groupby(['date', 'access_method'], as_index = False)['init'].sum(),on=['date', 'access_method']) \
.merge(edits[edits.is_id].groupby(['date', 'access_method'], as_index = False)['edits'].sum(),on=['date', 'access_method'])
idwiki_funnel['Reading to editing start'] = idwiki_funnel['init'].div(idwiki_funnel['pageviews'])
idwiki_funnel['Editing start to edit saved'] = idwiki_funnel['edits'].div(idwiki_funnel['init'])
idwiki_funnel['Reading to edit saved'] = idwiki_funnel['edits'].div(idwiki_funnel['pageviews'])
idwiki_funnel_desktop = idwiki_funnel[idwiki_funnel.access_method == 'desktop']
data = [{'x': idwiki_funnel_desktop.date,
'y': idwiki_funnel_desktop['Reading to edit saved'],
'name': 'Reading to edit saved'
},
{'x': idwiki_funnel_desktop.date,
'y': idwiki_funnel_desktop['Reading to editing start'],
'name': 'Reading to editing start',
'xaxis': 'x2',
'yaxis': 'y2'
},
{'x': idwiki_funnel_desktop.date,
'y': idwiki_funnel_desktop['Editing start to edit saved'],
'name': 'Editing start to edit saved',
'xaxis': 'x3',
'yaxis': 'y3'
}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline':True},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.333]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'tickformat': ',.3%', 'domain': [0.667, 1]},
'title': {'text': 'Conversion rate on desktop from reading to editing, for Indonesian Wikipedia'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
idwiki_funnel_mobile = idwiki_funnel[idwiki_funnel.access_method == 'mobile']
data = [{'x': idwiki_funnel_mobile.date,
'y': idwiki_funnel_mobile['Reading to edit saved'],
'name': 'Reading to edit saved'
},
{'x': idwiki_funnel_mobile.date,
'y': idwiki_funnel_mobile['Reading to editing start'],
'name': 'Reading to editing start',
'xaxis': 'x2',
'yaxis': 'y2'
},
{'x': idwiki_funnel_mobile.date,
'y': idwiki_funnel_mobile['Editing start to edit saved'],
'name': 'Editing start to edit saved',
'xaxis': 'x3',
'yaxis': 'y3'
}]
fig = {
'data': data,
'layout': {
'height': 700,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'xaxis3': {'showticklabels': False, 'anchor':'y3', 'showline':True},
'yaxis': {'tickformat': ',.3%', 'domain': [0, 0.333]},
'yaxis2': {'tickformat': ',.3%', 'domain': [0.333, 0.667], 'side': 'right'},
'yaxis3': {'tickformat': ',.3%', 'domain': [0.667, 1]},
'title': {'text': 'Conversion rate on mobile from reading to editing, for Indonesian Wikipedia'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1},
'annotations': [{'x':'2019-06-25', 'y':0, 'xref':'x3', 'yref':'y3', 'text':'Bug:T221197',
'ax': -30, 'ay': -50},
{'x':'2019-06-25', 'y':0.005, 'xref':'x2', 'yref':'y2', 'text':'Bug:T221197',
'ax': -30, 'ay': 50}]
}
}
plotly.offline.iplot(fig)
wiki_new_page = wmf.hive.run("""
select substr(rev_timestamp, 0, 10) AS date,
database='idwiki' as is_id,
rev_sha1 is not NULL as is_eg_edit,
count(*) as new_pages,
sum(cast(page_id is not NULL as int)) as deleted_pages
from (
select distinct c.rev_timestamp, c.`database`, c.rev_id, d.page_id, t.rev_sha1
from event.mediawiki_page_create c left outer join event.mediawiki_page_delete d on (
c.page_id = d.page_id
and c.`database`=d.`database`
and c.page_namespace = d.page_namespace
and c.year>=2019 and d.year>=2019)
left outer join event.mediawiki_revision_tags_change t on (
c.page_id = t.page_id
and c.rev_id = t.rev_id
and c.`database`=t.`database`
and c.page_namespace = t.page_namespace
and c.year>=2019 and t.year>=2019
and array_contains(t.tags, "campaign-external-machine-translation"))
where c.year>=2019
and substr(c.rev_timestamp, 0, 4) >= 2019
and c.meta.domain like '%wikipedia%'
and not c.performer.user_is_bot
and not array_contains(c.performer.user_groups, 'bot')
) as tbl
group by substr(rev_timestamp, 0, 10), database='idwiki', rev_sha1 is not NULL
order by date asc
limit 1000000
""")
wiki_new_page = wiki_new_page[wiki_new_page.date < current_date]
egprop_newpage_all = wiki_new_page.groupby(['date','is_eg_edit'], as_index=False)['new_pages'].sum() \
.pivot(index='date',columns='is_eg_edit',values='new_pages').fillna(0)
egprop_newpage_all.columns = ['All page creation', 'Page creation from external guidance extension']
data = [{'x': egprop_newpage_all.index,
'y': egprop_newpage_all['Page creation from external guidance extension'],
'name': 'Page creation from external guidance extension'
},
{'x': egprop_newpage_all.index,
'y': egprop_newpage_all['All page creation'],
'name': 'All page creation',
'xaxis': 'x2',
'yaxis': 'y2'
}]
fig = {
'data': data,
'layout': {
'height': 600,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'yaxis': {'domain': [0, 0.5]},
'yaxis2': {'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Number of new page creation on Wikipedia, for all languages'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
egprop_newpage_id = wiki_new_page[wiki_new_page.is_id].groupby(['date','is_eg_edit'], as_index=False)['new_pages'].sum() \
.pivot(index='date',columns='is_eg_edit',values='new_pages').fillna(0)
egprop_newpage_id.columns = ['All page creation', 'Page creation from external guidance extension']
data = [{'x': egprop_newpage_id.index,
'y': egprop_newpage_id['Page creation from external guidance extension'],
'name': 'Page creation from external guidance extension'
},
{'x': egprop_newpage_id.index,
'y': egprop_newpage_id['All page creation'],
'name': 'All page creation',
'xaxis': 'x2',
'yaxis': 'y2'
}]
fig = {
'data': data,
'layout': {
'height': 600,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'yaxis': {'domain': [0, 0.5]},
'yaxis2': {'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Number of new page creation on Indonesian Wikipedia'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
egprop_edit_all = edits.groupby(['date','is_eg_edit'], as_index=False)['edits'].sum() \
.pivot(index='date',columns='is_eg_edit',values='edits').fillna(0)
egprop_edit_all.columns = ['All edits', 'Edits from external guidance extension']
data = [{'x': egprop_edit_all.index,
'y': egprop_edit_all['Edits from external guidance extension'],
'name': 'Edits from external guidance extension'
},
{'x': egprop_edit_all.index,
'y': egprop_edit_all['All edits'],
'name': 'All edits',
'xaxis': 'x2',
'yaxis': 'y2'
}]
fig = {
'data': data,
'layout': {
'height': 600,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'yaxis': {'domain': [0, 0.5]},
'yaxis2': {'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Number of edits on Wikipedia, for all languages'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
egprop_edit_id = edits[edits.is_id].groupby(['date','is_eg_edit'], as_index=False)['edits'].sum() \
.pivot(index='date',columns='is_eg_edit',values='edits').fillna(0)
egprop_edit_id.columns = ['All edits', 'Edits from external guidance extension']
data = [{'x': egprop_edit_id.index,
'y': egprop_edit_id['Edits from external guidance extension'],
'name': 'Edits from external guidance extension'
},
{'x': egprop_edit_id.index,
'y': egprop_edit_id['All edits'],
'name': 'All edits',
'xaxis': 'x2',
'yaxis': 'y2'
}]
fig = {
'data': data,
'layout': {
'height': 600,
'xaxis': {'showline':True},
'xaxis2': {'showticklabels': False, 'anchor':'y2', 'showline':True},
'yaxis': {'domain': [0, 0.5]},
'yaxis2': {'domain': [0.5, 1], 'side': 'right'},
'title': {'text': 'Number of edits on Indonesian Wikipedia'},
'legend': {'x': 0, 'y': -0.2},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
page_delete_prop = wiki_new_page.groupby(['date','is_id'], as_index=False)['new_pages','deleted_pages'].sum()
page_delete_prop['prop'] = page_delete_prop['deleted_pages'].div(page_delete_prop['new_pages'])
page_delete_prop = page_delete_prop.pivot(index = 'date', columns='is_id', values='prop')
page_delete_prop.columns = ['Wikipedia in other languages', 'Indonesian Wikipedia']
eg_delte_prop = wiki_new_page[wiki_new_page.is_eg_edit].groupby('date')['new_pages','deleted_pages'].sum()
eg_delte_prop['New pages from External Guidance extension'] = eg_delte_prop['deleted_pages'].div(eg_delte_prop['new_pages'])
page_delete_prop = page_delete_prop.join(eg_delte_prop.drop(columns=['new_pages', 'deleted_pages']), how='left').fillna(0)
fig = {
'data': [{
'x': page_delete_prop.index,
'y': page_delete_prop[col],
'name': col
} for col in page_delete_prop.columns],
'layout': {
'yaxis': {'title': "Proportion of new pages",
'tickformat': ',.3%'
},
'title': {'text': 'Deletion rate of new pages'},
'legend': {'x': 0, 'y': -0.4},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
eg_edit_revert = pd.read_csv('external_machine_translation_edits_revert.tsv',sep='\t')
eg_edit_revert = eg_edit_revert.groupby('date')['is_reverted'].apply(lambda x: x.sum() / x.count()).to_frame()
eg_edit_revert.columns = ['Edits from External Guidance extension']
eg_edit_revert.index = pd.DatetimeIndex(eg_edit_revert.index)
edit_rever_date_rage = pd.date_range(start=eg_edit_revert.index.min(), end=eg_edit_revert.index.max())
eg_edit_revert = eg_edit_revert.reindex(edit_rever_date_rage, fill_value=0)
fig = {
'data': [{
'x': eg_edit_revert.index,
'y': eg_edit_revert['Edits from External Guidance extension'],
'name': 'Edits from External Guidance extension'
}],
'layout': {
'yaxis': {'title': "Revert Rate",
'tickformat': ',.3%'},
'title': {'text': 'Revert rate for edits from external guidance extension'},
'hoverlabel': {'namelength' : -1}
}
}
plotly.offline.iplot(fig)
# If day of the month is greater than 6, then use the previous month's snapshot
day_of_month = datetime.utcnow().strftime("%d")
if day_of_month > '06':
SNAPSHOT = (datetime.utcnow() - dateutil.relativedelta.relativedelta(months=1)).strftime("%Y-%m")
else:
SNAPSHOT = (datetime.utcnow() - dateutil.relativedelta.relativedelta(months=2)).strftime("%Y-%m")
wiki_revert = wmf.hive.run("""
select
substr(event_timestamp, 0, 10) as date,
wiki_db = 'idwiki' as is_id,
count(*) as edits,
sum(cast(revision_is_identity_reverted as int)) as reverted_edits
from wmf.mediawiki_history
where
event_entity = "revision" and
event_type = "create" and
event_timestamp >= "2018-12" and
substr(event_timestamp, 0, 7) <= SNAPSHOT and
-- ^not include data that's outside of the current snapshot (data from first few days of next month in incomplete)
snapshot = SNAPSHOT and
size(event_user_is_bot_by) < 1 and
mediawiki_history.wiki_db in (
select distinct database_code
from canonical_data.wikis
where database_group = 'wikipedia'
)
group by substr(event_timestamp, 0, 10), wiki_db = 'idwiki'
order by date
limit 1000000
""")
wiki_revert['revert_rate'] = wiki_revert['reverted_edits'].div(wiki_revert['edits'])
wiki_revert = wiki_revert.pivot(index = 'date', columns = 'is_id', values = 'revert_rate')
wiki_revert.columns = ['Wikipedia in other languages', 'Indonesian Wikipedia']
fig = {
'data': [{
'x': wiki_revert.index,
'y': wiki_revert[col],
'name': col
} for col in wiki_revert.columns],
'layout': {
'yaxis': {'title': "Revert Rate", 'tickformat': ',.3%'},
'title': {'text': 'Edit revert rate on Wikipedia'},
'hoverlabel': {'namelength' : -1},
'legend': {'x': 0, 'y': -0.25},
}
}
plotly.offline.iplot(fig)
# TODO:
# Proportion in overall edits: external guidance edits/all edits, idwiki vs all wikis