Persian (Farsi) Wikipedia community started a trial of blocking IP editing on content page since October 20,2021, which is the Wednesday of 42th week in 2021 T291018. This dashboard is to monitor related metrics to see how this impacts the health of the project. T292781. The trial is planned to end in 6 months.
Metrics:
%%capture --no-stderr
%set_env http_proxy=http://webproxy.eqiad.wmnet:8080
%set_env https_proxy=https://webproxy.eqiad.wmnet:8080
%set_env no_proxy=localhost,127.0.0.1
# to fix matplotlib warning on the default path (/nonexistent/.config/matplotlib) is not a writable directory
import os
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
import wmfdata
wmfdata.utils.insert_code_toggle()
from wmfdata import hive, mariadb
import pandas as pd
from datetime import datetime, timedelta, date
from wmfdata import spark
import findspark
SPARK_HOME = os.environ.get("SPARK_HOME", "/usr/lib/spark2")
findspark.init(SPARK_HOME)
spark_session = spark.get_session(type="yarn-large")
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
@ticker.FuncFormatter
def million_formatter(x, pos):
return "%d M" % round(x/1000000)
@ticker.FuncFormatter
def thousand_formatter(x, pos):
return "%d K" % round(x/1000)
now = pd.Timestamp.utcnow()
today=now.date()
last_Sunday=(today - timedelta(days=today.weekday()+1)).strftime("%Y-%m-%d")
month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d")
start_date_YYYY_MM_DD = '2022-01-03'
end_date_YYYY_MM_DD=last_Sunday
yr='2022'
pre_yr='2021'
start_date_YYYYMMDD = '20220103'
end_date_YYYYMMDD=datetime.today().replace(day=1).strftime("%Y%m%d")
if datetime.today().day < 7:
snapshot = datetime.today().replace(month=datetime.today().month-2).strftime("%Y-%m")
else:
snapshot = datetime.today().replace(month=datetime.today().month-1).strftime("%Y-%m")
lastyear_start_date_YYYY_MM_DD = '2021-01-01'
lastyear_end_date_YYYY_MM_DD='2021-12-31'
lastyear_start_date_YYYYMMDD = '20210101'
lastyear_end_date_YYYYMMDD='20211231'
last2years_start_date_YYYY_MM_DD = '2020-01-01'
last2years_end_date_YYYY_MM_DD='2020-12-31'
last2years_start_date_YYYYMMDD = '20200101'
last2years_end_date_YYYYMMDD='20201231'
last3years_start_date_YYYY_MM_DD = '2019-01-01'
last3years_end_date_YYYY_MM_DD='2019-12-31'
last3years_start_date_YYYYMMDD = '20190101'
last3years_end_date_YYYYMMDD='20191231'
weekly_user_editors_query='''
SELECT tmp.week_n, COUNT(tmp.user_id) AS user_editors
FROM
(SELECT WEEKOFYEAR(rev_timestamp) AS week_n, performer.user_id AS user_id, max(performer.user_text) AS user_text
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}')
AND performer.user_id IS NOT null
AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252)
AND `database`='fawiki'
GROUP BY WEEKOFYEAR(rev_timestamp), performer.user_id
)AS tmp
GROUP BY tmp.week_n
'''
df_weekly_user_editors=hive.run(weekly_user_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
weekly_user_editors_query='''
SELECT
weekofyear(event_timestamp) AS week_n, COUNT(DISTINCT event_user_id) wiki_user_editors
FROM wmf.mediawiki_history
WHERE
event_entity = 'revision' AND
event_type = 'create' AND
DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
event_user_is_anonymous = false AND page_namespace_is_content
AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp)
'''
df2_weekly_user_editors_pre_1_year=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
#df2_weekly_user_editors_pre_2_years=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df2_weekly_user_editors_pre_3_years=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly User Editors',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Editors', fontsize = 16)
ax.plot(df_weekly_user_editors['week_n'], df_weekly_user_editors['user_editors'], 'o-',label='Year 2022')
ax.plot(df2_weekly_user_editors_pre_1_year['week_n'], df2_weekly_user_editors_pre_1_year['wiki_user_editors'], 'o--',label='Year 2021')
ax.plot(df2_weekly_user_editors_pre_3_years['week_n'], df2_weekly_user_editors_pre_3_years['wiki_user_editors'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,3000)
ax.vlines(42, 0, 3000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
# set_xticks method before set_xticklabels to fix warning. ref: https://stackoverflow.com/questions/63723514/userwarning-fixedformatter-should-only-be-used-together-with-fixedlocator
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
weekly_edits_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,
SUM(IF(page_namespace = 0, 1, 0)) AS content_edits,
SUM(IF(page_namespace = 1, 1, 0)) AS talk_edits,
COUNT(*) AS total_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}')
AND `database`='fawiki'
GROUP BY WEEKOFYEAR(rev_timestamp)
ORDER BY week_n
LIMIT 100000
'''
df_weekly_edits=hive.run(weekly_edits_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
weekly_edits_query_2='''
SELECT
weekofyear(event_timestamp) AS week_n,
SUM(IF(page_namespace = 0, 1, 0)) AS content_edits,
SUM(IF(page_namespace = 1, 1, 0)) AS talk_edits,
COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
event_entity = 'revision' AND
event_type = 'create' AND
DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
snapshot = '{SNAPSHOT}'
AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp)
ORDER BY week_n
LIMIT 100000
'''
df2_weekly_edits_pre_1_year=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df2_weekly_edits_pre_2_years=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df2_weekly_edits_pre_3_years=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Total Edits',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Edits', fontsize = 16)
ax.plot(df_weekly_edits['week_n'], df_weekly_edits['total_edits'], 'o-',label='Year 2022')
ax.plot(df2_weekly_edits_pre_1_year['week_n'], df2_weekly_edits_pre_1_year['edits'], 'o--',label='Year 2021')
ax.plot(df2_weekly_edits_pre_3_years['week_n'], df2_weekly_edits_pre_3_years['edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,120000)
ax.vlines(42, 0, 120000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
#plt.savefig("1_fawiki_weekly_edits.png")
plt.show()
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Fawiki Weekly Edits on Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week', fontsize = 16)
ax[0].set_ylabel('Edits', fontsize = 16)
ax[0].plot(df_weekly_edits['week_n'],
df_weekly_edits['content_edits'], 'o-',label='Year 2022')
ax[0].plot(df2_weekly_edits_pre_1_year['week_n'],
df2_weekly_edits_pre_1_year['content_edits'], 'o--',label='Year 2021')
ax[0].plot(df2_weekly_edits_pre_3_years['week_n'],
df2_weekly_edits_pre_3_years['content_edits'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,120000)
ax[0].vlines(42, 0, 120000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(thousand_formatter)
ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax[1].set_title('Fawiki Weekly Edits on Article Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week', fontsize = 16)
ax[1].set_ylabel('Edits', fontsize = 16)
ax[1].plot(df_weekly_edits['week_n'],
df_weekly_edits['talk_edits'], 'o-',label='Year 2022')
ax[1].plot(df2_weekly_edits_pre_1_year['week_n'],
df2_weekly_edits_pre_1_year['talk_edits'], 'o--',label='Year 2021')
ax[1].plot(df2_weekly_edits_pre_3_years['week_n'],
df2_weekly_edits_pre_3_years['talk_edits'], ':',label='Year 2019')
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,5000)
ax[1].vlines(42, 0, 5000, colors='k', linestyles='dashdot', label='Turned off')
ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(thousand_formatter)
ax3 = ax[1].twiny() # instantiate a second axes that shares the same x-axis
ax3.set_xlim(0,12)
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
plt.show()
query_weekly_reverts_48hrs='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
SUM(IF(page_namespace = 0, 1, 0)) AS reverted_content_edits,
SUM(IF(page_namespace = 1, 1, 0)) AS reverted_talk_edits,
COUNT(revision_id) AS reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 100000
'''
df_weekly_reverts_48hrs=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_weekly_reverts_48hrs_pre_1_year=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_weekly_reverts_48hrs_pre_2_years=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_weekly_reverts_48hrs_pre_3_years=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Reverts', fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], df_weekly_reverts_48hrs['reverted_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], df_weekly_reverts_48hrs_pre_1_year['reverted_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_3_years['week_n'], df_weekly_reverts_48hrs_pre_3_years['reverted_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,35000)
ax.vlines(42, 0, 35000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing on Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Reverts', fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1],
df_weekly_reverts_48hrs['reverted_content_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'],
df_weekly_reverts_48hrs_pre_1_year['reverted_content_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_3_years['week_n'],
df_weekly_reverts_48hrs_pre_3_years['reverted_content_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,35000)
ax.vlines(42, 0, 35000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing on Article Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Reverts', fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1],
df_weekly_reverts_48hrs['reverted_talk_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'],
df_weekly_reverts_48hrs_pre_1_year['reverted_talk_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_3_years['week_n'],
df_weekly_reverts_48hrs_pre_3_years['reverted_talk_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,600)
ax.vlines(42, 0, 600, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND size(event_user_is_bot_by) <= 0
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1
LEFT JOIN wmf.mediawiki_history AS h2 -- h1 is revert edits, h2 is the edits being reverted
ON h1.revision_id=h2.revision_first_identity_reverting_revision_id
AND h1.wiki_db = h2.wiki_db
AND h1.snapshot= h2.snapshot
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
-- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND h1.event_entity = "revision" and h2.event_entity = "revision"
AND (NOT h1.revision_is_identity_reverted OR h1.revision_seconds_to_identity_revert > 172800)
AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
AND h1.revision_is_identity_revert
AND h1.event_entity = 'revision' and h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
df_weekly_net_non_reverted_48hrs_edits=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_1_year=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_2_years=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_3_years=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits', fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits['week_n'][:-1], df_weekly_net_non_reverted_48hrs_edits['net_non_reverted_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_1_year['net_non_reverted_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_3_years['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_3_years['net_non_reverted_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,60000)
ax.vlines(42, 0, 60000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND page_namespace = 0
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND size(event_user_is_bot_by) <= 0
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1
LEFT JOIN wmf.mediawiki_history AS h2 -- h1 is revert edits, h2 is the edits being reverted
ON h1.revision_id=h2.revision_first_identity_reverting_revision_id
AND h1.wiki_db = h2.wiki_db
AND h1.snapshot= h2.snapshot
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
AND h1.page_namespace = 0 AND h2.page_namespace = 0
-- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND h1.event_entity = "revision" and h2.event_entity = "revision"
AND (NOT h1.revision_is_identity_reverted OR h1.revision_seconds_to_identity_revert > 172800)
AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
AND h1.revision_is_identity_revert
AND h1.event_entity = 'revision' and h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
df_weekly_net_non_reverted_48hrs_edits_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_2_years_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_3_years_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits on Content Pages Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits', fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits_c['week_n'][:-1],
df_weekly_net_non_reverted_48hrs_edits_c['net_non_reverted_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c['week_n'],
df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c['net_non_reverted_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_3_years_c['week_n'],
df_weekly_net_non_reverted_48hrs_edits_pre_3_years_c['net_non_reverted_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,60000)
ax.vlines(42, 0, 60000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND page_namespace = 1
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND size(event_user_is_bot_by) <= 0
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1
LEFT JOIN wmf.mediawiki_history AS h2 -- h1 is revert edits, h2 is the edits being reverted
ON h1.revision_id=h2.revision_first_identity_reverting_revision_id
AND h1.wiki_db = h2.wiki_db
AND h1.snapshot= h2.snapshot
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
AND h1.page_namespace = 1 AND h2.page_namespace = 1
-- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND h1.event_entity = "revision" and h2.event_entity = "revision"
AND (NOT h1.revision_is_identity_reverted OR h1.revision_seconds_to_identity_revert > 172800)
AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
AND h1.revision_is_identity_revert
AND h1.event_entity = 'revision' and h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
df_weekly_net_non_reverted_48hrs_edits_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_2_years_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_weekly_net_non_reverted_48hrs_edits_pre_3_years_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits on Article Talk Pages Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits', fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits_t['week_n'][:-1],
df_weekly_net_non_reverted_48hrs_edits_t['net_non_reverted_edits'][:-1], 'o-',label='Year 2022')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t['week_n'],
df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t['net_non_reverted_edits'], 'o--',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_3_years_t['week_n'],
df_weekly_net_non_reverted_48hrs_edits_pre_3_years_t['net_non_reverted_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,10000)
ax.vlines(42, 0, 10000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
weekly_edits_bot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS bot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}')
AND performer.user_id IS not null AND `database`='fawiki'
AND ( performer.user_is_bot=true OR performer.user_text regexp "^.*bot([^a-z].*$|$)")
GROUP BY WEEKOFYEAR(rev_timestamp)
'''
weekly_edits_nonbot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS nonbot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}')
AND performer.user_id IS not null AND `database`='fawiki'
AND ( performer.user_is_bot=false AND performer.user_text not regexp "^.*bot([^a-z].*$|$)")
GROUP BY WEEKOFYEAR(rev_timestamp)
'''
df_weekly_bot_edits=hive.run(weekly_edits_bot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_weekly_nonbot_edits=hive.run(weekly_edits_nonbot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
weekly_edits_bot_query_2='''
SELECT
weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
event_entity = 'revision' AND
event_type = 'create' AND
DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
event_user_is_anonymous = false AND
(size(event_user_is_bot_by_historical) > 0 OR size(event_user_is_bot_by) > 0
OR event_user_text regexp "^.*bot([^a-z].*$|$)" )
AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp)
'''
df2_weekly_bot_edits_pre_1_year=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df2_weekly_bot_edits_pre_2_years=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df2_weekly_bot_edits_pre_3_years=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
weekly_edits_nonbot_query_2='''
SELECT
weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
event_entity = 'revision' AND
event_type = 'create' AND
DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
event_user_is_anonymous = false AND
(size(event_user_is_bot_by_historical) = 0 AND size(event_user_is_bot_by) = 0
AND event_user_text not regexp "^.*bot([^a-z].*$|$)" )
AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp)
'''
df2_weekly_nonbot_edits_pre_1_year=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df2_weekly_nonbot_edits_pre_2_years=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df2_weekly_nonbot_edits_pre_2_years=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Weekly Edits by Registered Bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week', fontsize = 16)
ax[0].set_ylabel('Edits', fontsize = 16)
ax[0].plot(df_weekly_bot_edits['week_n'], df_weekly_bot_edits['bot_edits'], 'o-',label='Year 2022')
ax[0].plot(df2_weekly_bot_edits_pre_1_year['week_n'], df2_weekly_bot_edits_pre_1_year['edits'], 'o--',label='Year 2021')
ax[0].plot(df2_weekly_bot_edits_pre_3_years['week_n'], df2_weekly_bot_edits_pre_3_years['edits'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,80000)
ax[0].vlines(42, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(thousand_formatter)
ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax[1].set_title('Weekly Edits by Registered Non-bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Edits',fontsize = 16)
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,80000)
ax[1].vlines(42, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
ax[1].plot(df_weekly_nonbot_edits['week_n'], df_weekly_nonbot_edits['nonbot_edits'], 'o-',label='Year 2021')
ax[1].plot(df2_weekly_nonbot_edits_pre_1_year['week_n'], df2_weekly_nonbot_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax[1].plot(df2_weekly_nonbot_edits_pre_2_years['week_n'], df2_weekly_nonbot_edits_pre_2_years['edits'], ':',label='Year 2019')
ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(thousand_formatter)
plt.show()
query_block_weekly='''
SELECT WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n,
COUNT(1) AS blocks
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND log_type = 'block' AND wiki_db='fawiki'
AND log_action IN ('block','reblock')
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
df_block_weekly=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
df_block_weekly_pre_1_year=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
df_block_weekly_pre_2_years=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
df_block_weekly_pre_3_years=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD))
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Fawiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week', fontsize = 16)
ax[0].set_ylabel('Blocks', fontsize = 16)
ax[0].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2022')
ax[0].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2021')
ax[0].plot(df_block_weekly_pre_3_years['week_n'], df_block_weekly_pre_3_years['blocks'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,60000)
ax[0].vlines(42, 0, 60000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(thousand_formatter)
ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax[1].set_title('Fawiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Blocks',fontsize = 16)
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,1600)
ax[1].vlines(42, 0, 1600, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
ax[1].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021')
ax[1].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020')
ax[1].legend(loc='upper right')
plt.show()
query_accounts_created='''
SELECT WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss'))) AS week_n ,
COUNT(DISTINCT user_id) AS new_accounts
FROM wmf_raw.mediawiki_user
WHERe snapshot ='{SNAPSHOT}' AND substr(user_registration, 1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
AND wiki_db='fawiki' AND user_registration IS NOT NULL
GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 10000
'''
df_accounts_created=hive.run(query_accounts_created.format(SNAPSHOT=snapshot,START_YYYYMMDD= start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD ))
df_accounts_created_pre_1_year=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD ))
df_accounts_created_pre_2_years=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD ))
df_accounts_created_pre_3_years=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD ))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('fawiki Created Accounts',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('New Accounts', fontsize = 16)
ax.plot(df_accounts_created['week_n'][:-1], df_accounts_created['new_accounts'][:-1], 'o-',label='Year 2022')
ax.plot(df_accounts_created_pre_1_year['week_n'], df_accounts_created_pre_1_year['new_accounts'], 'o--',label='Year 2021')
ax.plot(df_accounts_created_pre_3_years['week_n'], df_accounts_created_pre_3_years['new_accounts'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,5000)
ax.vlines(42, 0, 5000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
Definition: Out of the non-bot users who registered in the week before the previous and made at least one edit in their first 30 days, the proportion who also edited during their second 30 days.
Due to the nature of the definition, this metric surfaces 2 months after user account creation.
query_retention_rate='''
-- only count non-bot user
SELECT
weekofyear(1st_month.user_creation_date) AS user_creation_cohort_week,
ROUND(SUM(CASE WHEN 2nd_month.edits IS NOT NULL THEN 1 ELSE 0 END) /COUNT(1), 6) AS retention_rate
FROM (
SELECT event_user_text AS user_name,
event_user_id AS user_id,
substr(event_user_creation_timestamp,1,10) AS user_creation_date,
count(*) AS edits
FROM wmf.mediawiki_history
WHERE snapshot = "{SNAPSHOT}"
AND wiki_db='fawiki'
AND event_entity = "revision"
AND event_type = "create"
AND NOT event_user_is_created_by_system
AND size(event_user_is_bot_by) = 0
AND event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND "{END_YYYY_MM_DD}"
AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
GROUP BY event_user_text,
event_user_id,
event_user_creation_timestamp
) 1st_month
LEFT JOIN
(
SELECT event_user_text AS user_name,
event_user_id AS user_id,
substr(event_user_creation_timestamp,1,10) AS user_creation_date,
count(*) AS edits
FROM wmf.mediawiki_history
WHERE snapshot = "{SNAPSHOT}"
AND wiki_db='fawiki'
AND event_entity = "revision"
AND event_type = "create"
AND NOT event_user_is_created_by_system
AND size(event_user_is_bot_by) = 0
AND event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND "{END_YYYY_MM_DD}"
AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") >= (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
AND unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
GROUP BY event_user_text,
event_user_id,
event_user_creation_timestamp
) 2nd_month
ON (
1st_month.user_id = 2nd_month.user_id
AND 1st_month.user_creation_date = 2nd_month.user_creation_date)
GROUP BY weekofyear(1st_month.user_creation_date)
'''
df_retention_rate=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_retention_rate_pre_1_year=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_retention_rate_pre_2_years=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_retention_rate_pre_3_years=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=last3years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last3years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Retention Rate',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('User Cohort (Creation Week)', fontsize = 16)
ax.set_ylabel('Retention Rate', fontsize = 16)
ax.plot(df_retention_rate['user_creation_cohort_week'][:-8], df_retention_rate['retention_rate'][:-8], 'o-',label='Year 2022')
ax.plot(df_retention_rate_pre_1_year['user_creation_cohort_week'], df_retention_rate_pre_1_year['retention_rate'], 'o--',label='Year 2021')
ax.plot(df_retention_rate_pre_3_years['user_creation_cohort_week'], df_retention_rate_pre_3_years['retention_rate'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,0.3)
ax.vlines(42, 0, 0.3, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
query_check_user='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp))) AS week_n,
CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END AS tool,
COUNT(cul_id) AS requests
FROM cu_log WHERE substr(cul_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp)))
, CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END
ORDER BY week_n
LIMIT 10000
'''
df_check_user=mariadb.run(query_check_user.format(START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD),'fawiki')
df_check_user_pre_1_year=mariadb.run(query_check_user.format(START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD),'fawiki')
df_check_user_pre_2_years=mariadb.run(query_check_user.format(START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD),'fawiki')
df_check_user_pre_3_years=mariadb.run(query_check_user.format(START_YYYYMMDD=last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD),'fawiki')
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Checkuser Requests',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week', fontsize = 16)
ax[0].set_ylabel('Requests', fontsize = 16)
ax[0].plot(df_check_user[df_check_user['tool']=='checkuser']['week_n'], df_check_user[df_check_user['tool']=='checkuser']['requests'], 'o-',label='Year 2022')
ax[0].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['requests'], 'o--',label='Year 2021')
ax[0].plot(df_check_user_pre_3_years[df_check_user_pre_3_years['tool']=='checkuser']['week_n'], df_check_user_pre_3_years[df_check_user_pre_3_years['tool']=='checkuser']['requests'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,200)
ax[0].vlines(42, 0, 200, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax[1].set_title('Special Investigate Requests',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Requests',fontsize = 16)
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,100)
ax[1].vlines(42, 0, 100, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
ax[1].plot(df_check_user[df_check_user['tool']=='investigate']['week_n'], df_check_user[df_check_user['tool']=='investigate']['requests'], 'o-',label='Year 2021')
ax[1].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['requests'], 'o--',label='Year 2020')
ax[1].plot(df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['week_n'], df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['requests'], ':',label='Year 2019')
ax[1].legend(loc='upper right')
plt.show()
protected_pages_query='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n ,
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki'
AND log_action = 'protect'
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
df_protected_pages=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
df_protected_pages_pre_1_year=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
df_protected_pages_pre_2_years=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
df_protected_pages_pre_3_years=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Protected Pages', fontsize = 16)
ax.plot(df_protected_pages['week_n'][:-1], df_protected_pages['protected_pages'][:-1], 'o-',label='Year 2022')
ax.plot(df_protected_pages_pre_1_year['week_n'], df_protected_pages_pre_1_year['protected_pages'], 'o--',label='Year 2021')
ax.plot(df_protected_pages_pre_3_years['week_n'], df_protected_pages_pre_3_years['protected_pages'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,4000)
ax.vlines(42, 0, 4000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Pages -- Zoom In',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Protected Pages', fontsize = 16)
ax.plot(df_protected_pages['week_n'][:-1], df_protected_pages['protected_pages'][:-1], 'o-',label='Year 2021')
ax.plot(df_protected_pages_pre_1_year['week_n'], df_protected_pages_pre_1_year['protected_pages'], 'o--',label='Year 2020')
ax.plot(df_protected_pages_pre_2_years['week_n'], df_protected_pages_pre_2_years['protected_pages'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,300)
ax.vlines(42, 0, 300, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
protected_pages_query_content='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n ,
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki'
AND log_action = 'protect' AND log_namespace=0
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
df_protected_pages_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
df_protected_pages_pre_1_year_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
df_protected_pages_pre_2_years_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
df_protected_pages_pre_3_years_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Protected Pages', fontsize = 16)
ax.plot(df_protected_pages_c['week_n'][:-1], df_protected_pages_c['protected_pages'][:-1], 'o-',label='Year 2022')
ax.plot(df_protected_pages_pre_1_year_c['week_n'], df_protected_pages_pre_1_year_c['protected_pages'], 'o--',label='Year 2021')
ax.plot(df_protected_pages_pre_3_years_c['week_n'], df_protected_pages_pre_3_years_c['protected_pages'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,4000)
ax.vlines(42, 0, 4000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
protected_pages_query_talk='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n ,
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki'
AND log_action = 'protect' AND log_namespace=1
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
df_protected_pages_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
df_protected_pages_pre_1_year_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
df_protected_pages_pre_2_years_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
df_protected_pages_pre_3_years_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=last3years_start_date_YYYYMMDD, END_YYYYMMDD=last3years_end_date_YYYYMMDD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Protected Pages', fontsize = 16)
ax.plot(df_protected_pages_t['week_n'][:-1], df_protected_pages_t['protected_pages'][:-1], 'o-',label='Year 2022')
ax.plot(df_protected_pages_pre_1_year_t['week_n'], df_protected_pages_pre_1_year_t['protected_pages'], 'o--',label='Year 2021')
ax.plot(df_protected_pages_pre_3_years_t['week_n'], df_protected_pages_pre_3_years_t['protected_pages'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,20)
ax.vlines(42, 0, 20, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
Note
The bump in October 2021 is mainly from one user who protected 5867 pages in a month. It happened on content pages (namespace 0) in 40th week and on pages of namespace 100 in 41th week.
We don't have a perfect model to measure edits quality. The ORES model is known for being biased against IP editors. [^1] If including IP editors in baseline measurement. It's not a surprise that the damages/edits rate for all edits dropped since IP editing was turned off. Therefore, we also measure the edits quality of registered editors by excluding IP editors.
Meanwhile, ORES model is tested to be friendly to bot editors, not marking bot edits as damages usually. However, the number of bot edits could fluctuate dramatically month by month due to the nature of the bot function. To reduce the fluctuation in monthly trends, we exclude bot editors in analysis of registered editors.
Following analysis will focus on edits quality by all editors and by registered non-bot editors only.
query_edits_quality_damaging_model='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,
ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate
FROM event_sanitized.mediawiki_revision_score
WHERE year='{YEAR}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}')
AND `database`='fawiki'
GROUP BY WEEKOFYEAR(rev_timestamp)
ORDER BY week_n
LIMIT 10000
'''
df_edits_quality_damaging_rate=hive.run(query_edits_quality_damaging_model.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_edits_quality_damaging_rate_pre_1_year=hive.run(query_edits_quality_damaging_model.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_edits_quality_damaging_rate_pre_2_year=hive.run(query_edits_quality_damaging_model.format(YEAR=2020, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD , END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki ORES Damaging Rate (Including IP Editors)',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Damages/Edits Rate', fontsize = 16)
ax.plot(df_edits_quality_damaging_rate['week_n'], df_edits_quality_damaging_rate['damaging_rate'], 'o-',label='Year 2022')
ax.plot(df_edits_quality_damaging_rate_pre_1_year['week_n'], df_edits_quality_damaging_rate_pre_1_year['damaging_rate'], 'o--',label='Year 2021')
ax.plot(df_edits_quality_damaging_rate_pre_2_year['week_n'], df_edits_quality_damaging_rate_pre_2_year['damaging_rate'], ':',label='Year 2020')
ax.set_xlim(1,52)
ax.set_ylim(0,0.6)
ax.legend(loc='upper right')
ax.vlines(42, 0, 1, colors='k', linestyles='dashdot', label='Turned off')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
query_edits_quality_damaging_model_nonbot_registered_user='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,
ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate
FROM event_sanitized.mediawiki_revision_score
WHERE year='{YEAR}' AND (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}')
AND `database`='fawiki'
-- exclude IP editors in baseline as ORES is biased against anonymous editors
AND performer.user_id IS NOT NULL
-- exclude bot editors
AND not performer.user_is_bot
AND performer.user_text not regexp "^.*bot([^a-z].*$|$)"
GROUP BY WEEKOFYEAR(rev_timestamp)
ORDER BY week_n
LIMIT 10000
'''
df=spark_session.sql(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD))
df_edits_quality_damaging_rate_nonbot_registered_user=df.toPandas()
df=spark_session.sql(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year=df.toPandas()
df=spark_session.sql(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=2020, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD , END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
df_edits_quality_damaging_rate_nonbot_registered_user_pre_2_year=df.toPandas()
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki ORES Damaging Rate (Registered Non-bot Editors)',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Damages/Edits Rate', fontsize = 16)
ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user['damaging_rate'], 'o-',label='Year 2022')
ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['damaging_rate'], 'o--',label='Year 2021')
ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user_pre_2_year['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user_pre_2_year['damaging_rate'], ':',label='Year 2020')
ax.set_xlim(1,52)
ax.set_ylim(0,0.6)
ax.vlines(42, 0, 1, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
print("This dashboard was generated on {} at {}. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback please email jwang@wikimedia.org or product-analytics@wikimedia.org.".format(today.strftime("%Y-%m-%d"), now.strftime("%H:%M %Z")))