Persian (Farsi) Wikipedia community started a trial of blocking IP editing on content page since October 20,2021, which is the Wednesday of 42th week in 2021 T291018. This dashboard is to monitor related metrics to see how this impacts the health of the project. T292781. The trial is planned to end in 6 months.

Metrics:

In [46]:
%%capture --no-stderr
%set_env http_proxy=http://webproxy.eqiad.wmnet:8080
%set_env https_proxy=https://webproxy.eqiad.wmnet:8080
%set_env no_proxy=localhost,127.0.0.1
In [47]:
# to fix matplotlib warning on the default path (/nonexistent/.config/matplotlib) is not a writable directory
import os 
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
In [48]:
import wmfdata
wmfdata.utils.insert_code_toggle()
In [49]:
from wmfdata import hive, mariadb
import pandas as pd
from datetime import datetime, timedelta, date
In [ ]:
from wmfdata import spark
import findspark
SPARK_HOME = os.environ.get("SPARK_HOME", "/usr/lib/spark2")
findspark.init(SPARK_HOME)
spark_session = spark.get_session(type="yarn-large")
In [50]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
In [51]:
@ticker.FuncFormatter
def million_formatter(x, pos):
    return "%d M" % round(x/1000000)
@ticker.FuncFormatter
def thousand_formatter(x, pos):
    return "%d K" % round(x/1000)
In [52]:
now = pd.Timestamp.utcnow()
today=now.date()
In [53]:
last_Sunday=(today - timedelta(days=today.weekday()+1)).strftime("%Y-%m-%d")
month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d")
In [54]:
start_date_YYYY_MM_DD = '2021-01-04'
end_date_YYYY_MM_DD=last_Sunday
yr='2021'
In [55]:
pre_yr='2020'
In [56]:
start_date_YYYYMMDD = '20210104'
#end_date_YYYYMMDD=datetime.today().replace(day=1).strftime("%Y%m%d")
In [57]:
end_date_YYYYMMDD=(today - timedelta(days=today.weekday()+1)).strftime("%Y%m%d")
In [58]:
snapshot='2021-12'
In [12]:
if datetime.today().day < 7:
    snapshot = datetime.today().replace(month=datetime.today().month-2).strftime("%Y-%m")
else:
    snapshot = datetime.today().replace(month=datetime.today().month-1).strftime("%Y-%m")
In [111]:
lastyear_start_date_YYYY_MM_DD = '2020-01-01'
lastyear_end_date_YYYY_MM_DD='2020-12-31'
lastyear_start_date_YYYYMMDD = '20200101'
lastyear_end_date_YYYYMMDD='20201231'
last2years_start_date_YYYY_MM_DD = '2019-01-01'
last2years_end_date_YYYY_MM_DD='2019-12-31'
last2years_start_date_YYYYMMDD = '20190101'
last2years_end_date_YYYYMMDD='20191231'

Number of active editors (registered users)

In [112]:
weekly_user_editors_query='''
SELECT tmp.week_n, COUNT(tmp.user_id) AS user_editors
FROM
(SELECT  WEEKOFYEAR(rev_timestamp) AS week_n,  performer.user_id AS user_id, max(performer.user_text) AS user_text
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS NOT null 
AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252) 
AND `database`='fawiki'
GROUP BY   WEEKOFYEAR(rev_timestamp), performer.user_id 
)AS tmp
GROUP BY tmp.week_n
'''
In [113]:
df_weekly_user_editors=hive.run(weekly_user_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [259]:
df_weekly_user_editors
Out[259]:
week_n user_editors
0 1 1826
1 2 1972
2 3 1977
3 4 2022
4 5 2003
5 6 1925
6 7 1889
7 8 1989
8 9 2022
9 10 1916
10 11 1659
11 12 1811
12 13 1873
13 14 2071
14 15 1992
15 16 2127
16 17 2004
17 18 2052
18 19 2061
19 20 1816
20 21 1872
21 22 1801
22 23 1751
23 24 1698
24 25 1748
25 26 1709
26 27 1696
27 28 1740
28 29 1718
29 30 1665
30 31 1821
31 32 1760
32 33 1888
33 34 1826
34 35 1794
35 36 1746
36 37 1773
37 38 1682
38 39 1714
39 40 1811
40 41 1735
41 42 1685
42 43 1709
43 44 1684
44 45 1748
45 46 1791
46 47 1764
47 48 1852
48 49 1768
49 50 1799
50 51 1724
51 52 1405
In [114]:
weekly_user_editors_query='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(DISTINCT event_user_id) wiki_user_editors
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false  AND page_namespace_is_content
        AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [115]:
df2_weekly_user_editors_pre_1_year=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [116]:
df2_weekly_user_editors_pre_2_years=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [117]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly User Editors',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Editors',  fontsize = 16)

ax.plot(df_weekly_user_editors['week_n'], df_weekly_user_editors['user_editors'], 'o-',label='Year 2021')
ax.plot(df2_weekly_user_editors_pre_1_year['week_n'], df2_weekly_user_editors_pre_1_year['wiki_user_editors'], 'o--',label='Year 2020')
ax.plot(df2_weekly_user_editors_pre_2_years['week_n'], df2_weekly_user_editors_pre_2_years['wiki_user_editors'], ':',label='Year 2019')


ax.set_xlim(1,52)
ax.set_ylim(0,3000)
ax.vlines(42, 0, 3000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

# set_xticks method before set_xticklabels to fix warning. ref: https://stackoverflow.com/questions/63723514/userwarning-fixedformatter-should-only-be-used-together-with-fixedlocator
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
                    

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)

ax2.grid(axis='x', which='minor')

plt.show()

Number of edits

In [209]:
weekly_edits_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, 
        SUM(IF(page_namespace = 0, 1, 0)) AS content_edits,
        SUM(IF(page_namespace = 1, 1, 0)) AS talk_edits,
        COUNT(*) AS total_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND `database`='fawiki'
GROUP BY WEEKOFYEAR(rev_timestamp) 
ORDER BY week_n
LIMIT 100000
'''
In [210]:
df_weekly_edits=hive.run(weekly_edits_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [211]:
df_weekly_edits
Out[211]:
week_n content_edits talk_edits total_edits
0 1 38440 856 56200
1 2 38551 775 58167
2 3 42045 877 61617
3 4 44495 1067 64900
4 5 42443 1181 64879
5 6 40924 926 60560
6 7 44746 871 66404
7 8 47320 950 73780
8 9 46201 751 67734
9 10 45904 830 64090
10 11 39959 931 61826
11 12 46095 3260 75478
12 13 58146 906 80275
13 14 49221 849 68152
14 15 46646 1553 63510
15 16 46047 876 68297
16 17 44581 880 64435
17 18 53908 928 75591
18 19 47314 778 66402
19 20 42345 1171 57566
20 21 40604 1058 55735
21 22 45185 1059 61309
22 23 45083 957 67990
23 24 41277 810 60892
24 25 43859 782 59609
25 26 41557 1451 57184
26 27 44716 921 60758
27 28 38192 898 52607
28 29 31407 756 45790
29 30 39933 903 53998
30 31 34352 744 49104
31 32 32538 1055 45618
32 33 44541 1458 59886
33 34 56820 1340 72052
34 35 45903 903 66352
35 36 46963 1281 61935
36 37 41190 1043 55567
37 38 49650 1022 65104
38 39 60595 975 74118
39 40 50824 875 61601
40 41 41769 815 55753
41 42 34978 742 44240
42 43 27422 891 39183
43 44 33249 731 45147
44 45 25668 706 37382
45 46 30702 873 42163
46 47 26830 859 36390
47 48 27575 930 41168
48 49 26270 802 40505
49 50 22451 1017 34390
50 51 33378 1017 44452
51 52 18036 609 27618
In [212]:
weekly_edits_query_2='''
SELECT
        weekofyear(event_timestamp) AS week_n,  
        SUM(IF(page_namespace = 0, 1, 0)) AS content_edits,
        SUM(IF(page_namespace = 1, 1, 0)) AS talk_edits,
        COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        snapshot = '{SNAPSHOT}'
        AND wiki_db='fawiki' 
GROUP BY weekofyear(event_timestamp) 
ORDER BY week_n
LIMIT 100000
'''
In [213]:
df2_weekly_edits_pre_1_year=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [214]:
df2_weekly_edits_pre_2_years=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [217]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Total Edits',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Edits',  fontsize = 16)
ax.plot(df_weekly_edits['week_n'], df_weekly_edits['total_edits'], 'o-',label='Year 2021')
ax.plot(df2_weekly_edits_pre_1_year['week_n'], df2_weekly_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax.plot(df2_weekly_edits_pre_2_years['week_n'], df2_weekly_edits_pre_2_years['edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,120000)
ax.vlines(42, 0, 120000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
#plt.savefig("1_fawiki_weekly_edits.png")

plt.show()
In [237]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Fawiki Weekly Edits on Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Edits',  fontsize = 16)


ax[0].plot(df_weekly_edits['week_n'], 
           df_weekly_edits['content_edits'], 'o-',label='Year 2021')
ax[0].plot(df2_weekly_edits_pre_1_year['week_n'], 
           df2_weekly_edits_pre_1_year['content_edits'], 'o--',label='Year 2020')
ax[0].plot(df2_weekly_edits_pre_2_years['week_n'], 
           df2_weekly_edits_pre_2_years['content_edits'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,120000)
ax[0].vlines(42, 0, 120000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')

ax[0].yaxis.set_major_formatter(thousand_formatter)

ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Fawiki Weekly Edits on Article Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',  fontsize = 16)
ax[1].set_ylabel('Edits',  fontsize = 16)
ax[1].plot(df_weekly_edits['week_n'], 
           df_weekly_edits['talk_edits'], 'o-',label='Year 2021')
ax[1].plot(df2_weekly_edits_pre_1_year['week_n'], 
           df2_weekly_edits_pre_1_year['talk_edits'], 'o--',label='Year 2020')
ax[1].plot(df2_weekly_edits_pre_2_years['week_n'], 
           df2_weekly_edits_pre_2_years['talk_edits'], ':',label='Year 2019')
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,5000)
ax[1].vlines(42, 0, 5000, colors='k', linestyles='dashdot', label='Turned off')
ax[1].legend(loc='upper right')

ax[1].yaxis.set_major_formatter(thousand_formatter)

ax3 = ax[1].twiny()  # instantiate a second axes that shares the same x-axis
ax3.set_xlim(0,12)
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax3.tick_params(length=0)
ax3.set_xlabel('Month',fontsize=16)


ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

plt.show()

Number of reverts

Definition: number of edits reverted within 48 hours

In [226]:
query_weekly_reverts_48hrs='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
        SUM(IF(page_namespace = 0, 1, 0)) AS reverted_content_edits,
        SUM(IF(page_namespace = 1, 1, 0)) AS reverted_talk_edits,
        COUNT(revision_id) AS reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 100000
'''
In [227]:
df_weekly_reverts_48hrs=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [260]:
df_weekly_reverts_48hrs
Out[260]:
week_n reverted_content_edits reverted_talk_edits reverted_edits
0 1 4937 23 7425
1 2 5286 20 7562
2 3 4842 17 7060
3 4 4733 51 7586
4 5 6596 90 9234
5 6 4746 32 7231
6 7 5377 39 7976
7 8 5790 45 8324
8 9 5613 28 7965
9 10 5460 15 7941
10 11 4459 21 6869
11 12 4907 18 7432
12 13 4803 62 7099
13 14 6023 56 8435
14 15 5485 23 7567
15 16 6844 50 10117
16 17 5840 26 8029
17 18 7102 25 9240
18 19 5607 37 7828
19 20 4756 16 7384
20 21 5419 24 7737
21 22 4734 13 7105
22 23 4336 21 6863
23 24 4176 14 6412
24 25 3731 17 5799
25 26 3663 30 5989
26 27 4091 12 6221
27 28 4087 32 6118
28 29 3762 22 5928
29 30 4196 16 6442
30 31 4409 23 6533
31 32 4237 16 6236
32 33 5648 17 7837
33 34 5713 18 7757
34 35 5077 9 7831
35 36 4593 35 7265
36 37 4346 54 6870
37 38 4255 14 6513
38 39 9738 24 11949
39 40 4977 32 6389
40 41 4735 19 5555
41 42 3085 55 3601
42 43 1254 58 2116
43 44 1403 20 2118
44 45 1617 29 2497
45 46 1423 18 2159
46 47 1503 12 1945
47 48 1606 23 2416
48 49 1559 26 3031
49 50 1427 11 2379
50 51 1516 24 2537
51 52 1288 13 1933
In [228]:
df_weekly_reverts_48hrs_pre_1_year=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [229]:
df_weekly_reverts_48hrs_pre_2_years=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [230]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Reverts',  fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], df_weekly_reverts_48hrs['reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], df_weekly_reverts_48hrs_pre_1_year['reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_reverts_48hrs_pre_2_years['week_n'], df_weekly_reverts_48hrs_pre_2_years['reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,35000)
ax.vlines(42, 0, 35000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [231]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing on Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Reverts',  fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], 
        df_weekly_reverts_48hrs['reverted_content_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], 
        df_weekly_reverts_48hrs_pre_1_year['reverted_content_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_reverts_48hrs_pre_2_years['week_n'], 
        df_weekly_reverts_48hrs_pre_2_years['reverted_content_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,35000)
ax.vlines(42, 0, 35000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [236]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Reverts Within 48 Hours of Editing on Article Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Reverts',  fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], 
        df_weekly_reverts_48hrs['reverted_talk_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], 
        df_weekly_reverts_48hrs_pre_1_year['reverted_talk_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_reverts_48hrs_pre_2_years['week_n'], 
        df_weekly_reverts_48hrs_pre_2_years['reverted_talk_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,600)
ax.vlines(42, 0, 600, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')


ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Number of net non-reverted edits

Definition: number of edits which were not reverted within 48 hours (excluding bots and revert edits)

In [238]:
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND  size(event_user_is_bot_by) <= 0 
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted  OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT  weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1 
LEFT JOIN wmf.mediawiki_history AS h2  -- h1 is revert edits, h2 is the edits being reverted
     ON h1.revision_id=h2.revision_first_identity_reverting_revision_id  
      AND h1.wiki_db = h2.wiki_db
      AND h1.snapshot= h2.snapshot 
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
    -- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
    AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
    AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
    AND h1.event_entity = "revision" and h2.event_entity = "revision" 
    AND (NOT h1.revision_is_identity_reverted  OR h1.revision_seconds_to_identity_revert > 172800)
    AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
    AND h1.revision_is_identity_revert
    AND h1.event_entity = 'revision' and  h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM  t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
In [239]:
df_weekly_net_non_reverted_48hrs_edits=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [261]:
df_weekly_net_non_reverted_48hrs_edits
Out[261]:
week_n net_non_reverted_edits
0 1 32872
1 2 34399
2 3 36402
3 4 37841
4 5 37944
5 6 36816
6 7 35401
7 8 38249
8 9 38626
9 10 35662
10 11 34164
11 12 37792
12 13 36786
13 14 36650
14 15 38701
15 16 36933
16 17 38361
17 18 41055
18 19 36324
19 20 32822
20 21 32841
21 22 34681
22 23 35313
23 24 34789
24 25 35619
25 26 31865
26 27 35559
27 28 33339
28 29 31341
29 30 33212
30 31 33500
31 32 29774
32 33 34310
33 34 36131
34 35 37798
35 36 39038
36 37 35861
37 38 42988
38 39 34402
39 40 36481
40 41 35796
41 42 28319
42 43 27569
43 44 29913
44 45 28031
45 46 27484
46 47 24437
47 48 28554
48 49 29085
49 50 26613
50 51 25559
51 52 23723
In [240]:
df_weekly_net_non_reverted_48hrs_edits_pre_1_year=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [241]:
df_weekly_net_non_reverted_48hrs_edits_pre_2_years=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [249]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits',  fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits['week_n'][:-1], df_weekly_net_non_reverted_48hrs_edits['net_non_reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_1_year['net_non_reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_2_years['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_2_years['net_non_reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,60000)
ax.vlines(42, 0, 60000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [243]:
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND   page_namespace = 0 
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND  size(event_user_is_bot_by) <= 0 
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted  OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT  weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1 
LEFT JOIN wmf.mediawiki_history AS h2  -- h1 is revert edits, h2 is the edits being reverted
     ON h1.revision_id=h2.revision_first_identity_reverting_revision_id  
      AND h1.wiki_db = h2.wiki_db
      AND h1.snapshot= h2.snapshot 
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
    AND h1.page_namespace = 0 AND h2.page_namespace = 0
    -- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
    AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
    AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
    AND h1.event_entity = "revision" and h2.event_entity = "revision" 
    AND (NOT h1.revision_is_identity_reverted  OR h1.revision_seconds_to_identity_revert > 172800)
    AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
    AND h1.revision_is_identity_revert
    AND h1.event_entity = 'revision' and  h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM  t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
In [244]:
df_weekly_net_non_reverted_48hrs_edits_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))

df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))

df_weekly_net_non_reverted_48hrs_edits_pre_2_years_c=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_content.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [262]:
df_weekly_net_non_reverted_48hrs_edits_c
Out[262]:
week_n net_non_reverted_edits
0 1 21881
1 2 22597
2 3 23919
3 4 25772
4 5 23786
5 6 24263
6 7 22613
7 8 23085
8 9 23836
9 10 24263
10 11 23032
11 12 27171
12 13 25694
13 14 24015
14 15 26786
15 16 25048
16 17 26706
17 18 29834
18 19 25221
19 20 23235
20 21 22825
21 22 24456
22 23 24554
23 24 24970
24 25 26172
25 26 22082
26 27 25266
27 28 23443
28 29 21474
29 30 23480
30 31 23167
31 32 20810
32 33 24412
33 34 27078
34 35 27445
35 36 28901
36 37 26271
37 38 32135
38 39 25330
39 40 28900
40 41 23975
41 42 20884
42 43 18287
43 44 20289
44 45 18504
45 46 18731
46 47 16599
47 48 17282
48 49 18324
49 50 16853
50 51 16554
51 52 14793
In [250]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits on Content Pages Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits',  fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits_c['week_n'][:-1], 
        df_weekly_net_non_reverted_48hrs_edits_c['net_non_reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c['week_n'], 
        df_weekly_net_non_reverted_48hrs_edits_pre_1_year_c['net_non_reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_2_years_c['week_n'], 
        df_weekly_net_non_reverted_48hrs_edits_pre_2_years_c['net_non_reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,60000)
ax.vlines(42, 0, 60000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [246]:
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='fawiki'
AND   page_namespace = 1 
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND  size(event_user_is_bot_by) <= 0 
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted  OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT  weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1 
LEFT JOIN wmf.mediawiki_history AS h2  -- h1 is revert edits, h2 is the edits being reverted
     ON h1.revision_id=h2.revision_first_identity_reverting_revision_id  
      AND h1.wiki_db = h2.wiki_db
      AND h1.snapshot= h2.snapshot 
WHERE h1.wiki_db = 'fawiki' AND h1.snapshot='{SNAPSHOT}'
    AND h1.page_namespace = 1 AND h2.page_namespace = 1
    -- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
    AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
    AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
    AND h1.event_entity = "revision" and h2.event_entity = "revision" 
    AND (NOT h1.revision_is_identity_reverted  OR h1.revision_seconds_to_identity_revert > 172800)
    AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
    AND h1.revision_is_identity_revert
    AND h1.event_entity = 'revision' and  h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM  t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
In [247]:
df_weekly_net_non_reverted_48hrs_edits_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))

df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))

df_weekly_net_non_reverted_48hrs_edits_pre_2_years_t=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits_talk.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [263]:
df_weekly_net_non_reverted_48hrs_edits_t
Out[263]:
week_n net_non_reverted_edits
0 1 943
1 2 1040
2 3 1076
3 4 1286
4 5 1286
5 6 1139
6 7 1025
7 8 1074
8 9 799
9 10 1098
10 11 1165
11 12 1009
12 13 898
13 14 791
14 15 1223
15 16 882
16 17 971
17 18 1074
18 19 1005
19 20 983
20 21 819
21 22 931
22 23 776
23 24 713
24 25 635
25 26 679
26 27 681
27 28 690
28 29 590
29 30 733
30 31 610
31 32 737
32 33 1069
33 34 1083
34 35 705
35 36 942
36 37 767
37 38 719
38 39 751
39 40 631
40 41 682
41 42 563
42 43 731
43 44 643
44 45 611
45 46 786
46 47 776
47 48 852
48 49 685
49 50 924
50 51 931
51 52 720
In [253]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Weekly Edits on Article Talk Pages Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits',  fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits_t['week_n'][:-1], 
        df_weekly_net_non_reverted_48hrs_edits_t['net_non_reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t['week_n'], 
        df_weekly_net_non_reverted_48hrs_edits_pre_1_year_t['net_non_reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_2_years_t['week_n'], 
        df_weekly_net_non_reverted_48hrs_edits_pre_2_years_t['net_non_reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,10000)
ax.vlines(42, 0, 10000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Edits by bot and non-bot registered editors

In [139]:
weekly_edits_bot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS bot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null AND `database`='fawiki'
AND  ( performer.user_is_bot=true OR performer.user_text regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [140]:
weekly_edits_nonbot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,  COUNT(*) AS nonbot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null AND `database`='fawiki'
AND  ( performer.user_is_bot=false AND performer.user_text not regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [141]:
df_weekly_bot_edits=hive.run(weekly_edits_bot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [142]:
df_weekly_nonbot_edits=hive.run(weekly_edits_nonbot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [143]:
weekly_edits_bot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND
        (size(event_user_is_bot_by_historical) > 0 OR size(event_user_is_bot_by) > 0  
        OR event_user_text  regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [144]:
df2_weekly_bot_edits_pre_1_year=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [145]:
df2_weekly_bot_edits_pre_2_years=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [146]:
weekly_edits_nonbot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND 
        (size(event_user_is_bot_by_historical) = 0 AND size(event_user_is_bot_by) = 0  
        AND event_user_text not regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}' AND wiki_db='fawiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [147]:
df2_weekly_nonbot_edits_pre_1_year=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [148]:
df2_weekly_nonbot_edits_pre_2_years=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [149]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Weekly Edits by Registered Bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Edits',  fontsize = 16)
ax[0].plot(df_weekly_bot_edits['week_n'], df_weekly_bot_edits['bot_edits'], 'o-',label='Year 2021')
ax[0].plot(df2_weekly_bot_edits_pre_1_year['week_n'], df2_weekly_bot_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax[0].plot(df2_weekly_bot_edits_pre_2_years['week_n'], df2_weekly_bot_edits_pre_2_years['edits'], ':',label='Year 2019')


ax[0].set_xlim(1,52)
ax[0].set_ylim(0,80000)
ax[0].vlines(42, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(thousand_formatter)
ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Weekly Edits by Registered Non-bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Edits',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,80000)
ax[1].vlines(42, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_weekly_nonbot_edits['week_n'], df_weekly_nonbot_edits['nonbot_edits'], 'o-',label='Year 2021')
ax[1].plot(df2_weekly_nonbot_edits_pre_1_year['week_n'], df2_weekly_nonbot_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax[1].plot(df2_weekly_nonbot_edits_pre_2_years['week_n'], df2_weekly_nonbot_edits_pre_2_years['edits'], ':',label='Year 2019')

ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(thousand_formatter)

plt.show()

Number of blocks

In [150]:
query_block_weekly='''
SELECT WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n, 
COUNT(1) AS blocks
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND log_type = 'block'  AND wiki_db='fawiki'
AND  log_action IN ('block','reblock')
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
In [151]:
df_block_weekly=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
In [264]:
df_block_weekly
Out[264]:
week_n blocks
0 1 368
1 2 384
2 3 376
3 4 415
4 5 455
5 6 504
6 7 532
7 8 559
8 9 570
9 10 426
10 11 307
11 12 447
12 13 329
13 14 474
14 15 432
15 16 670
16 17 604
17 18 492
18 19 604
19 20 411
20 21 469
21 22 482
22 23 527
23 24 466
24 25 387
25 26 390
26 27 407
27 28 397
28 29 388
29 30 332
30 31 514
31 32 464
32 33 533
33 34 616
34 35 544
35 36 360
36 37 348
37 38 493
38 39 713
39 40 744
40 41 543
41 42 365
42 43 335
43 44 353
44 45 144
45 46 114
46 47 363
47 48 309
48 49 268
49 50 228
50 51 148
51 52 285
In [152]:
df_block_weekly_pre_1_year=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
In [153]:
df_block_weekly_pre_2_years=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
In [154]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Fawiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Blocks',  fontsize = 16)
ax[0].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021')
ax[0].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020')
ax[0].plot(df_block_weekly_pre_2_years['week_n'], df_block_weekly_pre_2_years['blocks'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,12000)
ax[0].vlines(42, 0, 120000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')

ax[0].yaxis.set_major_formatter(thousand_formatter)

ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')



ax[1].set_title('Fawiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Blocks',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,1200)
ax[1].vlines(42, 0, 1200, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax3.tick_params(length=0)
ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021')
ax[1].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020')
ax[1].legend(loc='upper right')


plt.show()

Number of accounts created

In [155]:
query_accounts_created='''
SELECT WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss'))) AS week_n ,
COUNT(DISTINCT user_id) AS new_accounts
FROM wmf_raw.mediawiki_user
WHERe snapshot ='{SNAPSHOT}' AND substr(user_registration, 1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
 AND wiki_db='fawiki' AND user_registration IS NOT NULL
 GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(user_registration,'yyyyMMddHHmmss'))) 
 ORDER BY week_n 
 LIMIT 10000
'''
In [156]:
df_accounts_created=hive.run(query_accounts_created.format(SNAPSHOT=snapshot,START_YYYYMMDD= start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD ))
In [265]:
df_accounts_created
Out[265]:
week_n new_accounts
0 1 2043
1 2 2213
2 3 2153
3 4 2036
4 5 2109
5 6 2061
6 7 2054
7 8 2096
8 9 2194
9 10 2000
10 11 1644
11 12 1834
12 13 1963
13 14 2106
14 15 1968
15 16 2302
16 17 2119
17 18 2002
18 19 2032
19 20 1791
20 21 1796
21 22 1778
22 23 1702
23 24 1635
24 25 1575
25 26 1539
26 27 1599
27 28 1641
28 29 1524
29 30 1570
30 31 1668
31 32 1723
32 33 2015
33 34 1868
34 35 1751
35 36 1671
36 37 1672
37 38 1680
38 39 1705
39 40 1907
40 41 1778
41 42 1743
42 43 1864
43 44 1930
44 45 1683
45 46 1773
46 47 1798
47 48 1788
48 49 1785
49 50 1744
50 51 1687
51 52 1507
In [157]:
df_accounts_created_pre_1_year=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD ))
In [158]:
df_accounts_created_pre_2_years=hive.run(query_accounts_created.format(SNAPSHOT=snapshot, START_YYYYMMDD= last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD ))
In [159]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('fawiki Created Accounts',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('New Accounts',  fontsize = 16)
ax.plot(df_accounts_created['week_n'][:-1], df_accounts_created['new_accounts'][:-1], 'o-',label='Year 2021')
ax.plot(df_accounts_created_pre_1_year['week_n'], df_accounts_created_pre_1_year['new_accounts'], 'o--',label='Year 2020')
ax.plot(df_accounts_created_pre_2_years['week_n'], df_accounts_created_pre_2_years['new_accounts'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,5000)
ax.vlines(42, 0, 5000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Retention rate

Definition: Out of the non-bot users who registered in the week before the previous and made at least one edit in their first 30 days, the proportion who also edited during their second 30 days.
Due to the nature of the definition, this metric surfaces 2 months after user account creation.

In [284]:
query_retention_rate='''
-- only count non-bot user 
SELECT 
          weekofyear(1st_month.user_creation_date) AS user_creation_cohort_week, 
          ROUND(SUM(CASE WHEN 2nd_month.edits IS NOT NULL THEN 1 ELSE 0 END) /COUNT(1), 6)          AS retention_rate
FROM      ( 
                   SELECT   event_user_text                           AS user_name, 
                            event_user_id                             AS user_id, 
                            substr(event_user_creation_timestamp,1,10) AS user_creation_date, 
                            count(*)                                  AS edits 
                   FROM     wmf.mediawiki_history 
                   WHERE    snapshot = "{SNAPSHOT}" 
                   AND      wiki_db='fawiki' 
                   AND      event_entity = "revision" 
                   AND      event_type = "create" 
                   AND      NOT event_user_is_created_by_system 
                   AND      size(event_user_is_bot_by) = 0 
                   AND      event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND      "{END_YYYY_MM_DD}" 
                   AND      unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) 
                   GROUP BY event_user_text, 
                            event_user_id, 
                            event_user_creation_timestamp
                            ) 1st_month 
LEFT JOIN 
          ( 
                   SELECT   event_user_text                           AS user_name, 
                            event_user_id                             AS user_id, 
                            substr(event_user_creation_timestamp,1,10) AS user_creation_date, 
                            count(*)                                  AS edits 
                   FROM     wmf.mediawiki_history 
                   WHERE    snapshot = "{SNAPSHOT}" 
                   AND      wiki_db='fawiki' 
                   AND      event_entity = "revision" 
                   AND      event_type = "create" 
                   AND      NOT event_user_is_created_by_system 
                   AND      size(event_user_is_bot_by) = 0 
                   AND      event_user_creation_timestamp BETWEEN "{START_YYYY_MM_DD}" AND      "{END_YYYY_MM_DD}" 
                   AND      unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") >= (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) 
                   AND      unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") < (unix_timestamp(event_user_creation_timestamp, "yyyy-MM-dd HH:mm:ss.0")  + (60*24*60*60)) 
                   GROUP BY event_user_text, 
                            event_user_id, 
                            event_user_creation_timestamp
                            ) 2nd_month 
ON        ( 
                    1st_month.user_id = 2nd_month.user_id 
          AND       1st_month.user_creation_date = 2nd_month.user_creation_date)
GROUP BY weekofyear(1st_month.user_creation_date)        
'''
In [285]:
df_retention_rate=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [162]:
df_retention_rate_pre_1_year=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [163]:
df_retention_rate_pre_2_years=hive.run(query_retention_rate.format(SNAPSHOT=snapshot,START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [286]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Retention Rate',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('User Cohort (Creation Week)',  fontsize = 16)
ax.set_ylabel('Retention Rate',  fontsize = 16)
ax.plot(df_retention_rate['user_creation_cohort_week'][:-8], df_retention_rate['retention_rate'][:-8], 'o-',label='Year 2021')
ax.plot(df_retention_rate_pre_1_year['user_creation_cohort_week'], df_retention_rate_pre_1_year['retention_rate'], 'o--',label='Year 2020')
ax.plot(df_retention_rate_pre_2_years['user_creation_cohort_week'], df_retention_rate_pre_2_years['retention_rate'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,0.3)
ax.vlines(42, 0, 0.3, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Checkuser checks

In [165]:
query_check_user='''
 SELECT weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp))) AS week_n,
   CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END AS tool,
   COUNT(cul_id) AS requests
 FROM cu_log WHERE substr(cul_timestamp,1,8)  BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
 GROUP BY weekofyear(from_unixtime(UNIX_timestamp(cul_timestamp)))
 , CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END 
 ORDER BY week_n
 LIMIT 10000
'''
In [166]:
df_check_user=mariadb.run(query_check_user.format(START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD),'fawiki')
In [283]:
df_check_user
Out[283]:
week_n tool requests
0 2 checkuser 9
1 3 checkuser 13
2 4 checkuser 3
3 5 checkuser 49
4 6 checkuser 97
5 6 investigate 3
6 7 checkuser 54
7 8 checkuser 56
8 9 checkuser 3
9 10 checkuser 11
10 11 checkuser 10
11 12 checkuser 8
12 13 checkuser 27
13 14 checkuser 53
14 14 investigate 1
15 15 checkuser 14
16 16 checkuser 13
17 17 checkuser 40
18 18 checkuser 73
19 21 checkuser 7
20 22 checkuser 19
21 23 checkuser 5
22 24 checkuser 29
23 26 checkuser 2
24 27 checkuser 1
25 27 investigate 1
26 28 checkuser 8
27 29 checkuser 1
28 30 checkuser 8
29 31 checkuser 18
30 32 checkuser 10
31 33 checkuser 18
32 34 checkuser 8
33 35 checkuser 11
34 37 checkuser 2
35 38 checkuser 80
36 39 checkuser 9
37 40 checkuser 4
38 42 checkuser 27
39 43 checkuser 2
40 44 checkuser 64
41 45 checkuser 87
42 46 checkuser 37
43 47 checkuser 6
44 48 checkuser 1
45 49 checkuser 54
46 50 checkuser 90
47 51 checkuser 86
48 52 checkuser 30
In [167]:
df_check_user_pre_1_year=mariadb.run(query_check_user.format(START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD),'fawiki')
In [168]:
df_check_user_pre_2_years=mariadb.run(query_check_user.format(START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD),'fawiki')
In [169]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Checkuser Requests',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Requests',  fontsize = 16)
ax[0].plot(df_check_user[df_check_user['tool']=='checkuser']['week_n'], df_check_user[df_check_user['tool']=='checkuser']['requests'], 'o-',label='Year 2021')
ax[0].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='checkuser']['requests'], 'o--',label='Year 2020')
ax[0].plot(df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='checkuser']['week_n'], df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='checkuser']['requests'], ':',label='Year 2019')


ax[0].set_xlim(1,52)
ax[0].set_ylim(0,200)
ax[0].vlines(42, 0, 200, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Special Investigate Requests',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Requests',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,100)
ax[1].vlines(42, 0, 100, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_check_user[df_check_user['tool']=='investigate']['week_n'], df_check_user[df_check_user['tool']=='investigate']['requests'], 'o-',label='Year 2021')
ax[1].plot(df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['week_n'], df_check_user_pre_1_year[df_check_user_pre_1_year['tool']=='investigate']['requests'], 'o--',label='Year 2020')
ax[1].plot(df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['week_n'], df_check_user_pre_2_years[df_check_user_pre_2_years['tool']=='investigate']['requests'], ':',label='Year 2019')


ax[1].legend(loc='upper right')
plt.show()

Number of pages protected

In [254]:
protected_pages_query='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n , 
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging 
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki' 
AND log_action = 'protect'  
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) 
ORDER BY week_n
LIMIT 100000
'''
In [255]:
df_protected_pages=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
In [282]:
df_protected_pages
Out[282]:
week_n protected_pages
0 1 77
1 2 72
2 3 65
3 4 79
4 5 129
5 6 112
6 7 131
7 8 122
8 9 122
9 10 127
10 11 94
11 12 91
12 13 110
13 14 133
14 15 162
15 16 197
16 17 102
17 18 110
18 19 118
19 20 120
20 21 112
21 22 98
22 23 98
23 24 94
24 25 105
25 26 101
26 27 103
27 28 104
28 29 101
29 30 124
30 31 140
31 32 154
32 33 192
33 34 197
34 35 199
35 36 193
36 37 134
37 38 177
38 39 157
39 40 2751
40 41 3391
41 42 102
42 43 69
43 44 281
44 45 55
45 46 798
46 47 38
47 48 55
48 49 44
49 50 40
50 51 46
51 52 43
In [256]:
df_protected_pages_pre_1_year=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
In [257]:
df_protected_pages_pre_2_years=hive.run(protected_pages_query.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
In [258]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Protected Pages',  fontsize = 16)
ax.plot(df_protected_pages['week_n'][:-1], df_protected_pages['protected_pages'][:-1], 'o-',label='Year 2021')
ax.plot(df_protected_pages_pre_1_year['week_n'], df_protected_pages_pre_1_year['protected_pages'], 'o--',label='Year 2020')
ax.plot(df_protected_pages_pre_2_years['week_n'], df_protected_pages_pre_2_years['protected_pages'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,4000)
ax.vlines(42, 0, 4000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [266]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Pages  --  Zoom In',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Protected Pages',  fontsize = 16)
ax.plot(df_protected_pages['week_n'][:-1], df_protected_pages['protected_pages'][:-1], 'o-',label='Year 2021')
ax.plot(df_protected_pages_pre_1_year['week_n'], df_protected_pages_pre_1_year['protected_pages'], 'o--',label='Year 2020')
ax.plot(df_protected_pages_pre_2_years['week_n'], df_protected_pages_pre_2_years['protected_pages'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,300)
ax.vlines(42, 0, 300, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [267]:
protected_pages_query_content='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n , 
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging 
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki' 
AND log_action = 'protect'  AND log_namespace=0
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) 
ORDER BY week_n
LIMIT 100000
'''
In [268]:
df_protected_pages_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
In [277]:
df_protected_pages_c
Out[277]:
week_n protected_pages
0 1 73
1 2 72
2 3 65
3 4 67
4 5 112
5 6 103
6 7 120
7 8 108
8 9 111
9 10 121
10 11 91
11 12 84
12 13 105
13 14 128
14 15 155
15 16 192
16 17 96
17 18 106
18 19 111
19 20 116
20 21 108
21 22 98
22 23 94
23 24 88
24 25 104
25 26 99
26 27 92
27 28 101
28 29 97
29 30 120
30 31 134
31 32 150
32 33 185
33 34 189
34 35 193
35 36 189
36 37 128
37 38 174
38 39 154
39 40 2735
40 41 247
41 42 100
42 43 48
43 44 277
44 45 40
45 46 789
46 47 34
47 48 51
48 49 42
49 50 34
50 51 40
51 52 38
In [269]:
df_protected_pages_pre_1_year_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
In [270]:
df_protected_pages_pre_2_years_c=hive.run(protected_pages_query_content.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
In [271]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Content Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Protected Pages',  fontsize = 16)
ax.plot(df_protected_pages_c['week_n'][:-1], df_protected_pages_c['protected_pages'][:-1], 'o-',label='Year 2021')
ax.plot(df_protected_pages_pre_1_year_c['week_n'], df_protected_pages_pre_1_year_c['protected_pages'], 'o--',label='Year 2020')
ax.plot(df_protected_pages_pre_2_years_c['week_n'], df_protected_pages_pre_2_years_c['protected_pages'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,4000)
ax.vlines(42, 0, 4000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [272]:
protected_pages_query_talk='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n , 
-- log_page has a lot of null rows. So use log_title instead. We might over count if the page title is changed.
COUNT(distinct log_title) AS protected_pages
FROM wmf_raw.mediawiki_logging 
WHERE snapshot ='{SNAPSHOT}' AND wiki_db='fawiki' 
AND log_action = 'protect'  AND log_namespace=1
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) 
ORDER BY week_n
LIMIT 100000
'''
In [273]:
df_protected_pages_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
In [278]:
df_protected_pages_t
Out[278]:
week_n protected_pages
0 3 1
1 4 7
2 5 5
3 6 2
4 7 4
5 8 2
6 10 1
7 13 1
8 14 1
9 15 5
10 16 3
11 17 1
12 19 4
13 23 2
14 24 1
15 26 2
16 28 1
17 29 1
18 32 3
19 33 1
20 35 1
21 37 1
22 38 2
23 40 1
24 41 1
25 42 2
26 43 7
27 44 1
28 45 3
29 46 1
30 47 1
31 48 1
32 49 1
33 50 1
34 51 2
35 52 2
In [274]:
df_protected_pages_pre_1_year_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
In [275]:
df_protected_pages_pre_2_years_t=hive.run(protected_pages_query_talk.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
In [281]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki Protected Talk Pages',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Protected Pages',  fontsize = 16)
ax.plot(df_protected_pages_t['week_n'][:-1], df_protected_pages_t['protected_pages'][:-1], 'o-',label='Year 2021')
ax.plot(df_protected_pages_pre_1_year_t['week_n'], df_protected_pages_pre_1_year_t['protected_pages'], 'o--',label='Year 2020')
ax.plot(df_protected_pages_pre_2_years_t['week_n'], df_protected_pages_pre_2_years_t['protected_pages'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,20)
ax.vlines(42, 0, 20, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Note

The bump in October 2021 is mainly from one user who protected 5867 pages in a month.

Quality of edits with ORES

We don't have a perfect model to measure edits quality. The ORES model is known for being biased against IP editors. [^1] If including IP editors in baseline measurement. It's not a surprise that the damages/edits rate for all edits dropped since IP editing was turned off. Therefore, we also measure the edits quality of registered editors by excluding IP editors.

Meanwhile, ORES model is tested to be friendly to bot editors, not marking bot edits as damages usually. However, the number of bot edits could fluctuate dramatically month by month due to the nature of the bot function. To reduce the fluctuation in monthly trends, we exclude bot editors in analysis of registered editors.

Following analysis will focus on edits quality by all editors and by registered non-bot editors only.

[^1]https://meta.wikimedia.org/wiki/Objective_Revision_Evaluation_Service/Issues#Bias_against_anonymous_editors

In [14]:
query_edits_quality_damaging_model='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, 
ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate
FROM event_sanitized.mediawiki_revision_score
WHERE year='{YEAR}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
AND `database`='fawiki'
GROUP BY WEEKOFYEAR(rev_timestamp)
ORDER BY week_n
LIMIT 10000
'''
In [16]:
df_edits_quality_damaging_rate=hive.run(query_edits_quality_damaging_model.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [80]:
df_edits_quality_damaging_rate_pre_1_year=hive.run(query_edits_quality_damaging_model.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [118]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki ORES Damaging Rate (Including IP Editors)',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Damages/Edits Rate',  fontsize = 16)
ax.plot(df_edits_quality_damaging_rate['week_n'], df_edits_quality_damaging_rate['damaging_rate'], 'o-',label='Year 2021')
ax.plot(df_edits_quality_damaging_rate_pre_1_year['week_n'], df_edits_quality_damaging_rate_pre_1_year['damaging_rate'], 'o--',label='Year 2020')

ax.set_xlim(1,52)
ax.set_ylim(0,0.4)
ax.legend(loc='upper right')
ax.vlines(42, 0, 0.4, colors='k', linestyles='dashdot', label='Turned off')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [104]:
query_edits_quality_damaging_model_nonbot_registered_user='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, 
ROUND(SUM(case when scores["damaging"].prediction[0]='true' then 1 else 0 end)/count(rev_id) ,4) AS damaging_rate
FROM event_sanitized.mediawiki_revision_score
WHERE year='{YEAR}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND `database`='fawiki' 
-- exclude IP editors in baseline as ORES is biased against anonymous editors
AND performer.user_id IS NOT NULL
-- exclude bot editors
AND not performer.user_is_bot
AND performer.user_text not regexp "^.*bot([^a-z].*$|$)"
GROUP BY WEEKOFYEAR(rev_timestamp)
ORDER BY week_n
LIMIT 10000
'''
In [105]:
df=spark_session.sql(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD , END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [106]:
df_edits_quality_damaging_rate_nonbot_registered_user=df.toPandas()
In [108]:
df=spark_session.sql(query_edits_quality_damaging_model_nonbot_registered_user.format(YEAR=pre_yr, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD , END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [109]:
df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year=df.toPandas()
In [110]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Fawiki ORES Damaging Rate (Registered Non-bot Editors)',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Damages/Edits Rate',  fontsize = 16)
ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user['damaging_rate'], 'o-',label='Year 2021')
ax.plot(df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['week_n'], df_edits_quality_damaging_rate_nonbot_registered_user_pre_1_year['damaging_rate'], 'o--',label='Year 2020')

ax.set_xlim(1,52)
ax.set_ylim(0,0.4)
ax.vlines(42, 0, 0.4, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()
In [176]:
print("This dashboard was generated on {} at {}. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback please email jwang@wikimedia.org or product-analytics@wikimedia.org.".format(today.strftime("%Y-%m-%d"), now.strftime("%H:%M %Z")))
This dashboard was generated on 2022-01-04 at 01:29 UTC. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback please email jwang@wikimedia.org or product-analytics@wikimedia.org.
In [ ]: