Ptwiki community decided to turn off editing for IP editors. This has been done using an AbuseFilter since October 4th, 2020 (Week 40th). This dashboard is to monitor related metrics to see how this impacts the health of the project in the long term. T264940

Metrics:

In [1]:
%%capture --no-stderr
%set_env http_proxy=http://webproxy.eqiad.wmnet:8080
%set_env https_proxy=https://webproxy.eqiad.wmnet:8080
%set_env no_proxy=localhost,127.0.0.1
In [2]:
# to fix matplotlib warning on the default path (/nonexistent/.config/matplotlib) is not a writable directory
import os 
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
In [3]:
import wmfdata
wmfdata.utils.insert_code_toggle()
In [4]:
from wmfdata import hive, mariadb
import pandas as pd
from datetime import datetime, timedelta, date
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
In [6]:
@ticker.FuncFormatter
def million_formatter(x, pos):
    return "%d M" % round(x/1000000)
@ticker.FuncFormatter
def thousand_formatter(x, pos):
    return "%d K" % round(x/1000)
In [7]:
now = pd.Timestamp.utcnow()
today=now.date()
In [8]:
last_Sunday=(today - timedelta(days=today.weekday()+1)).strftime("%Y-%m-%d")
month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d")
In [9]:
start_date_YYYY_MM_DD = '2021-01-04'
end_date_YYYY_MM_DD=last_Sunday
yr='2021'
In [10]:
pre_yr='2020'
In [11]:
start_date_YYYYMMDD = '20210104'
#end_date_YYYYMMDD=datetime.today().replace(day=1).strftime("%Y%m%d")
In [12]:
end_date_YYYYMMDD=(today - timedelta(days=today.weekday()+1)).strftime("%Y%m%d")
In [13]:
if datetime.today().day < 7:
    snapshot = datetime.today().replace(month=datetime.today().month-2).strftime("%Y-%m")
else:
    snapshot = datetime.today().replace(month=datetime.today().month-1).strftime("%Y-%m")
In [14]:
lastyear_start_date_YYYY_MM_DD = '2020-01-01'
lastyear_end_date_YYYY_MM_DD='2020-12-31'
lastyear_start_date_YYYYMMDD = '20200101'
lastyear_end_date_YYYYMMDD='20201231'
last2years_start_date_YYYY_MM_DD = '2019-01-01'
last2years_end_date_YYYY_MM_DD='2019-12-31'
last2years_start_date_YYYYMMDD = '20190101'
last2years_end_date_YYYYMMDD='20191231'

Number of active editors (registered users)

In [15]:
weekly_user_editors_query='''
SELECT tmp.week_n, COUNT(tmp.user_id) AS user_editors
FROM
(SELECT  WEEKOFYEAR(rev_timestamp) AS week_n,  performer.user_id AS user_id, max(performer.user_text) AS user_text
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS NOT null 
AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252) 
AND `database`='ptwiki'
GROUP BY   WEEKOFYEAR(rev_timestamp), performer.user_id 
)AS tmp
GROUP BY tmp.week_n
'''
In [16]:
df_weekly_user_editors=hive.run(weekly_user_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [17]:
weekly_user_editors_query='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(DISTINCT event_user_id) wiki_user_editors
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false  AND page_namespace_is_content
        AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [18]:
df2_weekly_user_editors_pre_1_year=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [19]:
df2_weekly_user_editors_pre_2_years=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [20]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Ptwiki Weekly User Editors',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Editors',  fontsize = 16)

ax.plot(df_weekly_user_editors['week_n'], df_weekly_user_editors['user_editors'], 'o-',label='Year 2021')
ax.plot(df2_weekly_user_editors_pre_1_year['week_n'], df2_weekly_user_editors_pre_1_year['wiki_user_editors'], 'o--',label='Year 2020')
ax.plot(df2_weekly_user_editors_pre_2_years['week_n'], df2_weekly_user_editors_pre_2_years['wiki_user_editors'], ':',label='Year 2019')


ax.set_xlim(1,52)
ax.set_ylim(0,5000)
ax.vlines(40, 0, 5000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

# set_xticks method before set_xticklabels to fix warning. ref: https://stackoverflow.com/questions/63723514/userwarning-fixedformatter-should-only-be-used-together-with-fixedlocator
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
                    

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)

ax2.grid(axis='x', which='minor')

plt.show()

Number of edits

In [21]:
weekly_edits_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND `database`='ptwiki'
GROUP BY WEEKOFYEAR(rev_timestamp) 
'''
In [22]:
df_weekly_edits=hive.run(weekly_edits_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [23]:
weekly_edits_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        snapshot = '{SNAPSHOT}'
        AND wiki_db='ptwiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [24]:
df2_weekly_edits_pre_1_year=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [25]:
df2_weekly_edits_pre_2_years=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [26]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Ptwiki Weekly Edits',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Edits',  fontsize = 16)
ax.plot(df_weekly_edits['week_n'], df_weekly_edits['edits'], 'o-',label='Year 2021')
ax.plot(df2_weekly_edits_pre_1_year['week_n'], df2_weekly_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax.plot(df2_weekly_edits_pre_2_years['week_n'], df2_weekly_edits_pre_2_years['edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,150000)
ax.vlines(40, 0, 150000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
#plt.savefig("1_ptwiki_weekly_edits.png")

plt.show()

Number of reverts

Definition 1: number of edits reverted by the snapshot time

In [27]:
query_weekly_reverts='''

SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki'
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND revision_is_identity_reverted 
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
'''
In [28]:
df_weekly_reverts=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [29]:
df_weekly_reverts_pre_1_year=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [30]:
df_weekly_reverts_pre_2_years=hive.run(query_weekly_reverts.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [31]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Ptwiki Weekly Reverted Edits',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Reverts',  fontsize = 16)
ax.plot(df_weekly_reverts['week_n'][:-1], df_weekly_reverts['reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_reverts_pre_1_year['week_n'], df_weekly_reverts_pre_1_year['reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_reverts_pre_2_years['week_n'], df_weekly_reverts_pre_2_years['reverted_edits'], ':',label='Year 2019')
ax.set_xlim(1,52)
ax.set_ylim(0,15000)
ax.vlines(40, 0, 15000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Definition 2: number of edits reverted within 48 hours

In [32]:
query_weekly_reverts_48hrs='''
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki'
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
'''
In [33]:
df_weekly_reverts_48hrs=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [34]:
df_weekly_reverts_48hrs_pre_1_year=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [35]:
df_weekly_reverts_48hrs_pre_2_years=hive.run(query_weekly_reverts_48hrs.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [36]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Ptwiki Weekly Reverts Within 48 Hours of Editing',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Reverts',  fontsize = 16)
ax.plot(df_weekly_reverts_48hrs['week_n'][:-1], df_weekly_reverts_48hrs['reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_reverts_48hrs_pre_1_year['week_n'], df_weekly_reverts_48hrs_pre_1_year['reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_reverts_48hrs_pre_2_years['week_n'], df_weekly_reverts_48hrs_pre_2_years['reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,15000)
ax.vlines(40, 0, 15000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Note: Around 60% reverts happened within 48 hours of editing on ptwiki.

Number of net non-reverted edits

Definition: number of edits which were not reverted within 48 hours (excluding bots and revert edits)

In [37]:
query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits='''
WITH t1 AS
(
SELECT weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(revision_id) AS non_reverted_edits
FROM wmf.mediawiki_history
WHERE snapshot= '{SNAPSHOT}' AND wiki_db='ptwiki'
-- for bots size(event_user_is_bot_by) returns 1/2, IP editors return -1, registered non-bot editors return 0
AND  size(event_user_is_bot_by) <= 0 
AND substr(event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
AND event_entity = "revision" AND (NOT revision_is_identity_reverted  OR revision_seconds_to_identity_revert > 172800)
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_N
LIMIT 1000000
),
t2 AS
(
SELECT  weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss'))) AS week_n,
 count(distinct h1.revision_id) AS revert_edits
FROM wmf.mediawiki_history AS h1 
LEFT JOIN wmf.mediawiki_history AS h2  -- h1 is revert edits, h2 is the edits being reverted
     ON h1.revision_id=h2.revision_first_identity_reverting_revision_id  
      AND h1.wiki_db = h2.wiki_db
      AND h1.snapshot= h2.snapshot 
WHERE h1.wiki_db = 'ptwiki' AND h1.snapshot='{SNAPSHOT}'
    -- for bots size(event_user_is_bot_by) returns 1 or 2, IP editors return -1, registered non-bot editors return 0
    AND size(h1.event_user_is_bot_by) <= 0 AND size(h2.event_user_is_bot_by) <= 0
    AND substr(h1.event_timestamp,1,10) BETWEEN '{START_YYYY_MM_DD}' AND '{END_YYYY_MM_DD}'
    AND h1.event_entity = "revision" and h2.event_entity = "revision" 
    AND (NOT h1.revision_is_identity_reverted  OR h1.revision_seconds_to_identity_revert > 172800)
    AND h2.revision_is_identity_reverted AND h2.revision_seconds_to_identity_revert < 172800
    AND h1.revision_is_identity_revert
    AND h1.event_entity = 'revision' and  h2.event_entity = 'revision'
GROUP BY weekofyear(from_unixtime(UNIX_timestamp(h1.event_timestamp,'yyyy-MM-dd HH:mm:ss')))
ORDER BY week_n
LIMIT 10000
)
SELECT t1.week_n, t1.non_reverted_edits - t2.revert_edits AS net_non_reverted_edits
FROM  t1
LEFT JOIN t2 ON t1.week_n=t2.week_n
'''
In [38]:
df_weekly_net_non_reverted_48hrs_edits=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [39]:
df_weekly_net_non_reverted_48hrs_edits_pre_1_year=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [40]:
df_weekly_net_non_reverted_48hrs_edits_pre_2_years=hive.run(query_weekly_non_reverted_nonbot_edits_48hr_exclude_revert_edits.format(SNAPSHOT=snapshot,START_YYYY_MM_DD= last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [41]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Ptwiki Weekly Edits Which Were Not Reverted in 48 Hours (Excluding Bot And Revert Edits)',fontweight="bold",fontsize = 16, y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Net Non-reverted Edits',  fontsize = 16)
ax.plot(df_weekly_net_non_reverted_48hrs_edits['week_n'][:-1], df_weekly_net_non_reverted_48hrs_edits['net_non_reverted_edits'][:-1], 'o-',label='Year 2021')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_1_year['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_1_year['net_non_reverted_edits'], 'o--',label='Year 2020')
ax.plot(df_weekly_net_non_reverted_48hrs_edits_pre_2_years['week_n'], df_weekly_net_non_reverted_48hrs_edits_pre_2_years['net_non_reverted_edits'], ':',label='Year 2019')

ax.set_xlim(1,52)
ax.set_ylim(0,150000)
ax.vlines(40, 0, 150000, colors='k', linestyles='dashdot', label='Turned off')
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(thousand_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.show()

Edits by bot and non-bot registered editors

In [42]:
weekly_edits_bot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS bot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null AND `database`='ptwiki'
AND  ( performer.user_is_bot=true OR performer.user_text regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [43]:
weekly_edits_nonbot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,  COUNT(*) AS nonbot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp <= '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null AND `database`='ptwiki'
AND  ( performer.user_is_bot=false AND performer.user_text not regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [44]:
df_weekly_bot_edits=hive.run(weekly_edits_bot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [45]:
df_weekly_nonbot_edits=hive.run(weekly_edits_nonbot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date_YYYY_MM_DD, END_YYYY_MM_DD=end_date_YYYY_MM_DD))
In [46]:
weekly_edits_bot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND
        (size(event_user_is_bot_by_historical) > 0 OR size(event_user_is_bot_by) > 0  
        OR event_user_text  regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [47]:
df2_weekly_bot_edits_pre_1_year=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [48]:
df2_weekly_bot_edits_pre_2_years=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [49]:
weekly_edits_nonbot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND 
        (size(event_user_is_bot_by_historical) = 0 AND size(event_user_is_bot_by) = 0  
        AND event_user_text not regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}' AND wiki_db='ptwiki'
GROUP BY weekofyear(event_timestamp) 
'''
In [50]:
df2_weekly_nonbot_edits_pre_1_year=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=lastyear_start_date_YYYY_MM_DD, END_YYYY_MM_DD=lastyear_end_date_YYYY_MM_DD))
In [51]:
df2_weekly_nonbot_edits_pre_2_years=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=last2years_start_date_YYYY_MM_DD, END_YYYY_MM_DD=last2years_end_date_YYYY_MM_DD))
In [52]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Weekly Edits by Registered Bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Edits',  fontsize = 16)
ax[0].plot(df_weekly_bot_edits['week_n'], df_weekly_bot_edits['bot_edits'], 'o-',label='Year 2021')
ax[0].plot(df2_weekly_bot_edits_pre_1_year['week_n'], df2_weekly_bot_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax[0].plot(df2_weekly_bot_edits_pre_2_years['week_n'], df2_weekly_bot_edits_pre_2_years['edits'], ':',label='Year 2019')


ax[0].set_xlim(1,52)
ax[0].set_ylim(0,80000)
ax[0].vlines(40, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(thousand_formatter)
ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Weekly Edits by Registered Non-bot Editors',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Edits',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,80000)
ax[1].vlines(40, 0, 80000, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_weekly_nonbot_edits['week_n'], df_weekly_nonbot_edits['nonbot_edits'], 'o-',label='Year 2021')
ax[1].plot(df2_weekly_nonbot_edits_pre_1_year['week_n'], df2_weekly_nonbot_edits_pre_1_year['edits'], 'o--',label='Year 2020')
ax[1].plot(df2_weekly_nonbot_edits_pre_2_years['week_n'], df2_weekly_nonbot_edits_pre_2_years['edits'], ':',label='Year 2019')

ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(thousand_formatter)

plt.show()

Number of blocks

In [53]:
query_block_weekly='''
SELECT WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss'))) AS week_n, 
COUNT(1) AS blocks
FROM wmf_raw.mediawiki_logging
WHERE snapshot ='{SNAPSHOT}' AND log_type = 'block'  AND wiki_db='ptwiki'
AND  log_action IN ('block','reblock')
AND substr(log_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY WEEKofyear(from_unixtime(UNIX_timestamp(log_timestamp,'yyyyMMddHHmmss')))
ORDER BY week_n
LIMIT 100000
'''
In [54]:
df_block_weekly=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=start_date_YYYYMMDD, END_YYYYMMDD=end_date_YYYYMMDD))
In [55]:
df_block_weekly_pre_1_year=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=lastyear_start_date_YYYYMMDD, END_YYYYMMDD=lastyear_end_date_YYYYMMDD))
In [56]:
df_block_weekly_pre_2_years=hive.run(query_block_weekly.format(SNAPSHOT=snapshot, START_YYYYMMDD=last2years_start_date_YYYYMMDD, END_YYYYMMDD=last2years_end_date_YYYYMMDD))
In [57]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Ptwiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Blocks',  fontsize = 16)
ax[0].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021')
ax[0].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020')
ax[0].plot(df_block_weekly_pre_2_years['week_n'], df_block_weekly_pre_2_years['blocks'], ':',label='Year 2019')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,14000)
ax[0].vlines(40, 0, 140000, colors='k', linestyles='dashdot', label='Turned off')
ax[0].legend(loc='upper right')

ax[0].yaxis.set_major_formatter(thousand_formatter)

ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')



ax[1].set_title('Ptwiki Weekly Blocks',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Blocks',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,4000)
ax[1].vlines(40, 0, 4000, colors='k', linestyles='dashdot', label='Turned off')
ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax3.tick_params(length=0)
ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_block_weekly['week_n'], df_block_weekly['blocks'], 'o-',label='Year 2021')
ax[1].plot(df_block_weekly_pre_1_year['week_n'], df_block_weekly_pre_1_year['blocks'], 'o--',label='Year 2020')
ax[1].legend(loc='upper right')


plt.show()