In [1]:
%%capture --no-stderr
%set_env http_proxy=http://webproxy.eqiad.wmnet:8080
%set_env https_proxy=https://webproxy.eqiad.wmnet:8080
%set_env no_proxy=localhost,127.0.0.1
In [2]:
# to fix matplotlib warning on the default path (/nonexistent/.config/matplotlib) is not a writable directory
import os 
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
In [3]:
# import wmf data package
import wmfdata
wmfdata.utils.insert_code_toggle()
In [4]:
from wmfdata import hive, mariadb
import pandas as pd
from datetime import datetime, timedelta, date
In [5]:
now = pd.Timestamp.utcnow()
today=now.date()
this_monday=(today - timedelta(days=today.weekday())).strftime("%Y-%m-%d")
month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d")
In [6]:
start_date= '2021-01-04'
end_date=this_monday
yr='2021'
In [7]:
weekly_edits_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
GROUP BY WEEKOFYEAR(rev_timestamp) 
'''
In [8]:
df_weekly_edits_2021=hive.run(weekly_edits_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [9]:
weekly_edits_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        snapshot = '{SNAPSHOT}'
GROUP BY weekofyear(event_timestamp) 
'''
In [10]:
start_date_last_year = '2020-01-01'
end_date_last_year='2020-12-31'
In [11]:
if datetime.today().month >2 :
    snapshot = datetime.today().replace(month=datetime.today().month-2).strftime("%Y-%m")
else:
    snapshot= datetime.today().replace(month=12, year=datetime.today().year-1).strftime("%Y-%m")
In [12]:
df2_weekly_edits_2020=hive.run(weekly_edits_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
@ticker.FuncFormatter
def million_formatter(x, pos):
    return "%d M" % round(x/1000000)
@ticker.FuncFormatter
def thousand_formatter(x, pos):
    return "%d K" % round(x/1000)

Weekly edits trend

In [14]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Weekly Edits',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)
ax.set_ylabel('Edits',  fontsize = 16)
ax.plot(df_weekly_edits_2021['week_n'], df_weekly_edits_2021['edits'], 'o-',label='Year 2021')
ax.plot(df2_weekly_edits_2020['week_n'], df2_weekly_edits_2020['edits'], 'o--',label='Year 2020')

ax.set_xlim(1,52)
ax.set_ylim(2000000,20000000)
ax.legend(loc='upper right')

ax.yaxis.set_major_formatter(million_formatter)

ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
#plt.savefig("1_weekly_edits.png")

plt.show()

Edits by bot editors and non-bot editors

In [15]:
weekly_edits_bot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(*) AS bot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null
AND  ( performer.user_is_bot=true OR performer.user_text regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [16]:
weekly_edits_nonbot_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n,  COUNT(*) AS nonbot_edits
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
AND performer.user_id IS not null
AND  ( performer.user_is_bot=false AND performer.user_text not regexp "^.*bot([^a-z].*$|$)")
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [17]:
df_weekly_bot_edits_2021=hive.run(weekly_edits_bot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [18]:
df_weekly_nonbot_edits_2021=hive.run(weekly_edits_nonbot_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [19]:
weekly_edits_bot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND
        (size(event_user_is_bot_by_historical) > 0 OR size(event_user_is_bot_by) > 0  
        OR event_user_text  regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}'
GROUP BY weekofyear(event_timestamp) 
'''
In [20]:
df2_weekly_bot_edits_2020=hive.run(weekly_edits_bot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [21]:
weekly_edits_nonbot_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false AND 
        (size(event_user_is_bot_by_historical) = 0 AND size(event_user_is_bot_by) = 0  
        AND event_user_text not regexp "^.*bot([^a-z].*$|$)" )
        AND snapshot = '{SNAPSHOT}'
GROUP BY weekofyear(event_timestamp) 
'''
In [22]:
df2_weekly_nonbot_edits_2020=hive.run(weekly_edits_nonbot_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [23]:
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Weekly Bot Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Edits',  fontsize = 16)
ax[0].plot(df_weekly_bot_edits_2021['week_n'], df_weekly_bot_edits_2021['bot_edits'], 'o-',label='Year 2021')
ax[0].plot(df2_weekly_bot_edits_2020['week_n'], df2_weekly_bot_edits_2020['edits'], 'o--',label='Year 2020')


ax[0].set_xlim(1,52)
ax[0].set_ylim(0,14000000)
ax[0].legend(loc='upper right')
ax[0].yaxis.set_major_formatter(million_formatter)
ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Weekly Non-bot Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Edits',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,14000000)

ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')

ax[1].plot(df_weekly_nonbot_edits_2021['week_n'], df_weekly_nonbot_edits_2021['nonbot_edits'], 'o-',label='Year 2021')
ax[1].plot(df2_weekly_nonbot_edits_2020['week_n'], df2_weekly_nonbot_edits_2020['edits'], 'o--',label='Year 2020')
ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(million_formatter)
plt.savefig("2_bot_nonbot.png")
plt.show()

Edits by project families

In [24]:
weekly_edits_family_query='''
SELECT w.database_group , WEEKOFYEAR(rev_timestamp) AS week_n,  COUNT(*) AS edits
FROM event_sanitized.mediawiki_revision_create AS rv
INNER JOIN canonical_data.wikis AS w ON rv.`database` = w.database_code AND w.database_group IN ('wikipedia','wikidata','commons')
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
GROUP BY w.database_group , WEEKOFYEAR(rev_timestamp) 
ORDER BY  w.database_group , week_n
LIMIT 1000
'''
In [25]:
df_weekly_edits_by_family_2021=hive.run(weekly_edits_family_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [26]:
weekly_edits_family_query_2='''
SELECT w.database_group ,
       weekofyear(event_timestamp) AS week_n, COUNT(revision_id) AS edits
FROM wmf.mediawiki_history AS h
INNER JOIN canonical_data.wikis AS w ON h.wiki_db = w.database_code AND w.database_group IN ('wikipedia','wikidata','commons')
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        snapshot = '{SNAPSHOT}'
GROUP BY w.database_group , weekofyear(event_timestamp) 
ORDER BY w.database_group, week_n
LIMIT 1000
'''
In [27]:
df_weekly_edits_by_family_2020=hive.run(weekly_edits_family_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [28]:
df_weekly_non_wikidata_2021= df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] !='wikidata'].groupby(['week_n']).sum().reset_index()
In [29]:
df_weekly_non_wikidata_2020=df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] !='wikidata'].groupby(['week_n']).sum().reset_index()
In [30]:
fig, ax = plt.subplots(nrows=4, ncols=1,figsize=(16,24))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Weekly Wikipedia Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week',  fontsize = 16)
ax[0].set_ylabel('Edits',  fontsize = 16)
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,20000000)
ax[0].plot(df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='wikipedia']['week_n'], df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='wikipedia']['edits'], 'o-',label='Wikipedia Year 2021')
ax[0].plot(df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='wikipedia']['week_n'], df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='wikipedia']['edits'], 'o--',label='Wikipedia Year 2020')
ax[0].plot(df_weekly_edits_2021['week_n'], df_weekly_edits_2021['edits'], 'o-',label='All Projects Year 2021')
ax[0].yaxis.set_major_formatter(million_formatter)
ax[0].legend(loc='upper right')

ax2 = ax[0].twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax2.tick_params(length=0)

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')

ax[1].set_title('Weekly Wikidata Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Edits',fontsize = 16)

ax[1].set_xlim(1,52)
ax[1].set_ylim(0,20000000)

ax3=ax[1].twiny()
ax3.set_xlim(0,12)

ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])

ax3.tick_params(length=0)
#ax3.set_xlabel('Month',fontsize=16)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
ax[1].plot(df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='wikidata']['week_n'], df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='wikidata']['edits'], 'o-',label='Wikidata Year 2021')
ax[1].plot(df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='wikidata']['week_n'], df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='wikidata']['edits'], 'o--',label='Wikidata Year 2020')
ax[1].plot(df_weekly_edits_2021['week_n'], df_weekly_edits_2021['edits'], 'o-',label='All Projects Year 2021')
ax[1].legend(loc='upper right')
ax[1].yaxis.set_major_formatter(million_formatter)              
               
ax[2].set_title('Weekly Wikicommons Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[2].set_xlabel('Week',fontsize = 16)
ax[2].set_ylabel('Edits',fontsize = 16)

ax[2].set_xlim(1,52)
ax[2].set_ylim(0,20000000)

ax4=ax[2].twiny()
ax4.set_xlim(0,12)

ax4.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax4.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax4.tick_params(length=0)

ax4.xaxis.set_minor_locator(miloc)
ax4.grid(axis='x', which='minor')
ax[2].plot(df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='commons']['week_n'], df_weekly_edits_by_family_2021[df_weekly_edits_by_family_2021['database_group'] =='commons']['edits'], 'o-',label='Wikicommons Year 2021')
ax[2].plot(df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='commons']['week_n'], df_weekly_edits_by_family_2020[df_weekly_edits_by_family_2020['database_group'] =='commons']['edits'], 'o--',label='Wikicommons Year 2020')
ax[2].plot(df_weekly_edits_2021['week_n'], df_weekly_edits_2021['edits'], 'o-',label='All Projects Year 2021')
ax[2].legend(loc='upper right')             
ax[2].yaxis.set_major_formatter(million_formatter)   
    
ax[3].set_title('Weekly Non-wikidata Edits',fontweight="bold",fontsize = 16,y=1.08)
ax[3].set_xlabel('Week',fontsize = 16)
ax[3].set_ylabel('Edits',fontsize = 16)

ax[3].set_xlim(1,52)
ax[3].set_ylim(0,20000000)

ax5=ax[3].twiny()
ax5.set_xlim(0,12)
ax5.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax5.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax5.tick_params(length=0)

ax5.xaxis.set_minor_locator(miloc)
ax5.grid(axis='x', which='minor')
ax[3].plot(df_weekly_non_wikidata_2021['week_n'], df_weekly_non_wikidata_2021['edits'], 'o-',label='Non-wikidata Year 2021')
ax[3].plot(df_weekly_non_wikidata_2020['week_n'], df_weekly_non_wikidata_2020['edits'], 'o--',label='Non-wikidata Year 2020')
ax[3].plot(df_weekly_edits_2021['week_n'], df_weekly_edits_2021['edits'], 'o-',label='All Projects Year 2021')
ax[3].legend(loc='upper right')     
ax[3].yaxis.set_major_formatter(million_formatter)   
plt.savefig("3_projects_edits.png")
plt.show()

Weekly active editors (registered users)

Note:
The definition of weekly editors is different from that of monthly editors. Monthly active editors is defined as the number of registered users who made at least 5 content edits across all projects in the given month (Product data dictionary). When defining weekly editors, considering the threshold of 5 content edits per week would be too high, it is defined as the number of registered users who made at least 1 content edit across all projects in the given week, which represents an approximately equivalent frequency to 5 edits per month for monthly active editors. Given weekly editors definition adopts a lower threshold , it captures less frequent editors, resulting in sum of weekly editors being higher than that of monthly editors for the same time period.

In [31]:
weekly_user_editors_query='''
SELECT tmp.week_n, COUNT(tmp.user_id) AS user_editors
FROM
(SELECT  WEEKOFYEAR(rev_timestamp) AS week_n, `database`, performer.user_id AS user_id, max(performer.user_text) AS user_text
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
AND performer.user_id IS NOT null 
AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252) 
GROUP BY   WEEKOFYEAR(rev_timestamp), `database`, performer.user_id 
)AS tmp
GROUP BY tmp.week_n
'''
In [32]:
df_weekly_user_editors_2021=hive.run(weekly_user_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [33]:
weekly_user_editors_query='''
SELECT tmp.week_n,  SUM(tmp.wiki_user_editors) AS user_editors
FROM
(SELECT
       weekofyear(event_timestamp) AS week_n,wiki_db, COUNT(DISTINCT event_user_id) wiki_user_editors
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = false  AND page_namespace_is_content
        AND snapshot = '{SNAPSHOT}'
GROUP BY weekofyear(event_timestamp) ,wiki_db
) AS tmp
GROUP BY tmp.week_n
'''
In [34]:
df2_weekly_user_editors_2020=hive.run(weekly_user_editors_query.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [35]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Weekly User Editors',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)


ax.plot(df_weekly_user_editors_2021['week_n'], df_weekly_user_editors_2021['user_editors'], 'o-',label='Year 2021')
ax.plot(df2_weekly_user_editors_2020['week_n'], df2_weekly_user_editors_2020['user_editors'], 'o--',label='Year 2020')
ax.set_xlim(1,52)
ax.set_ylim(50000,220000)
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
                    

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.savefig("4_user_editors.png")
plt.show()

Weekly unique IPs for non-logged-in editors

In [36]:
weekly_ip_editors_query='''
SELECT WEEKOFYEAR(rev_timestamp) AS week_n, COUNT(DISTINCT performer.user_text) AS ip_editors
FROM event_sanitized.mediawiki_revision_create
WHERE year='{YEAR_YYYY}' AND  (rev_timestamp >= '{START_YYYY_MM_DD}' AND rev_timestamp < '{END_YYYY_MM_DD}') 
AND performer.user_id IS null 
AND page_namespace IN (0,6,12,640,100,102,104,106,108,110,112,114,124,146,250,252) 
GROUP BY  WEEKOFYEAR(rev_timestamp) 
'''
In [37]:
df_weekly_ip_editors_2021=hive.run(weekly_ip_editors_query.format(YEAR_YYYY=yr, START_YYYY_MM_DD=start_date, END_YYYY_MM_DD=end_date))
In [38]:
weekly_ip_editors_query_2='''
SELECT
       weekofyear(event_timestamp) AS week_n, COUNT(DISTINCT event_user_text) AS ip_editors
FROM wmf.mediawiki_history
WHERE
        event_entity = 'revision' AND
        event_type = 'create' AND
        DATE(event_timestamp) >= '{START_YYYY_MM_DD}' AND DATE(event_timestamp) <= '{END_YYYY_MM_DD}' AND
        event_user_is_anonymous = true  AND page_namespace_is_content
        AND snapshot = '{SNAPSHOT}'
GROUP BY weekofyear(event_timestamp) 
'''
In [39]:
df2_weekly_ip_editors_2020=hive.run(weekly_ip_editors_query_2.format(SNAPSHOT=snapshot, START_YYYY_MM_DD=start_date_last_year, END_YYYY_MM_DD=end_date_last_year))
In [40]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Weekly Unique IPs for Non-logged-in Editors',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week',  fontsize = 16)

ax.plot(df_weekly_ip_editors_2021['week_n'], df_weekly_ip_editors_2021['ip_editors'], 'o-',label='Year 2021')
ax.plot(df2_weekly_ip_editors_2020['week_n'], df2_weekly_ip_editors_2020['ip_editors'], 'o--',label='Year 2020')
ax.set_xlim(1,52)
ax.set_ylim(50000,220000)
ax.legend(loc='upper right')
ax.yaxis.set_major_formatter(thousand_formatter)
ax2 = ax.twiny()  # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)

ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
                    

miloc = plt.MultipleLocator(1)

ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
plt.savefig("5_ip_editors.png")
plt.show()
In [41]:
print("This dashboard was generated on {} at {}. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback, please email jwang@wikimedia.org or product-analytics@wikimedia.org.".format(today.strftime("%Y-%m-%d"), now.strftime("%H:%M %Z")))
This dashboard was generated on 2021-12-13 at 17:30 UTC. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback, please email jwang@wikimedia.org or product-analytics@wikimedia.org.
In [49]: