Special:Investgate has been deployed on all wikis. This dashboard is to monitor the adoption, user experience, feature usage with this new feature. T257539
Adoption: Comparing usage for the new tool versus the old tool
User experience
Feature usage
User reliance on external IP tools
Other metrics we'd like to track after we have developed a 'Done' button
Schema: eventlogging data
Deployed on itwiki and svwiki on Sept 24th.
Deploying on all projects on Oct 8.
%%capture
%set_env http_proxy=http://webproxy.eqiad.wmnet:8080
%set_env https_proxy=https://webproxy.eqiad.wmnet:8080
%set_env no_proxy=localhost,127.0.0.1
import wmfdata
wmfdata.utils.insert_code_toggle()
from wmfdata import hive, mariadb
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from datetime import datetime, timedelta, date
now = pd.Timestamp.utcnow()
today=now.date()
last_Saturday=(today - timedelta(days=today.weekday()+2)).strftime("%Y%m%d")
month_1st_day= datetime.today().replace(day=1).strftime("%Y-%m-%d")
start_wmf_raw_date = '20210101'
# the first day of the week is Sunday
end_wmf_raw_date=last_Saturday
lastyear_start_wmf_raw_date = '20200101'
lastyear_end_wmf_raw_date='20201231'
wikis =hive.run(
'''
SELECT database_code, database_group
FROM canonical_data.wikis
WHERE
database_group in (
"commons", "incubator", "foundation", "mediawiki", "meta", "sources",
"species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
"wikisource", "wikiversity", "wikivoyage", "wiktionary"
)
'''
).sort_values("database_code").set_index("database_code")
local_CU_list=['metawiki','commonswiki', 'specieswiki', 'wikidatawiki','arwiki', 'bnwiki','cawiki','cswiki','dawiki' ,
'dewiki', 'enwiki','eswiki','fawiki','fiwiki','frwiki', 'hewiki','hrwiki','huwiki', 'idwiki','itwiki',
'jawiki', 'kowiki', 'mlwiki', 'nlwiki', 'plwiki', 'ptwiki', 'ruwiki', 'simplewiki', 'slwiki',
'srwiki', 'svwiki', 'trwiki', 'ukwiki', 'viwiki','enwikibooks','enwiktionary']
checkuser_requests_query='''
SELECT
DATE_FORMAT(from_unixtime(UNIX_timestamp(cul_timestamp)), '%U') AS week_n,
'{wiki}' AS wiki_db,'{admin}' AS admin_type,
CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END AS tool,
COUNT(cul_id) AS requests
FROM cu_log
WHERE substr(cul_timestamp,1,8) BETWEEN '{START_YYYYMMDD}' AND '{END_YYYYMMDD}'
GROUP BY DATE_FORMAT(from_unixtime(UNIX_timestamp(cul_timestamp)), '%U'),
CASE WHEN cul_type='investigate' THEN 'investigate' ELSE 'checkuser' END
ORDER BY wiki_db, week_n
LIMIT 10000
'''
columns = ['week_n','wiki_db', 'admin_type','tool','requests']
df_admin_request= pd.DataFrame( columns=columns)
df_admin_request_lastyear= pd.DataFrame( columns=columns)
for w in wikis.index.values:
if w in local_CU_list:
try:
tmp=mariadb.run(checkuser_requests_query.format(wiki=w,admin='local_checkuser', START_YYYYMMDD=start_wmf_raw_date,END_YYYYMMDD=end_wmf_raw_date),w)
df_admin_request=df_admin_request.append(tmp, ignore_index = True)
except ValueError:
pass
else:
try:
tmp=mariadb.run(checkuser_requests_query.format(wiki=w,admin='global_checkuser', START_YYYYMMDD=start_wmf_raw_date,END_YYYYMMDD=end_wmf_raw_date),w)
df_admin_request=df_admin_request.append(tmp, ignore_index = True)
except ValueError:
pass
for w in wikis.index.values:
if w in local_CU_list:
try:
tmp=mariadb.run(checkuser_requests_query.format(wiki=w,admin='local_checkuser', START_YYYYMMDD=lastyear_start_wmf_raw_date,END_YYYYMMDD=lastyear_end_wmf_raw_date),w)
df_admin_request_lastyear=df_admin_request_lastyear.append(tmp, ignore_index = True)
except ValueError:
pass
else:
try:
tmp=mariadb.run(checkuser_requests_query.format(wiki=w,admin='global_checkuser', START_YYYYMMDD=lastyear_start_wmf_raw_date,END_YYYYMMDD=lastyear_end_wmf_raw_date),w)
df_admin_request_lastyear=df_admin_request_lastyear.append(tmp, ignore_index = True)
except ValueError:
pass
fig, ax = plt.subplots(nrows=2, ncols=1,figsize=(16,12))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax[0].set_title('Check Requests on Wikis With Local Checkusers',fontweight="bold",fontsize = 16,y=1.08)
ax[0].set_xlabel('Week', fontsize = 16)
ax[0].set_ylabel('Requests', fontsize = 16)
ax[0].plot(df_admin_request[(df_admin_request['tool']=='checkuser')&(df_admin_request['admin_type']=='local_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o-',color='green',label='2021 Checkuser')
ax[0].plot(df_admin_request[(df_admin_request['tool']=='investigate')&(df_admin_request['admin_type']=='local_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o-',color='orange', label='2021 Special:Investigate')
ax[0].plot(df_admin_request_lastyear[(df_admin_request_lastyear['tool']=='checkuser')&(df_admin_request_lastyear['admin_type']=='local_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o--',color='green', label='2020 Checkuser')
ax[0].plot(df_admin_request_lastyear[(df_admin_request_lastyear['tool']=='investigate')&(df_admin_request_lastyear['admin_type']=='local_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o--',color='orange', label='2020 Special:Investigate')
ax[0].set_xlim(1,52)
ax[0].set_ylim(0,4000)
ax[0].legend(loc='upper right')
ax2 = ax[0].twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
miloc = plt.MultipleLocator(1)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax[1].set_title('Check Requests on Wikis Without Local Checkusers',fontweight="bold",fontsize = 16,y=1.08)
ax[1].set_xlabel('Week',fontsize = 16)
ax[1].set_ylabel('Requests',fontsize = 16)
ax[1].set_xlim(1,52)
ax[1].set_ylim(0,1000)
ax3=ax[1].twiny()
ax3.set_xlim(0,12)
ax3.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax3.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax3.tick_params(length=0)
ax3.xaxis.set_minor_locator(miloc)
ax3.grid(axis='x', which='minor')
ax[1].plot(df_admin_request[(df_admin_request['tool']=='checkuser')&(df_admin_request['admin_type']=='global_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o-',color='green', label='2021 Checkuser')
ax[1].plot(df_admin_request[(df_admin_request['tool']=='investigate')&(df_admin_request['admin_type']=='global_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o-',color='orange', label='2021 Special:Investigate')
ax[1].plot(df_admin_request_lastyear[(df_admin_request_lastyear['tool']=='checkuser')&(df_admin_request_lastyear['admin_type']=='global_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o--',color='green', label='2020 Checkuser')
ax[1].plot(df_admin_request_lastyear[(df_admin_request_lastyear['tool']=='investigate')&(df_admin_request_lastyear['admin_type']=='global_checkuser' )].groupby(['week_n'])['requests'].agg('sum'), 'o--',color='orange',label='2020 Special:Investigate')
ax[1].legend(loc='upper right')
#plt.savefig("adoption.png")
plt.show()
df_admin_request[df_admin_request['tool']=='investigate'].sort_values(by=['week_n', 'requests'], ascending=False).head(10)
df_admin_request[df_admin_request['tool']=='investigate'].sort_values(by=['week_n', 'requests'], ascending=[True, False ]).tail(10)
week_list = list(range(0, 53))
df_week_n= pd.DataFrame(week_list, columns=['week_n'])
df_tmp=df_admin_request[(df_admin_request['tool']=='investigate')].groupby(['week_n'])['requests'].agg('sum').to_frame().reset_index()
df_tmp['week_n']=df_tmp['week_n'].astype(int)
df_to_draw=pd.merge(df_week_n, df_tmp,how='left', left_on = 'week_n',right_on='week_n')
df_tmp_pre=df_admin_request_lastyear[(df_admin_request_lastyear['tool']=='investigate')].groupby(['week_n'])['requests'].agg('sum').to_frame().reset_index()
df_tmp_pre['week_n']=df_tmp_pre['week_n'].astype(int)
df_to_draw_pre=pd.merge(df_week_n, df_tmp_pre,how='left', left_on = 'week_n',right_on='week_n')
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(16,6))
fig.tight_layout(pad=1.08, w_pad=0.5, h_pad=6)
ax.set_title('Special Investigate Requests on All Wikis',fontweight="bold",fontsize = 16,y=1.08)
ax.set_xlabel('Week', fontsize = 16)
ax.set_ylabel('Requests', fontsize = 16)
ax.plot(df_to_draw['week_n'],df_to_draw['requests'], 'o-',label='2021')
ax.plot(df_to_draw_pre['week_n'],df_to_draw_pre['requests'], 'o--',label='2020')
ax.set_xlim(1,52)
ax.set_ylim(0,1000)
ax.grid(axis='x', which='minor')
ax2 = ax.twiny() # instantiate a second axes that shares the same x-axis
ax2.set_xlim(0,12)
ax2.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
ax2.set_xticks([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5])
ax2.tick_params(length=0)
ax2.set_xlabel('Month',fontsize=16)
ax2.xaxis.set_minor_locator(miloc)
ax2.grid(axis='x', which='minor')
ax.legend(loc='upper right')
#plt.savefig("adoption2.png")
plt.show()
query_targetsCount = '''
SELECT event.targetsCount
FROM event.specialinvestigate
WHERE year =2021 and wiki!='testwiki' AND event.action='submit'
'''
df_targetsCount=hive.run(query_targetsCount)
plt.figure(figsize=(18, 6))
plt.hist(df_targetsCount['targetscount'], bins=50, histtype='bar')
plt.title('Distribution of # of Users Investigated per Sumbit',fontweight="bold",fontsize = 16,y=1.08)
plt.xlabel('# of Users',fontsize = 16)
plt.ylabel('Investigation Count',fontsize = 16)
#plt.savefig("users_investigated.png")
plt.show()
query_result_count = '''
SELECT event.resultscount AS result_count
FROM event.specialinvestigate
WHERE year =2021
AND event.resultscount IS not NULL
'''
df_result_count=hive.run(query_result_count)
plt.figure(figsize=(18, 6))
plt.hist(df_result_count['result_count'], bins=50, histtype='bar')
plt.title('Distribution of # of Result Returned',fontweight="bold",fontsize = 16,y=1.08)
plt.xlabel('# of results',fontsize = 16)
plt.ylabel('Investigation Count',fontsize = 16)
#plt.savefig("results.png")
plt.show()
query_result_count_hist = '''
SELECT event.resultscount AS results, COUNT(1) AS investigate
FROM event.specialinvestigate
WHERE year =2021
AND event.resultscount IS not NULL
GROUP BY event.resultscount
'''
df_result_count_hist=hive.run(query_result_count_hist)
df_result_count_hist.sort_values(by=['investigate'], ascending=False).head(20)
query_result_incomplete_count = '''
SELECT COUNT(1) AS resultsIncomplete_count
FROM event.specialinvestigate
WHERE year =2021 and event.resultsIncomplete
'''
df_result_incomplete_count=hive.run(query_result_incomplete_count)
df_result_incomplete_count.head()
query_queryTime = '''
SELECT event.queryTime
FROM event.specialinvestigate
WHERE year =2021 AND event.queryTime IS not null
'''
df_queryTime=hive.run(query_queryTime)
plt.figure(figsize=(18, 6))
plt.hist(df_queryTime['querytime'], bins=50, histtype='bar')
plt.title('Distribution of Query Time',fontweight="bold",fontsize = 16,y=1.08)
plt.xlabel('Query Time',fontsize = 16)
plt.ylabel('Investigation Count',fontsize = 16)
#plt.savefig("querytime.png")
plt.show()
df_queryTime.sort_values (by='querytime', ascending=False).head(10)
tab name | value in schema |
---|---|
Default landing tab (IPs/User Agents) | NaN |
IPs/User Agents | compare |
Account Information | preliminary-check |
Timeline | timeline |
query_tab_clicks='''
SELECT event.tab, count(1) AS tab_clicks
FROM event.specialinvestigate
WHERE year =2021 AND event.action IN ('query','submit')
GROUP BY event.tab
'''
df_tab_clicks=hive.run(query_tab_clicks)
df_tab_clicks
query_pins='''
SELECT COUNT(1) AS pins
FROM event.specialinvestigate
WHERE year =2021
AND event.action = 'pin'
'''
df_pins=hive.run(query_pins)
df_pins
query_blocks='''
SELECT COUNT(1) AS blocks
FROM event.specialinvestigate
WHERE year =2021
AND event.action = 'block'
'''
df_blocks=hive.run(query_blocks)
df_blocks
query_blocked_users='''
SELECT SUM(event.targetsCount) AS blocked_users
FROM event.specialinvestigate
WHERE year =2021
AND event.action = 'block'
'''
df_blocked_users=hive.run(query_blocked_users)
df_blocked_users
query_tool_usage='''
SELECT event.tool, COUNT(1) tool_usage
FROM event.specialinvestigate
WHERE year =2021 AND event.action='tool'
GROUP BY event.tool
ORDER BY tool_usage DESC
LIMIT 100000
'''
df_tool_usage=hive.run(query_tool_usage)
df_tool_usage
print("This dashboard was generated on {} at {}. It is maintained by Jennifer Wang, Product Analytics. If you have questions or feedback, please email jwang@wikimedia.org or product-analytics@wikimedia.org.".format(today.strftime("%Y-%m-%d"), now.strftime("%H:%M %Z")))