https://phabricator.wikimedia.org/T221891
In this analysis, we use the ORES draft topic model to get the topics of articles viewed on English Wikipedia in March 2019.
The outcome topics are the mid-level categories of WikiProject directory (see the hierarchy).
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this notebook is by default hidden for easier reading.
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code"></form>
''')
%load_ext sql_magic
import findspark, os
os.environ['SPARK_HOME'] = '/usr/lib/spark2';
findspark.init()
import pyspark
import pyspark.sql
conf = pyspark.SparkConf().setMaster("yarn") # Use master yarn here if you are going to query large datasets.
conf.set('spark.executor.memory', '8g')
conf.set('spark.yarn.executor.memoryOverhead', '1024')
conf.set('spark.executor.cores', '4')
conf.set('spark.dynamicAllocation.maxExecutors', '32')
conf.set('spark.driver.memory', '4g')
conf.set('spark.driver.maxResultSize', '10g')
conf.set('spark.logConf', True)
sc = pyspark.SparkContext(conf=conf)
spark_hive = pyspark.sql.HiveContext(sc)
%config SQL.conn_name = 'spark_hive'
import requests
import pandas as pd
import json
import matplotlib.pyplot as plt
%%read_sql enwiki_pageviews -d
select page_id, sum(view_count) as pageviews
from wmf.pageview_hourly
where year=2019 and month=3
and namespace_id = 0
and project = 'en.wikipedia'
and agent_type = 'user'
group by page_id
enwiki_pageviews['proportion']= enwiki_pageviews['pageviews']/enwiki_pageviews['pageviews'].sum()
enwiki_pageviews = enwiki_pageviews.sort_values(by='pageviews', ascending=False)
print('Total page views: ' + str(enwiki_pageviews.pageviews.sum()))
print('Number of unqiue pages: ' + str(enwiki_pageviews.shape[0]))
print('Top 1M pages account for ' + str(round(enwiki_pageviews.proportion[:1000000].sum() * 100,2)) + '% of total page views.')
# Page views distribution
distr_plot = enwiki_pageviews.iloc[1:1000].hist(column='pageviews', bins=100, grid=False, figsize=(15,8))
plt.title('Page Views Distribution of the Top 1000 Pages')
plt.xlabel('Page Views')
plt.ylabel('Number of Pages')
%%read_sql enwiki_pv_rev -d
with v as (
select page_id, sum(view_count) as pageviews
from wmf.pageview_hourly
where year=2019 and month=3
and namespace_id = 0
and project = 'en.wikipedia'
and agent_type = 'user'
group by page_id
order by pageviews desc
limit 1000000
), p as (
select page_id, page_title, page_latest
from wmf_raw.mediawiki_page
where wiki_db = 'enwiki'
and snapshot = '2019-03'
and page_id is not null
and page_namespace = 0
and not page_is_redirect
)
select v.page_id, p.page_title, p.page_latest as rev_id, v.pageviews
from v left join p on v.page_id=p.page_id
# Save rev_id into a json file
enwiki_pv_rev[['rev_id']].dropna().astype('int64').to_json(path_or_buf='input_rev_id.json', orient='records', lines=True)
%%bash
ores score_revisions https://ores.wikimedia.org "cxie@wikimedia.org analyzing article topics" enwiki drafttopic --parallel-requests=4 --input=input_rev_id.json > output_drafttopic_enwiki.json
# Get topic from ORES draft topic output
def get_pred_topic_best(input_json):
try:
topics = input_json['score']['drafttopic']['score']['probability']
best = sorted(topics, key=topics.get, reverse=True)[0]
except (IndexError, KeyError) as error:
best = 'Unknown'
return best
topic_df = pd.DataFrame([])
with open('output_drafttopic_enwiki.json') as json_file:
for line in json_file:
try:
ores_results = json.loads(line)
topic_df = topic_df.append(pd.DataFrame([[ores_results['rev_id'], get_pred_topic_best(ores_results)]]))
except ValueError:
print(line)
topic_df.columns = ['rev_id', 'topic']
topic_df.rev_id=topic_df.rev_id.astype(int)
enwiki_pv_rev_topic = enwiki_pv_rev.merge(topic_df, how = 'left', on='rev_id')
enwiki_pv_rev_topic['topic'] = enwiki_pv_rev_topic['topic'].fillna(value='Unknown')
enwiki_pv_rev_topic['proportion']= enwiki_pv_rev_topic['pageviews']/enwiki_pv_rev_topic['pageviews'].sum()
enwiki_pv_rev_topic[['page_title','pageviews','proportion','topic']].sort_values(by='pageviews', ascending=False).reset_index(drop=True).head(50)
The table below shows the page views by topics of the top 1M pages viewed in March 2019 on English Wikipedia. Their corresponding proportions among the total page views of the top 1M pages are also calculated. Main page is excluded in this table, so the sum of the proprotion is not 100%.
enwiki_pv_rev_topic_summary = (enwiki_pv_rev_topic[enwiki_pv_rev_topic.page_title != 'Main_Page']
.groupby('topic', as_index = False)['pageviews', 'proportion']
.sum()
.sort_values(by='pageviews', ascending=False))
enwiki_pv_rev_topic_summary
The table below aggregates the table above by WikiProject Directory (broad topics).
enwiki_pv_rev_topic_summary['broad topic'] = enwiki_pv_rev_topic_summary.topic.str.split(pat=".", n=1, expand=True)[0]
enwiki_pv_rev_topic_summary.groupby('broad topic', as_index = False)['pageviews', 'proportion'].sum().sort_values(by='pageviews', ascending=False)