import pandas as pd
import numpy as np
import wmfdata as wmf
import ast
from wmfdata import spark, mariadb
import json
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 50)
You are using Wmfdata v2.0.0, but v2.0.1 is available. To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`. To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.
spark.run("DESC event.mediawiki_editattempt_block")
col_name | data_type | comment | |
---|---|---|---|
0 | _schema | string | None |
1 | block_expiry | string | None |
2 | block_id | string | None |
3 | block_scope | string | None |
4 | block_type | string | None |
5 | country_code | string | None |
6 | database | string | None |
7 | dt | string | None |
8 | http | struct<client_ip:string,has_cookies:boolean,me... | None |
9 | interface | string | None |
10 | meta | struct<domain:string,dt:string,id:string,reque... | None |
11 | page_id | bigint | None |
12 | page_is_redirect | boolean | None |
13 | page_namespace | bigint | None |
14 | page_title | string | None |
15 | performer | struct<user_edit_count:bigint,user_groups:arra... | None |
16 | platform | string | None |
17 | rev_id | bigint | None |
18 | geocoded_data | map<string,string> | None |
19 | user_agent_map | map<string,string> | None |
20 | is_wmf_domain | boolean | None |
21 | normalized_host | struct<project_class:string,project:string,qua... | None |
22 | datacenter | string | None |
23 | year | bigint | None |
24 | month | bigint | None |
25 | day | bigint | None |
26 | hour | bigint | None |
27 | # Partition Information | ||
28 | # col_name | data_type | comment |
29 | datacenter | string | None |
30 | year | bigint | None |
31 | month | bigint | None |
32 | day | bigint | None |
33 | hour | bigint | None |
query = '''
SELECT
performer.user_id,
block_id,
interface,
block_scope,
country_code,
user_agent_map,
meta.id,
http.client_ip
FROM event.mediawiki_editattempt_block
WHERE year = 2023
AND month = 10
AND day = 1
AND user_agent_map.browser_family = "bingbot"
AND database = "enwiki"
AND block_type IN ("ip", "range")
'''
edit_attempt_block = spark.run(query)
print('Number of edit attempt block events: %d' % edit_attempt_block['block_id'].count())
Number of edit attempt block events: 73424
ip_and_range_blocks_query = '''
SELECT
actor_name,
ipblocks.ipb_id,
ipblocks.ipb_by_actor,
ipblocks.ipb_expiry,
comment_text
FROM ipblocks
JOIN actor ON actor.actor_id = ipblocks.ipb_by_actor
JOIN comment ON comment.comment_id = ipblocks.ipb_reason_id
WHERE ipb_user = 0
'''
# Note, the comment_text is for a local block. The edit_attempt_block schema logs
# both local and global blocks, and in case of a composite block, it's impossible
# to determine if the block ID is global/local, because the IDs are incremented
# and the same ID can appear in the centralauth.globalblock table as well as ipblocks,
# while referring to completely different block types.
ip_and_range_blocks = mariadb.run(ip_and_range_blocks_query, 'enwiki')
# `block_id` can be a list of blocks when the block object is an instance of
# CompositeBlock. e.g. the the user blocked by a local, enwiki block against an
# IP, and the same block exists globally in centralauth.globalblock.
# Transform the edit attempt block data to create new rows for each block_id in a list of block IDs.
edit_attempt_block_transformed = edit_attempt_block.copy()
print('Number of rows before transformation: %d' % edit_attempt_block_transformed['block_id'].count())
edit_attempt_block_transformed['block_id'] = edit_attempt_block_transformed['block_id'].apply(ast.literal_eval)
edit_attempt_block_transformed = edit_attempt_block_transformed.explode('block_id')
print('Number of rows after transformation: %d' % edit_attempt_block_transformed['block_id'].count())
# The number of rows roughly doubles. This is plausible if we look at edit_attempt_block['block_id'].value_counts(),
# which shows that the most commonly occurring block IDs are lists.
Number of rows before transformation: 73424 Number of rows after transformation: 163071
intersection = pd.merge(edit_attempt_block_transformed, ip_and_range_blocks, left_on='block_id', right_on='ipb_id')
useragent = pd.json_normalize(intersection.user_agent_map)
intersection = pd.concat([intersection.drop('user_agent_map', axis=1), useragent], axis=1)
print('Number of edit attempt block events: %d' % intersection['id'].count())
intersection.groupby(['interface']).size()
Number of edit attempt block events: 89966
interface wikieditor 89966 dtype: int64
intersection.groupby('comment_text').size().sort_values(ascending=False)
comment_text {{colocationwebhost}} <!-- MS Azure --> 39199 {{colocationwebhost}} <!-- Microsoft Azure --> 17336 {{webhostblock}} <!-- Microsoft Azure (MICROSOFT-CORP-MSN-AS-BLOCK, US, AS8075) --> 16508 {{colocationwebhost}} <!-- MSFT-GFS (MICROSOFT-CORP-MSN-AS-BLOCK, AS8075) --> 11989 {{colocationwebhost}} <!-- MICROSOFT-GLOBAL-NET (MICROSOFT-CORP-MSN-AS-BLOCK, AS8075) --> 4519 {{webhostblock}} <!-- Gigabit Hosting Sdn Bhd (AS55720) (GIGABIT-MY Gigabit Hosting Sdn Bhd, MY, AS55720) --> 239 {{webhostblock}} <!-- Avast VPN (AVAST-AS-DC, CZ, AS198605) --> 80 {{webhostblock}} <!-- ServeTheWorld AS (AS34989) (SERVETHEWORLD-AS, NO, AS34989) --> 39 {{Colocationwebhost}} <!-- Amazon Web Services --> 17 {{webhostblock}} <!-- ColoCrossing (SERVER-MANIA, CA, AS55286) --> 9 {{CDNblock}} <!-- Cloudflare --> 7 {{Colocationwebhost}} <!-- Serverion --> 4 {{webhostblock}} <!-- Serverion (AS-SERVERION Serverion B.V., NL, AS213035) --> 4 {{colocationwebhost}} <!-- DigitalOcean, proxies --> 3 {{colocationwebhost}} <!-- AL-3 (Alibaba (US) Technology Co., Ltd., AS45102) --> 3 {{Colocationwebhost}} <!-- G-Core Labs --> 3 {{Colocationwebhost}} <!-- Microsoft Azure --> 2 {{colocationwebhost}} <!-- DIGITALOCEAN-165-232-32-0 (DIGITALOCEAN-ASN, AS14061) --> 1 {{webhostblock}} <!-- DIGITALOCEAN-ASN, US (AS14061) --> 1 {{webhostblock}} <!-- OVH-200141d00000 (OVH SAS, AS16276) (OVH, FR, AS16276) --> 1 {{Colocationwebhost}} <!-- Cloudflare --> 1 {{webhostblock}}: <!-- GoDaddy.com --> 1 dtype: int64
intersection.groupby('country_code').size().sort_values(ascending=False)
country_code US 89631 MY 128 Unknown 112 NO 39 ZA 32 DE 15 HK 3 JP 3 NL 3 dtype: int64
intersection.groupby(['comment_text','client_ip']).size().sort_values(ascending=False).nlargest(n=25)
comment_text client_ip {{colocationwebhost}} <!-- MS Azure --> 127.0.0.1 3931 {{colocationwebhost}} <!-- Microsoft Azure --> 127.0.0.1 1799 {{webhostblock}} <!-- Microsoft Azure (MICROSOFT-CORP-MSN-AS-BLOCK, US, AS8075) --> 127.0.0.1 1639 {{colocationwebhost}} <!-- MS Azure --> 52.167.144.24 1333 52.167.144.238 1327 52.167.144.231 1272 52.167.144.145 1264 52.167.144.192 1263 52.167.144.195 1216 52.167.144.181 1201 ... 52.167.144.210 931 {{colocationwebhost}} <!-- Microsoft Azure --> 40.77.167.70 924 {{colocationwebhost}} <!-- MS Azure --> 52.167.144.221 923 {{colocationwebhost}} <!-- MICROSOFT-GLOBAL-NET (MICROSOFT-CORP-MSN-AS-BLOCK, AS8075) --> 207.46.13.154 922 {{webhostblock}} <!-- Microsoft Azure (MICROSOFT-CORP-MSN-AS-BLOCK, US, AS8075) --> 207.46.13.154 922 {{colocationwebhost}} <!-- MS Azure --> 52.167.144.21 914 52.167.144.222 898 {{colocationwebhost}} <!-- Microsoft Azure --> 40.77.167.25 892 {{colocationwebhost}} <!-- MSFT-GFS (MICROSOFT-CORP-MSN-AS-BLOCK, AS8075) --> 157.55.39.51 839 {{webhostblock}} <!-- Microsoft Azure (MICROSOFT-CORP-MSN-AS-BLOCK, US, AS8075) --> 157.55.39.51 839 Length: 25, dtype: int64