https://phabricator.wikimedia.org/T214628
This analysis use the editing (MobileWikiAppEdit) and account creating (MobileWikiAppCreateAccount) event logging table to find out how many users got block error messages, thus we are only counting users who opt in to share their usage report with us -- the real number should be larger. We didn't see any error messages relating to block for Android on both tables, which may be a data bug on Android.
Caveat: Since we didn't send block ID/wiki user ID to the eventlogging table, we can only join the eventlogging data with Ipblocks table and Logging table by IP addresses to check whether an error message is an IP block. Therefore, the block type we're talking about here are defined as:
Caveat: Since the the error messages in MobileWikiAppCreateAccount table are translated, any message that contain the word "IP" will be taken as a IP block error message. Also, since we didn't send block ID/wiki user ID to the eventlogging table, we can only join the eventlogging data with Ipblocks table and Logging table by IP addresses to check whether an error message is an IP block.
%load_ext sql_magic
import findspark, os
os.environ['SPARK_HOME'] = '/usr/lib/spark2';
findspark.init()
import pyspark
import pyspark.sql
conf = pyspark.SparkConf() # Use master yarn here if you are going to query large datasets.
sc = pyspark.SparkContext(conf=conf)
spark_hive = pyspark.sql.HiveContext(sc)
%config SQL.conn_name = 'spark_hive'
Querying MobileWikiAppEdit table by parsing the localized error messages (all messages are in English).
%%read_sql edit_block
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
useragent.os_family,
case when event.errortext = "You have been blocked from editing." then event.errortext
when event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then event.errortext
when event.errortext like "%Your IP address is in a range which has been blocked%" then "Your IP address is in a range which has been blocked."
else "other" end as error_msg
from event.mobilewikiappedit
where (year=2019
or (year=2018 and month>10)
)
and event.action='error'
and event.errortext is not null
group by useragent.os_family,
case when event.errortext = "You have been blocked from editing." then event.errortext
when event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then event.errortext
when event.errortext like "%Your IP address is in a range which has been blocked%" then "Your IP address is in a range which has been blocked."
else "other" end
Android didn't see any error messages that contains the word "block"...
%%read_sql edit_block_by_type
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end as block_type
from event.mobilewikiappedit left join blocked_ip on mobilewikiappedit.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and useragent.os_family = "iOS"
and event.action='error'
and event.errortext like "%block%"
group by
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end
We only listed those that has 5+ IP addresses blocked.
%%read_sql edit_block_by_type_country
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
geocoded_data['country'] as country,
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end as block_type
from event.mobilewikiappedit left join blocked_ip on mobilewikiappedit.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and useragent.os_family = "iOS"
and event.action='error'
and event.errortext like "%block%"
group by geocoded_data['country'],
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end
having n_ip > 5
order by n_ip desc
We only listed those that has 5+ IP addresses blocked. Please note that the value of wiki may not be the wiki of the article being edited -- we just fixed this bug in v6.2
%%read_sql edit_block_by_type_wiki
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
wiki,
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end as block_type
from event.mobilewikiappedit left join blocked_ip on mobilewikiappedit.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and useragent.os_family = "iOS"
and event.action='error'
and event.errortext like "%block%"
group by wiki,
case when event.errortext like "%Your IP address is in a range which has been blocked%" then "IP range block"
when event.errortext = "You have been blocked from editing." and blocked_ip is not null then "IP block"
when (event.errortext = "You have been blocked from editing." and not event.anon) or event.errortext = "Your IP address has been blocked automatically, because it was used by a blocked user." then "ID block"
else "Unknown" end
having n_ip > 5
order by n_ip desc
Querying MobileWikiAppCreateAccount table by parsing the localized error messages. Since the the error messages here are translated, the matching is fuzzy, i.e. any message that contain the word "IP" will be taken as a IP block message.
%%read_sql registration_block
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
useragent.os_family,
case when event.errortext like "%IP%" and blocked_ip is null then "Unconfirmed IP block"
when event.errortext like "%IP%" and blocked_ip is not null then "IP block"
else "other" end as error_msg
from event.mobilewikiappcreateaccount left join blocked_ip on mobilewikiappcreateaccount.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and event.action='error'
and event.errortext is not null
group by useragent.os_family,
case when event.errortext like "%IP%" and blocked_ip is null then "Unconfirmed IP block"
when event.errortext like "%IP%" and blocked_ip is not null then "IP block"
else "other" end
Android's n_users is 0 because didn't send any app install ID to the eventlogging table...
We only listed those that has 5+ IP addresses blocked.
%%read_sql registration_block_by_type_country
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
geocoded_data['country'] as country,
IF(blocked_ip is not null, "IP block", "Unconfirmed IP block") as block_type
from event.mobilewikiappcreateaccount left join blocked_ip on mobilewikiappcreateaccount.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and useragent.os_family = "iOS"
and event.action='error'
and event.errortext like "%IP%"
group by geocoded_data['country'],
IF(blocked_ip is not null, "IP block", "Unconfirmed IP block")
having n_ip > 5
order by n_ip desc
We only listed those that has 5+ IP addresses blocked.
%%read_sql registration_block_by_type_wiki
with blocked_ip as (
select distinct ip as blocked_ip
from (
select distinct log_title as ip
from wmf_raw.mediawiki_logging
where snapshot='2018-12'
and log_type = "block"
and log_action in ("block", "reblock")
and log_namespace=2
and substr(log_timestamp, 0, 4) > 2016
union
select distinct ipb_address as ip
from wmf_raw.mediawiki_ipblocks
where mediawiki_ipblocks.snapshot='2019-01'
) as tbl
)
select count(distinct coalesce(event.app_install_id, event.appInstallID)) as n_users,
count(distinct ip) as n_ip,
wiki,
IF(blocked_ip is not null, "IP block", "Unconfirmed IP block") as block_type
from event.mobilewikiappcreateaccount left join blocked_ip on mobilewikiappcreateaccount.ip=blocked_ip.blocked_ip
where (year=2019
or (year=2018 and month>10)
)
and useragent.os_family = "iOS"
and event.action='error'
and event.errortext like "%IP%"
group by wiki,
IF(blocked_ip is not null, "IP block", "Unconfirmed IP block")
having n_ip > 5
order by n_ip desc