Get Q4 TSS2:
Get daily SERP, search, user count:
#Count daily SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
cat("Fetching webrequest data from ", as.character(date), "\n")
clause_data <- wmf::date_clause(date)
query <- paste("USE wmf;
SELECT
date,
access_method,
agent_type,
COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
COUNT(*) AS n_serp, -- a lot of NULL searchToken, so don't count DISTINCT CONCAT(client_ip, user_agent, searchToken)
COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
FROM (
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
client_ip,
user_agent,
agent_type,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search') AS query,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken') AS searchToken -- a lot of NULL
FROM webrequest",
clause_data$date_clause,
"AND webrequest_source = 'text'
AND is_pageview
-- flag for pageviews that are search results pages
AND page_id IS NULL
AND (
uri_path = '/wiki/Special:Search'
OR (
uri_path = '/w/index.php'
AND (
LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search')) > 0
OR LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken')) > 0
)
)
)
) AS serp
GROUP BY date, access_method, agent_type;")
results <- wmf::query_hive(query)
return(results)
}))
readr::write_rds(daily_serp, "data/quarterly_metrics/daily_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_serp.rds data/")
Daily pageviews from SERP, count search and user:
#Count daily Pageviews from SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_pv_from_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
cat("Fetching webrequest data from ", as.character(date), "\n")
clause_data <- wmf::date_clause(date)
query <- paste("USE wmf;
SELECT
date,
access_method,
agent_type,
COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
COUNT(*) AS n_pv,
COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
FROM (
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
client_ip,