Data

Get Q4 TSS2:

Get daily SERP, search, user count:

#Count daily SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
  cat("Fetching webrequest data from ", as.character(date), "\n")
  clause_data <- wmf::date_clause(date)
  query <- paste("USE wmf;
    SELECT
      date,
      access_method,
      agent_type, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
      COUNT(*) AS n_serp, -- a lot of NULL searchToken, so don't count DISTINCT CONCAT(client_ip, user_agent, searchToken)
      COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
    FROM (
      SELECT 
        CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
        access_method,
        client_ip, 
        user_agent,
        agent_type,
        PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search') AS query,
        PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken') AS searchToken -- a lot of NULL
      FROM webrequest",
    clause_data$date_clause,
    "AND webrequest_source = 'text'
    AND is_pageview
    -- flag for pageviews that are search results pages
    AND page_id IS NULL
    AND (
      uri_path = '/wiki/Special:Search'
      OR (
        uri_path = '/w/index.php'
        AND (
          LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search')) > 0
          OR LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken')) > 0
        )
      )
    )
  ) AS serp
  GROUP BY date, access_method, agent_type;") 
  results <- wmf::query_hive(query)
  return(results)
}))
readr::write_rds(daily_serp, "data/quarterly_metrics/daily_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_serp.rds data/")

Daily pageviews from SERP, count search and user:

#Count daily Pageviews from SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_pv_from_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
  cat("Fetching webrequest data from ", as.character(date), "\n")
  clause_data <- wmf::date_clause(date)
  query <- paste("USE wmf;
    SELECT
      date,
      access_method,
      agent_type, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
      COUNT(*) AS n_pv, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
    FROM (
      SELECT 
        CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
        access_method,
        client_ip,