Data

Get Q4 TSS2:

Get daily SERP, search, user count:

#Count daily SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
  cat("Fetching webrequest data from ", as.character(date), "\n")
  clause_data <- wmf::date_clause(date)
  query <- paste("USE wmf;
    SELECT
      date,
      access_method,
      agent_type, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
      COUNT(*) AS n_serp, -- a lot of NULL searchToken, so don't count DISTINCT CONCAT(client_ip, user_agent, searchToken)
      COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
    FROM (
      SELECT 
        CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
        access_method,
        client_ip, 
        user_agent,
        agent_type,
        PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search') AS query,
        PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken') AS searchToken -- a lot of NULL
      FROM webrequest",
    clause_data$date_clause,
    "AND webrequest_source = 'text'
    AND is_pageview
    -- flag for pageviews that are search results pages
    AND page_id IS NULL
    AND (
      uri_path = '/wiki/Special:Search'
      OR (
        uri_path = '/w/index.php'
        AND (
          LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search')) > 0
          OR LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken')) > 0
        )
      )
    )
  ) AS serp
  GROUP BY date, access_method, agent_type;") 
  results <- wmf::query_hive(query)
  return(results)
}))
readr::write_rds(daily_serp, "data/quarterly_metrics/daily_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_serp.rds data/")

Daily pageviews from SERP, count search and user:

#Count daily Pageviews from SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_pv_from_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
  cat("Fetching webrequest data from ", as.character(date), "\n")
  clause_data <- wmf::date_clause(date)
  query <- paste("USE wmf;
    SELECT
      date,
      access_method,
      agent_type, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
      COUNT(*) AS n_pv, 
      COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
    FROM (
      SELECT 
        CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
        access_method,
        client_ip, 
        user_agent,
        agent_type,
        PARSE_URL(referer, 'QUERY', 'search') AS query,
        PARSE_URL(referer, 'QUERY', 'searchToken') AS searchToken
      FROM webrequest",
    clause_data$date_clause,
    "AND webrequest_source = 'text'
    AND is_pageview
    -- only those that have been referred by a search results page:
    AND referer_class = 'internal'
    AND (
      LENGTH(PARSE_URL(referer, 'QUERY', 'search')) > 0
      OR LENGTH(PARSE_URL(referer, 'QUERY', 'searchToken')) > 0
    )
  ) AS pv
  GROUP BY date, access_method, agent_type;")  
  results <- wmf::query_hive(query)
  return(results)
}))
readr::write_rds(daily_pv_from_serp, "data/quarterly_metrics/daily_pv_from_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_pv_from_serp.rds data/")

Completion Rate

absolute searches –> results shown(non-ZRR )–> clickthroughs

The first two graphs use webrequest data, which counts the absolute number of fulltext searches on all platforms. The median non-zero result rate is 81.3%, the median clickthrough rate is 75.5%, which is way too high. We haven’t found out the reason yet…

daily_search_automata <- daily_serp_no_access %>%
  group_by(date) %>%
  summarise(n_search = sum(n_search))

zrr_with_automata <- polloi::read_dataset("discovery/metrics/search/cirrus_query_breakdowns_with_automata.tsv", col_types = "Dcd") %>%
  dplyr::filter(!is.na(query_type), !is.na(rate)) %>%
  mutate(date = lubridate::ymd(date),
         query_type = if_else(grepl("full.?text", query_type, ignore.case = TRUE), "fulltext", query_type)) %>%
 filter(date >= as.Date("2017-04-23"), date < as.Date("2017-07-01"), query_type == "fulltext")

daily_clicked_search_automata <- daily_pv_from_serp_no_access %>%
  group_by(date) %>%
  summarise(clicked_search = sum(n_search))

daily_comp_rate <- daily_search_automata %>%
  mutate(search_with_result=n_search*(1-zrr_with_automata$rate)) %>%
  inner_join(daily_clicked_search_automata, by = "date")
  
daily_comp_rate %>%
  rename(`All Searches` = n_search, `Searches with Results` = search_with_result, `Searches with Clicks` = clicked_search) %>%
  gather(key=Group, value = count, -date) %>%
  ggplot(aes(x=date, y=count, colour=Group)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(labels=polloi::compress, name = "Number of Searches") +
  ggtitle("Fulltext Searches on All Platforms", subtitle = "Webrequest data") +
  theme(legend.position = "bottom") +
  annotate("segment", x = as.Date("2017-07-01"), xend = as.Date("2017-07-01"), 
           y = 3000000, yend = 4000000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
  annotate("text", x = as.Date("2017-07-05"), y = 3500000, label = "81.3%") +
  annotate("segment", x = as.Date("2017-07-01"), xend = as.Date("2017-07-01"), 
           y = 2300000, yend = 2900000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
  annotate("text", x = as.Date("2017-07-05"), y = 2600000, label = "75.5%") 

daily_comp_rate <- daily_comp_rate %>%
  mutate(ctr = clicked_search/search_with_result)

daily_comp_rate %>%
ggplot(aes(x=date, y=ctr)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "Clickthrough Rate") +
  ggtitle("Clickthrough Rate for Fulltext Searches on All Platforms", subtitle = "Webrequest data") +
  theme(legend.position = "bottom")

Instead, I suggest we use eventlogging data as below. The median non-zero result rate is 77.6%, the median clickthrough rate is 32.3%.

tss2_searches %>%
  group_by(date) %>%
  summarise(`All Searches` = n(), 
            `Searches with Results` = sum(`got same-wiki results`, na.rm = TRUE), 
            `Searches with Clicks` = sum(`same-wiki clickthrough`, na.rm = TRUE)) %>%
  gather(key=Group, value = count, -date) %>%
  ggplot(aes(x=date, y=count, colour=Group)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(labels=polloi::compress, name = "Number of Searches") +
  ggtitle("Fulltext Searches on Desktop", subtitle = "Eventlogging data") +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
               linetype = "dashed", color = "black") +
  geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
             linetype = "dashed", color = "black") + 
  geom_vline(xintercept = as.numeric(as.Date("2017-06-28")),
             linetype = "dashed", color = "black") + 
  annotate("text", x = as.Date("2017-04-26"), y = 15000, label = "sample rate changed", angle = 90) +
  # annotate("text", x = as.Date("2017-06-16"), y = 15000, label = "sister search deployed", angle = 90) +
  # annotate("text", x = as.Date("2017-06-29"), y = 15000, label = "started to collect data", angle = 90) +
  annotate("segment", x = as.Date("2017-08-01"), xend = as.Date("2017-08-01"), 
           y = 20000, yend = 30000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
  annotate("text", x = as.Date("2017-08-07"), y = 25000, label = "77.6%") +
  annotate("segment", x = as.Date("2017-08-01"), xend = as.Date("2017-08-01"), 
           y = 7000, yend = 19000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
  annotate("text", x = as.Date("2017-08-07"), y = 13000, label = "32.3%") +
  labs(caption = "*On April 25th, we changed the sample rates for several projects. On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages and started to collect eventlogging data on 2017-06-29.")

Return Rate

Users may click back to search page directly after they clickthrough to an article (within 10 mins). We computed two kinds of return rate:

The median rate of return to the same search is 24%, and the median rate of return to make different search is 26% (after sample rate change).

returnRate_to_same_search <- tss2_q4 %>%
  group_by(date, serp_id) %>%
  filter(sum(grepl("click", event)) > 0) %>% # Among search with at least 1 click
  arrange(timestamp) %>%
  mutate(n_click_cumsum = cumsum(grepl("click", event))) %>%
  filter(n_click_cumsum > 0) %>% # delete serp before first click
  summarise(comeback = "searchResultPage" %in% event | sum(n_click_cumsum > 1)) %>% # comeback to the same serp or make another click
  group_by(date) %>%
  summarise(return_to_same_search = sum(comeback)/length(unique(serp_id))*100)

returnRate_to_other_search <-  tss2_q4 %>%
  group_by(date, session_id) %>%
  filter(sum(grepl("click", event)) > 0) %>% # Among session with at least 1 click
  arrange(timestamp) %>%
  mutate(n_click_cumsum = cumsum(grepl("click", event))) %>%
  filter(n_click_cumsum > 0) %>% # delete serp before first click
  summarise(another_search = length(unique(serp_id)) > 1) %>% # comeback to make another search
  group_by(date) %>%
  summarise(return_to_make_another_search = sum(another_search) / length(unique(session_id))*100)

returnRate_to_same_search %>%
  inner_join(returnRate_to_other_search, by = "date") %>%
  rename(`Return to make different search` = return_to_make_another_search,
         `Return to the same search page` = return_to_same_search) %>%
  gather(key=Type, value = rate, -date) %>%
  ggplot(aes(x=date, y=rate, colour=Type)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "Return Rate (%)") +
  geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
             linetype = "dashed", color = "black") +
  annotate("text", x = as.Date("2017-04-26"), y = 20, label = "sample rate changed", angle = 90) +
  ggtitle("Return rate after users clickthrough on search engine result pages", subtitle = "Full-text search on desktop") +
  labs(caption = "*On April 25th, we changed the sample rates for several projects.") +
  theme(legend.position = "bottom")

Load time on SERP

Before sister search deployed, the median load time was 680, this number increased to 807 after deployed. Since we just deployed more than a month ago, we don’t have enough data to raise the alarm.

desktop_loadtime <- polloi::read_dataset("discovery/metrics/search/desktop_load_times.tsv", col_types = "Dddd") %>%
  dplyr::filter(!is.na(Median), date >= "2017-01-01")
desktop_loadtime %>%
  ggplot(aes(x=date, y=Median)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(labels=polloi::compress, name = "Load time (ms)") +
  geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
             linetype = "dashed", color = "black") +
  annotate("text", x = as.Date("2017-06-16"), y = 630, label = "sister search deployed", angle = 90) +
  ggtitle("Desktop full-text search result load times", subtitle = "Median of everyday in 2017") +
  labs(caption = "*On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages.")

Dwell time on SERP

has it increased because of users reading snippets? This one depends on T170468 which we haven’t been able to work on. We will present next time.

Click position

On both desktop and mobile app, more than 80% of clicks are in the first 3 position. (probably not very interesting, but worth verbally mention)

safe_ordinals <- function(x) {
  na_mask <- is.na(x)
  output <- rep(NA, length(x))
  output[!na_mask] <- vapply(x[!na_mask], toOrdinal::toOrdinal, "")
  return(output)
}

tss2_q4 %>%
  filter(event == "click") %>%
  mutate(event_position = case_when(
    event_position >= 4 & event_position < 20 ~ "5th - 20th",
    event_position >= 20 ~ "21st+",
    TRUE ~ safe_ordinals(event_position + 1)
  )) %>%
  group_by(date, event_position) %>%
  summarise(n_click = n()) %>%
  ungroup %>%
  group_by(date) %>%
  mutate(prop = n_click/sum(n_click)*100) %>%
  ggplot(aes(x=date, y=prop, colour=event_position)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "Proportion of clicks (%)", trans = scales::sqrt_trans()) +
  ggtitle("Proportion of Clicks on Nth Result on Desktop", subtitle="April 1st - June 30th") +
  theme(legend.position = "bottom")

Mobile App: see http://discovery.wmflabs.org/metrics/#app_click_position

SERP visits: desktop vs mobile web vs api

fulltext_api <- polloi::read_dataset("discovery/metrics/search/search_api_usage.tsv", col_types = "Dci") %>%
  dplyr::filter(!is.na(api), !is.na(calls), api == "cirrus", date >= "2017-04-23", date <= "2017-06-30") %>%
  dplyr::distinct(date, api, .keep_all = TRUE) %>%
  rename(access_method=api, n_serp=calls) %>%
  mutate(access_method = "api")

daily_serp %>%
  filter(access_method != "mobile app") %>%
  group_by(date, access_method) %>%
  summarise(n_serp = sum(n_serp)) %>%
  bind_rows(fulltext_api) %>%
  ggplot(aes(x=date, y=n_serp, colour=access_method)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(labels=polloi::compress, name = "Number of SERPs", trans = scales::sqrt_trans()) +
  ggtitle("Number of SERPs by access methods") +
  theme(legend.position = "bottom")

daily_serp %>%
  filter(access_method != "mobile app") %>%
  group_by(date, access_method) %>%
  summarise(n_serp = sum(n_serp)) %>%
  ggplot(aes(x=date, y=n_serp, colour=access_method)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(labels=polloi::compress, name = "Number of SERPs") +
  scale_color_manual(values=scales::hue_pal()(3)[2:3]) + 
  ggtitle("Number of SERPs on desktop and mobile web") +
  theme(legend.position = "bottom")

Clickthrough rate by project

Among the top 5 projects which have the most searches, Wikidata has the highest clickthrough rate while wikimedia commons has the lowest clickthrough rate.

top_project <- tss2_searches %>%
  cbind(polloi::parse_wikiid(tss2_searches$wiki)) %>%
  group_by(project) %>%
  summarise(n_search = n()) %>%
  top_n(5)

tss2_searches %>%
  cbind(polloi::parse_wikiid(tss2_searches$wiki)) %>%  
  filter(project %in% top_project$project) %>%
  group_by(date, project) %>%
  summarise(ctr = sum(`same-wiki clickthrough`, na.rm = TRUE)/n()) %>%
  ggplot(aes(x=date, y=ctr, colour=project)) + 
  geom_line(size=1.2) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "Clickthrough Rate") +
  ggtitle("Clickthrough rate by top 5 projects", subtitle = "Full-text search on desktop") +
  theme(legend.position = "bottom") +
  geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
             linetype = "dashed", color = "black") +
  annotate("text", x = as.Date("2017-04-26"), y = 0.2, label = "sample rate changed", angle = 90) +
  labs(caption = "*On April 25th, we changed the sample rates for several projects.")