Get Q4 TSS2:
Get daily SERP, search, user count:
#Count daily SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
cat("Fetching webrequest data from ", as.character(date), "\n")
clause_data <- wmf::date_clause(date)
query <- paste("USE wmf;
SELECT
date,
access_method,
agent_type,
COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
COUNT(*) AS n_serp, -- a lot of NULL searchToken, so don't count DISTINCT CONCAT(client_ip, user_agent, searchToken)
COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
FROM (
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
client_ip,
user_agent,
agent_type,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search') AS query,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken') AS searchToken -- a lot of NULL
FROM webrequest",
clause_data$date_clause,
"AND webrequest_source = 'text'
AND is_pageview
-- flag for pageviews that are search results pages
AND page_id IS NULL
AND (
uri_path = '/wiki/Special:Search'
OR (
uri_path = '/w/index.php'
AND (
LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search')) > 0
OR LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken')) > 0
)
)
)
) AS serp
GROUP BY date, access_method, agent_type;")
results <- wmf::query_hive(query)
return(results)
}))
readr::write_rds(daily_serp, "data/quarterly_metrics/daily_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_serp.rds data/")
Daily pageviews from SERP, count search and user:
#Count daily Pageviews from SERP
start_date <- as.Date("2017-04-22")
end_date <- as.Date("2017-06-30")
daily_pv_from_serp <- do.call(rbind, lapply(seq(start_date, end_date, "day"), function(date) {
cat("Fetching webrequest data from ", as.character(date), "\n")
clause_data <- wmf::date_clause(date)
query <- paste("USE wmf;
SELECT
date,
access_method,
agent_type,
COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
COUNT(*) AS n_pv,
COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user
FROM (
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
client_ip,
user_agent,
agent_type,
PARSE_URL(referer, 'QUERY', 'search') AS query,
PARSE_URL(referer, 'QUERY', 'searchToken') AS searchToken
FROM webrequest",
clause_data$date_clause,
"AND webrequest_source = 'text'
AND is_pageview
-- only those that have been referred by a search results page:
AND referer_class = 'internal'
AND (
LENGTH(PARSE_URL(referer, 'QUERY', 'search')) > 0
OR LENGTH(PARSE_URL(referer, 'QUERY', 'searchToken')) > 0
)
) AS pv
GROUP BY date, access_method, agent_type;")
results <- wmf::query_hive(query)
return(results)
}))
readr::write_rds(daily_pv_from_serp, "data/quarterly_metrics/daily_pv_from_serp.rds", "gz")
# Local
system("scp chelsyx@stat5:~/data/quarterly_metrics/daily_pv_from_serp.rds data/")
absolute searches –> results shown(non-ZRR )–> clickthroughs
The first two graphs use webrequest data, which counts the absolute number of fulltext searches on all platforms. The median non-zero result rate is 81.3%, the median clickthrough rate is 75.5%, which is way too high. We haven’t found out the reason yet…
daily_search_automata <- daily_serp_no_access %>%
group_by(date) %>%
summarise(n_search = sum(n_search))
zrr_with_automata <- polloi::read_dataset("discovery/metrics/search/cirrus_query_breakdowns_with_automata.tsv", col_types = "Dcd") %>%
dplyr::filter(!is.na(query_type), !is.na(rate)) %>%
mutate(date = lubridate::ymd(date),
query_type = if_else(grepl("full.?text", query_type, ignore.case = TRUE), "fulltext", query_type)) %>%
filter(date >= as.Date("2017-04-23"), date < as.Date("2017-07-01"), query_type == "fulltext")
daily_clicked_search_automata <- daily_pv_from_serp_no_access %>%
group_by(date) %>%
summarise(clicked_search = sum(n_search))
daily_comp_rate <- daily_search_automata %>%
mutate(search_with_result=n_search*(1-zrr_with_automata$rate)) %>%
inner_join(daily_clicked_search_automata, by = "date")
daily_comp_rate %>%
rename(`All Searches` = n_search, `Searches with Results` = search_with_result, `Searches with Clicks` = clicked_search) %>%
gather(key=Group, value = count, -date) %>%
ggplot(aes(x=date, y=count, colour=Group)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Number of Searches") +
ggtitle("Fulltext Searches on All Platforms", subtitle = "Webrequest data") +
theme(legend.position = "bottom") +
annotate("segment", x = as.Date("2017-07-01"), xend = as.Date("2017-07-01"),
y = 3000000, yend = 4000000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
annotate("text", x = as.Date("2017-07-05"), y = 3500000, label = "81.3%") +
annotate("segment", x = as.Date("2017-07-01"), xend = as.Date("2017-07-01"),
y = 2300000, yend = 2900000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
annotate("text", x = as.Date("2017-07-05"), y = 2600000, label = "75.5%")
daily_comp_rate <- daily_comp_rate %>%
mutate(ctr = clicked_search/search_with_result)
daily_comp_rate %>%
ggplot(aes(x=date, y=ctr)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Clickthrough Rate") +
ggtitle("Clickthrough Rate for Fulltext Searches on All Platforms", subtitle = "Webrequest data") +
theme(legend.position = "bottom")
Instead, I suggest we use eventlogging data as below. The median non-zero result rate is 77.6%, the median clickthrough rate is 32.3%.
tss2_searches %>%
group_by(date) %>%
summarise(`All Searches` = n(),
`Searches with Results` = sum(`got same-wiki results`, na.rm = TRUE),
`Searches with Clicks` = sum(`same-wiki clickthrough`, na.rm = TRUE)) %>%
gather(key=Group, value = count, -date) %>%
ggplot(aes(x=date, y=count, colour=Group)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Number of Searches") +
ggtitle("Fulltext Searches on Desktop", subtitle = "Eventlogging data") +
theme(legend.position = "bottom") +
geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
linetype = "dashed", color = "black") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
linetype = "dashed", color = "black") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-28")),
linetype = "dashed", color = "black") +
annotate("text", x = as.Date("2017-04-26"), y = 15000, label = "sample rate changed", angle = 90) +
# annotate("text", x = as.Date("2017-06-16"), y = 15000, label = "sister search deployed", angle = 90) +
# annotate("text", x = as.Date("2017-06-29"), y = 15000, label = "started to collect data", angle = 90) +
annotate("segment", x = as.Date("2017-08-01"), xend = as.Date("2017-08-01"),
y = 20000, yend = 30000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
annotate("text", x = as.Date("2017-08-07"), y = 25000, label = "77.6%") +
annotate("segment", x = as.Date("2017-08-01"), xend = as.Date("2017-08-01"),
y = 7000, yend = 19000, arrow=arrow(ends="both", angle=90, length=unit(.2,"cm"))) +
annotate("text", x = as.Date("2017-08-07"), y = 13000, label = "32.3%") +
labs(caption = "*On April 25th, we changed the sample rates for several projects. On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages and started to collect eventlogging data on 2017-06-29.")
Users may click back to search page directly after they clickthrough to an article (within 10 mins). We computed two kinds of return rate:
The median rate of return to the same search is 24%, and the median rate of return to make different search is 26% (after sample rate change).
returnRate_to_same_search <- tss2_q4 %>%
group_by(date, serp_id) %>%
filter(sum(grepl("click", event)) > 0) %>% # Among search with at least 1 click
arrange(timestamp) %>%
mutate(n_click_cumsum = cumsum(grepl("click", event))) %>%
filter(n_click_cumsum > 0) %>% # delete serp before first click
summarise(comeback = "searchResultPage" %in% event | sum(n_click_cumsum > 1)) %>% # comeback to the same serp or make another click
group_by(date) %>%
summarise(return_to_same_search = sum(comeback)/length(unique(serp_id))*100)
returnRate_to_other_search <- tss2_q4 %>%
group_by(date, session_id) %>%
filter(sum(grepl("click", event)) > 0) %>% # Among session with at least 1 click
arrange(timestamp) %>%
mutate(n_click_cumsum = cumsum(grepl("click", event))) %>%
filter(n_click_cumsum > 0) %>% # delete serp before first click
summarise(another_search = length(unique(serp_id)) > 1) %>% # comeback to make another search
group_by(date) %>%
summarise(return_to_make_another_search = sum(another_search) / length(unique(session_id))*100)
returnRate_to_same_search %>%
inner_join(returnRate_to_other_search, by = "date") %>%
rename(`Return to make different search` = return_to_make_another_search,
`Return to the same search page` = return_to_same_search) %>%
gather(key=Type, value = rate, -date) %>%
ggplot(aes(x=date, y=rate, colour=Type)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Return Rate (%)") +
geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
linetype = "dashed", color = "black") +
annotate("text", x = as.Date("2017-04-26"), y = 20, label = "sample rate changed", angle = 90) +
ggtitle("Return rate after users clickthrough on search engine result pages", subtitle = "Full-text search on desktop") +
labs(caption = "*On April 25th, we changed the sample rates for several projects.") +
theme(legend.position = "bottom")
daily_pv_from_serp %>%
group_by(date, access_method) %>%
summarise(n_pv = sum(n_pv)) %>%
ggplot(aes(x=date, y=n_pv, colour=access_method)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Pageviews") +
ggtitle("Pageviews from full-text search") +
theme(legend.position = "bottom")
Pageviews from full-text search on mobile web saw an increase for 3 weeks in June. This increase doesn’t seem to result from bots, since we also saw an increase in the number of users who clickthrough on mobile web:
daily_pv_from_serp %>%
filter(access_method=="mobile web") %>%
rename(Pageviews = n_pv, `Number of Searches` = n_search, `Number of Users` = n_user) %>%
gather(key=Metrics, value = counts, -c(date, access_method, agent_type)) %>%
ggplot(aes(x=date, y=counts, colour=Metrics)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Counts") +
ggtitle("Mobile web traffic from full-text search") +
theme(legend.position = "bottom")
Wiktionary got the most absolute pageview increase from sister search (increase about 150%), but Wikiversity got the highest increase in proportion, although it is still a small proportion (around 0.13%).
sister_search_traffic <- polloi::read_dataset("discovery/metrics/search/sister_search_traffic.tsv", col_types = "Dcccli") %>%
dplyr::mutate(
project = polloi::capitalize_first_letter(project),
access_method = polloi::capitalize_first_letter(access_method)
)
sister_search_traffic %>%
group_by(date, project) %>%
summarise(pageviews = sum(pageviews)) %>%
ggplot(aes(x=date, y=pageviews, colour=project)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Pageviews") +
ggtitle("Traffic to sister projects from Wikipedia SERPs, broken down by project") +
theme(legend.position = "bottom") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
linetype = "dashed", color = "black") +
labs(caption = "*On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages.")
sis_pv_total <- readr::read_rds("data/project_pv.rds") %>%
filter(!(project %in% c("simple wikiquote", "simple wikibooks")), date <= "2017-07-23",
access_method != "mobile app") %>%
dplyr::mutate(
project = polloi::capitalize_first_letter(project),
access_method = polloi::capitalize_first_letter(access_method),
date = lubridate::ymd(date)
) %>%
rename(total_pv = pageviews)
sister_search_data <- sister_search_traffic %>%
group_by(date, access_method, project) %>%
summarise(pageviews = sum(pageviews)) %>%
right_join(sis_pv_total, by = c("date", "access_method", "project")) %>%
mutate(pageviews = ifelse(is.na(pageviews), 0, pageviews))
sister_search_data %>%
group_by(date, project) %>%
summarise(total_pv = sum(total_pv), pageviews = sum(pageviews), proportion = pageviews/total_pv*100) %>%
ggplot(aes(x=date, y=proportion, colour=project)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Proportion of Pageviews (%)") +
ggtitle("Proportion of pageviews from Wikipedia SERPs, broken down by project") +
theme(legend.position = "bottom") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
linetype = "dashed", color = "black") +
labs(caption = "*On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages.")
Compared to same-wiki clickthrough rate on Wikipedia, sister search clickthrough rate is very low. The median of sister search clickthrough rate is 0.8%, while the median of same-wiki clickthrough rate is 26%. There was a dip between June 15th and June 29th. I doubt that was causing by our event-logging implementation. Further investigation is needed.
tss2_searches %>%
cbind(polloi::parse_wikiid(tss2_searches$wiki)) %>%
filter(project == "Wikipedia") %>%
group_by(date) %>%
summarise(`Same-Wiki` = sum(`same-wiki clickthrough`, na.rm = TRUE)/n(), `Sister Search` = sum(`sister clickthrough`, na.rm = TRUE)/n()) %>%
gather(key = Group, value = ctr, -date) %>%
ggplot(aes(x=date, y=ctr, colour=Group)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Clickthrough Rate", trans = scales::sqrt_trans()) +
ggtitle("Clickthrough Rate on Wikipedia, Same-Wiki Clicks vs Sister Search Clicks", subtitle = "Full-text search on desktop") +
theme(legend.position = "bottom") +
geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
linetype = "dashed", color = "black") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
linetype = "dashed", color = "black") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-28")),
linetype = "dashed", color = "black") +
annotate("text", x = as.Date("2017-06-16"), y = 0.03, label = "deployed", angle = 90) +
annotate("text", x = as.Date("2017-06-29"), y = 0.03, label = "started to collect data", angle = 90) +
annotate("text", x = as.Date("2017-04-26"), y = 0.03, label = "sample rate changed", angle = 90) +
labs(caption = "*On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages and started to collect eventlogging data on 2017-06-29.")
Before sister search deployed, the median load time was 680, this number increased to 807 after deployed. Since we just deployed more than a month ago, we don’t have enough data to raise the alarm.
desktop_loadtime <- polloi::read_dataset("discovery/metrics/search/desktop_load_times.tsv", col_types = "Dddd") %>%
dplyr::filter(!is.na(Median), date >= "2017-01-01")
desktop_loadtime %>%
ggplot(aes(x=date, y=Median)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Load time (ms)") +
geom_vline(xintercept = as.numeric(as.Date("2017-06-15")),
linetype = "dashed", color = "black") +
annotate("text", x = as.Date("2017-06-16"), y = 630, label = "sister search deployed", angle = 90) +
ggtitle("Desktop full-text search result load times", subtitle = "Median of everyday in 2017") +
labs(caption = "*On 2017-06-15 we deployed the sister search feature to all Wikipedia in all languages.")
has it increased because of users reading snippets? This one depends on T170468 which we haven’t been able to work on. We will present next time.
On both desktop and mobile app, more than 80% of clicks are in the first 3 position. (probably not very interesting, but worth verbally mention)
safe_ordinals <- function(x) {
na_mask <- is.na(x)
output <- rep(NA, length(x))
output[!na_mask] <- vapply(x[!na_mask], toOrdinal::toOrdinal, "")
return(output)
}
tss2_q4 %>%
filter(event == "click") %>%
mutate(event_position = case_when(
event_position >= 4 & event_position < 20 ~ "5th - 20th",
event_position >= 20 ~ "21st+",
TRUE ~ safe_ordinals(event_position + 1)
)) %>%
group_by(date, event_position) %>%
summarise(n_click = n()) %>%
ungroup %>%
group_by(date) %>%
mutate(prop = n_click/sum(n_click)*100) %>%
ggplot(aes(x=date, y=prop, colour=event_position)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Proportion of clicks (%)", trans = scales::sqrt_trans()) +
ggtitle("Proportion of Clicks on Nth Result on Desktop", subtitle="April 1st - June 30th") +
theme(legend.position = "bottom")
Mobile App: see http://discovery.wmflabs.org/metrics/#app_click_position
fulltext_api <- polloi::read_dataset("discovery/metrics/search/search_api_usage.tsv", col_types = "Dci") %>%
dplyr::filter(!is.na(api), !is.na(calls), api == "cirrus", date >= "2017-04-23", date <= "2017-06-30") %>%
dplyr::distinct(date, api, .keep_all = TRUE) %>%
rename(access_method=api, n_serp=calls) %>%
mutate(access_method = "api")
daily_serp %>%
filter(access_method != "mobile app") %>%
group_by(date, access_method) %>%
summarise(n_serp = sum(n_serp)) %>%
bind_rows(fulltext_api) %>%
ggplot(aes(x=date, y=n_serp, colour=access_method)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Number of SERPs", trans = scales::sqrt_trans()) +
ggtitle("Number of SERPs by access methods") +
theme(legend.position = "bottom")
daily_serp %>%
filter(access_method != "mobile app") %>%
group_by(date, access_method) %>%
summarise(n_serp = sum(n_serp)) %>%
ggplot(aes(x=date, y=n_serp, colour=access_method)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(labels=polloi::compress, name = "Number of SERPs") +
scale_color_manual(values=scales::hue_pal()(3)[2:3]) +
ggtitle("Number of SERPs on desktop and mobile web") +
theme(legend.position = "bottom")
Among the top 5 projects which have the most searches, Wikidata has the highest clickthrough rate while wikimedia commons has the lowest clickthrough rate.
top_project <- tss2_searches %>%
cbind(polloi::parse_wikiid(tss2_searches$wiki)) %>%
group_by(project) %>%
summarise(n_search = n()) %>%
top_n(5)
tss2_searches %>%
cbind(polloi::parse_wikiid(tss2_searches$wiki)) %>%
filter(project %in% top_project$project) %>%
group_by(date, project) %>%
summarise(ctr = sum(`same-wiki clickthrough`, na.rm = TRUE)/n()) %>%
ggplot(aes(x=date, y=ctr, colour=project)) +
geom_line(size=1.2) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Clickthrough Rate") +
ggtitle("Clickthrough rate by top 5 projects", subtitle = "Full-text search on desktop") +
theme(legend.position = "bottom") +
geom_vline(xintercept = as.numeric(as.Date("2017-04-25")),
linetype = "dashed", color = "black") +
annotate("text", x = as.Date("2017-04-26"), y = 0.2, label = "sample rate changed", angle = 90) +
labs(caption = "*On April 25th, we changed the sample rates for several projects.")