Created
September 20, 2016 21:16
-
-
Save milimetric/e77e22a736cef4c973a26667a3e94d8c to your computer and use it in GitHub Desktop.
This is a step-by-step query / command guide to getting pageviews for a particular wiki project. We will productionize this and make it part of the pageview API.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Amir's query to get all pages in WikiProject_Medicine, executed on analytics-store | |
select page_title | |
from (select tl_from, rd_from | |
from templatelinks | |
left join | |
redirect on rd_from = tl_from | |
and rd_title = 'WikiProject_Medicine' | |
and (rd_interwiki is null or rd_interwiki = '') | |
and rd_namespace = 10 | |
inner join | |
page on tl_from = page_id | |
where tl_namespace = 10 | |
and tl_title = 'WikiProject_Medicine' | |
order by tl_from | |
) temp_backlink_range | |
inner join | |
page on tl_from = page_id | |
where page_namespace = 1 | |
order by page_title | |
; | |
-- output to a file on stat1003 by: | |
/* | |
mysql --defaults-file=/etc/mysql/conf.d/research-client.cnf -hanalytics-store.eqiad.wmnet -Denwiki -s < t.sql > wiki-project-medicine-pages.tsv | |
*/ | |
-- scp-ed to stat1004 | |
-- put in hadoop with: | |
-- hdfs dfs -put wiki-project-medicine-pages.tsv /user/milimetric/wiki-project-medicine-pages/ | |
-- create a hive table: | |
/* | |
create external table milimetric.wikiproject_medicine_page | |
(page_title string) | |
location 'hdfs://analytics-hadoop/user/milimetric/wiki-project-medicine-pages' | |
; | |
*/ | |
-- and create another hive table for the intermediary counts per page: | |
/* | |
create external table milimetric.wikiproject_medicine_page_counts | |
( | |
page_title string, | |
view_count bigint, | |
year int, | |
month int | |
) | |
location 'hdfs://analytics-hadoop/user/milimetric/wiki-project-medicine-page-counts' | |
; | |
*/ | |
-- select total and top pageviews for these articles | |
insert into milimetric.wikiproject_medicine_page_counts | |
select p.page_title, | |
sum(coalesce(w.view_count, 0)) as view_count, | |
w.year, | |
w.month | |
from milimetric.wikiproject_medicine_page p | |
inner join | |
wmf.pageview_hourly w on w.page_title = p.page_title | |
where w.year=2016 | |
and w.month in (7, 8) | |
group by p.page_title, year, month | |
; | |
-- example ways to get data out: | |
-- top 100 pages in August: | |
select page_title, | |
view_count | |
from milimetric.wikiproject_medicine_page_counts | |
where year=2016 | |
and month=8 | |
order by view_count desc | |
limit 100 | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment