Created
February 22, 2019 16:51
-
-
Save milimetric/7e8614d21550c0c23d86945ff463817b to your computer and use it in GitHub Desktop.
example query to mess around with
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use wmf; | |
-- new data | |
select coalesce(c.country, g.country_code) as country, | |
sum(edit_count) as edits, | |
sum(namespace_zero_edit_count) as namespace_zero_edits | |
from geoeditors_edits_monthly g | |
inner join | |
(select distinct dbname | |
from wmf_raw.mediawiki_project_namespace_map | |
where hostname like '%.wikipedia.%' | |
and snapshot='2019-01' | |
) w on w.dbname = g.wiki_db | |
left join | |
(select distinct country, country_code | |
from pageview_hourly | |
where year=2019 and month=2 and day=10 and hour = 15 | |
) c on g.country_code = c.country_code | |
where month in ('2018-11', '2018-12') | |
group by g.country_code, c.country | |
--having sum(edit_count) >= 16000 | |
order by country | |
limit 100000 | |
; | |
-- new data lower bound based on editors | |
select country, | |
sum(if(activity_level = '1 to 4', distinct_editors, 0)) + | |
(5 * sum(if(activity_level = '5 to 99', distinct_editors, 0))) + | |
(100 * sum(if(activity_level = '100 or more', distinct_editors, 0))) as lower_bound | |
from geoeditors_monthly g | |
left join | |
(select distinct country, country_code | |
from pageview_hourly | |
where year=2019 and month=2 and day=10 and hour = 15 | |
) c on g.country_code = c.country_code | |
where month in ('2018-11', '2018-12') | |
group by country | |
having sum(distinct_editors) >= 1000 | |
order by country | |
limit 1000 | |
; | |
-- playing around with an example of 'AL' and different constants to see the estimate | |
select sum(edits) from geoeditors_edits_monthly where country_code = 'AL' and month='2018-12'; | |
select (3 * sum(if(activity_level = '1 to 4', distinct_editors, 0))) + | |
(10 * sum(if(activity_level = '5 to 99', distinct_editors, 0))) + | |
(110 * sum(if(activity_level = '100 or more', distinct_editors, 0))) as lower_bound | |
from geoeditors_monthly g | |
where month in ('2018-12') | |
and country_code = 'AL' | |
; | |
-- raw data deleted after 90 days is: | |
in geoeditors_daily | |
-- old data | |
select country, | |
sum(edits) edits | |
from geowiki_archive_edits_country | |
where start like '2017-%-01' | |
group by country | |
having sum(edits) >= 40000 | |
order by country | |
limit 100000 | |
; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment