Last active
December 15, 2015 12:49
-
-
Save milimetric/5262726 to your computer and use it in GitHub Desktop.
Which skin do Wikipedia Editors use? Only look at editors with 5 or more edits over the past 30 days.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*swp |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Problem 1: Dispelling the Ryan query vs. our query myth */ | |
-- This is Ryan's query | |
select count(*) | |
from (select rc_user, | |
count(*) as revs | |
from enwiki.recentchanges | |
where rc_namespace = 0 | |
and rc_type < 2 | |
group by 1 | |
having revs >= 5) as t; | |
-- it counts 32623 | |
-- This is a simple count of our version | |
select count(*) | |
from (select user.user_id, | |
count(*) as edits_in_last_30_days | |
from recentchanges | |
inner join | |
user on user.user_id = recentchanges.rc_user | |
where recentchanges.rc_namespace = 0 | |
and recentchanges.rc_type < 2 | |
group by user_id | |
having edits_in_last_30_days >= 5 | |
) as active_editors | |
left join | |
user_properties on user_properties.up_user = active_editors.user_id | |
where user_properties.up_property = 'skin'; | |
-- it counts 32619 which must mean there are some people not in the user table. I think this is more valid then, even if the difference is small | |
-- Conclusion to Problem 1: It doesn't seem that our inner query is causing any problems | |
/* Problem 2: Finding out what gets lost when we LEFT join our inner query to user_properties. Hint: nothing should get lost! :) */ | |
-- First try the query without the group by, or the coalesce which I might have incorrectly assumed works like T-SQL | |
select count(*) as skin_users | |
from (select user.user_id, | |
count(*) as edits_in_last_30_days | |
from recentchanges | |
inner join | |
user on user.user_id = recentchanges.rc_user | |
where recentchanges.rc_namespace = 0 | |
and recentchanges.rc_type < 2 | |
and recentchanges.rc_user > 0 | |
group by user_id | |
having edits_in_last_30_days >= 5 | |
) as active_editors | |
left join | |
user_properties on user_properties.up_user = active_editors.user_id | |
where user_properties.up_property = 'skin' | |
-- counts 6037!! This means something's wrong with the left join or where clause | |
-- Correct! It was the problem I solved months ago but forgot: the where clause is wrong | |
select count(*) as skin_users | |
from (select user.user_id, | |
count(*) as edits_in_last_30_days | |
from recentchanges | |
inner join | |
user on user.user_id = recentchanges.rc_user | |
where recentchanges.rc_namespace = 0 | |
and recentchanges.rc_type < 2 | |
and recentchanges.rc_user > 0 | |
group by user_id | |
having edits_in_last_30_days >= 5 | |
) as active_editors | |
left join | |
user_properties on user_properties.up_user = active_editors.user_id | |
and user_properties.up_property = 'skin' | |
-- so now this counts 32619 as expected | |
/* Problem 3: The original problem */ | |
-- Query that *now* works thanks to the fix above | |
select coalesce(up_value, 'default') as skin, | |
count(*) as skin_users | |
from (select user.user_id, | |
count(*) as edits_in_last_30_days | |
from recentchanges | |
inner join | |
user on user.user_id = recentchanges.rc_user | |
where recentchanges.rc_namespace = 0 | |
and recentchanges.rc_type < 2 | |
and recentchanges.rc_user > 0 | |
group by user_id | |
having edits_in_last_30_days >= 5 | |
) as active_editors | |
left join | |
user_properties on user_properties.up_user = active_editors.user_id | |
and user_properties.up_property = 'skin' | |
group by skin; | |
-- returns this very sensible set of results: | |
/* | |
+-------------+------------+ | |
| skin | skin_users | | |
+-------------+------------+ | |
| | 2525 | | |
| 0 | 32 | | |
| chick | 21 | | |
| cologneblue | 104 | | |
| default | 26582 | | |
| modern | 329 | | |
| monobook | 2810 | | |
| myskin | 8 | | |
| nostalgia | 15 | | |
| simple | 21 | | |
| standard | 98 | | |
| vector | 74 | | |
+-------------+------------+ | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment