Skip to content

Instantly share code, notes, and snippets.

@callison-burch
Created January 13, 2010 02:27
Show Gist options
  • Save callison-burch/275868 to your computer and use it in GitHub Desktop.
Save callison-burch/275868 to your computer and use it in GitHub Desktop.
def query_category_members(category, language='en', limit=100):
"""
action=query,prop=categories
Returns all the members of a category up to the specified limit
"""
url = api_url % (language)
query_args = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'format': 'json',
'cmlimit': min(limit, 500)
}
members = []
while True:
json = _run_query(query_args, language)
for member in json['query']['categorymembers']:
members.append(member['title'])
if 'query-continue' in json and len(members) <= limit:
continue_item = json['query-continue']['categorymembers']['cmcontinue']
query_args['cmcontinue'] = continue_item
else:
break
return members[0:limit]
featured_articles = query_category_members('Category:Featured articles', limit=10000)
len(featured_articles)
2739
featured_articles[2500:2525]
['Treatment of multiple sclerosis', 'Treehouse of Horror (series)', 'Trembling Before G-d', 'Marcus Trescothick', 'Trial by Jury', 'Triceratops', 'Stephen Trigg', 'Sarah Trimmer', u'Triptych, May\u2013June 1973', 'Triton (moon)', 'Michael Tritter', 'Tropic Thunder', 'Tropical Storm Barry (2001)', 'Tropical Storm Bill (2003)', 'Tropical Storm Bonnie (2004)', 'Tropical Storm Chantal (2001)', 'Tropical Storm Hermine (1998)', 'Tropical Storm Keith (1988)', 'Tropical cyclone', 'Harry Trott', 'Harry S. Truman', 'Hugh Trumble', 'Trump International Hotel and Tower (Chicago)', 'Truthiness', 'Tryon Creek']
import calendar
import datetime
def query_page_view_stats(title, language='en', start_date=(datetime.date.today()-datetime.timedelta(1)), end_date=datetime.date.today()):
"""
Queries stats.grok.se for the daily page views for wikipedia articles
"""
stats_api_url = 'http://stats.grok.se/json/%s/%s/%s'
earliest_date = datetime.date(2007, 01, 01)
query_date = max(start_date, earliest_date)
end_date = min(end_date, datetime.date.today())
total_views = 0
stats = {}
stats['monthly_views'] = {}
while(query_date < end_date):
query_date_str = query_date.strftime("%Y%m")
url = stats_api_url % (language, query_date_str, urllib.quote(title.encode('utf-8')))
search_results = urllib.urlopen(url)
json = simplejson.loads(search_results.read())
total_views += json['total_views']
stats['monthly_views'][query_date_str] = json
days_in_month = calendar.monthrange(query_date.year, query_date.month)[1]
query_date = query_date + datetime.timedelta(days_in_month)
stats['total_views'] = total_views
return stats
stats = query_page_view_stats('Barack Obama')
stats
{'monthly_views': {'201001': {'daily_views': [16512, 16161, 17048, 18596, 20376, 22108, 25204, 21432, 21127, 18302, 24222, 24275], 'total_views': 245363, 'month': '201001', 'title': 'Barack_Obama'}}, 'total_views': 245363}
stats['total_views']
245363
page_views = {}
for i in range(0, len(featured_articles)):
article = featured_articles[i]
stats = query_page_view_stats(article)
page_views[article] = stats['total_views']
print i, page_views[article], article
0 5370 0.999...
1 2567 1 − 2 + 3 − 4 + · · ·
2 246 Tropical Depression Ten (2005)
3 143 Tropical Depression Ten (2007)
4 1474 1080° Snowboarding
5 844 13th Airborne Division (United States)
6 491 1880 Republican National Convention
7 6410 1896 Summer Olympics
8 817 1923 FA Cup Final
9 429 1926 World Series
10 1795 1928 Okeechobee hurricane
11 4946 1930 FIFA World Cup
12 300 1933 Atlantic hurricane season
13 157 1941 Atlantic hurricane season
14 181 1941 Florida hurricane
15 466 1956 FA Cup Final
...
2728 44001 Frank Zappa
2729 7325 Abu Musab al-Zarqawi
2730 1401 Zhang Heng
2731 862 Zhou Tong (archer)
2732 45541 Zinc
2733 22784 Preity Zinta
2734 3520 Zion National Park
2735 140 Otto Julius Zobel
2736 17219 Zodiac (film)
2737 249 Nikita Zotov
2738 3598 Huldrych Zwingli
from operator import itemgetter
for page_view in sorted(page_views.iteritems(), key=itemgetter(1), reverse=True):
count = page_view[1]
if(count < 100000): break
print count, page_view[0]
421991 Lost (TV series)
388691 Michael Jackson
338038 India
262797 Global warming
254757 House (TV series)
245363 Barack Obama
242602 The Beatles
231525 Canada
223870 Mariah Carey
213183 The Simpsons
209978 Australia
207670 Japan
198658 William Shakespeare
195270 New York City
173199 Germany
169023 PlayStation 3
161429 Angelina Jolie
153913 Batman
151068 Emma Watson
139615 Asperger syndrome
139507 Metallica
138863 Anne Frank
133621 Islam
132207 Israel
129504 Jackie Chan
123318 Elizabeth I of England
120062 Earth
119571 Michael Jordan
119241 M249 light machine gun
114333 Wii
110718 4chan
107469 The Notorious B.I.G.
106497 Solar System
103212 The Wire
101126 Schizophrenia
100212 Turkey
100033 DNA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment