Created
January 13, 2010 02:27
-
-
Save callison-burch/275868 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def query_category_members(category, language='en', limit=100): | |
| """ | |
| action=query,prop=categories | |
| Returns all the members of a category up to the specified limit | |
| """ | |
| url = api_url % (language) | |
| query_args = { | |
| 'action': 'query', | |
| 'list': 'categorymembers', | |
| 'cmtitle': category, | |
| 'format': 'json', | |
| 'cmlimit': min(limit, 500) | |
| } | |
| members = [] | |
| while True: | |
| json = _run_query(query_args, language) | |
| for member in json['query']['categorymembers']: | |
| members.append(member['title']) | |
| if 'query-continue' in json and len(members) <= limit: | |
| continue_item = json['query-continue']['categorymembers']['cmcontinue'] | |
| query_args['cmcontinue'] = continue_item | |
| else: | |
| break | |
| return members[0:limit] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| featured_articles = query_category_members('Category:Featured articles', limit=10000) | |
| len(featured_articles) | |
| 2739 | |
| featured_articles[2500:2525] | |
| ['Treatment of multiple sclerosis', 'Treehouse of Horror (series)', 'Trembling Before G-d', 'Marcus Trescothick', 'Trial by Jury', 'Triceratops', 'Stephen Trigg', 'Sarah Trimmer', u'Triptych, May\u2013June 1973', 'Triton (moon)', 'Michael Tritter', 'Tropic Thunder', 'Tropical Storm Barry (2001)', 'Tropical Storm Bill (2003)', 'Tropical Storm Bonnie (2004)', 'Tropical Storm Chantal (2001)', 'Tropical Storm Hermine (1998)', 'Tropical Storm Keith (1988)', 'Tropical cyclone', 'Harry Trott', 'Harry S. Truman', 'Hugh Trumble', 'Trump International Hotel and Tower (Chicago)', 'Truthiness', 'Tryon Creek'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import calendar | |
| import datetime | |
| def query_page_view_stats(title, language='en', start_date=(datetime.date.today()-datetime.timedelta(1)), end_date=datetime.date.today()): | |
| """ | |
| Queries stats.grok.se for the daily page views for wikipedia articles | |
| """ | |
| stats_api_url = 'http://stats.grok.se/json/%s/%s/%s' | |
| earliest_date = datetime.date(2007, 01, 01) | |
| query_date = max(start_date, earliest_date) | |
| end_date = min(end_date, datetime.date.today()) | |
| total_views = 0 | |
| stats = {} | |
| stats['monthly_views'] = {} | |
| while(query_date < end_date): | |
| query_date_str = query_date.strftime("%Y%m") | |
| url = stats_api_url % (language, query_date_str, urllib.quote(title.encode('utf-8'))) | |
| search_results = urllib.urlopen(url) | |
| json = simplejson.loads(search_results.read()) | |
| total_views += json['total_views'] | |
| stats['monthly_views'][query_date_str] = json | |
| days_in_month = calendar.monthrange(query_date.year, query_date.month)[1] | |
| query_date = query_date + datetime.timedelta(days_in_month) | |
| stats['total_views'] = total_views | |
| return stats |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| stats = query_page_view_stats('Barack Obama') | |
| stats | |
| {'monthly_views': {'201001': {'daily_views': [16512, 16161, 17048, 18596, 20376, 22108, 25204, 21432, 21127, 18302, 24222, 24275], 'total_views': 245363, 'month': '201001', 'title': 'Barack_Obama'}}, 'total_views': 245363} | |
| stats['total_views'] | |
| 245363 | |
| page_views = {} | |
| for i in range(0, len(featured_articles)): | |
| article = featured_articles[i] | |
| stats = query_page_view_stats(article) | |
| page_views[article] = stats['total_views'] | |
| print i, page_views[article], article | |
| 0 5370 0.999... | |
| 1 2567 1 − 2 + 3 − 4 + · · · | |
| 2 246 Tropical Depression Ten (2005) | |
| 3 143 Tropical Depression Ten (2007) | |
| 4 1474 1080° Snowboarding | |
| 5 844 13th Airborne Division (United States) | |
| 6 491 1880 Republican National Convention | |
| 7 6410 1896 Summer Olympics | |
| 8 817 1923 FA Cup Final | |
| 9 429 1926 World Series | |
| 10 1795 1928 Okeechobee hurricane | |
| 11 4946 1930 FIFA World Cup | |
| 12 300 1933 Atlantic hurricane season | |
| 13 157 1941 Atlantic hurricane season | |
| 14 181 1941 Florida hurricane | |
| 15 466 1956 FA Cup Final | |
| ... | |
| 2728 44001 Frank Zappa | |
| 2729 7325 Abu Musab al-Zarqawi | |
| 2730 1401 Zhang Heng | |
| 2731 862 Zhou Tong (archer) | |
| 2732 45541 Zinc | |
| 2733 22784 Preity Zinta | |
| 2734 3520 Zion National Park | |
| 2735 140 Otto Julius Zobel | |
| 2736 17219 Zodiac (film) | |
| 2737 249 Nikita Zotov | |
| 2738 3598 Huldrych Zwingli |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from operator import itemgetter | |
| for page_view in sorted(page_views.iteritems(), key=itemgetter(1), reverse=True): | |
| count = page_view[1] | |
| if(count < 100000): break | |
| print count, page_view[0] | |
| 421991 Lost (TV series) | |
| 388691 Michael Jackson | |
| 338038 India | |
| 262797 Global warming | |
| 254757 House (TV series) | |
| 245363 Barack Obama | |
| 242602 The Beatles | |
| 231525 Canada | |
| 223870 Mariah Carey | |
| 213183 The Simpsons | |
| 209978 Australia | |
| 207670 Japan | |
| 198658 William Shakespeare | |
| 195270 New York City | |
| 173199 Germany | |
| 169023 PlayStation 3 | |
| 161429 Angelina Jolie | |
| 153913 Batman | |
| 151068 Emma Watson | |
| 139615 Asperger syndrome | |
| 139507 Metallica | |
| 138863 Anne Frank | |
| 133621 Islam | |
| 132207 Israel | |
| 129504 Jackie Chan | |
| 123318 Elizabeth I of England | |
| 120062 Earth | |
| 119571 Michael Jordan | |
| 119241 M249 light machine gun | |
| 114333 Wii | |
| 110718 4chan | |
| 107469 The Notorious B.I.G. | |
| 106497 Solar System | |
| 103212 The Wire | |
| 101126 Schizophrenia | |
| 100212 Turkey | |
| 100033 DNA |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment