Skip to content

Instantly share code, notes, and snippets.

@choplin
Created November 10, 2011 18:35
Show Gist options
  • Save choplin/1355697 to your computer and use it in GitHub Desktop.
Save choplin/1355697 to your computer and use it in GitHub Desktop.
一番酒が飲まれている言語を調べる
import urllib2
import chardet
from pyquery import PyQuery as pq
import re
import sys
base_url = 'http://togetter.com/li/212550'
more_url = 'http://togetter.com/api/moreTweets/212550?page='
def get_content(url):
req = urllib2.Request(url)
req.add_header('Referer', base_url)
data = ''.join(urllib2.urlopen(req).readlines())
guess = chardet.detect(data)
result = dict(url=url,data=data,**guess)
return result
def parse(data):
result = {'lang':{}, 'name':{}}
d = pq(data)
for tweet_elem in d.find('li.list_item div.tweet'):
lang = get_language(pq(tweet_elem).text())
if lang:
result['lang'][lang] = result['lang'].get(lang, 0) + 1
for name_elem in d.find('li.list_item a.status_name'):
name = pq(name_elem).text()
result['name'][name] = result['name'].get(name, 0) + 1
return result
def get_language(tweet):
m = re.search(u'月は(.*?)で', tweet)
if m:
return m.group(1)
def sum(total, new):
for lang,count in new['lang'].items():
total['lang'][lang] = total['lang'].get(lang, 0) + count
for name,count in new['name'].items():
total['name'][name] = total['name'].get(name, 0) + count
return total
def show(total):
print 'language:\n'
for lang,count in total['lang'].items():
print lang.encode('utf8')+'\t'+str(count)
print '------------------------------------------\n'
print 'name:\n'
for name,count in total['name'].items():
print name.encode('utf8')+'\t'+str(count)
def main():
total = {'lang':{}, 'name':{}}
res = get_content(base_url)
total = sum (total, parse(res['data'].decode(res['encoding'])))
page = 2
while True:
res = get_content(more_url+str(page))
if res['data'] == '<ul>\n</ul>\n':
break
total = sum (total, parse(res['data'].decode(res['encoding'])))
page += 1
show(total)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment