Created
April 19, 2010 08:50
-
-
Save kekssw/370855 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from get_google_lang import * | |
from debug import ishell | |
import sys | |
from urllib import urlencode | |
from urllib2 import urlopen | |
import sqlite3 | |
from datetime import datetime | |
# TODO: Some function to be developed here | |
def get_juick_page(page_num, showAll = False): | |
args = {'page': page_num} | |
if showAll: args['show'] = 'all' | |
url = "http://juick.com/last?" + urlencode(args) | |
page = urlopen(url) | |
return parse_juick(page) | |
def parse_juick(page, smallest_collected = [sys.maxint]): # emulation of smallest_collected as static variable | |
#print "DBG:Entered get_juick_page with smallest_collected=%d" % smallest_collected[0] | |
from BeautifulSoup import BeautifulSoup | |
import copy | |
soup = BeautifulSoup(page) | |
lis = soup.findAll('li', attrs={"class":"liav"}) | |
messages = [] | |
# iterates over messages | |
for li in lis: | |
try: | |
this_mid = int(li.small.a.string[1:]) | |
except: | |
continue | |
new_message = {} | |
# Do we already got this post & should skip it? | |
if this_mid >= smallest_collected[0]: | |
continue | |
else: | |
smallest_collected[0] = this_mid | |
new_message['mid'] = this_mid | |
anchors = li.big.findAll('a') | |
new_message['uname'] = anchors[0].string[1:] | |
new_message['timestamp'] = li.small.span.attrs[0][1] | |
new_message['lang'] = '' | |
if len(anchors) > 1: | |
tags = anchors[1:] | |
# ishell(local_vars = locals()) | |
lg = [tag.string[7:].lower() for tag in tags | |
if tag.string and tag.string.startswith('*lang:')] | |
if len(lg) and len(lg[0]) == 2: | |
new_message['lang'] = lg[0] | |
new_message['tags'] = " ".join([tag.string[1:].lower() | |
for tag in tags if tag.string]) | |
else: | |
new_message['tags'] = "" | |
new_message['message'] = li.div.renderContents().decode('utf8') | |
cleaned = li.div # copy.deepcopy(li.div) | |
# cut anchors off | |
[a.extract() for a in cleaned.findAll('a')] | |
# get cleaned message (no tags; unicode encoded) | |
new_message['cleaned'] = ''.join(cleaned.findAll(text=True)) | |
messages.append(new_message) | |
#print "DBG:Leaving get_juick_page with smallest_collected=%d" % smallest_collected[0] | |
return messages | |
if __name__ == "__main__": | |
start_page = 0 | |
pages_count = 1 | |
showAll = (sys.argv[-1] == "all") | |
if (showAll and len(sys.argv) > 3) or (not showAll and len(sys.argv) > 2): | |
start_page = int(sys.argv[1]) - 1 | |
pages_count = int(sys.argv[2]) | |
elif len(sys.argv) > 1: | |
pages_count = int(sys.argv[1]) | |
# TODO: Here we should get MaxPostNumber from DB to use it as a boundary in 'update mode' | |
conn = sqlite3.connect(showAll and 'latest.db' or 'popular.db') | |
curr = conn.cursor() | |
for page in xrange(start_page, start_page + pages_count): | |
print "------ Page #%d [%d of %d] ------" % (page + 1, page + 1 - start_page, pages_count) | |
messages = get_juick_page(page + 1, showAll) | |
# TODO: Check if there a post with numbers greater than MaxPostNumber on this page. | |
# If so, truncate messages for only new info being stored, set LastPage flag. | |
for msg in messages: | |
if msg['lang']: | |
# We already detect language from tag | |
msg['verified'] = 2 | |
print "Language by tag detected: #%(mid)d - %(lang)s" % msg | |
else: | |
json = detect_lang(msg['cleaned'][:128].encode('utf8')) | |
# Also, could be detected by original message: | |
# json = detect_lang(msg['message'].renderContents()) | |
msg['lang'] = parse_lang(json) | |
msg['verified'] = 0 # We need manual verification | |
msg['lang_initial'] = msg['lang'] | |
# Current timestamp; TODO: check GMT offset | |
msg['collected'] = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S GMT") | |
print "%(mid)d by %(uname)s: %(lang)s /%(tags)s/" % msg | |
# Here we should store messages to DB | |
for msg in messages: | |
fields = [msg['mid'], | |
msg['uname'], | |
msg['timestamp'], | |
msg['message'], | |
msg['cleaned'], | |
msg['collected'], | |
msg['lang'], | |
msg['lang_initial'], | |
msg['verified'], | |
msg['tags']] | |
# ishell(local_vars=locals()) | |
curr.execute('INSERT INTO messages VALUES (%s)' % ",".join('?' * len(fields)), fields) | |
conn.commit() | |
# TODO: Exit if LastPage flag is set | |
curr.close() | |
conn.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment