Skip to content

Instantly share code, notes, and snippets.

Created April 19, 2010 08:50
Show Gist options
  • Save kekssw/370855 to your computer and use it in GitHub Desktop.
Save kekssw/370855 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from get_google_lang import *
from debug import ishell
import sys
from urllib import urlencode
from urllib2 import urlopen
import sqlite3
from datetime import datetime
# TODO: Some function to be developed here
def get_juick_page(page_num, showAll = False):
args = {'page': page_num}
if showAll: args['show'] = 'all'
url = "" + urlencode(args)
page = urlopen(url)
return parse_juick(page)
def parse_juick(page, smallest_collected = [sys.maxint]): # emulation of smallest_collected as static variable
#print "DBG:Entered get_juick_page with smallest_collected=%d" % smallest_collected[0]
from BeautifulSoup import BeautifulSoup
import copy
soup = BeautifulSoup(page)
lis = soup.findAll('li', attrs={"class":"liav"})
messages = []
# iterates over messages
for li in lis:
this_mid = int(li.small.a.string[1:])
new_message = {}
# Do we already got this post & should skip it?
if this_mid >= smallest_collected[0]:
smallest_collected[0] = this_mid
new_message['mid'] = this_mid
anchors = li.big.findAll('a')
new_message['uname'] = anchors[0].string[1:]
new_message['timestamp'] = li.small.span.attrs[0][1]
new_message['lang'] = ''
if len(anchors) > 1:
tags = anchors[1:]
# ishell(local_vars = locals())
lg = [tag.string[7:].lower() for tag in tags
if tag.string and tag.string.startswith('*lang:')]
if len(lg) and len(lg[0]) == 2:
new_message['lang'] = lg[0]
new_message['tags'] = " ".join([tag.string[1:].lower()
for tag in tags if tag.string])
new_message['tags'] = ""
new_message['message'] = li.div.renderContents().decode('utf8')
cleaned = li.div # copy.deepcopy(li.div)
# cut anchors off
[a.extract() for a in cleaned.findAll('a')]
# get cleaned message (no tags; unicode encoded)
new_message['cleaned'] = ''.join(cleaned.findAll(text=True))
#print "DBG:Leaving get_juick_page with smallest_collected=%d" % smallest_collected[0]
return messages
if __name__ == "__main__":
start_page = 0
pages_count = 1
showAll = (sys.argv[-1] == "all")
if (showAll and len(sys.argv) > 3) or (not showAll and len(sys.argv) > 2):
start_page = int(sys.argv[1]) - 1
pages_count = int(sys.argv[2])
elif len(sys.argv) > 1:
pages_count = int(sys.argv[1])
# TODO: Here we should get MaxPostNumber from DB to use it as a boundary in 'update mode'
conn = sqlite3.connect(showAll and 'latest.db' or 'popular.db')
curr = conn.cursor()
for page in xrange(start_page, start_page + pages_count):
print "------ Page #%d [%d of %d] ------" % (page + 1, page + 1 - start_page, pages_count)
messages = get_juick_page(page + 1, showAll)
# TODO: Check if there a post with numbers greater than MaxPostNumber on this page.
# If so, truncate messages for only new info being stored, set LastPage flag.
for msg in messages:
if msg['lang']:
# We already detect language from tag
msg['verified'] = 2
print "Language by tag detected: #%(mid)d - %(lang)s" % msg
json = detect_lang(msg['cleaned'][:128].encode('utf8'))
# Also, could be detected by original message:
# json = detect_lang(msg['message'].renderContents())
msg['lang'] = parse_lang(json)
msg['verified'] = 0 # We need manual verification
msg['lang_initial'] = msg['lang']
# Current timestamp; TODO: check GMT offset
msg['collected'] = datetime.strftime(, "%Y-%m-%d %H:%M:%S GMT")
print "%(mid)d by %(uname)s: %(lang)s /%(tags)s/" % msg
# Here we should store messages to DB
for msg in messages:
fields = [msg['mid'],
# ishell(local_vars=locals())
curr.execute('INSERT INTO messages VALUES (%s)' % ",".join('?' * len(fields)), fields)
# TODO: Exit if LastPage flag is set
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment