Created
January 26, 2011 17:59
-
-
Save ideamonk/797119 to your computer and use it in GitHub Desktop.
Counts daily posts in craiglist, does recursive page navs, uses multiprocessing to suck up all the bandwidth
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib2 | |
import psycopg2 | |
from time import strftime | |
from datetime import date, timedelta, datetime | |
from geopy import geocoders | |
from BeautifulSoup import BeautifulSoup | |
from multiprocessing import Pool, cpu_count | |
conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb") | |
cur = conn.cursor() | |
today = (date.today()).strftime("%a %b %d") # think west, think yesterday | |
sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d") | |
sql_time = (datetime.now()).strftime("%H:%M:%S") | |
def updateStats(id,url, old_count, nesting_level): | |
stat_count = old_count | |
if nesting_level==0: | |
try: | |
soup = BeautifulSoup(urllib2.urlopen(url).read()) | |
except: | |
print "Fetch FAIL" | |
return (id, stat_count) | |
else: | |
try: | |
soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read()) | |
except: | |
print "Fetch FAIL" | |
return (id, stat_count) | |
masthead = soup.findAll('h4') | |
if len(masthead)>0: | |
if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3: | |
print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url | |
return (id, stat_count) | |
start = masthead[0].nextSibling | |
natural_break = False | |
while start: | |
if str(type(start)) == "<class 'BeautifulSoup.Tag'>": | |
if start.name=="p": | |
stat_count+=1 | |
if start.name=="h4": | |
natural_break = True | |
break | |
start = start.nextSibling | |
if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break: | |
print "nesting in", nesting_level+100 | |
return updateStats(id, url, stat_count, nesting_level+100) | |
print url | |
return (id, stat_count) | |
cur.execute("select * from craigs_statistics;") | |
city = cur.fetchone() | |
city_list = [] | |
while city: | |
city_list.append( (city[0], city[3] + "/sss", 0, 0) ) | |
city = cur.fetchone() | |
''' # uniprocessing random fluke test | |
r=46 | |
print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3]) | |
''' | |
pool = Pool(processes=10) | |
results = [pool.apply_async(updateStats, a) for a in city_list] | |
results = [r.get() for r in results] | |
for id,count in results: | |
cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count)) | |
conn.commit() | |
print "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hmm this code could be made much more efficient, doing away with any kind of recursive navigation if we run it frequently and use the timestamp / id from list page itself to increment our counts, that 1 visit per city.