Skip to content

Instantly share code, notes, and snippets.

@ideamonk
Created January 26, 2011 17:59
Show Gist options
  • Save ideamonk/797119 to your computer and use it in GitHub Desktop.
Save ideamonk/797119 to your computer and use it in GitHub Desktop.
Counts daily posts in craiglist, does recursive page navs, uses multiprocessing to suck up all the bandwidth
#!/usr/bin/env python
import urllib2
import psycopg2
from time import strftime
from datetime import date, timedelta, datetime
from geopy import geocoders
from BeautifulSoup import BeautifulSoup
from multiprocessing import Pool, cpu_count
conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
cur = conn.cursor()
today = (date.today()).strftime("%a %b %d") # think west, think yesterday
sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
sql_time = (datetime.now()).strftime("%H:%M:%S")
def updateStats(id,url, old_count, nesting_level):
stat_count = old_count
if nesting_level==0:
try:
soup = BeautifulSoup(urllib2.urlopen(url).read())
except:
print "Fetch FAIL"
return (id, stat_count)
else:
try:
soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
except:
print "Fetch FAIL"
return (id, stat_count)
masthead = soup.findAll('h4')
if len(masthead)>0:
if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
return (id, stat_count)
start = masthead[0].nextSibling
natural_break = False
while start:
if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
if start.name=="p":
stat_count+=1
if start.name=="h4":
natural_break = True
break
start = start.nextSibling
if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
print "nesting in", nesting_level+100
return updateStats(id, url, stat_count, nesting_level+100)
print url
return (id, stat_count)
cur.execute("select * from craigs_statistics;")
city = cur.fetchone()
city_list = []
while city:
city_list.append( (city[0], city[3] + "/sss", 0, 0) )
city = cur.fetchone()
''' # uniprocessing random fluke test
r=46
print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
'''
pool = Pool(processes=10)
results = [pool.apply_async(updateStats, a) for a in city_list]
results = [r.get() for r in results]
for id,count in results:
cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
conn.commit()
print "done"
@ideamonk
Copy link
Author

Hmm this code could be made much more efficient, doing away with any kind of recursive navigation if we run it frequently and use the timestamp / id from list page itself to increment our counts, that 1 visit per city.

@ideamonk
Copy link
Author

This was used to do an hourly visualization of craigslist activity across US :D for profit, phun and discovery

all in a day on craigslist

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment