Skip to content

Instantly share code, notes, and snippets.

@ideamonk
Created January 26, 2011 17:59
Show Gist options
  • Select an option

  • Save ideamonk/797119 to your computer and use it in GitHub Desktop.

Select an option

Save ideamonk/797119 to your computer and use it in GitHub Desktop.
Counts daily posts in craiglist, does recursive page navs, uses multiprocessing to suck up all the bandwidth
#!/usr/bin/env python
import urllib2
import psycopg2
from time import strftime
from datetime import date, timedelta, datetime
from geopy import geocoders
from BeautifulSoup import BeautifulSoup
from multiprocessing import Pool, cpu_count
conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
cur = conn.cursor()
today = (date.today()).strftime("%a %b %d") # think west, think yesterday
sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
sql_time = (datetime.now()).strftime("%H:%M:%S")
def updateStats(id,url, old_count, nesting_level):
stat_count = old_count
if nesting_level==0:
try:
soup = BeautifulSoup(urllib2.urlopen(url).read())
except:
print "Fetch FAIL"
return (id, stat_count)
else:
try:
soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
except:
print "Fetch FAIL"
return (id, stat_count)
masthead = soup.findAll('h4')
if len(masthead)>0:
if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
return (id, stat_count)
start = masthead[0].nextSibling
natural_break = False
while start:
if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
if start.name=="p":
stat_count+=1
if start.name=="h4":
natural_break = True
break
start = start.nextSibling
if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
print "nesting in", nesting_level+100
return updateStats(id, url, stat_count, nesting_level+100)
print url
return (id, stat_count)
cur.execute("select * from craigs_statistics;")
city = cur.fetchone()
city_list = []
while city:
city_list.append( (city[0], city[3] + "/sss", 0, 0) )
city = cur.fetchone()
''' # uniprocessing random fluke test
r=46
print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
'''
pool = Pool(processes=10)
results = [pool.apply_async(updateStats, a) for a in city_list]
results = [r.get() for r in results]
for id,count in results:
cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
conn.commit()
print "done"
@ideamonk

Copy link
Copy Markdown
Author

Hmm this code could be made much more efficient, doing away with any kind of recursive navigation if we run it frequently and use the timestamp / id from list page itself to increment our counts, that 1 visit per city.

@ideamonk

Copy link
Copy Markdown
Author

This was used to do an hourly visualization of craigslist activity across US :D for profit, phun and discovery

all in a day on craigslist

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment