ideamonk · January 26, 2011 17:59 · ideamonk · Jan 26, 2011 · ideamonk · Jan 26, 2011
diff --git a/craigcount.py b/craigcount.py
 #!/usr/bin/env python
 import urllib2
 import psycopg2
 from time import strftime
 from datetime import date, timedelta, datetime
 from geopy import geocoders
 from BeautifulSoup import BeautifulSoup
 from multiprocessing import Pool, cpu_count

 conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
 cur = conn.cursor()
 today = (date.today()).strftime("%a %b %d") # think west, think yesterday
 sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
 sql_time = (datetime.now()).strftime("%H:%M:%S")

 def updateStats(id,url, old_count, nesting_level):
 	stat_count = old_count
 	if nesting_level==0:
 		try:
 			soup = BeautifulSoup(urllib2.urlopen(url).read())
 		except:
 			print "Fetch FAIL"
 			return (id, stat_count)
 	else:
 		try:
 			soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
 		except:
 			print "Fetch FAIL"
 			return (id, stat_count)
 		
 	masthead = soup.findAll('h4')
 	
 	if len(masthead)>0:
 		if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
 			print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
 			return (id, stat_count)		

 		start = masthead[0].nextSibling
 		natural_break = False
 	
 		while start:
 			if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
 				if start.name=="p":
 					stat_count+=1
 				if start.name=="h4":
 					natural_break = True
 					break
 			start = start.nextSibling
 		
 		if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
 			print "nesting in", nesting_level+100
 			return updateStats(id, url, stat_count, nesting_level+100)
 			
 	print url
 	return (id, stat_count)

 cur.execute("select * from craigs_statistics;")
 city = cur.fetchone()

 city_list = []
 while city:
 	city_list.append( (city[0], city[3] + "/sss", 0, 0) )
 	city = cur.fetchone()

 ''' # uniprocessing random fluke test
 r=46
 print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
 '''

 pool = Pool(processes=10)
 results = [pool.apply_async(updateStats, a) for a in city_list]
 results = [r.get() for r in results]

 for id,count in results:
 	cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
 	conn.commit()
 print "done"
	#!/usr/bin/env python
	import urllib2
	import psycopg2
	from time import strftime
	from datetime import date, timedelta, datetime
	from geopy import geocoders
	from BeautifulSoup import BeautifulSoup
	from multiprocessing import Pool, cpu_count

	conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
	cur = conn.cursor()
	today = (date.today()).strftime("%a %b %d") # think west, think yesterday
	sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
	sql_time = (datetime.now()).strftime("%H:%M:%S")

	def updateStats(id,url, old_count, nesting_level):
	stat_count = old_count
	if nesting_level==0:
	try:
	soup = BeautifulSoup(urllib2.urlopen(url).read())
	except:
	print "Fetch FAIL"
	return (id, stat_count)
	else:
	try:
	soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
	except:
	print "Fetch FAIL"
	return (id, stat_count)

	masthead = soup.findAll('h4')

	if len(masthead)>0:
	if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
	print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
	return (id, stat_count)

	start = masthead[0].nextSibling
	natural_break = False

	while start:
	if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
	if start.name=="p":
	stat_count+=1
	if start.name=="h4":
	natural_break = True
	break
	start = start.nextSibling

	if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
	print "nesting in", nesting_level+100
	return updateStats(id, url, stat_count, nesting_level+100)

	print url
	return (id, stat_count)

	cur.execute("select * from craigs_statistics;")
	city = cur.fetchone()

	city_list = []
	while city:
	city_list.append( (city[0], city[3] + "/sss", 0, 0) )
	city = cur.fetchone()

	''' # uniprocessing random fluke test
	r=46
	print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
	'''

	pool = Pool(processes=10)
	results = [pool.apply_async(updateStats, a) for a in city_list]
	results = [r.get() for r in results]

	for id,count in results:
	cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
	conn.commit()
	print "done"