bmander · August 12, 2013 23:17
diff --git a/onionvoices.py b/onionvoices.py
 from urllib2 import urlopen
 from bs4 import BeautifulSoup
 import time

 def get_voice_details( soup, i ):
  resptag = soup.find_all("li", class_="response-%d"%i)[0]
 	name, br, occupation = resptag("p",class_="occupation")[0].contents
 	name = " ".join( name.split()[:-1] )#last word is a dash
 	occupation = occupation.strip()

 	return name,occupation


 def get_voices( path ):
 	html = urlopen( DOMAIN+path ).read()

 	soup = BeautifulSoup(html)

 	voices = []
 	voices.append( get_voice_details( soup, 1 ) )
 	voices.append( get_voice_details( soup, 2 ) )
 	voices.append( get_voice_details( soup, 3 ) )

 	last_voices_tag = soup.find_all(lambda tag:tag.get("data-ct_section_name")=="previous_recirc")[0]
 	last_voices_path = last_voices_tag.get("href")

 	return voices, last_voices_path

 DOMAIN = "http://www.theonion.com"
 START_PATH = "/articles/smarter-women-less-likely-to-have-kids,33440/"

 pathsfp = open("paths","w")
 voicesfp = open("voices","w")

 cur_path = START_PATH
 for i in range(500):
 	print cur_path
 	voices, last_path = get_voices( cur_path )

 	for voice in voices:
 		voicesfp.write( voice[0].encode("utf8") )
 		voicesfp.write(",")
 		voicesfp.write( voice[1].encode("utf8") )
 		voicesfp.write("\n")
 		voicesfp.flush()

 	pathsfp.write( "%s\n"%cur_path )
 	pathsfp.flush()

 	cur_path = last_path

 	time.sleep(0.5)
	from urllib2 import urlopen
	from bs4 import BeautifulSoup
	import time

	def get_voice_details( soup, i ):
	resptag = soup.find_all("li", class_="response-%d"%i)[0]
	name, br, occupation = resptag("p",class_="occupation")[0].contents
	name = " ".join( name.split()[:-1] )#last word is a dash
	occupation = occupation.strip()

	return name,occupation


	def get_voices( path ):
	html = urlopen( DOMAIN+path ).read()

	soup = BeautifulSoup(html)

	voices = []
	voices.append( get_voice_details( soup, 1 ) )
	voices.append( get_voice_details( soup, 2 ) )
	voices.append( get_voice_details( soup, 3 ) )

	last_voices_tag = soup.find_all(lambda tag:tag.get("data-ct_section_name")=="previous_recirc")[0]
	last_voices_path = last_voices_tag.get("href")

	return voices, last_voices_path

	DOMAIN = "http://www.theonion.com"
	START_PATH = "/articles/smarter-women-less-likely-to-have-kids,33440/"

	pathsfp = open("paths","w")
	voicesfp = open("voices","w")

	cur_path = START_PATH
	for i in range(500):
	print cur_path
	voices, last_path = get_voices( cur_path )

	for voice in voices:
	voicesfp.write( voice[0].encode("utf8") )
	voicesfp.write(",")
	voicesfp.write( voice[1].encode("utf8") )
	voicesfp.write("\n")
	voicesfp.flush()

	pathsfp.write( "%s\n"%cur_path )
	pathsfp.flush()

	cur_path = last_path

	time.sleep(0.5)