Skip to content

Instantly share code, notes, and snippets.

@bmander
Created August 12, 2013 23:17
Show Gist options
  • Save bmander/6216282 to your computer and use it in GitHub Desktop.
Save bmander/6216282 to your computer and use it in GitHub Desktop.
A scraper what fetches The Onion's American Voices names and occupations.
from urllib2 import urlopen
from bs4 import BeautifulSoup
import time
def get_voice_details( soup, i ):
resptag = soup.find_all("li", class_="response-%d"%i)[0]
name, br, occupation = resptag("p",class_="occupation")[0].contents
name = " ".join( name.split()[:-1] )#last word is a dash
occupation = occupation.strip()
return name,occupation
def get_voices( path ):
html = urlopen( DOMAIN+path ).read()
soup = BeautifulSoup(html)
voices = []
voices.append( get_voice_details( soup, 1 ) )
voices.append( get_voice_details( soup, 2 ) )
voices.append( get_voice_details( soup, 3 ) )
last_voices_tag = soup.find_all(lambda tag:tag.get("data-ct_section_name")=="previous_recirc")[0]
last_voices_path = last_voices_tag.get("href")
return voices, last_voices_path
DOMAIN = "http://www.theonion.com"
START_PATH = "/articles/smarter-women-less-likely-to-have-kids,33440/"
pathsfp = open("paths","w")
voicesfp = open("voices","w")
cur_path = START_PATH
for i in range(500):
print cur_path
voices, last_path = get_voices( cur_path )
for voice in voices:
voicesfp.write( voice[0].encode("utf8") )
voicesfp.write(",")
voicesfp.write( voice[1].encode("utf8") )
voicesfp.write("\n")
voicesfp.flush()
pathsfp.write( "%s\n"%cur_path )
pathsfp.flush()
cur_path = last_path
time.sleep(0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment