Skip to content

Instantly share code, notes, and snippets.

@jrjames83
Last active May 4, 2016 03:44
Show Gist options
  • Save jrjames83/cd0dc9eb058cff5b0396d789c6eb144f to your computer and use it in GitHub Desktop.
Save jrjames83/cd0dc9eb058cff5b0396d789c6eb144f to your computer and use it in GitHub Desktop.
scrape some obama speeches
from bs4 import BeautifulSoup
import requests
import urllib
#if that shit doesn't load, do from terminal
#python -m pip install module name or pip install module
url = "http://www.americanrhetoric.com/barackobamaspeeches.htm"
response = requests.get(url)
soup = BeautifulSoup(response.text)
"""
actual pattern
http://www.americanrhetoric.com/mp3clipsXE/barackobama/barackobamanursesassociationARXE.mp3
mp3clipsXE/barackobama/barackobamaguantanimobayclosingARXE.mp3
mp3clipsXE/barackobama/barackobamaISILupdate02-25-16ARXE.mp3
mp3clipsXE/barackobama/barackobamarecoveryact2016ARXE.mp3
"""
stub = "http://www.americanrhetoric.com/"
files = [] #put all files in a list
for a in soup.findAll('a'):
if '.mp3' in a['href']:
files.append(stub + a['href']) # add them to a list but since relative paths use stub each time
somefiles = files[:10] #first 10 files to test download (just use files for all of em)
print len(files)
#Download the fuckers - just ave as last 9 chars of file
#added basic try except in case one flops you won't kill the script, the terminal will print the error
try:
for x in somefiles:
urllib.urlretrieve(x, x.split("/")[5])
except Exception, e:
print str(e)
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment