Last active
May 4, 2016 03:44
-
-
Save jrjames83/cd0dc9eb058cff5b0396d789c6eb144f to your computer and use it in GitHub Desktop.
scrape some obama speeches
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import requests | |
| import urllib | |
| #if that shit doesn't load, do from terminal | |
| #python -m pip install module name or pip install module | |
| url = "http://www.americanrhetoric.com/barackobamaspeeches.htm" | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text) | |
| """ | |
| actual pattern | |
| http://www.americanrhetoric.com/mp3clipsXE/barackobama/barackobamanursesassociationARXE.mp3 | |
| mp3clipsXE/barackobama/barackobamaguantanimobayclosingARXE.mp3 | |
| mp3clipsXE/barackobama/barackobamaISILupdate02-25-16ARXE.mp3 | |
| mp3clipsXE/barackobama/barackobamarecoveryact2016ARXE.mp3 | |
| """ | |
| stub = "http://www.americanrhetoric.com/" | |
| files = [] #put all files in a list | |
| for a in soup.findAll('a'): | |
| if '.mp3' in a['href']: | |
| files.append(stub + a['href']) # add them to a list but since relative paths use stub each time | |
| somefiles = files[:10] #first 10 files to test download (just use files for all of em) | |
| print len(files) | |
| #Download the fuckers - just ave as last 9 chars of file | |
| #added basic try except in case one flops you won't kill the script, the terminal will print the error | |
| try: | |
| for x in somefiles: | |
| urllib.urlretrieve(x, x.split("/")[5]) | |
| except Exception, e: | |
| print str(e) | |
| pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment