-
-
Save kastnerkyle/f51df60c8d7780945eb8b3d0e480c413 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Based on code from Laurent Dinh (laurent-dinh) | |
# Author: Kyle Kastner | |
# License: BSD 3-Clause | |
import requests | |
import time | |
import random | |
from bs4 import BeautifulSoup | |
import urllib | |
import os | |
import io | |
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py | |
def download_file(url, filename): | |
# NOTE the stream=True parameter | |
r = requests.get(url, stream=True) | |
with open(filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
#f.flush() commented by recommendation from J.F.Sebastian | |
url_opener = urllib.FancyURLopener() | |
def save_content(link): | |
# http://millercenter.org/ | |
lines = requests.get(link) | |
content = "\n".join([line.strip() for line in lines]) | |
soup = BeautifulSoup(content, "html.parser") | |
mp3_link = [elem.get("href") for elem in soup.find_all("a") | |
if elem.get("href") is not None and ".mp3" in elem.get("href")][0] | |
transcript = [elem for elem in soup.find_all("div") | |
if elem.get("id") is not None and "transcript" in elem.get("id")] | |
sp = BeautifulSoup(str(transcript[0]), "html.parser") | |
def text_with_newlines(elem): | |
text = '' | |
for e in elem.recursiveChildGenerator(): | |
if isinstance(e, basestring): | |
text += e.strip() | |
elif e.name == 'br': | |
text += '\n' | |
elif e.name == "p": | |
pass | |
fin_text = '' | |
for n, t in enumerate(text): | |
if text[n:n+2] == '\u': | |
fin_text += " " | |
fin_text += t | |
return fin_text | |
r = text_with_newlines(sp) | |
r = r[10:] | |
basename = mp3_link.split("/")[-1].split(".")[0] | |
txt_name = basename + ".txt" | |
mp3_name = basename + ".mp3" | |
save_txt = "txt/" + txt_name | |
save_mp3 = "mp3/" + mp3_name | |
if not os.path.exists("mp3"): | |
os.mkdir("mp3") | |
if not os.path.exists("txt"): | |
os.mkdir("txt") | |
with io.open(save_txt, "w", encoding="utf-8") as f: | |
f.write(r) | |
url_opener.retrieve(mp3_link, save_mp3) | |
# wgotten from http://millercenter.org/president/speeches/ | |
with open("speech_main_page.html", "r") as f: | |
lines = f.readlines() | |
content = "\n".join([line.strip() for line in lines]) | |
soup = BeautifulSoup(content, "html.parser") | |
audio_links = [elem.get("href") for elem in soup.find_all("a") | |
if elem.get("class") is not None and "audio" in elem.get("class")] | |
for n, al in enumerate(sorted(audio_links)): | |
time.sleep(random.random() * 5) | |
print "Starting download..." | |
try: | |
print("Downloading %s" % al) | |
link = "http://millercenter.org" + al | |
save_content(link) | |
except Exception as e: | |
print("Failed %i: %s!" % (n, al)) | |
print e | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment