Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Forked from laurent-dinh/dl_vine.py
Last active January 18, 2017 19:53
Show Gist options
  • Save kastnerkyle/f51df60c8d7780945eb8b3d0e480c413 to your computer and use it in GitHub Desktop.
Save kastnerkyle/f51df60c8d7780945eb8b3d0e480c413 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# Based on code from Laurent Dinh (laurent-dinh)
# Author: Kyle Kastner
# License: BSD 3-Clause
import requests
import time
import random
from bs4 import BeautifulSoup
import urllib
import os
import io
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
def download_file(url, filename):
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#f.flush() commented by recommendation from J.F.Sebastian
url_opener = urllib.FancyURLopener()
def save_content(link):
# http://millercenter.org/
lines = requests.get(link)
content = "\n".join([line.strip() for line in lines])
soup = BeautifulSoup(content, "html.parser")
mp3_link = [elem.get("href") for elem in soup.find_all("a")
if elem.get("href") is not None and ".mp3" in elem.get("href")][0]
transcript = [elem for elem in soup.find_all("div")
if elem.get("id") is not None and "transcript" in elem.get("id")]
sp = BeautifulSoup(str(transcript[0]), "html.parser")
def text_with_newlines(elem):
text = ''
for e in elem.recursiveChildGenerator():
if isinstance(e, basestring):
text += e.strip()
elif e.name == 'br':
text += '\n'
elif e.name == "p":
pass
fin_text = ''
for n, t in enumerate(text):
if text[n:n+2] == '\u':
fin_text += " "
fin_text += t
return fin_text
r = text_with_newlines(sp)
r = r[10:]
basename = mp3_link.split("/")[-1].split(".")[0]
txt_name = basename + ".txt"
mp3_name = basename + ".mp3"
save_txt = "txt/" + txt_name
save_mp3 = "mp3/" + mp3_name
if not os.path.exists("mp3"):
os.mkdir("mp3")
if not os.path.exists("txt"):
os.mkdir("txt")
with io.open(save_txt, "w", encoding="utf-8") as f:
f.write(r)
url_opener.retrieve(mp3_link, save_mp3)
# wgotten from http://millercenter.org/president/speeches/
with open("speech_main_page.html", "r") as f:
lines = f.readlines()
content = "\n".join([line.strip() for line in lines])
soup = BeautifulSoup(content, "html.parser")
audio_links = [elem.get("href") for elem in soup.find_all("a")
if elem.get("class") is not None and "audio" in elem.get("class")]
for n, al in enumerate(sorted(audio_links)):
time.sleep(random.random() * 5)
print "Starting download..."
try:
print("Downloading %s" % al)
link = "http://millercenter.org" + al
save_content(link)
except Exception as e:
print("Failed %i: %s!" % (n, al))
print e
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment