kastnerkyle · January 18, 2017 19:53
diff --git a/dl_presidents.py b/dl_presidents.py
 #!/usr/bin/env python
 # Based on code from Laurent Dinh (laurent-dinh)
 # Author: Kyle Kastner
 # License: BSD 3-Clause

 import requests
 import time
 import random
 from bs4 import BeautifulSoup
 import urllib
 import os
 import io

 # http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
 def download_file(url, filename):
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                #f.flush() commented by recommendation from J.F.Sebastian


 url_opener = urllib.FancyURLopener()

 def save_content(link):
    # http://millercenter.org/
    lines = requests.get(link)
    content = "\n".join([line.strip() for line in lines])
    soup = BeautifulSoup(content, "html.parser")

    mp3_link = [elem.get("href") for elem in soup.find_all("a")
                if elem.get("href") is not None and ".mp3" in elem.get("href")][0]

    transcript = [elem for elem in soup.find_all("div")
                if elem.get("id") is not None and "transcript" in elem.get("id")]

    sp = BeautifulSoup(str(transcript[0]), "html.parser")

    def text_with_newlines(elem):
        text = ''
        for e in elem.recursiveChildGenerator():
            if isinstance(e, basestring):
                text += e.strip()
            elif e.name == 'br':
                text += '\n'
            elif e.name == "p":
                pass

        fin_text = ''
        for n, t in enumerate(text):
            if text[n:n+2] == '\u':
                fin_text += " "
            fin_text += t
        return fin_text

    r = text_with_newlines(sp)
    r = r[10:]

    basename = mp3_link.split("/")[-1].split(".")[0]
    txt_name = basename + ".txt"
    mp3_name = basename + ".mp3"

    save_txt = "txt/" + txt_name
    save_mp3 = "mp3/" + mp3_name

    if not os.path.exists("mp3"):
        os.mkdir("mp3")

    if not os.path.exists("txt"):
        os.mkdir("txt")

    with io.open(save_txt, "w", encoding="utf-8") as f:
        f.write(r)

    url_opener.retrieve(mp3_link, save_mp3)



 # wgotten from http://millercenter.org/president/speeches/
 with open("speech_main_page.html", "r") as f:
    lines = f.readlines()
 content = "\n".join([line.strip() for line in lines])
 soup = BeautifulSoup(content, "html.parser")
 audio_links = [elem.get("href") for elem in soup.find_all("a")
               if elem.get("class") is not None and "audio" in elem.get("class")]

 for n, al in enumerate(sorted(audio_links)):
    time.sleep(random.random() * 5)
    print "Starting download..."
    try:
        print("Downloading %s" % al)
        link = "http://millercenter.org" + al
        save_content(link)
    except Exception as e:
        print("Failed %i: %s!" % (n, al))
        print e
        pass
	#!/usr/bin/env python
	# Based on code from Laurent Dinh (laurent-dinh)
	# Author: Kyle Kastner
	# License: BSD 3-Clause

	import requests
	import time
	import random
	from bs4 import BeautifulSoup
	import urllib
	import os
	import io

	# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
	def download_file(url, filename):
	# NOTE the stream=True parameter
	r = requests.get(url, stream=True)
	with open(filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk: # filter out keep-alive new chunks
	f.write(chunk)
	#f.flush() commented by recommendation from J.F.Sebastian


	url_opener = urllib.FancyURLopener()

	def save_content(link):
	# http://millercenter.org/
	lines = requests.get(link)
	content = "\n".join([line.strip() for line in lines])
	soup = BeautifulSoup(content, "html.parser")

	mp3_link = [elem.get("href") for elem in soup.find_all("a")
	if elem.get("href") is not None and ".mp3" in elem.get("href")][0]

	transcript = [elem for elem in soup.find_all("div")
	if elem.get("id") is not None and "transcript" in elem.get("id")]

	sp = BeautifulSoup(str(transcript[0]), "html.parser")

	def text_with_newlines(elem):
	text = ''
	for e in elem.recursiveChildGenerator():
	if isinstance(e, basestring):
	text += e.strip()
	elif e.name == 'br':
	text += '\n'
	elif e.name == "p":
	pass

	fin_text = ''
	for n, t in enumerate(text):
	if text[n:n+2] == '\u':
	fin_text += " "
	fin_text += t
	return fin_text

	r = text_with_newlines(sp)
	r = r[10:]

	basename = mp3_link.split("/")[-1].split(".")[0]
	txt_name = basename + ".txt"
	mp3_name = basename + ".mp3"

	save_txt = "txt/" + txt_name
	save_mp3 = "mp3/" + mp3_name

	if not os.path.exists("mp3"):
	os.mkdir("mp3")

	if not os.path.exists("txt"):
	os.mkdir("txt")

	with io.open(save_txt, "w", encoding="utf-8") as f:
	f.write(r)

	url_opener.retrieve(mp3_link, save_mp3)



	# wgotten from http://millercenter.org/president/speeches/
	with open("speech_main_page.html", "r") as f:
	lines = f.readlines()
	content = "\n".join([line.strip() for line in lines])
	soup = BeautifulSoup(content, "html.parser")
	audio_links = [elem.get("href") for elem in soup.find_all("a")
	if elem.get("class") is not None and "audio" in elem.get("class")]

	for n, al in enumerate(sorted(audio_links)):
	time.sleep(random.random() * 5)
	print "Starting download..."
	try:
	print("Downloading %s" % al)
	link = "http://millercenter.org" + al
	save_content(link)
	except Exception as e:
	print("Failed %i: %s!" % (n, al))
	print e
	pass