redwrasse · October 25, 2017 05:10
diff --git a/sec_filings.py b/sec_filings.py

 from multiprocessing.dummy import Pool 
 from urllib import urlretrieve, urlopen
 from bs4 import BeautifulSoup
 import os
 from datetime import datetime

 startDate = datetime.strptime('01.01.1996', '%d.%m.%Y')
 endDate = datetime.strptime('01.01.2018', '%d.%m.%Y')


 CACHE_DIR = "cache"
 URL = 'https://www.sec.gov/Archives/edgar/Oldloads'
 QUARTERS = ["QTR1", "QTR2", "QTR3", "QTR4"]
 YEARS = [str(yr) for yr in range(startDate.year, endDate.year)]


 def generate_all_urls():
    urls = []
    for year in YEARS:
        for qtr in QUARTERS:
            url = "/".join([URL, year, qtr]) + "/"
            soup = BeautifulSoup(urlopen(url), "lxml")
            for link in soup.find_all('a', href=True):
                lnk = link.get('href')
                if ".gz" in lnk:
                    full_path = url + lnk
                    yield full_path

 def write_to_file(url):
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)
    name = url.split("/")[-1]
    filename = CACHE_DIR + "/" + name
    if not os.path.isfile(filename):
        print("Retrieving {}".format(url))
        urlretrieve(url, CACHE_DIR + "/" + name)
        print("Finished downloading {}".format(url))

 result = Pool().map(write_to_file, generate_all_urls())

	from multiprocessing.dummy import Pool
	from urllib import urlretrieve, urlopen
	from bs4 import BeautifulSoup
	import os
	from datetime import datetime

	startDate = datetime.strptime('01.01.1996', '%d.%m.%Y')
	endDate = datetime.strptime('01.01.2018', '%d.%m.%Y')


	CACHE_DIR = "cache"
	URL = 'https://www.sec.gov/Archives/edgar/Oldloads'
	QUARTERS = ["QTR1", "QTR2", "QTR3", "QTR4"]
	YEARS = [str(yr) for yr in range(startDate.year, endDate.year)]


	def generate_all_urls():
	urls = []
	for year in YEARS:
	for qtr in QUARTERS:
	url = "/".join([URL, year, qtr]) + "/"
	soup = BeautifulSoup(urlopen(url), "lxml")
	for link in soup.find_all('a', href=True):
	lnk = link.get('href')
	if ".gz" in lnk:
	full_path = url + lnk
	yield full_path

	def write_to_file(url):
	if not os.path.exists(CACHE_DIR):
	os.makedirs(CACHE_DIR)
	name = url.split("/")[-1]
	filename = CACHE_DIR + "/" + name
	if not os.path.isfile(filename):
	print("Retrieving {}".format(url))
	urlretrieve(url, CACHE_DIR + "/" + name)
	print("Finished downloading {}".format(url))

	result = Pool().map(write_to_file, generate_all_urls())