Created
October 25, 2017 05:10
-
-
Save redwrasse/30dbceb1b6f864c99d6e9f940ee07470 to your computer and use it in GitHub Desktop.
Download SEC compressed filings from the EDGAR database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from multiprocessing.dummy import Pool | |
from urllib import urlretrieve, urlopen | |
from bs4 import BeautifulSoup | |
import os | |
from datetime import datetime | |
startDate = datetime.strptime('01.01.1996', '%d.%m.%Y') | |
endDate = datetime.strptime('01.01.2018', '%d.%m.%Y') | |
CACHE_DIR = "cache" | |
URL = 'https://www.sec.gov/Archives/edgar/Oldloads' | |
QUARTERS = ["QTR1", "QTR2", "QTR3", "QTR4"] | |
YEARS = [str(yr) for yr in range(startDate.year, endDate.year)] | |
def generate_all_urls(): | |
urls = [] | |
for year in YEARS: | |
for qtr in QUARTERS: | |
url = "/".join([URL, year, qtr]) + "/" | |
soup = BeautifulSoup(urlopen(url), "lxml") | |
for link in soup.find_all('a', href=True): | |
lnk = link.get('href') | |
if ".gz" in lnk: | |
full_path = url + lnk | |
yield full_path | |
def write_to_file(url): | |
if not os.path.exists(CACHE_DIR): | |
os.makedirs(CACHE_DIR) | |
name = url.split("/")[-1] | |
filename = CACHE_DIR + "/" + name | |
if not os.path.isfile(filename): | |
print("Retrieving {}".format(url)) | |
urlretrieve(url, CACHE_DIR + "/" + name) | |
print("Finished downloading {}".format(url)) | |
result = Pool().map(write_to_file, generate_all_urls()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment