Skip to content

Instantly share code, notes, and snippets.

@redwrasse
Created October 25, 2017 05:10
Show Gist options
  • Save redwrasse/30dbceb1b6f864c99d6e9f940ee07470 to your computer and use it in GitHub Desktop.
Save redwrasse/30dbceb1b6f864c99d6e9f940ee07470 to your computer and use it in GitHub Desktop.
Download SEC compressed filings from the EDGAR database
from multiprocessing.dummy import Pool
from urllib import urlretrieve, urlopen
from bs4 import BeautifulSoup
import os
from datetime import datetime
startDate = datetime.strptime('01.01.1996', '%d.%m.%Y')
endDate = datetime.strptime('01.01.2018', '%d.%m.%Y')
CACHE_DIR = "cache"
URL = 'https://www.sec.gov/Archives/edgar/Oldloads'
QUARTERS = ["QTR1", "QTR2", "QTR3", "QTR4"]
YEARS = [str(yr) for yr in range(startDate.year, endDate.year)]
def generate_all_urls():
urls = []
for year in YEARS:
for qtr in QUARTERS:
url = "/".join([URL, year, qtr]) + "/"
soup = BeautifulSoup(urlopen(url), "lxml")
for link in soup.find_all('a', href=True):
lnk = link.get('href')
if ".gz" in lnk:
full_path = url + lnk
yield full_path
def write_to_file(url):
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
name = url.split("/")[-1]
filename = CACHE_DIR + "/" + name
if not os.path.isfile(filename):
print("Retrieving {}".format(url))
urlretrieve(url, CACHE_DIR + "/" + name)
print("Finished downloading {}".format(url))
result = Pool().map(write_to_file, generate_all_urls())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment