-
-
Save dougvk/8499335 to your computer and use it in GitHub Desktop.
import re | |
from cPickle import dump | |
from requests import get | |
DEFAULT_TICKERS = ['goog', 'aapl'] | |
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany' | |
CIK_RE = re.compile(r'.*CIK=(\d{10}).*') | |
cik_dict = {} | |
for ticker in DEFAULT_TICKERS: | |
results = CIK_RE.findall(get(URL.format(ticker)).content) | |
if len(results): | |
cik_dict[str(ticker).lower()] = str(results[0]) | |
f = open('cik_dict', 'w') | |
dump(cik_dict, f) | |
f.close() |
Curious if anyone has tried to port this into Python3? Regardless, thank you Doug!
It works in python 3, just with a tiny change:
f = requests.get(URL.format(ticker), stream = True);
results = CIK_RE.findall(f.text)
Some tickers don't work tho: e.g., BRK.A (Buffett's company), anyone has a solution?
I want to convert this into a dataframe and it keeps spitting out the following
{'goog': '0001652044', 'aapl': '0000320193'}
0
goog 0001652044
aapl 0000320193
How do I rename my columns?
I tried df = df.rename(columns = {"0": "CIK"})
but that didn't work.
Here is the version of the code that I used:
`import re
from _pickle import dump
import requests
import pandas as pd
DEFAULT_TICKERS = ['goog', 'aapl']
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
CIK_RE = re.compile(r'.CIK=(\d{10}).')
cik_dict = {}
for ticker in DEFAULT_TICKERS:
f = requests.get(URL.format(ticker), stream=True);
results = CIK_RE.findall(f.text)
if len(results):
cik_dict[str(ticker).lower()] = str(results[0])
print(cik_dict)
df = pd.DataFrame.from_dict(cik_dict, orient = 'index')
df = df.rename(columns = {"0": "CIK"})
print(df)`
Thank you everyone, your comments and code was a huge help.
for 3.6:
import re, requests
def getCIKs(TICKERS):
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
cik_dict = {}
for ticker in TICKERS:
f = requests.get(URL.format(ticker), stream = True)
results = CIK_RE.findall(f.text)
if len(results):
results[0] = int(re.sub('\.[0]*', '.', results[0]))
cik_dict[str(ticker).upper()] = str(results[0])
f = open('cik_dict', 'w')
print(cik_dict)
f.close()
getCIKs(['wmt','amzn','nflx'])
# returns:
# {'WMT': '104169', 'AMZN': '1018724', 'NFLX': '1065280'}
@Micah81 instead of print(cik_dict)
, return(cik_dict)
worked better for me
Today you have to set a user agent:
import re, requests
headers = {"user-agent": "Safari"}
def getCIKs(TICKERS):
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
cik_dict = {}
for ticker in TICKERS:
f = requests.get(URL.format(ticker),headers= headers, stream = True)
results = CIK_RE.findall(f.text)
if len(results):
results[0] = int(re.sub('\.[0]*', '.', results[0]))
cik_dict[str(ticker).upper()] = str(results[0])
f = open('cik_dict', 'w')
f.close()
return(cik_dict)
getCIKs(['wmt','amzn','nflx'])
# returns:
# {'WMT': '104169', 'AMZN': '1018724', 'NFLX': '1065280'}
It's failed today because the SEC has already changed the website
Doug your code just saved me a bunch of time! Thanks man