Skip to content

Instantly share code, notes, and snippets.

@sleepynate
Created August 25, 2011 15:21
Show Gist options
  • Save sleepynate/1170912 to your computer and use it in GitHub Desktop.
Save sleepynate/1170912 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Parses the U.S. Securities and Exchance Commision website for info on
# Form 4/A filings.
from urllib import urlopen
import re
class SECParser:
def __init__(self, page):
"""Sets up a parser pointed at an SEC EDGAR index page."""
self.SEC_URL = page
return
def retrieveIndex(self, url):
"""Gets the index, be it from the SEC or locally"""
print "Using index from: " + url
edgar_resource = urlopen(url)
print 'Retrieved webpage'
return edgar_resource
def scanIndex(self, file):
"""Scan's the page's index for links to 4\A filings"""
listings = 0
good_lines = []
for line in file.readlines():
found = re.search("Archives/edgar/data/.*index.htm", line)
if found:
listings += 1
if VERBOSE == True:
print "String match: " + line[int(found.start()):int(found.end())]
good_lines.append("http://sec.gov/"+line[int(found.start()):int(found.end())])
print str(listings) + " listings scanned"
file.close()
return good_lines
def xmlLinksFromFilings(self, link_array):
"""This method pulls links to the XML pages from the individual
filing pages in """
links_to_xml=[]
scanned_resources = 0
for link in link_array:
resource = urlopen(link)
if VERBOSE == True:
print "Opened resourse: " + link
for line in resource.readlines():
xml_link = re.search(r"Archives/edgar/data/[/0-9]+/[a-zA-Z0-9\-_]+\.xml", line)
if xml_link:
if VERBOSE == True:
print "XML Link found: " + line[int(xml_link.start()):int(xml_link.end())]
link_string = "http://sec.gov/"+line[int(xml_link.start()):int(xml_link.end())]
if link_string in links_to_xml:
if VERBOSE == True:
print "XML link already in array"
else:
links_to_xml.append(link_string)
resource.close()
scanned_resources += 1
print "%3d Resources scanned so far" % scanned_resources
return links_to_xml
def processXMLLinks(self, links_array):
"""Takes an array of links to xml documents in the format used
by EDGAR and retrieving each one and calling parseXML() to
return an array of arrays, packed with the pertinent data"""
full_info_from_listings = []
for link in links_array:
if VERBOSE == True:
print "Parsing: " + link
data = self.parseXML( urlopen(link) )
full_info_from_listings.append(data)
return full_info_from_listings
def parseXML(self, text):
"""reads through text to find relevent tags, the places them in
an array to return"""
file_data = []
for line in text.readlines():
if "issuerName" in line:
file_data.append(line.strip())
if "issuerTradingSymbol" in line:
file_data.append(line.strip())
if "rptOwnerName" in line:
file_data.append(line.strip())
if "officerTitle" in line:
file_data.append(line.strip())
return file_data
def xmlOutput(self, aoa):
"""Takes an array of arrays of EDGAR data and outputs them in a
readable fashion"""
for entry in aoa:
print "\n".join(entry)
# End of class SECParser()
if __name__ == '__main__':
VERBOSE = False
url = 'http://sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4%2Fa&owner=include&count=100&action=getcurrent'
parsy = SECParser(url)
#^ Set up class with a link to edgar
index = parsy.retrieveIndex(parsy.SEC_URL)
#^ open index resource
form4_links = parsy.scanIndex(index)
#^ pass resource to pull links to listings from index page
xml_links = parsy.xmlLinksFromFilings(form4_links)
#^ pull links to XML versions of filings from html filing pages
print parsy.xmlOutput(parsy.processXMLLinks(xml_links))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment