Created
August 25, 2011 15:21
-
-
Save sleepynate/1170912 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Parses the U.S. Securities and Exchance Commision website for info on | |
# Form 4/A filings. | |
from urllib import urlopen | |
import re | |
class SECParser: | |
def __init__(self, page): | |
"""Sets up a parser pointed at an SEC EDGAR index page.""" | |
self.SEC_URL = page | |
return | |
def retrieveIndex(self, url): | |
"""Gets the index, be it from the SEC or locally""" | |
print "Using index from: " + url | |
edgar_resource = urlopen(url) | |
print 'Retrieved webpage' | |
return edgar_resource | |
def scanIndex(self, file): | |
"""Scan's the page's index for links to 4\A filings""" | |
listings = 0 | |
good_lines = [] | |
for line in file.readlines(): | |
found = re.search("Archives/edgar/data/.*index.htm", line) | |
if found: | |
listings += 1 | |
if VERBOSE == True: | |
print "String match: " + line[int(found.start()):int(found.end())] | |
good_lines.append("http://sec.gov/"+line[int(found.start()):int(found.end())]) | |
print str(listings) + " listings scanned" | |
file.close() | |
return good_lines | |
def xmlLinksFromFilings(self, link_array): | |
"""This method pulls links to the XML pages from the individual | |
filing pages in """ | |
links_to_xml=[] | |
scanned_resources = 0 | |
for link in link_array: | |
resource = urlopen(link) | |
if VERBOSE == True: | |
print "Opened resourse: " + link | |
for line in resource.readlines(): | |
xml_link = re.search(r"Archives/edgar/data/[/0-9]+/[a-zA-Z0-9\-_]+\.xml", line) | |
if xml_link: | |
if VERBOSE == True: | |
print "XML Link found: " + line[int(xml_link.start()):int(xml_link.end())] | |
link_string = "http://sec.gov/"+line[int(xml_link.start()):int(xml_link.end())] | |
if link_string in links_to_xml: | |
if VERBOSE == True: | |
print "XML link already in array" | |
else: | |
links_to_xml.append(link_string) | |
resource.close() | |
scanned_resources += 1 | |
print "%3d Resources scanned so far" % scanned_resources | |
return links_to_xml | |
def processXMLLinks(self, links_array): | |
"""Takes an array of links to xml documents in the format used | |
by EDGAR and retrieving each one and calling parseXML() to | |
return an array of arrays, packed with the pertinent data""" | |
full_info_from_listings = [] | |
for link in links_array: | |
if VERBOSE == True: | |
print "Parsing: " + link | |
data = self.parseXML( urlopen(link) ) | |
full_info_from_listings.append(data) | |
return full_info_from_listings | |
def parseXML(self, text): | |
"""reads through text to find relevent tags, the places them in | |
an array to return""" | |
file_data = [] | |
for line in text.readlines(): | |
if "issuerName" in line: | |
file_data.append(line.strip()) | |
if "issuerTradingSymbol" in line: | |
file_data.append(line.strip()) | |
if "rptOwnerName" in line: | |
file_data.append(line.strip()) | |
if "officerTitle" in line: | |
file_data.append(line.strip()) | |
return file_data | |
def xmlOutput(self, aoa): | |
"""Takes an array of arrays of EDGAR data and outputs them in a | |
readable fashion""" | |
for entry in aoa: | |
print "\n".join(entry) | |
# End of class SECParser() | |
if __name__ == '__main__': | |
VERBOSE = False | |
url = 'http://sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4%2Fa&owner=include&count=100&action=getcurrent' | |
parsy = SECParser(url) | |
#^ Set up class with a link to edgar | |
index = parsy.retrieveIndex(parsy.SEC_URL) | |
#^ open index resource | |
form4_links = parsy.scanIndex(index) | |
#^ pass resource to pull links to listings from index page | |
xml_links = parsy.xmlLinksFromFilings(form4_links) | |
#^ pull links to XML versions of filings from html filing pages | |
print parsy.xmlOutput(parsy.processXMLLinks(xml_links)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment