sleepynate · August 25, 2011 15:21
diff --git a/SEC-4Aparser.py b/SEC-4Aparser.py
 #!/usr/bin/env python
 #
 # Parses the U.S. Securities and Exchance Commision website for info on
 # Form 4/A filings.


 from urllib import urlopen
 import re

 class SECParser:

    def __init__(self, page):
        """Sets up a parser pointed at an SEC EDGAR index page."""
        self.SEC_URL = page
        return

    def retrieveIndex(self, url):
        """Gets the index, be it from the SEC or locally"""
        print "Using index from: " + url
        edgar_resource = urlopen(url)
        print 'Retrieved webpage'
        return edgar_resource

    def scanIndex(self, file):
        """Scan's the page's index for links to 4\A filings"""
        listings = 0
        good_lines = []
        for line in file.readlines():
            found = re.search("Archives/edgar/data/.*index.htm", line)
            if found:
                listings += 1
                if VERBOSE == True:
                    print "String match: " + line[int(found.start()):int(found.end())]
                good_lines.append("http://sec.gov/"+line[int(found.start()):int(found.end())])
        print str(listings) + " listings scanned"
        file.close()
        return good_lines

    def xmlLinksFromFilings(self, link_array):
        """This method pulls links to the XML pages from the individual
        filing pages in """
        links_to_xml=[]
        scanned_resources = 0
        for link in link_array:
            resource = urlopen(link)
            if VERBOSE == True:
                print "Opened resourse: " + link
            for line in resource.readlines():
                xml_link = re.search(r"Archives/edgar/data/[/0-9]+/[a-zA-Z0-9\-_]+\.xml", line)
                if xml_link:
                    if VERBOSE == True:
                        print "XML Link found: " + line[int(xml_link.start()):int(xml_link.end())]
                    link_string = "http://sec.gov/"+line[int(xml_link.start()):int(xml_link.end())]
                    if link_string in links_to_xml:
                        if VERBOSE == True:
                            print "XML link already in array"
                    else:
                        links_to_xml.append(link_string)
            resource.close()
            scanned_resources += 1
            print "%3d Resources scanned so far" % scanned_resources
        return links_to_xml

    def processXMLLinks(self, links_array):
        """Takes an array of links to xml documents in the format used
        by EDGAR and retrieving each one and calling parseXML() to
        return an array of arrays, packed with the pertinent data"""
        full_info_from_listings = []
        for link in links_array:
            if VERBOSE == True:
                print "Parsing: " + link
            data = self.parseXML( urlopen(link) )
            full_info_from_listings.append(data)
        return full_info_from_listings

    def parseXML(self, text):
        """reads through text to find relevent tags, the places them in
        an array to return"""
        file_data = []
        for line in text.readlines():
            if "issuerName" in line:
                file_data.append(line.strip())
            if "issuerTradingSymbol" in line:
                file_data.append(line.strip())
            if "rptOwnerName" in line:
                file_data.append(line.strip())
            if "officerTitle" in line:
                file_data.append(line.strip())
        return file_data

    def xmlOutput(self, aoa):
        """Takes an array of arrays of EDGAR data and outputs them in a
        readable fashion"""
        for entry in aoa:
            print "\n".join(entry)
 # End of class SECParser()

 if __name__ == '__main__':
    VERBOSE = False
    url = 'http://sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4%2Fa&owner=include&count=100&action=getcurrent'
    parsy = SECParser(url)
    #^ Set up class with a link to edgar
    index = parsy.retrieveIndex(parsy.SEC_URL)
    #^ open index resource
    form4_links = parsy.scanIndex(index)
    #^ pass resource to pull links to listings from index page
    xml_links = parsy.xmlLinksFromFilings(form4_links)
    #^ pull links to XML versions of filings from html filing pages
    print parsy.xmlOutput(parsy.processXMLLinks(xml_links))
	#!/usr/bin/env python
	#
	# Parses the U.S. Securities and Exchance Commision website for info on
	# Form 4/A filings.


	from urllib import urlopen
	import re

	class SECParser:

	def __init__(self, page):
	"""Sets up a parser pointed at an SEC EDGAR index page."""
	self.SEC_URL = page
	return

	def retrieveIndex(self, url):
	"""Gets the index, be it from the SEC or locally"""
	print "Using index from: " + url
	edgar_resource = urlopen(url)
	print 'Retrieved webpage'
	return edgar_resource

	def scanIndex(self, file):
	"""Scan's the page's index for links to 4\A filings"""
	listings = 0
	good_lines = []
	for line in file.readlines():
	found = re.search("Archives/edgar/data/.*index.htm", line)
	if found:
	listings += 1
	if VERBOSE == True:
	print "String match: " + line[int(found.start()):int(found.end())]
	good_lines.append("http://sec.gov/"+line[int(found.start()):int(found.end())])
	print str(listings) + " listings scanned"
	file.close()
	return good_lines

	def xmlLinksFromFilings(self, link_array):
	"""This method pulls links to the XML pages from the individual
	filing pages in """
	links_to_xml=[]
	scanned_resources = 0
	for link in link_array:
	resource = urlopen(link)
	if VERBOSE == True:
	print "Opened resourse: " + link
	for line in resource.readlines():
	xml_link = re.search(r"Archives/edgar/data/[/0-9]+/[a-zA-Z0-9\-_]+\.xml", line)
	if xml_link:
	if VERBOSE == True:
	print "XML Link found: " + line[int(xml_link.start()):int(xml_link.end())]
	link_string = "http://sec.gov/"+line[int(xml_link.start()):int(xml_link.end())]
	if link_string in links_to_xml:
	if VERBOSE == True:
	print "XML link already in array"
	else:
	links_to_xml.append(link_string)
	resource.close()
	scanned_resources += 1
	print "%3d Resources scanned so far" % scanned_resources
	return links_to_xml

	def processXMLLinks(self, links_array):
	"""Takes an array of links to xml documents in the format used
	by EDGAR and retrieving each one and calling parseXML() to
	return an array of arrays, packed with the pertinent data"""
	full_info_from_listings = []
	for link in links_array:
	if VERBOSE == True:
	print "Parsing: " + link
	data = self.parseXML( urlopen(link) )
	full_info_from_listings.append(data)
	return full_info_from_listings

	def parseXML(self, text):
	"""reads through text to find relevent tags, the places them in
	an array to return"""
	file_data = []
	for line in text.readlines():
	if "issuerName" in line:
	file_data.append(line.strip())
	if "issuerTradingSymbol" in line:
	file_data.append(line.strip())
	if "rptOwnerName" in line:
	file_data.append(line.strip())
	if "officerTitle" in line:
	file_data.append(line.strip())
	return file_data

	def xmlOutput(self, aoa):
	"""Takes an array of arrays of EDGAR data and outputs them in a
	readable fashion"""
	for entry in aoa:
	print "\n".join(entry)
	# End of class SECParser()

	if __name__ == '__main__':
	VERBOSE = False
	url = 'http://sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4%2Fa&owner=include&count=100&action=getcurrent'
	parsy = SECParser(url)
	#^ Set up class with a link to edgar
	index = parsy.retrieveIndex(parsy.SEC_URL)
	#^ open index resource
	form4_links = parsy.scanIndex(index)
	#^ pass resource to pull links to listings from index page
	xml_links = parsy.xmlLinksFromFilings(form4_links)
	#^ pull links to XML versions of filings from html filing pages
	print parsy.xmlOutput(parsy.processXMLLinks(xml_links))