Created
April 20, 2011 13:33
-
-
Save bycoffe/931340 to your computer and use it in GitHub Desktop.
First pass at scraping the FAPIIS site, just to see if it's possible.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
First pass at scraping the FAPIIS site, just to see if it's possible. | |
This seems to work, but it's impossible to know whether the data it's | |
returning will be accurate since FAPIIS doesn't currently contain any | |
data. | |
This method requires knowing the company's DUNS number, though it's | |
likely possible to back this up a step to allow for searching by name. | |
""" | |
import urllib | |
import urllib2 | |
import lxml.html | |
def get_summary_page(duns): | |
url = 'https://www.fapiis.gov/fapiis/fapiis/govt/adversereportsearch.do' | |
data = {'org.apache.struts.taglib.html.TOKEN': '', | |
'action': 'showReportsSummary', | |
'rctrID': duns, # DUNS number | |
'rctrName': '', | |
'nameOption': '', | |
'searchctrName': 'This can really be anything', | |
'searchduns': '', | |
'cageCode': '', | |
'sequence': '', | |
'inputsequence': '', | |
} | |
req = urllib2.Request(url, data=urllib.urlencode(data)) | |
response = urllib2.urlopen(req) | |
return response.read() | |
def parse_summary_page(page): | |
doc = lxml.html.fromstring(page) | |
try: | |
table = doc.cssselect('#listdata')[0] | |
except IndexError: | |
return | |
for row in table.cssselect('tr')[2:]: | |
cells = row.cssselect('td') | |
try: | |
link = row.cssselect('input')[0].attrib['onclick'].strip() | |
report_type = cells[1].text_content().strip() | |
count = cells[2].text_content().strip() | |
except IndexError: | |
continue | |
print link | |
print report_type | |
print count | |
def _main(): | |
duns = 192835515 | |
page = get_summary_page(duns) | |
parse_summary_page(page) | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment