Last active
December 17, 2016 23:04
-
-
Save cstrouse/ea546d3764454a364ca1 to your computer and use it in GitHub Desktop.
fda advisory item scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import jinja2 | |
import shutil | |
import os | |
from bs4 import BeautifulSoup | |
BASE_URL = "http://www.accessdata.fda.gov/cms_ia/" | |
OUTPUT_DIR = "build" | |
env = jinja2.Environment(loader=jinja2.FileSystemLoader("tpl")) | |
if os.path.exists(OUTPUT_DIR): | |
shutil.rmtree(OUTPUT_DIR) | |
os.makedirs(OUTPUT_DIR) | |
soup = BeautifulSoup(requests.get(BASE_URL + "iapublishdate.html").content, "lxml") | |
alerts = soup.find_all("tr", {"class": re.compile("row.*")}) | |
for alert in alerts: | |
cols = alert.find_all("td") | |
alert_number = cols[0].find(text=True) | |
alert_type = cols[1].find(text=True) | |
publish_date = cols[2].find(text=True) | |
alert_name = cols[3].find(text=True) | |
detail_url = BASE_URL + cols[3].find("a", href=True)["href"] | |
soup = BeautifulSoup(requests.get(detail_url).content, "lxml") | |
details = soup.find_all("div", {"class": "iabody"}) | |
reason = details[1].text | |
guidance = details[2].text | |
product_description = details[3].text | |
charge = details[4].text | |
vendors = soup.find_all("div", {"class": re.compile("textbody_level[1-3]")}) | |
vendors_clean = "".join(str(v) for v in vendors) | |
advisory_items = { | |
"alert_number": alert_number, | |
"alert_type": alert_type, | |
"publish_date": publish_date, | |
"alert_name": alert_name, | |
"detail_url": detail_url, | |
"reason": reason, | |
"guidance": guidance, | |
"product_description": product_description, | |
"charge": charge, | |
"vendors": vendors_clean.decode("utf8") | |
} | |
html = env.get_template("ia_items.tpl").render(a=advisory_items) | |
o = open(OUTPUT_DIR + "/{:s}.html".format(alert_number), "w") | |
o.write(html.encode("utf-8")) | |
o.close() | |
advisory_items.clear() | |
print "Processed advisory #{:s}".format(alert_number) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{% for a in advisory_items %} | |
<div class="ia_item"> | |
<p><b>Import Alert #</b> {{ a["alert_number"] }}</p> | |
<p><b>Publish Date:</b> {{ a["publish_date"] }}</p> | |
<p><b>Import Alert Name:</b> {{ a["alert_name"] }}</p> | |
<p><b>Reason for Alert:</b> {{ a["reason"] }}</p> | |
<p><b>Guidance:</b> {{ a["guidance"] }}</p> | |
<p><b>Product Description:</b> {{ a["product_description"] }}</p> | |
<p><b>Charge:</b> {{ a["charge"] }}</p> | |
<div class="ia_item_vendors">{{ a["vendors"] }}</div> | |
<div> | |
{% endfor %} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment