Skip to content

Instantly share code, notes, and snippets.

@cstrouse
Last active December 17, 2016 23:04
Show Gist options
  • Save cstrouse/ea546d3764454a364ca1 to your computer and use it in GitHub Desktop.
Save cstrouse/ea546d3764454a364ca1 to your computer and use it in GitHub Desktop.
fda advisory item scraper
import requests
import re
import jinja2
import shutil
import os
from bs4 import BeautifulSoup
BASE_URL = "http://www.accessdata.fda.gov/cms_ia/"
OUTPUT_DIR = "build"
env = jinja2.Environment(loader=jinja2.FileSystemLoader("tpl"))
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)
soup = BeautifulSoup(requests.get(BASE_URL + "iapublishdate.html").content, "lxml")
alerts = soup.find_all("tr", {"class": re.compile("row.*")})
for alert in alerts:
cols = alert.find_all("td")
alert_number = cols[0].find(text=True)
alert_type = cols[1].find(text=True)
publish_date = cols[2].find(text=True)
alert_name = cols[3].find(text=True)
detail_url = BASE_URL + cols[3].find("a", href=True)["href"]
soup = BeautifulSoup(requests.get(detail_url).content, "lxml")
details = soup.find_all("div", {"class": "iabody"})
reason = details[1].text
guidance = details[2].text
product_description = details[3].text
charge = details[4].text
vendors = soup.find_all("div", {"class": re.compile("textbody_level[1-3]")})
vendors_clean = "".join(str(v) for v in vendors)
advisory_items = {
"alert_number": alert_number,
"alert_type": alert_type,
"publish_date": publish_date,
"alert_name": alert_name,
"detail_url": detail_url,
"reason": reason,
"guidance": guidance,
"product_description": product_description,
"charge": charge,
"vendors": vendors_clean.decode("utf8")
}
html = env.get_template("ia_items.tpl").render(a=advisory_items)
o = open(OUTPUT_DIR + "/{:s}.html".format(alert_number), "w")
o.write(html.encode("utf-8"))
o.close()
advisory_items.clear()
print "Processed advisory #{:s}".format(alert_number)
{% for a in advisory_items %}
<div class="ia_item">
<p><b>Import Alert #</b> {{ a["alert_number"] }}</p>
<p><b>Publish Date:</b> {{ a["publish_date"] }}</p>
<p><b>Import Alert Name:</b> {{ a["alert_name"] }}</p>
<p><b>Reason for Alert:</b> {{ a["reason"] }}</p>
<p><b>Guidance:</b> {{ a["guidance"] }}</p>
<p><b>Product Description:</b> {{ a["product_description"] }}</p>
<p><b>Charge:</b> {{ a["charge"] }}</p>
<div class="ia_item_vendors">{{ a["vendors"] }}</div>
<div>
{% endfor %}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment