Skip to content

Instantly share code, notes, and snippets.

@jeremyjbowers
Created March 21, 2014 16:11
Show Gist options
  • Save jeremyjbowers/9689707 to your computer and use it in GitHub Desktop.
Save jeremyjbowers/9689707 to your computer and use it in GitHub Desktop.
A scraper for dead coal miners. Requires a Virtualenv where you've installed requests and beautifulsoup4. Run it like this: ./coal.py
#!/usr/bin/env python
import json
from bs4 import BeautifulSoup
import requests
BASE_URL = "http://bpldb.bplonline.org/db/formProc/coalmine?firstname=&search_lastname_parameter=starts&lastname=&title=coalmine&mine=&race=&search_firstname_parameter=starts&year=&report=&cause=&occupation=&atnum=%s"
def get_count():
"""
Get the number of miners -- thus the number of pages.
Returns a single integer which represents the total number of miners.
"""
r = requests.get(BASE_URL % 0)
soup = BeautifulSoup(r.content)
count = soup.select('#dbResultsCount')[0].text
return int(count.split('Viewing results 1 to 30 of ')[1])
def calculate_pages(count):
"""
Calculate the number of pages based on the number of miners.
We know there are 30 per page.
The URL paginates based on miner number.
Returns a tuple with the page number and the miner number.
"""
pages = []
current_page = 0
miner_count = 30
while (miner_count < count):
pages.append((current_page, miner_count))
current_page += 1
miner_count += 30
return pages
def request_page(miner_count):
"""
Grabs the page from the Web site.
Returns a string of HTML.
"""
r = requests.get(BASE_URL % miner_count)
return r.content
def parse_page(html):
"""
Parses the page's HTML to a list of miners.
The list of miners contains a dictionary for each one.
Retuns a list of dictionaries.
"""
soup = BeautifulSoup(html)
miners = soup.select('div.footer_inner_element')
miner_list = []
for miner in miners:
miner_dict = {}
miner_dict['name'] = miner.select('div.spacing_five')[0].contents[2].strip().encode('utf-8')
miner_dict['mine'] = miner.select('div.spacing_five')[1].contents[2].strip().encode('utf-8')
miner_dict['cause_of_death'] = miner.select('div.spacing_five')[2].contents[2].strip().encode('utf-8')
miner_dict['occupation'] = miner.select('div.spacing_five')[3].contents[2].strip().encode('utf-8')
miner_dict['race'] = miner.select('div.spacing_five')[4].contents[2].strip().encode('utf-8')
miner_dict['report_year'] = miner.select('div.spacing_five')[5].contents[2].strip().encode('utf-8')
miner_dict['date_string'] = miner.select('div.spacing_five')[6].contents[2].strip().encode('utf-8')
miner_dict['notes'] = miner.select('div.spacing_five')[7].contents[2].strip().encode('utf-8')
miner_list.append(miner_dict)
return miner_list
def write_json(miner_list):
"""
Writes a JSON file based on a list.
"""
with open('miners.json', 'wb') as writefile:
writefile.write(json.dumps(miner_list))
def main():
"""
What are we going to do here?
1. Get the count.
2. Figure out the pages we need to request.
3. Request those pages.
4. Parse each page.
5. Make a long list of miners.
6. Write the list of miners to JSON.
7. Profit??
"""
count = get_count()
print "Starting up!"
print "Downloading %s miners." % count
pages = calculate_pages(count)
miner_list = []
for page, miner_count in pages:
print "Getting page %s." % (int(page) + 1)
html = request_page(miner_count)
miners = parse_page(html)
miner_list = miner_list + miners
print "Writing %s miners to JSON." % len(miner_list)
write_json(miner_list)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment