Created
March 21, 2014 16:11
-
-
Save jeremyjbowers/9689707 to your computer and use it in GitHub Desktop.
A scraper for dead coal miners. Requires a Virtualenv where you've installed requests and beautifulsoup4. Run it like this: ./coal.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import json | |
| from bs4 import BeautifulSoup | |
| import requests | |
| BASE_URL = "http://bpldb.bplonline.org/db/formProc/coalmine?firstname=&search_lastname_parameter=starts&lastname=&title=coalmine&mine=&race=&search_firstname_parameter=starts&year=&report=&cause=&occupation=&atnum=%s" | |
| def get_count(): | |
| """ | |
| Get the number of miners -- thus the number of pages. | |
| Returns a single integer which represents the total number of miners. | |
| """ | |
| r = requests.get(BASE_URL % 0) | |
| soup = BeautifulSoup(r.content) | |
| count = soup.select('#dbResultsCount')[0].text | |
| return int(count.split('Viewing results 1 to 30 of ')[1]) | |
| def calculate_pages(count): | |
| """ | |
| Calculate the number of pages based on the number of miners. | |
| We know there are 30 per page. | |
| The URL paginates based on miner number. | |
| Returns a tuple with the page number and the miner number. | |
| """ | |
| pages = [] | |
| current_page = 0 | |
| miner_count = 30 | |
| while (miner_count < count): | |
| pages.append((current_page, miner_count)) | |
| current_page += 1 | |
| miner_count += 30 | |
| return pages | |
| def request_page(miner_count): | |
| """ | |
| Grabs the page from the Web site. | |
| Returns a string of HTML. | |
| """ | |
| r = requests.get(BASE_URL % miner_count) | |
| return r.content | |
| def parse_page(html): | |
| """ | |
| Parses the page's HTML to a list of miners. | |
| The list of miners contains a dictionary for each one. | |
| Retuns a list of dictionaries. | |
| """ | |
| soup = BeautifulSoup(html) | |
| miners = soup.select('div.footer_inner_element') | |
| miner_list = [] | |
| for miner in miners: | |
| miner_dict = {} | |
| miner_dict['name'] = miner.select('div.spacing_five')[0].contents[2].strip().encode('utf-8') | |
| miner_dict['mine'] = miner.select('div.spacing_five')[1].contents[2].strip().encode('utf-8') | |
| miner_dict['cause_of_death'] = miner.select('div.spacing_five')[2].contents[2].strip().encode('utf-8') | |
| miner_dict['occupation'] = miner.select('div.spacing_five')[3].contents[2].strip().encode('utf-8') | |
| miner_dict['race'] = miner.select('div.spacing_five')[4].contents[2].strip().encode('utf-8') | |
| miner_dict['report_year'] = miner.select('div.spacing_five')[5].contents[2].strip().encode('utf-8') | |
| miner_dict['date_string'] = miner.select('div.spacing_five')[6].contents[2].strip().encode('utf-8') | |
| miner_dict['notes'] = miner.select('div.spacing_five')[7].contents[2].strip().encode('utf-8') | |
| miner_list.append(miner_dict) | |
| return miner_list | |
| def write_json(miner_list): | |
| """ | |
| Writes a JSON file based on a list. | |
| """ | |
| with open('miners.json', 'wb') as writefile: | |
| writefile.write(json.dumps(miner_list)) | |
| def main(): | |
| """ | |
| What are we going to do here? | |
| 1. Get the count. | |
| 2. Figure out the pages we need to request. | |
| 3. Request those pages. | |
| 4. Parse each page. | |
| 5. Make a long list of miners. | |
| 6. Write the list of miners to JSON. | |
| 7. Profit?? | |
| """ | |
| count = get_count() | |
| print "Starting up!" | |
| print "Downloading %s miners." % count | |
| pages = calculate_pages(count) | |
| miner_list = [] | |
| for page, miner_count in pages: | |
| print "Getting page %s." % (int(page) + 1) | |
| html = request_page(miner_count) | |
| miners = parse_page(html) | |
| miner_list = miner_list + miners | |
| print "Writing %s miners to JSON." % len(miner_list) | |
| write_json(miner_list) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment