Created
June 29, 2012 21:00
-
-
Save csessig86/3020607 to your computer and use it in GitHub Desktop.
This Python scraper pulls information off a PDF after its been converted to HTML. The PDF is a weekly arrest log provided by the Waterloo Police Department. The information is then put into a CSV file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We will be using the Python library Beautiful Soup | |
# To scrape the information | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import re | |
# Note: This arrest log is available at: | |
# http://chrisessig.com/arrestlog.PDF | |
# It was taken from the Waterloo Police Department's website: | |
# http://www.waterloopolice.com/images/arrestlog.PDF | |
# Create a CSV where we'll save our data. | |
f = open('crime_data.csv', 'w') | |
# Add headers | |
f.write("last_name" + "," + "first_name" + "," + "age" + "," + "address" + "," + "city" + "," + "arrest_date" + "," + "location" + "," + "charge_one" + "," + "charge_two" + "," + "charge_three" + "," + "charge_four" + "," + "charge_five" + "," + "charge_six" + "," + "charge_seven" + "," + "charge_eight" + "," + "charge_nine" + "," + "charge_ten" + "\n") | |
# Use PDFtoHTML to convert into HTML | |
# For Mac users, go to command line and type: brew install pdftohtml | |
# Command line command convert PDF pages to HTML pages: pdftohtml -c arrestlog.pdf | |
# URL of the arrest log index, which includes links to several pages with arrest records on them | |
url = 'file:///Users/Essig/Desktop/crime_map/arrestlog_ind.html' | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
# Go through each record page | |
# PDFtoHTML gives us several pages of arrests | |
# Depending on how many people were arrested | |
# So we create a for loop to go through each one | |
for a in soup.find_all('a',href=True): | |
# Create new URL for each page and pass that to Beautiful Soup | |
new_url = 'file:///Users/Essig/Desktop/crime_map/' + a['href'] | |
new_page = urllib2.urlopen(new_url) | |
new_soup = BeautifulSoup(new_page) | |
# We'll now starting pulling content from URLs | |
# We need to pull names, addresses, arrest dates, etc. | |
# Note: Each page gets its own array of names | |
# This regex looks for the word "Name: " | |
# "Name: " is included in the divs containing names of those arrested. | |
# Basic format: "Name: Bob Smith" | |
name_regex = re.compile('.*Name: *') | |
# Search for div containing this regex using BS | |
# Then put the content of each div into an array | |
# Later, we will remove "Name: " | |
names = [post.get_text() for post in new_soup.find_all('div') if name_regex.match(post.get_text())] | |
# Regex looks for exactly two digits | |
# Ages are the only field that will contain exactly two digits | |
age_regex = re.compile('\d{2}$') | |
# Search for div containing this regex using BS | |
# Then put the content of each div into an array | |
ages = [post.get_text() for post in new_soup.find_all('div') if age_regex.match(post.get_text())] | |
# Addresses are contained in the next div after the div with ages | |
# We will find that using 'find_next('div') | |
# Then put the content of each div into an array | |
addresses = [post.find_next('div').get_text() for post in new_soup.find_all('div') if age_regex.match(post.get_text())] | |
# City is div after div with the word 'Age:' in it. | |
# We will find that using 'find_next('div') | |
# Then put the content of each div into an array | |
city = [post.find_next('div').get_text() for post in new_soup.find_all('div', text="Age:")] | |
# Regex statement finds the format of the arrest dates: 00/00/2012 | |
arrest_date_regex = re.compile('\d{2}/\d{2}/\d{4}$') | |
# Then find div containing "WATERLOO POLICE DEPARTMENT" | |
# Dates are in divs previous to divs containing "WATERLOO POLICE DEPARTMENT" | |
# Find these divs with 'find_previous' | |
# Make sure they fit the format of the regex statement | |
# Then put the content of each div into an array | |
arrest_date = [post.find_previous('div').get_text() for post in new_soup.find_all('div', text="WATERLOO POLICE DEPARTMENT") if arrest_date_regex.match(post.find_previous('div').get_text())] | |
# Location is in the div after the div with the word 'Arresting Agency:' in it. | |
# We will find that using 'find_next('div') | |
# Then put the content of each div into an array | |
location = [post.find_next('div').get_text() for post in new_soup.find_all('div', text="Arresting Agency:")] | |
# Charges are the trickest to pull | |
# Several people have multiple charges | |
# All charges are in capital letters | |
# Regex assumes charges have at least three, uppercased letters | |
charges_regex = re.compile('[A-Z][A-Z][A-Z]*') | |
# All charges have style 'left:167' | |
# This indicates how far to the left they are indented | |
# This regex searches for any number of letters and digits | |
# Then style 'left:167' | |
charges_regex_css = re.compile('[a-z]*\d*\;left:167') | |
# We will find text containing 'left:167' attribute | |
# Then put the content of each div into an array | |
# Note: Before each list of charges is a div containing 'Charge Description | |
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'DRUG STAMP VIOL', | |
# 'Charge Description', 'SALE/MFG: MARIJUANA', etc. | |
# We keep the 'Charge Description in the array | |
# So we can divide charges up by person later | |
charges_div = [post.get_text() for post in new_soup.find_all(attrs={'style' : charges_regex_css})] | |
# Create empty array for each charge | |
# We're accounting for up to 10 charges. | |
# If they have a charge, it will appear | |
# Otherwise, we will put an 'x' in the row where the charge appears | |
charge_one = [] | |
charge_two = [] | |
charge_three = [] | |
charge_four = [] | |
charge_five = [] | |
charge_six = [] | |
charge_seven = [] | |
charge_eight = [] | |
charge_nine = [] | |
charge_ten = [] | |
# Start with -1 so first time through the for loop below | |
# desc_num will equal 0 | |
desc_num = -1 | |
desc_index = [] | |
# Number of items in each page's list of charges | |
# Each page will be different | |
# Since each person has a different amount of charges | |
charges_div_length = len(charges_div) | |
# We use this to find where 'Charge Description' | |
# Appears in the array | |
# This will help us split up the charges by person | |
# Each appearance of 'Charge Description' | |
# Signifies a new list of charges for a new person | |
for post in charges_div: | |
desc_num = desc_num + 1 | |
if post == 'Charge Description': | |
desc_index.append(desc_num) | |
# Append the length of the charges_div to our desc_index array | |
# This would be the theoredical next 'Charge Description' in the array | |
# We use this in for loop below | |
# To find how many charges the last person on the page has | |
desc_index.append(charges_div_length) | |
num = -1 | |
# Loops through each page | |
# And grabs just the first charge for each person | |
for post in charges_div: | |
# Variable keeps track of where inside the array the charge is | |
num = num + 1 | |
# Grab the field after 'Charge Description' | |
# Which would be the first charge for each person | |
if post == 'Charge Description': | |
charge_one.append(charges_div[num+1]) | |
# This loop uses desc_index | |
# Which is our list of locations of 'Charge Description' | |
# And either posts the second, etc. charge in their respective array | |
# Or an 'x' if it doesn't exist | |
num_two = -1 | |
# We don't feed the last number in the array | |
# Through the loop statement | |
# Because nothing comes after it. | |
# IE: Calling 'desc_index[num_two + 1]' in the loop below | |
# Would be invalid | |
desc_index_length = len(desc_index) - 1 | |
for post in desc_index[0:desc_index_length]: | |
num_two = num_two + 1 | |
# If the space between 'Charge Description' locations is 2 | |
# The person has only one charge | |
# So we will put 'x' in the charge_two array | |
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'Charge Description' | |
if desc_index[num_two + 1] - desc_index[num_two] == 2: | |
charge_two.append('x') | |
charge_three.append('x') | |
charge_four.append('x') | |
charge_five.append('x') | |
charge_six.append('x') | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
# Otherwise we need to append all their charges | |
# Which will be between locations of 'Charge Description' | |
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'DRUG STAMP VIOL', 'Charge Description' | |
# Would be two spots and two charges | |
elif desc_index[num_two + 1] - desc_index[num_two] == 3: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append('x') | |
charge_four.append('x') | |
charge_five.append('x') | |
charge_six.append('x') | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 4: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append('x') | |
charge_five.append('x') | |
charge_six.append('x') | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 5: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append('x') | |
charge_six.append('x') | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 6: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append('x') | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 7: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append(charges_div[desc_index[num_two] + 6]) | |
charge_seven.append('x') | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 8: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append(charges_div[desc_index[num_two] + 6]) | |
charge_seven.append(charges_div[desc_index[num_two] + 7]) | |
charge_eight.append('x') | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 9: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append(charges_div[desc_index[num_two] + 6]) | |
charge_seven.append(charges_div[desc_index[num_two] + 7]) | |
charge_eight.append(charges_div[desc_index[num_two] + 8]) | |
charge_nine.append('x') | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 10: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append(charges_div[desc_index[num_two] + 6]) | |
charge_seven.append(charges_div[desc_index[num_two] + 7]) | |
charge_eight.append(charges_div[desc_index[num_two] + 8]) | |
charge_nine.append(charges_div[desc_index[num_two] + 9]) | |
charge_ten.append('x') | |
elif desc_index[num_two + 1] - desc_index[num_two] == 11: | |
charge_two.append(charges_div[desc_index[num_two] + 2]) | |
charge_three.append(charges_div[desc_index[num_two] + 3]) | |
charge_four.append(charges_div[desc_index[num_two] + 4]) | |
charge_five.append(charges_div[desc_index[num_two] + 5]) | |
charge_six.append(charges_div[desc_index[num_two] + 6]) | |
charge_seven.append(charges_div[desc_index[num_two] + 7]) | |
charge_eight.append(charges_div[desc_index[num_two] + 8]) | |
charge_nine.append(charges_div[desc_index[num_two] + 9]) | |
charge_ten.append(charges_div[desc_index[num_two] + 10]) | |
else: | |
print "WARNING: LOTS OF CHARGES FOR ONE PERSON. ADD MORE OPTIONS!!!!" | |
# We're almost done! | |
# We'll now loop though the arrays we made | |
# And print results to a CSV | |
for x in range(0, len(names)): | |
# Replace the word "Name: " with nothing so we have just the names | |
new_names = names[x].replace("Name: ", "") | |
# Replace commas, so we don't screw up CSV | |
new_ages = ages[x].replace(",", " -") | |
new_addresses = addresses[x].replace(",", " -") | |
new_city = city[x].replace(",", " -") | |
new_arrest_date = arrest_date[x].replace(",", " -") | |
new_location = location[x].replace(",", " -") | |
# Here's our arrays for our ten charges | |
new_charge_one = charge_one[x].replace(",", " -") | |
new_charge_two = charge_two[x].replace(",", " -") | |
new_charge_three = charge_three[x].replace(",", " -") | |
new_charge_four = charge_four[x].replace(",", " -") | |
new_charge_five = charge_five[x].replace(",", " -") | |
new_charge_six = charge_six[x].replace(",", " -") | |
new_charge_seven = charge_seven[x].replace(",", " -") | |
new_charge_eight = charge_eight[x].replace(",", " -") | |
new_charge_nine = charge_nine[x].replace(",", " -") | |
new_charge_ten = charge_ten[x].replace(",", " -") | |
# Extra replaces for location need to changed to 'and' | |
# Funky symbols in addresses will screw up Google | |
new_location02 = new_location.replace("&", "and") | |
new_location03 = new_location02.replace("/", " and ") | |
# Write to CSV | |
f.write(new_names + "," + new_ages + "," + new_addresses + "," + new_city + "," + new_arrest_date + "," + new_location03 + " Waterloo IA" + "," + new_charge_one + "," + new_charge_two + "," + new_charge_three + "," + new_charge_four + "," + new_charge_five + "," + new_charge_six + "," + new_charge_seven + "," + new_charge_eight + "," + new_charge_nine + "," + new_charge_ten + "\n") | |
# Always a good idea to close! | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment