Skip to content

Instantly share code, notes, and snippets.

@csessig86
Created June 29, 2012 21:00
Show Gist options
  • Save csessig86/3020607 to your computer and use it in GitHub Desktop.
Save csessig86/3020607 to your computer and use it in GitHub Desktop.
This Python scraper pulls information off a PDF after its been converted to HTML. The PDF is a weekly arrest log provided by the Waterloo Police Department. The information is then put into a CSV file.
# We will be using the Python library Beautiful Soup
# To scrape the information
import urllib2
from bs4 import BeautifulSoup
import re
# Note: This arrest log is available at:
# http://chrisessig.com/arrestlog.PDF
# It was taken from the Waterloo Police Department's website:
# http://www.waterloopolice.com/images/arrestlog.PDF
# Create a CSV where we'll save our data.
f = open('crime_data.csv', 'w')
# Add headers
f.write("last_name" + "," + "first_name" + "," + "age" + "," + "address" + "," + "city" + "," + "arrest_date" + "," + "location" + "," + "charge_one" + "," + "charge_two" + "," + "charge_three" + "," + "charge_four" + "," + "charge_five" + "," + "charge_six" + "," + "charge_seven" + "," + "charge_eight" + "," + "charge_nine" + "," + "charge_ten" + "\n")
# Use PDFtoHTML to convert into HTML
# For Mac users, go to command line and type: brew install pdftohtml
# Command line command convert PDF pages to HTML pages: pdftohtml -c arrestlog.pdf
# URL of the arrest log index, which includes links to several pages with arrest records on them
url = 'file:///Users/Essig/Desktop/crime_map/arrestlog_ind.html'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
# Go through each record page
# PDFtoHTML gives us several pages of arrests
# Depending on how many people were arrested
# So we create a for loop to go through each one
for a in soup.find_all('a',href=True):
# Create new URL for each page and pass that to Beautiful Soup
new_url = 'file:///Users/Essig/Desktop/crime_map/' + a['href']
new_page = urllib2.urlopen(new_url)
new_soup = BeautifulSoup(new_page)
# We'll now starting pulling content from URLs
# We need to pull names, addresses, arrest dates, etc.
# Note: Each page gets its own array of names
# This regex looks for the word "Name: "
# "Name: " is included in the divs containing names of those arrested.
# Basic format: "Name: Bob Smith"
name_regex = re.compile('.*Name: *')
# Search for div containing this regex using BS
# Then put the content of each div into an array
# Later, we will remove "Name: "
names = [post.get_text() for post in new_soup.find_all('div') if name_regex.match(post.get_text())]
# Regex looks for exactly two digits
# Ages are the only field that will contain exactly two digits
age_regex = re.compile('\d{2}$')
# Search for div containing this regex using BS
# Then put the content of each div into an array
ages = [post.get_text() for post in new_soup.find_all('div') if age_regex.match(post.get_text())]
# Addresses are contained in the next div after the div with ages
# We will find that using 'find_next('div')
# Then put the content of each div into an array
addresses = [post.find_next('div').get_text() for post in new_soup.find_all('div') if age_regex.match(post.get_text())]
# City is div after div with the word 'Age:' in it.
# We will find that using 'find_next('div')
# Then put the content of each div into an array
city = [post.find_next('div').get_text() for post in new_soup.find_all('div', text="Age:")]
# Regex statement finds the format of the arrest dates: 00/00/2012
arrest_date_regex = re.compile('\d{2}/\d{2}/\d{4}$')
# Then find div containing "WATERLOO POLICE DEPARTMENT"
# Dates are in divs previous to divs containing "WATERLOO POLICE DEPARTMENT"
# Find these divs with 'find_previous'
# Make sure they fit the format of the regex statement
# Then put the content of each div into an array
arrest_date = [post.find_previous('div').get_text() for post in new_soup.find_all('div', text="WATERLOO POLICE DEPARTMENT") if arrest_date_regex.match(post.find_previous('div').get_text())]
# Location is in the div after the div with the word 'Arresting Agency:' in it.
# We will find that using 'find_next('div')
# Then put the content of each div into an array
location = [post.find_next('div').get_text() for post in new_soup.find_all('div', text="Arresting Agency:")]
# Charges are the trickest to pull
# Several people have multiple charges
# All charges are in capital letters
# Regex assumes charges have at least three, uppercased letters
charges_regex = re.compile('[A-Z][A-Z][A-Z]*')
# All charges have style 'left:167'
# This indicates how far to the left they are indented
# This regex searches for any number of letters and digits
# Then style 'left:167'
charges_regex_css = re.compile('[a-z]*\d*\;left:167')
# We will find text containing 'left:167' attribute
# Then put the content of each div into an array
# Note: Before each list of charges is a div containing 'Charge Description
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'DRUG STAMP VIOL',
# 'Charge Description', 'SALE/MFG: MARIJUANA', etc.
# We keep the 'Charge Description in the array
# So we can divide charges up by person later
charges_div = [post.get_text() for post in new_soup.find_all(attrs={'style' : charges_regex_css})]
# Create empty array for each charge
# We're accounting for up to 10 charges.
# If they have a charge, it will appear
# Otherwise, we will put an 'x' in the row where the charge appears
charge_one = []
charge_two = []
charge_three = []
charge_four = []
charge_five = []
charge_six = []
charge_seven = []
charge_eight = []
charge_nine = []
charge_ten = []
# Start with -1 so first time through the for loop below
# desc_num will equal 0
desc_num = -1
desc_index = []
# Number of items in each page's list of charges
# Each page will be different
# Since each person has a different amount of charges
charges_div_length = len(charges_div)
# We use this to find where 'Charge Description'
# Appears in the array
# This will help us split up the charges by person
# Each appearance of 'Charge Description'
# Signifies a new list of charges for a new person
for post in charges_div:
desc_num = desc_num + 1
if post == 'Charge Description':
desc_index.append(desc_num)
# Append the length of the charges_div to our desc_index array
# This would be the theoredical next 'Charge Description' in the array
# We use this in for loop below
# To find how many charges the last person on the page has
desc_index.append(charges_div_length)
num = -1
# Loops through each page
# And grabs just the first charge for each person
for post in charges_div:
# Variable keeps track of where inside the array the charge is
num = num + 1
# Grab the field after 'Charge Description'
# Which would be the first charge for each person
if post == 'Charge Description':
charge_one.append(charges_div[num+1])
# This loop uses desc_index
# Which is our list of locations of 'Charge Description'
# And either posts the second, etc. charge in their respective array
# Or an 'x' if it doesn't exist
num_two = -1
# We don't feed the last number in the array
# Through the loop statement
# Because nothing comes after it.
# IE: Calling 'desc_index[num_two + 1]' in the loop below
# Would be invalid
desc_index_length = len(desc_index) - 1
for post in desc_index[0:desc_index_length]:
num_two = num_two + 1
# If the space between 'Charge Description' locations is 2
# The person has only one charge
# So we will put 'x' in the charge_two array
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'Charge Description'
if desc_index[num_two + 1] - desc_index[num_two] == 2:
charge_two.append('x')
charge_three.append('x')
charge_four.append('x')
charge_five.append('x')
charge_six.append('x')
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
# Otherwise we need to append all their charges
# Which will be between locations of 'Charge Description'
# Example: 'Charge Description', 'SALE/MFG: MARIJUANA', 'DRUG STAMP VIOL', 'Charge Description'
# Would be two spots and two charges
elif desc_index[num_two + 1] - desc_index[num_two] == 3:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append('x')
charge_four.append('x')
charge_five.append('x')
charge_six.append('x')
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 4:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append('x')
charge_five.append('x')
charge_six.append('x')
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 5:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append('x')
charge_six.append('x')
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 6:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append('x')
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 7:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append(charges_div[desc_index[num_two] + 6])
charge_seven.append('x')
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 8:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append(charges_div[desc_index[num_two] + 6])
charge_seven.append(charges_div[desc_index[num_two] + 7])
charge_eight.append('x')
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 9:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append(charges_div[desc_index[num_two] + 6])
charge_seven.append(charges_div[desc_index[num_two] + 7])
charge_eight.append(charges_div[desc_index[num_two] + 8])
charge_nine.append('x')
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 10:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append(charges_div[desc_index[num_two] + 6])
charge_seven.append(charges_div[desc_index[num_two] + 7])
charge_eight.append(charges_div[desc_index[num_two] + 8])
charge_nine.append(charges_div[desc_index[num_two] + 9])
charge_ten.append('x')
elif desc_index[num_two + 1] - desc_index[num_two] == 11:
charge_two.append(charges_div[desc_index[num_two] + 2])
charge_three.append(charges_div[desc_index[num_two] + 3])
charge_four.append(charges_div[desc_index[num_two] + 4])
charge_five.append(charges_div[desc_index[num_two] + 5])
charge_six.append(charges_div[desc_index[num_two] + 6])
charge_seven.append(charges_div[desc_index[num_two] + 7])
charge_eight.append(charges_div[desc_index[num_two] + 8])
charge_nine.append(charges_div[desc_index[num_two] + 9])
charge_ten.append(charges_div[desc_index[num_two] + 10])
else:
print "WARNING: LOTS OF CHARGES FOR ONE PERSON. ADD MORE OPTIONS!!!!"
# We're almost done!
# We'll now loop though the arrays we made
# And print results to a CSV
for x in range(0, len(names)):
# Replace the word "Name: " with nothing so we have just the names
new_names = names[x].replace("Name: ", "")
# Replace commas, so we don't screw up CSV
new_ages = ages[x].replace(",", " -")
new_addresses = addresses[x].replace(",", " -")
new_city = city[x].replace(",", " -")
new_arrest_date = arrest_date[x].replace(",", " -")
new_location = location[x].replace(",", " -")
# Here's our arrays for our ten charges
new_charge_one = charge_one[x].replace(",", " -")
new_charge_two = charge_two[x].replace(",", " -")
new_charge_three = charge_three[x].replace(",", " -")
new_charge_four = charge_four[x].replace(",", " -")
new_charge_five = charge_five[x].replace(",", " -")
new_charge_six = charge_six[x].replace(",", " -")
new_charge_seven = charge_seven[x].replace(",", " -")
new_charge_eight = charge_eight[x].replace(",", " -")
new_charge_nine = charge_nine[x].replace(",", " -")
new_charge_ten = charge_ten[x].replace(",", " -")
# Extra replaces for location need to changed to 'and'
# Funky symbols in addresses will screw up Google
new_location02 = new_location.replace("&", "and")
new_location03 = new_location02.replace("/", " and ")
# Write to CSV
f.write(new_names + "," + new_ages + "," + new_addresses + "," + new_city + "," + new_arrest_date + "," + new_location03 + " Waterloo IA" + "," + new_charge_one + "," + new_charge_two + "," + new_charge_three + "," + new_charge_four + "," + new_charge_five + "," + new_charge_six + "," + new_charge_seven + "," + new_charge_eight + "," + new_charge_nine + "," + new_charge_ten + "\n")
# Always a good idea to close!
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment