Skip to content

Instantly share code, notes, and snippets.

@headquarters
Last active January 27, 2020 13:11
Show Gist options
  • Select an option

  • Save headquarters/d7c183628cd55dad148fcb24911813db to your computer and use it in GitHub Desktop.

Select an option

Save headquarters/d7c183628cd55dad148fcb24911813db to your computer and use it in GitHub Desktop.
Collect City of Raleigh Police Department vehicle crash data.
require 'bundler/setup'
require 'date'
require 'csv'
require 'nokogiri'
require "net/http"
require "uri"
require 'work_queue'
CSV_FILENAME = "crash_data.csv"
# Crash reports are at least 3 days old
start_date = Date.new(2010, 01, 01)
end_date = Date.new(2014, 12, 31)
URL = URI.parse("http://crash.raleighpd.org/default.php")
headers = [
"report_number",
"report_url",
"date",
"driver_names",
"street_names"
]
log = File.open("log.txt", "a+")
if !File::exist? CSV_FILENAME
CSV.open(CSV_FILENAME, 'ab') do |csv_writer|
csv_writer << headers
end
end
# Read the CSV file
csv = CSV.read(CSV_FILENAME)
# Queue up to 10 worker threads with 20 tasks each
wq = WorkQueue.new 10, 20
(start_date..end_date).each do |date|
puts "Queueing #{date}..."
wq.enqueue_b do
begin
form_data = {
:date => date.strftime("%D")
}
response = Net::HTTP::post_form(URL, form_data)
doc = Nokogiri::HTML(response.body)
rescue => e
log.write("Failed to access or parse the response for #{date}. #{e.message} \n")
else
table_rows = doc.css("#mainContent table tr")
# Remove headers
table_rows = table_rows.slice(1, table_rows.size)
table_rows.each { |row|
cells = row.css("td")
csv_row = []
link = cells[0].css("a").first
report_number = link.content
report_url = link["href"]
driver_names = cells[2].content
street_names = cells[3].content
csv_row.push(report_number)
csv_row.push(report_url)
csv_row.push(date)
csv_row.push(driver_names)
csv_row.push(street_names)
CSV.open(CSV_FILENAME, 'ab') do |csv_writer|
csv_writer << csv_row
end
}
puts "Done fetching data for #{date}."
# Sleep the # of days ago in milliseconds
# to prevent burning up their server
sleep(0.2)
ensure
# Keep going in the loop, even if this time failed.
next
end
end
end
wq.join
puts "Done fetching data."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment