Last active
January 27, 2020 13:11
-
-
Save headquarters/d7c183628cd55dad148fcb24911813db to your computer and use it in GitHub Desktop.
Collect City of Raleigh Police Department vehicle crash data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'bundler/setup' | |
| require 'date' | |
| require 'csv' | |
| require 'nokogiri' | |
| require "net/http" | |
| require "uri" | |
| require 'work_queue' | |
| CSV_FILENAME = "crash_data.csv" | |
| # Crash reports are at least 3 days old | |
| start_date = Date.new(2010, 01, 01) | |
| end_date = Date.new(2014, 12, 31) | |
| URL = URI.parse("http://crash.raleighpd.org/default.php") | |
| headers = [ | |
| "report_number", | |
| "report_url", | |
| "date", | |
| "driver_names", | |
| "street_names" | |
| ] | |
| log = File.open("log.txt", "a+") | |
| if !File::exist? CSV_FILENAME | |
| CSV.open(CSV_FILENAME, 'ab') do |csv_writer| | |
| csv_writer << headers | |
| end | |
| end | |
| # Read the CSV file | |
| csv = CSV.read(CSV_FILENAME) | |
| # Queue up to 10 worker threads with 20 tasks each | |
| wq = WorkQueue.new 10, 20 | |
| (start_date..end_date).each do |date| | |
| puts "Queueing #{date}..." | |
| wq.enqueue_b do | |
| begin | |
| form_data = { | |
| :date => date.strftime("%D") | |
| } | |
| response = Net::HTTP::post_form(URL, form_data) | |
| doc = Nokogiri::HTML(response.body) | |
| rescue => e | |
| log.write("Failed to access or parse the response for #{date}. #{e.message} \n") | |
| else | |
| table_rows = doc.css("#mainContent table tr") | |
| # Remove headers | |
| table_rows = table_rows.slice(1, table_rows.size) | |
| table_rows.each { |row| | |
| cells = row.css("td") | |
| csv_row = [] | |
| link = cells[0].css("a").first | |
| report_number = link.content | |
| report_url = link["href"] | |
| driver_names = cells[2].content | |
| street_names = cells[3].content | |
| csv_row.push(report_number) | |
| csv_row.push(report_url) | |
| csv_row.push(date) | |
| csv_row.push(driver_names) | |
| csv_row.push(street_names) | |
| CSV.open(CSV_FILENAME, 'ab') do |csv_writer| | |
| csv_writer << csv_row | |
| end | |
| } | |
| puts "Done fetching data for #{date}." | |
| # Sleep the # of days ago in milliseconds | |
| # to prevent burning up their server | |
| sleep(0.2) | |
| ensure | |
| # Keep going in the loop, even if this time failed. | |
| next | |
| end | |
| end | |
| end | |
| wq.join | |
| puts "Done fetching data." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment