Created
July 18, 2014 14:11
-
-
Save malev/f3f60ef64de8fd652191 to your computer and use it in GitHub Desktop.
Turbot scraper example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'json' | |
require 'mechanize' | |
require 'pdf-reader' | |
require 'turbotlib' | |
SOURCE_URL = "http://www.cityofchicago.org/city/en/depts/doit/supp_info/list_of_contractors.html" | |
class PDFGetter | |
def self.call | |
self.new.call | |
end | |
def initialize | |
agent = Mechanize.new | |
@doc = agent.get(SOURCE_URL).parser | |
end | |
def host | |
"http://www.cityofchicago.org" | |
end | |
def call | |
@doc.css("li.mediafilelist a").map do |element| | |
host + element.attributes["href"].text | |
end | |
end | |
end | |
class DataPage | |
attr_reader :text, :structure, :pdf_url | |
def initialize(text, pdf_url) | |
@text = text | |
@structure = generate_structure | |
@pdf_url = pdf_url | |
end | |
def call | |
output = "" | |
data.each do |row| | |
datum = { | |
vendor_name: row[0], | |
vendor_address: row[1], | |
source_url: pdf_url, | |
sample_date: Time.now | |
} | |
output << JSON.dump(datum) + "\n" | |
end | |
output | |
end | |
def data | |
output = [] | |
data_lines.each do |line| | |
vendor_name = line[structure[1]...structure[2]] | |
vendor_address = line[structure[2]..-1] | |
if vendor_name && vendor_address | |
output << [vendor_name.strip, vendor_address.strip] | |
end | |
end | |
output | |
end | |
def data_lines | |
lines = [] | |
text.split("\n")[(structure[0] + 1)..-2].each do |line| | |
lines << line.strip if line.strip != "" | |
end | |
lines | |
end | |
def generate_structure | |
headers = "" | |
headers_line = 0 | |
text.split("\n").each_with_index do |line, index| | |
if vendors_line?(line) | |
return [ | |
index, | |
vendor_name_position(line), | |
vendor_address_position(line) | |
] | |
end | |
end | |
end | |
def vendor_name_position(line) | |
vendor_name_position = line =~ /VENDOR NAME/ | |
end | |
def vendor_address_position(line) | |
vendor_address_position = line =~ /VENDOR ADDRESS/ | |
end | |
def vendors_line?(line) | |
line =~ /VENDOR NAME\W+VENDOR ADDRESS/ | |
end | |
def page_line?(line) | |
line.strip =~ /Page\W\d+/ | |
end | |
end | |
Turbotlib.log("Starting scrape...") | |
PDFGetter.call.each do |pdf_url| | |
begin | |
io = open(pdf_url) | |
reader = PDF::Reader.new(io) | |
reader.pages.each do |page| | |
puts DataPage.new(page.text, pdf_url).call | |
end | |
rescue | |
Turbotlib.log("#{pdf_url} has failed!!") | |
next | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment