Skip to content

Instantly share code, notes, and snippets.

@malev
Created July 18, 2014 14:11
Show Gist options
  • Save malev/f3f60ef64de8fd652191 to your computer and use it in GitHub Desktop.
Save malev/f3f60ef64de8fd652191 to your computer and use it in GitHub Desktop.
Turbot scraper example
require 'open-uri'
require 'json'
require 'mechanize'
require 'pdf-reader'
require 'turbotlib'
SOURCE_URL = "http://www.cityofchicago.org/city/en/depts/doit/supp_info/list_of_contractors.html"
class PDFGetter
def self.call
self.new.call
end
def initialize
agent = Mechanize.new
@doc = agent.get(SOURCE_URL).parser
end
def host
"http://www.cityofchicago.org"
end
def call
@doc.css("li.mediafilelist a").map do |element|
host + element.attributes["href"].text
end
end
end
class DataPage
attr_reader :text, :structure, :pdf_url
def initialize(text, pdf_url)
@text = text
@structure = generate_structure
@pdf_url = pdf_url
end
def call
output = ""
data.each do |row|
datum = {
vendor_name: row[0],
vendor_address: row[1],
source_url: pdf_url,
sample_date: Time.now
}
output << JSON.dump(datum) + "\n"
end
output
end
def data
output = []
data_lines.each do |line|
vendor_name = line[structure[1]...structure[2]]
vendor_address = line[structure[2]..-1]
if vendor_name && vendor_address
output << [vendor_name.strip, vendor_address.strip]
end
end
output
end
def data_lines
lines = []
text.split("\n")[(structure[0] + 1)..-2].each do |line|
lines << line.strip if line.strip != ""
end
lines
end
def generate_structure
headers = ""
headers_line = 0
text.split("\n").each_with_index do |line, index|
if vendors_line?(line)
return [
index,
vendor_name_position(line),
vendor_address_position(line)
]
end
end
end
def vendor_name_position(line)
vendor_name_position = line =~ /VENDOR NAME/
end
def vendor_address_position(line)
vendor_address_position = line =~ /VENDOR ADDRESS/
end
def vendors_line?(line)
line =~ /VENDOR NAME\W+VENDOR ADDRESS/
end
def page_line?(line)
line.strip =~ /Page\W\d+/
end
end
Turbotlib.log("Starting scrape...")
PDFGetter.call.each do |pdf_url|
begin
io = open(pdf_url)
reader = PDF::Reader.new(io)
reader.pages.each do |page|
puts DataPage.new(page.text, pdf_url).call
end
rescue
Turbotlib.log("#{pdf_url} has failed!!")
next
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment