malev · July 18, 2014 14:11
diff --git a/scraper.rb b/scraper.rb
 require 'open-uri'
 require 'json'
 require 'mechanize'
 require 'pdf-reader'
 require 'turbotlib'


 SOURCE_URL = "http://www.cityofchicago.org/city/en/depts/doit/supp_info/list_of_contractors.html"


 class PDFGetter
  def self.call
    self.new.call
  end

  def initialize
    agent = Mechanize.new
    @doc = agent.get(SOURCE_URL).parser
  end

  def host
    "http://www.cityofchicago.org"
  end

  def call
    @doc.css("li.mediafilelist a").map do |element|
      host + element.attributes["href"].text
    end
  end
 end

 class DataPage
  attr_reader :text, :structure, :pdf_url
  def initialize(text, pdf_url)
    @text = text
    @structure = generate_structure
    @pdf_url = pdf_url
  end

  def call
    output = ""
    data.each do |row|
      datum = {
        vendor_name: row[0],
        vendor_address: row[1],
        source_url: pdf_url,
        sample_date: Time.now
      }
      output << JSON.dump(datum) + "\n"
    end
    output
  end

  def data
    output = []
    data_lines.each do |line|
      vendor_name = line[structure[1]...structure[2]]
      vendor_address = line[structure[2]..-1]
      if vendor_name && vendor_address
        output << [vendor_name.strip, vendor_address.strip]
      end
    end
    output
  end

  def data_lines
    lines = []
    text.split("\n")[(structure[0] + 1)..-2].each do |line|
      lines << line.strip if line.strip != ""
    end
    lines
  end

  def generate_structure
    headers = ""
    headers_line = 0
    text.split("\n").each_with_index do |line, index|
      if vendors_line?(line)
        return [
          index,
          vendor_name_position(line),
          vendor_address_position(line)
        ]
      end
    end
  end

  def vendor_name_position(line)
    vendor_name_position = line =~ /VENDOR NAME/
  end

  def vendor_address_position(line)
    vendor_address_position = line =~ /VENDOR ADDRESS/
  end

  def vendors_line?(line)
    line =~ /VENDOR NAME\W+VENDOR ADDRESS/
  end

  def page_line?(line)
    line.strip =~ /Page\W\d+/
  end
 end

 Turbotlib.log("Starting scrape...")

 PDFGetter.call.each do |pdf_url|
  begin
    io = open(pdf_url)
    reader = PDF::Reader.new(io)

    reader.pages.each do |page|
      puts DataPage.new(page.text, pdf_url).call
    end
  rescue
    Turbotlib.log("#{pdf_url} has failed!!")
    next
  end
 end
	require 'open-uri'
	require 'json'
	require 'mechanize'
	require 'pdf-reader'
	require 'turbotlib'


	SOURCE_URL = "http://www.cityofchicago.org/city/en/depts/doit/supp_info/list_of_contractors.html"


	class PDFGetter
	def self.call
	self.new.call
	end

	def initialize
	agent = Mechanize.new
	@doc = agent.get(SOURCE_URL).parser
	end

	def host
	"http://www.cityofchicago.org"
	end

	def call
	@doc.css("li.mediafilelist a").map do \|element\|
	host + element.attributes["href"].text
	end
	end
	end

	class DataPage
	attr_reader :text, :structure, :pdf_url
	def initialize(text, pdf_url)
	@text = text
	@structure = generate_structure
	@pdf_url = pdf_url
	end

	def call
	output = ""
	data.each do \|row\|
	datum = {
	vendor_name: row[0],
	vendor_address: row[1],
	source_url: pdf_url,
	sample_date: Time.now
	}
	output << JSON.dump(datum) + "\n"
	end
	output
	end

	def data
	output = []
	data_lines.each do \|line\|
	vendor_name = line[structure[1]...structure[2]]
	vendor_address = line[structure[2]..-1]
	if vendor_name && vendor_address
	output << [vendor_name.strip, vendor_address.strip]
	end
	end
	output
	end

	def data_lines
	lines = []
	text.split("\n")[(structure[0] + 1)..-2].each do \|line\|
	lines << line.strip if line.strip != ""
	end
	lines
	end

	def generate_structure
	headers = ""
	headers_line = 0
	text.split("\n").each_with_index do \|line, index\|
	if vendors_line?(line)
	return [
	index,
	vendor_name_position(line),
	vendor_address_position(line)
	]
	end
	end
	end

	def vendor_name_position(line)
	vendor_name_position = line =~ /VENDOR NAME/
	end

	def vendor_address_position(line)
	vendor_address_position = line =~ /VENDOR ADDRESS/
	end

	def vendors_line?(line)
	line =~ /VENDOR NAME\W+VENDOR ADDRESS/
	end

	def page_line?(line)
	line.strip =~ /Page\W\d+/
	end
	end

	Turbotlib.log("Starting scrape...")

	PDFGetter.call.each do \|pdf_url\|
	begin
	io = open(pdf_url)
	reader = PDF::Reader.new(io)

	reader.pages.each do \|page\|
	puts DataPage.new(page.text, pdf_url).call
	end
	rescue
	Turbotlib.log("#{pdf_url} has failed!!")
	next
	end
	end