kennym · May 2, 2013 19:26
diff --git a/scraper.rb b/scraper.rb
 require 'airbrake'
 require 'debugger'

 module Craigslist
  module Scraper
    MAX_DEPTH = (Rails.env.development? or Rails.env.test?) ? 1 : 5

    def self.scrape_category(category)
      agent  = Mechanize.new { |settings|
        settings.user_agent_alias = "Linux Firefox"
        settings.read_timeout = 1200
      }

      ActiveRecord::Base.transaction do
        result = category.results.new
        result.save!

        self.scrape(category.get_full_url, result, agent, category)
      end
    end

    def self.scrape(url, result, agent, category, depth=0)
      base_url = category.get_full_url

      if depth < MAX_DEPTH
        Rails.logger.debug("Scraping page #{depth + 1}")
        page = agent.get(url)

        row = page.search(".//p[@class='row']/a")
        row.each do |link|

          # Parse Posting URL
          url = link.xpath("@href").text

          posting_id = link["href"].split("/")[-1].split(".html")[0]

          entries = CraigslistEntry.where(:url => url,
                                          :category_id => category)

          # If there already is an existing craigslist entry with the
          # posting_id, then skip it.
          next unless entries.empty?

          agent.get(url)

          # Handle flagged or removed postings
          next if agent.page.body.include?("This posting has been flagged for removal.")
          next if agent.page.body.include?("This posting has been deleted by its author.")

          reply_line = agent.page.search(".//a[@href[contains(.,'mailto')]]").text

          posting_date =
            if not agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").empty?
              agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").text
            elsif not agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").empty?
              agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").text
            else
              Airbrake.notify(Exception, {:error_message => "Date not found for #{url}"})
            end

          posting_date = posting_date.split(",")
          unless posting_date.nil?
            posting_date = posting_date.join(" ")
            begin
              posting_date = Date.parse(posting_date)
            rescue => ex
              Airbrake.notify(ex, {:error_message => "Date: #{posting_date}"})
            end
          else
            Airbrake.notify(Exception, {:error_message => "Posting date is nil. #{url}"})
            posting_date = Time.now
          end

          row = result.craigslist_entries.create(
            :posting_id => posting_id,
            :text => link.text,
            :url => url,
            :email => reply_line,
            :published => posting_date,
            :category_id => category.id
          )
          Rails.logger.debug("Created new craigslist entry #{url}")
        end

        next_page = page.search(".//p[@id='nextpage']//a[@href[contains(.,'index')]]")
        unless next_page.empty?
          self.scrape(base_url + next_page.xpath("@href").text, result, agent, category, depth + 1)
        end
      end
    end

    def self.get_us_cities
      agent = Mechanize.new
      page = agent.get("http://www.craigslist.org/about/sites/")

      cities_subdomains = Hash.new
      city_districts = Hash.new
      us_cities = {}
      lists = page.parser.css(".colmask").first.css("ul")
      lists.each do |list|
        state_name = list.previous_element.text
        us_cities[state_name] = []
        cities = list.css("a")
        cities.each do |city|
          subdomain = city["href"].split("//")[1].split(".")[0]
          name = city.text.strip
          us_cities[state_name] << [name, subdomain]
          cities_subdomains[subdomain] = name

          city_page = agent.get("http://#{subdomain}.craigslist.org")
          unless city_page.search(".sublinks").empty?
            city_districts[subdomain] = []
            city_page.search(".sublinks").css("a").each do |district_link|
              district_path = district_link["href"]
              district_name = district_link["title"]

              city_districts[subdomain] << {"name" => district_name,
                                            "path" => district_path }
            end
          end
        end
      end

      return us_cities, cities_subdomains, city_districts
    end
  end
 end
	require 'airbrake'
	require 'debugger'

	module Craigslist
	module Scraper
	MAX_DEPTH = (Rails.env.development? or Rails.env.test?) ? 1 : 5

	def self.scrape_category(category)
	agent = Mechanize.new { \|settings\|
	settings.user_agent_alias = "Linux Firefox"
	settings.read_timeout = 1200
	}

	ActiveRecord::Base.transaction do
	result = category.results.new
	result.save!

	self.scrape(category.get_full_url, result, agent, category)
	end
	end

	def self.scrape(url, result, agent, category, depth=0)
	base_url = category.get_full_url

	if depth < MAX_DEPTH
	Rails.logger.debug("Scraping page #{depth + 1}")
	page = agent.get(url)

	row = page.search(".//p[@class='row']/a")
	row.each do \|link\|

	# Parse Posting URL
	url = link.xpath("@href").text

	posting_id = link["href"].split("/")[-1].split(".html")[0]

	entries = CraigslistEntry.where(:url => url,
	:category_id => category)

	# If there already is an existing craigslist entry with the
	# posting_id, then skip it.
	next unless entries.empty?

	agent.get(url)

	# Handle flagged or removed postings
	next if agent.page.body.include?("This posting has been flagged for removal.")
	next if agent.page.body.include?("This posting has been deleted by its author.")

	reply_line = agent.page.search(".//a[@href[contains(.,'mailto')]]").text

	posting_date =
	if not agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").empty?
	agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").text
	elsif not agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").empty?
	agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").text
	else
	Airbrake.notify(Exception, {:error_message => "Date not found for #{url}"})
	end

	posting_date = posting_date.split(",")
	unless posting_date.nil?
	posting_date = posting_date.join(" ")
	begin
	posting_date = Date.parse(posting_date)
	rescue => ex
	Airbrake.notify(ex, {:error_message => "Date: #{posting_date}"})
	end
	else
	Airbrake.notify(Exception, {:error_message => "Posting date is nil. #{url}"})
	posting_date = Time.now
	end

	row = result.craigslist_entries.create(
	:posting_id => posting_id,
	:text => link.text,
	:url => url,
	:email => reply_line,
	:published => posting_date,
	:category_id => category.id
	)
	Rails.logger.debug("Created new craigslist entry #{url}")
	end

	next_page = page.search(".//p[@id='nextpage']//a[@href[contains(.,'index')]]")
	unless next_page.empty?
	self.scrape(base_url + next_page.xpath("@href").text, result, agent, category, depth + 1)
	end
	end
	end

	def self.get_us_cities
	agent = Mechanize.new
	page = agent.get("http://www.craigslist.org/about/sites/")

	cities_subdomains = Hash.new
	city_districts = Hash.new
	us_cities = {}
	lists = page.parser.css(".colmask").first.css("ul")
	lists.each do \|list\|
	state_name = list.previous_element.text
	us_cities[state_name] = []
	cities = list.css("a")
	cities.each do \|city\|
	subdomain = city["href"].split("//")[1].split(".")[0]
	name = city.text.strip
	us_cities[state_name] << [name, subdomain]
	cities_subdomains[subdomain] = name

	city_page = agent.get("http://#{subdomain}.craigslist.org")
	unless city_page.search(".sublinks").empty?
	city_districts[subdomain] = []
	city_page.search(".sublinks").css("a").each do \|district_link\|
	district_path = district_link["href"]
	district_name = district_link["title"]

	city_districts[subdomain] << {"name" => district_name,
	"path" => district_path }
	end
	end
	end
	end

	return us_cities, cities_subdomains, city_districts
	end
	end
	end