Skip to content

Instantly share code, notes, and snippets.

@kennym
Created May 2, 2013 19:26
Show Gist options
  • Save kennym/5504712 to your computer and use it in GitHub Desktop.
Save kennym/5504712 to your computer and use it in GitHub Desktop.
Craigslist scraper
require 'airbrake'
require 'debugger'
module Craigslist
module Scraper
MAX_DEPTH = (Rails.env.development? or Rails.env.test?) ? 1 : 5
def self.scrape_category(category)
agent = Mechanize.new { |settings|
settings.user_agent_alias = "Linux Firefox"
settings.read_timeout = 1200
}
ActiveRecord::Base.transaction do
result = category.results.new
result.save!
self.scrape(category.get_full_url, result, agent, category)
end
end
def self.scrape(url, result, agent, category, depth=0)
base_url = category.get_full_url
if depth < MAX_DEPTH
Rails.logger.debug("Scraping page #{depth + 1}")
page = agent.get(url)
row = page.search(".//p[@class='row']/a")
row.each do |link|
# Parse Posting URL
url = link.xpath("@href").text
posting_id = link["href"].split("/")[-1].split(".html")[0]
entries = CraigslistEntry.where(:url => url,
:category_id => category)
# If there already is an existing craigslist entry with the
# posting_id, then skip it.
next unless entries.empty?
agent.get(url)
# Handle flagged or removed postings
next if agent.page.body.include?("This posting has been flagged for removal.")
next if agent.page.body.include?("This posting has been deleted by its author.")
reply_line = agent.page.search(".//a[@href[contains(.,'mailto')]]").text
posting_date =
if not agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").empty?
agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").text
elsif not agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").empty?
agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").text
else
Airbrake.notify(Exception, {:error_message => "Date not found for #{url}"})
end
posting_date = posting_date.split(",")
unless posting_date.nil?
posting_date = posting_date.join(" ")
begin
posting_date = Date.parse(posting_date)
rescue => ex
Airbrake.notify(ex, {:error_message => "Date: #{posting_date}"})
end
else
Airbrake.notify(Exception, {:error_message => "Posting date is nil. #{url}"})
posting_date = Time.now
end
row = result.craigslist_entries.create(
:posting_id => posting_id,
:text => link.text,
:url => url,
:email => reply_line,
:published => posting_date,
:category_id => category.id
)
Rails.logger.debug("Created new craigslist entry #{url}")
end
next_page = page.search(".//p[@id='nextpage']//a[@href[contains(.,'index')]]")
unless next_page.empty?
self.scrape(base_url + next_page.xpath("@href").text, result, agent, category, depth + 1)
end
end
end
def self.get_us_cities
agent = Mechanize.new
page = agent.get("http://www.craigslist.org/about/sites/")
cities_subdomains = Hash.new
city_districts = Hash.new
us_cities = {}
lists = page.parser.css(".colmask").first.css("ul")
lists.each do |list|
state_name = list.previous_element.text
us_cities[state_name] = []
cities = list.css("a")
cities.each do |city|
subdomain = city["href"].split("//")[1].split(".")[0]
name = city.text.strip
us_cities[state_name] << [name, subdomain]
cities_subdomains[subdomain] = name
city_page = agent.get("http://#{subdomain}.craigslist.org")
unless city_page.search(".sublinks").empty?
city_districts[subdomain] = []
city_page.search(".sublinks").css("a").each do |district_link|
district_path = district_link["href"]
district_name = district_link["title"]
city_districts[subdomain] << {"name" => district_name,
"path" => district_path }
end
end
end
end
return us_cities, cities_subdomains, city_districts
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment