Created
May 2, 2013 19:26
-
-
Save kennym/5504712 to your computer and use it in GitHub Desktop.
Craigslist scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'airbrake' | |
require 'debugger' | |
module Craigslist | |
module Scraper | |
MAX_DEPTH = (Rails.env.development? or Rails.env.test?) ? 1 : 5 | |
def self.scrape_category(category) | |
agent = Mechanize.new { |settings| | |
settings.user_agent_alias = "Linux Firefox" | |
settings.read_timeout = 1200 | |
} | |
ActiveRecord::Base.transaction do | |
result = category.results.new | |
result.save! | |
self.scrape(category.get_full_url, result, agent, category) | |
end | |
end | |
def self.scrape(url, result, agent, category, depth=0) | |
base_url = category.get_full_url | |
if depth < MAX_DEPTH | |
Rails.logger.debug("Scraping page #{depth + 1}") | |
page = agent.get(url) | |
row = page.search(".//p[@class='row']/a") | |
row.each do |link| | |
# Parse Posting URL | |
url = link.xpath("@href").text | |
posting_id = link["href"].split("/")[-1].split(".html")[0] | |
entries = CraigslistEntry.where(:url => url, | |
:category_id => category) | |
# If there already is an existing craigslist entry with the | |
# posting_id, then skip it. | |
next unless entries.empty? | |
agent.get(url) | |
# Handle flagged or removed postings | |
next if agent.page.body.include?("This posting has been flagged for removal.") | |
next if agent.page.body.include?("This posting has been deleted by its author.") | |
reply_line = agent.page.search(".//a[@href[contains(.,'mailto')]]").text | |
posting_date = | |
if not agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").empty? | |
agent.page.search(".//div[@class[contains(.,'postingdate')]]/time").text | |
elsif not agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").empty? | |
agent.page.search(".//p[@class[contains(.,'postinginfo')]]/date").text | |
else | |
Airbrake.notify(Exception, {:error_message => "Date not found for #{url}"}) | |
end | |
posting_date = posting_date.split(",") | |
unless posting_date.nil? | |
posting_date = posting_date.join(" ") | |
begin | |
posting_date = Date.parse(posting_date) | |
rescue => ex | |
Airbrake.notify(ex, {:error_message => "Date: #{posting_date}"}) | |
end | |
else | |
Airbrake.notify(Exception, {:error_message => "Posting date is nil. #{url}"}) | |
posting_date = Time.now | |
end | |
row = result.craigslist_entries.create( | |
:posting_id => posting_id, | |
:text => link.text, | |
:url => url, | |
:email => reply_line, | |
:published => posting_date, | |
:category_id => category.id | |
) | |
Rails.logger.debug("Created new craigslist entry #{url}") | |
end | |
next_page = page.search(".//p[@id='nextpage']//a[@href[contains(.,'index')]]") | |
unless next_page.empty? | |
self.scrape(base_url + next_page.xpath("@href").text, result, agent, category, depth + 1) | |
end | |
end | |
end | |
def self.get_us_cities | |
agent = Mechanize.new | |
page = agent.get("http://www.craigslist.org/about/sites/") | |
cities_subdomains = Hash.new | |
city_districts = Hash.new | |
us_cities = {} | |
lists = page.parser.css(".colmask").first.css("ul") | |
lists.each do |list| | |
state_name = list.previous_element.text | |
us_cities[state_name] = [] | |
cities = list.css("a") | |
cities.each do |city| | |
subdomain = city["href"].split("//")[1].split(".")[0] | |
name = city.text.strip | |
us_cities[state_name] << [name, subdomain] | |
cities_subdomains[subdomain] = name | |
city_page = agent.get("http://#{subdomain}.craigslist.org") | |
unless city_page.search(".sublinks").empty? | |
city_districts[subdomain] = [] | |
city_page.search(".sublinks").css("a").each do |district_link| | |
district_path = district_link["href"] | |
district_name = district_link["title"] | |
city_districts[subdomain] << {"name" => district_name, | |
"path" => district_path } | |
end | |
end | |
end | |
end | |
return us_cities, cities_subdomains, city_districts | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment