jhubert · December 19, 2015 06:59
diff --git a/Gemfile b/Gemfile
 source 'https://rubygems.org'

 gem 'nokogiri'
diff --git a/Rakefile b/Rakefile
 URL_PATTERN = "http://%s.craigslist.org/%s/"
 SEARCH_URL_PATTERN = "#{URL_PATTERN}%s?query=%s"

 task :scrape do |t|
  # You can add multiple targets here, either as
  # complete URLs or as a [city, category] array
  targets = [
    'http://sfbay.craigslist.org/moa/',
    ['sfbay', 'ctd'], # generates a category url => http://sfbay.craigslist.org/ctd/
    ['sfbay', 'ctd', 'ford'] # generates a category search url => http://sfbay.craigslist.org/search/ctd?query=ford
  ]

  targets.each do |t|
    if t.is_a?(Array)
      t = URL_PATTERN % t if t.length == 2
      if t.length == 3
        t.insert(1, 'search')
        t = SEARCH_URL_PATTERN % t
      end
    end

    ruby "scraper.rb", t
  end
 end
diff --git a/scraper.rb b/scraper.rb
 require 'nokogiri' # fast html parser
 require 'open-uri' #
 require 'csv'

 # Get the URL from the first argument
 @url = ARGV[0]

 if @url.nil? || @url == ''
  puts "Usage: ruby scraper.rb http://sfbay.craigslist.org/moa/"
  exit 0
 end

 # Override the default to_s function
 # so that it outputs the content of the
 # node instead of the html
 class Nokogiri::XML::Node
  def to_s
    self && self.content.to_s
  end
 end

 # Get the name of the city from the URL
 city = @url.scan(/\/([\w]+)\.c/).last.first

 # Open up the URL
 doc = Nokogiri::HTML(open(@url))

 # Open up the CSV file
 CSV.open('results.csv', 'a+') do |csv|
  # Find all the rows in the doc
  doc.css('.row').each do |row|
    csv << [
      row.attr('data-pid'),                 # pid
      city,                                  # city
      row.at_css('.price').to_s.tr('$', ''), # price
      row.at_css('.pl a'),                  # title
      row.at_css('a.gc').attr('data-cat'),  # category key
      row.at_css('a.gc'),                   # category name
      row.at_css('.pnr small').to_s.tr('()', '').strip  # location - we remove ( ) and surrounding spaces
    ]
  end
 end
	URL_PATTERN = "http://%s.craigslist.org/%s/"
	SEARCH_URL_PATTERN = "#{URL_PATTERN}%s?query=%s"

	task :scrape do \|t\|
	# You can add multiple targets here, either as
	# complete URLs or as a [city, category] array
	targets = [
	'http://sfbay.craigslist.org/moa/',
	['sfbay', 'ctd'], # generates a category url => http://sfbay.craigslist.org/ctd/
	['sfbay', 'ctd', 'ford'] # generates a category search url => http://sfbay.craigslist.org/search/ctd?query=ford
	]

	targets.each do \|t\|
	if t.is_a?(Array)
	t = URL_PATTERN % t if t.length == 2
	if t.length == 3
	t.insert(1, 'search')
	t = SEARCH_URL_PATTERN % t
	end
	end

	ruby "scraper.rb", t
	end
	end
	require 'nokogiri' # fast html parser
	require 'open-uri' #
	require 'csv'

	# Get the URL from the first argument
	@url = ARGV[0]

	if @url.nil? \|\| @url == ''
	puts "Usage: ruby scraper.rb http://sfbay.craigslist.org/moa/"
	exit 0
	end

	# Override the default to_s function
	# so that it outputs the content of the
	# node instead of the html
	class Nokogiri::XML::Node
	def to_s
	self && self.content.to_s
	end
	end

	# Get the name of the city from the URL
	city = @url.scan(/\/([\w]+)\.c/).last.first

	# Open up the URL
	doc = Nokogiri::HTML(open(@url))

	# Open up the CSV file
	CSV.open('results.csv', 'a+') do \|csv\|
	# Find all the rows in the doc
	doc.css('.row').each do \|row\|
	csv << [
	row.attr('data-pid'), # pid
	city, # city
	row.at_css('.price').to_s.tr('$', ''), # price
	row.at_css('.pl a'), # title
	row.at_css('a.gc').attr('data-cat'), # category key
	row.at_css('a.gc'), # category name
	row.at_css('.pnr small').to_s.tr('()', '').strip # location - we remove ( ) and surrounding spaces
	]
	end
	end