Skip to content

Instantly share code, notes, and snippets.

@danielharan
Created March 26, 2009 20:58
Show Gist options
  • Save danielharan/86333 to your computer and use it in GitHub Desktop.
Save danielharan/86333 to your computer and use it in GitHub Desktop.
require 'hpricot'
namespace :import do
def x(path)
(f = (@doc/('/html/body/div/div[4]/div'+path)).first) ? f.inner_html : ''
end
def __left_value_for_right_matching(exp)
if (f = (@doc/"/html/body/div/div[4]/div/table[2]/tbody/tr/td[1]").detect {|e| e.inner_html =~ exp})
(f.parent / 'td[2]').first.inner_html
else
nil
end
end
def alcohol
v = __left_value_for_right_matching(/alcool/)
v ? v.to_f : nil
end
def producer
__left_value_for_right_matching(/Fournisseur/)
end
def upc
if (_upc = x("/table/tr/td[2]/p/strong[2]")).empty?
nil
else
_upc.gsub(" ",'').match(/\d+/)[0]
end
end
desc 'scrape the downloaded saq product pages'
task :saq => :environment do
Dir.glob("#{RAILS_ROOT}/saq/pages/*").each do |f|
begin
@doc = Hpricot(open(f))
saq = x("/table/tr/td[2]/p/strong[1]")
if saq == ''
puts "NO SAQ CODE on: #{f}"
next
end
saq = saq.match(/\d+/)[0]
next unless Wine.find_by_saq_code(saq).nil?
Wine.create :upc => upc,
:title => (@doc/"h2").first.inner_html, :saq_code => saq,
:colour => x("/table/tr/td[2]/table/tbody/tr[2]/td[2]"), :volume => x('/table/tr/td[2]/table/tbody/tr[4]/td[2]'),
:price => x("/table/tr/td[3]/p/span").gsub(",","").to_i / 100.0,
:country => x("/table[2]/tbody/tr/td[2]"), :alcohol => alcohol, :producer => producer
rescue Exception => e
puts "unable to process: #{f}" #exception doesn't help much
#raise e
end
end
end
end
# Script written a year ago to get SAQ wines...
#
# first, get all search pages
def url(i)
"http://www.saq.com/webapp/wcs/stores/servlet/CatalogSearchResultView?storeId=10001&langId=-2&catalogId=10001&searchTerm=&resultCatEntryType=2&beginIndex=#{i}&tri=RechercheUCIProdDescAttributeInfo&sensTri=AscOperator&searchType=400&codeReseau=&categoryId=11748&viewTaskName=SAQCatalogSearchResultView&catalogVenteId=&origineId=&codePrix=&pageSize=100"
end
(0..71).each do |i|
`curl '#{url(i*100)}' > saq/#{i*100}`
sleep(30)
puts i.to_s
end
# get all product urls, save them to file
require 'hpricot'
wine_urls = []
(0..72).each do |i|
wine_urls << Hpricot(open("saq/search/#{i*100}")).search("a").collect {|e| e['href']}.uniq.select {|href| href =~ /^ProductDisplay/}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment