Last active
August 29, 2015 14:00
-
-
Save albulescu/11232996 to your computer and use it in GitHub Desktop.
Products ripper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' #You need to install this gem | |
require 'net/http' | |
require 'pp' | |
require 'csv' | |
=begin | |
@autor Albulescu Cosmin <[email protected]> | |
All xpaths are copied with chrome browser. Inspect element, right click and copy xpath | |
=end | |
products_file_name = 'products.csv' | |
# File used to write non 200 http responses | |
fail_file = File.open('fail_list', 'a+') | |
#pages range | |
pages = (('A'..'Z').to_a + (1..9).to_a) | |
#Delete products csv file | |
File.delete products_file_name if File.exist? products_file_name | |
#products csv | |
$products = CSV.open(products_file_name, "wb") | |
$products << ['URL', 'TITLE', 'BRAND/COMPANY', 'QUANTITY', 'ART.NR.', | |
'PZN', 'EAN', 'PRICE 1', 'PRICE 2', 'DELIVERY TIME', | |
'DELIVERY INFO', 'IMAGE'] | |
#counter to know where we are | |
$write_count = 1 | |
def strip(text) | |
if !text.nil? | |
text.strip | |
else | |
text | |
end | |
end | |
def price(text) | |
if !text.nil? then | |
strip text.to_s.split(' ')[1] | |
else | |
text | |
end | |
end | |
def no_label(text) | |
if !text.nil? then | |
strip text.to_s.split(':')[1] | |
else | |
text | |
end | |
end | |
def check_image(image) | |
if image.to_s == 'http://www.volksversand.de/images/product_images/popup_images/04.jpg' then | |
'N/A' | |
else | |
image | |
end | |
end | |
# | |
# Function to read product data from url dom document | |
# | |
def read_product(doc, url) | |
line = Array.new | |
puts '#' + $write_count.to_s + ' - ' + url + "...\n" | |
#product url | |
line.push url | |
#product title | |
line.push doc.xpath('//*[@id="cart_quantity"]/div/*/h1').text # This xpath has been changed manually | |
#product category | |
line.push doc.xpath('//*[@id="cart_quantity"]/div/*/p').text # This xpath has been changed manually | |
#product quantity | |
line.push doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[1]').text | |
#Art.Nr. | |
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[2]').text | |
#PZN | |
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[3]').text | |
#EAN | |
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[4]').text | |
#PRICE 1 | |
line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[1]').text | |
#PRICE 2 | |
line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[2]').text | |
#SHIPPING TIME | |
line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[5]').text | |
#TAX AND SHIPPING INFO | |
line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[6]').text | |
#IMAGE | |
line.push check_image 'http://www.volksversand.de/' + doc.xpath('//*[@id="imageTarget"]/a').attribute('href').value | |
# Write the line to csv | |
$products << line | |
#flush the file | |
$products.flush | |
$write_count+=1 | |
end | |
# Loop through pages | |
pages.each { |alpha| | |
open ('http://www.volksversand.de/shop_content.php?coID=116&alpha=' + alpha) { |f| | |
if f.status[0] == '200' then | |
doc = Nokogiri::HTML f.read | |
# Read all products links from current page | |
doc.xpath('//*[@id="content"]/div/div[*]/a').each { |link| | |
#get url from product link | |
product_url = link.attribute 'href' | |
#read product url | |
open(product_url) { |product| | |
if product.status[0] == '200' then | |
#read product page and save | |
read_product Nokogiri::HTML(product.read), product_url.value | |
else | |
fail_file.write f.status[0] + '@' + product_url + "\n" | |
fail_file.flush | |
end | |
sleep 1 | |
} | |
} | |
# log fail urls | |
else | |
fail_file.write f.status[0] + '@' + alpha + "\n" | |
fail_file.flush | |
end | |
} | |
} | |
#close products file pointer | |
$products.close | |
#close fail file pointer | |
fail_file.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment