Last active
October 14, 2022 04:28
-
-
Save mimosz/3899790 to your computer and use it in GitHub Desktop.
京东商品价格解析
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.DS_Store |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rvm use jruby |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'mini_magick' | |
require 'rtesseract' | |
require 'crawler' | |
class Buy360 < Crawler | |
def initialize(band_name, page=1, pages=0) | |
@band_name = band_name | |
@max_attempts = 3 | |
@pages = pages | |
@page = page | |
@path = "#{@band_name}/#{Date.today}/360buy" | |
mkdirs(@path) | |
@request = Nestful::Request.new('http://search.360buy.com/search') | |
end | |
def finishing | |
merge_csv_files(@path, @pages) | |
end | |
def process | |
self.params = { keyword: @band_name, qrst: 'UNEXPAND', enc: 'utf-8', page: @page } | |
page_dom = get_page_dom('GBK') | |
items_dom = get_items_dom(page_dom) | |
page_count = export_items(items_dom) | |
if @pages < 1 | |
total = parse_total(page_dom) | |
@pages = pages_count(total, page_count) | |
end | |
if @pages > 1 | |
next_page | |
else | |
finishing | |
end | |
end | |
def next_page | |
if @page < @pages | |
@page += 1 | |
puts "开始执行:#{@page}/#{@pages}" | |
process | |
else | |
finishing | |
end | |
end | |
private | |
def export(items) | |
unless items.empty? | |
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数'] | |
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")", | |
item[:title], | |
item[:pic_url], | |
item[:proms] ? item[:proms].join(' ') : '', | |
item[:price], | |
item[:rates][:count], | |
] | |
end | |
end | |
end | |
return items.count | |
end | |
def export_items(items_dom) | |
items = {} | |
unless items_dom.empty? | |
items_dom.each do |item| | |
link_dom = item.at('div.p-name>a') | |
if link_dom | |
img_dom = item.at('div.p-img>a>img') | |
sku_id = item[:sku] || link_dom[:href].match(/product\/(.*)\.html$/)[1] | |
pic_url = img_dom['data-lazyload'] || img_dom[:src] | |
name = link_dom.text | |
rates_count = item.at('div.extra').at('a').text | |
rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank? | |
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count } } | |
end | |
end | |
items = set_proms(items) unless items.empty? | |
end | |
export(items) | |
end | |
def set_proms(node) | |
proms = get_proms(node.keys) | |
unless proms.empty? | |
proms.each do |item_id, prom| | |
node[item_id][:proms] = prom if node.has_key?(item_id) | |
end | |
end | |
return node | |
end | |
def parse_proms(proms) | |
item_proms = [] | |
proms.each do |flag| | |
item_proms << case flag.to_i | |
when 1 | |
'直降' | |
when 2 | |
'赠品' | |
when 3 | |
'返券' | |
when 4 | |
'送积分' | |
end | |
end | |
return item_proms | |
end | |
def get_items_dom(page_dom) | |
list = page_dom.at('div#plist>ul.list-h') | |
if list | |
return list.css('li') | |
else | |
return [] | |
end | |
end | |
def parse_total(page_dom) | |
filter_dom = page_dom.at('div#filter') | |
return 0 if filter_dom.nil? | |
total_dom = filter_dom.at('div.total') | |
if total_dom | |
total_dom.at('strong').text.to_i | |
else | |
total_dom = filter_dom.at('ul.extra') | |
total_dom.at('li').at('strong').text.to_i if total_dom | |
end | |
end | |
def parse_price(sku_id, type=3) | |
attempts = 0 | |
img_url = price_url(sku_id, type) | |
begin | |
img = MiniMagick::Image.open(img_url) | |
img.colorspace("GRAY") # 灰度化 | |
img.monochrome # 去色 | |
rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error | |
attempts = attempts + 1 | |
puts "错误: #{error}" | |
puts img_url | |
return 0 if(attempts < @max_attempts) | |
end | |
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别 | |
File.unlink(img.path) # 删除临时文件 | |
price = str.match(/\d+\.\d+/) | |
return price ? price[0].to_f : try_parse_price(sku_id) | |
end | |
def try_parse_price(sku_id, type=2) | |
img_url = price_url(sku_id, type) | |
img = MiniMagick::Image.open(img_url) | |
img.resize '200x100' # 放大 | |
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别 | |
File.unlink(img.path) # 删除临时文件 | |
price = str.match(/\d+\.\d+/) | |
price = price[0] if price | |
return price.to_f | |
end | |
def price_url(sku_id, type=3) | |
"http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-#{type}.png" | |
end | |
def item_url(sku_id) | |
"http://www.360buy.com/product/#{sku_id}.html" | |
end | |
def get_proms(sku_ids, try_count=0) | |
item_proms = {} | |
json = nil | |
begin | |
prom_url = 'http://price.360buy.com/PromotionFlag.aspx' | |
params = { pid: sku_ids.join(',')} | |
html = Nestful.get prom_url, params: params | |
rescue Nestful::ServerError => error | |
return item_proms | |
end | |
html = html.force_encoding("GBK").encode("UTF-8") | |
if html.first == '({' && html.last == '})' # 京东系统错误时,会跳回首页 | |
json = html.match(/\((.*)\)/) | |
elsif try_count < @max_attempts # 重试3次 | |
get_proms(sku_ids, try_count + 1) | |
end | |
if json | |
json = json[1] | |
json = ActiveSupport::JSON.decode(json) | |
proms = json['data'] | |
unless proms.empty? | |
item_proms = {} | |
proms.each do |prom| | |
item_proms[prom['Pid']] = parse_proms(prom['PF']) | |
end | |
end | |
end | |
return item_proms | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'crawler' | |
class Amazon < Crawler | |
def initialize(band_name, page=1, pages=0) | |
@band_name = band_name | |
@pages = pages | |
@page = page | |
@path = "#{@band_name}/#{Date.today}/amazon" | |
mkdirs(@path) | |
@request = Nestful::Request.new('http://www.amazon.cn/s') | |
@request.headers = { 'Host' => 'www.amazon.cn' } | |
end | |
def finishing | |
merge_csv_files(@path, @pages) | |
end | |
def process | |
self.params = { ie: 'UTF8', keywords: @band_name, page: @page, rh: "i:aps,k:#{@band_name}" } | |
page_dom = get_page_dom('UTF-8') | |
page_dom = page_dom.at('div#main>div#searchTemplate') | |
extra_dom = page_dom.at('div#centerBelow>div#btfResults') | |
page_dom = page_dom.at('div#center') | |
items_dom = get_items_dom(page_dom, extra_dom) | |
page_count = export_items(items_dom) | |
if @pages < 1 | |
@pages = parse_total(page_dom) | |
end | |
if @pages > 1 | |
next_page | |
else | |
finishing | |
end | |
end | |
def next_page | |
if @page < @pages | |
@page += 1 | |
puts "开始执行:#{@page}/#{@pages}" | |
process | |
else | |
finishing | |
end | |
end | |
private | |
def export(items) | |
unless items.empty? | |
header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数'] | |
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")", | |
item[:title], | |
item[:pic_url], | |
item[:tag_price], | |
item[:price], | |
item[:rates][:count], | |
] | |
end | |
end | |
end | |
return items.count | |
end | |
def export_items(items_dom) | |
items = {} | |
unless items_dom.empty? | |
items_dom.each do |item| | |
sku_id = item[:name] | |
pic_url = item.at('div.productImage>a>img')[:src] | |
item = item.at('div.productData') | |
name = item.at('div.productTitle').text | |
price = nil | |
tag_price = nil | |
price_dom = item.at('div.newPrice') | |
if price_dom | |
price = price_dom.at('span') | |
price = parse_price(price.text) if price | |
tag_price = price_dom.at('strike') | |
tag_price = parse_price(tag_price.text) if tag_price | |
end | |
rates_count = item.at('div.starsAndPrime') | |
rates_count = rates_count.css('a').last if rates_count | |
rates_count = rates_count ? rates_count.text.gsub(',', '').to_i : 0 | |
if sku_id | |
items[sku_id] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, tag_price: tag_price } | |
end | |
end | |
end | |
export(items) | |
end | |
def get_items_dom(page_dom, extra_dom=nil) | |
items_dom = [] | |
page_dom = page_dom.at('div#atfResults') | |
if page_dom | |
items_dom = page_dom.css('div.product') | |
end | |
if extra_dom | |
items_dom = items_dom + extra_dom.css('div.product') | |
end | |
return items_dom | |
end | |
def parse_total(page_dom) | |
total_dom = page_dom.at('div#resultCount') | |
if total_dom | |
total_dom.text.match(/\-(.*)条,/)[1].to_i | |
else | |
0 | |
end | |
end | |
def parse_price(str) | |
str = str.gsub('¥ ', '') | |
str = str.gsub(',', '') | |
str.to_f | |
end | |
def item_url(sku_id) | |
"http://www.amazon.cn/dp/#{sku_id}" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'fileutils' | |
require 'nokogiri' | |
require 'nestful' | |
require 'csv' | |
class Crawler | |
def initialize | |
@max_attempts = 3 | |
end | |
def request | |
@request | |
end | |
def url=(value) | |
request.url = value | |
end | |
def headers=(value) | |
request.headers = value | |
end | |
def params=(value) | |
request.params = value | |
end | |
def get_page_dom(charset) | |
html = nil | |
begin | |
response = request.connection.get(request.query_path) | |
html = response.body.force_encoding(charset).encode("UTF-8") | |
rescue Nestful::Redirection => error | |
location = error.response['Location'] | |
cookie = error.response['Set-Cookie'] | |
if location.include?('no_results') | |
return html | |
else | |
self.headers = { 'Cookie' => cookie, 'Referer' => request.url } | |
self.url = location | |
retry | |
end | |
end | |
return Nokogiri::HTML(html) unless html.nil? | |
end | |
def mkdirs(path) | |
FileUtils.mkdir_p(path) | |
end | |
def merge_csv_files(path, files_count) | |
csv_files = Dir["#{path}/*.csv" + ''] | |
puts "文件不够,呵呵~" if csv_files.count != files_count | |
CSV.open("#{path}.csv", "w:binary", col_sep: ',') do |csv| | |
has_header = false | |
csv_files.each do |csv_file| | |
data = CSV.read(csv_file, 'r:binary', headers: true, col_sep: ',') | |
unless has_header | |
csv << data.headers | |
has_header = true | |
end | |
data.each do |line| | |
csv << line | |
end | |
end | |
end | |
FileUtils.rm_r(path) # 删除临时文件夹 | |
end | |
def pages_count(total, size) | |
return 0 if total == 0 || size == 0 | |
page = (total / size.to_f).to_i | |
page += 1 if (total % size) > 0 | |
return page | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'crawler' | |
class Jumei < Crawler | |
def initialize(band_name, page=1, pages=0) | |
@band_name = band_name | |
@pages = pages | |
@page = page | |
@path = "#{@band_name}/#{Date.today}/jumei" | |
mkdirs(@path) | |
@request = Nestful::Request.new('http://search.jumei.com') | |
end | |
def finishing | |
merge_csv_files(@path, @pages) | |
end | |
def process | |
self.params = { filter: "0-0-0-0-31-#{@page}", search: @band_name } | |
page_dom = get_page_dom('UTF-8') | |
if page_dom | |
page_dom = page_dom.at('div#search_result_wrap') | |
items_dom = get_items_dom(page_dom) | |
export_items(items_dom) | |
if @pages < 1 | |
total = parse_total(page_dom) | |
@pages = pages_count(total, 40) | |
end | |
if @pages > 1 | |
next_page | |
else | |
finishing | |
end | |
else | |
puts "聚美优品中搜索 #{@band_name},无结果" | |
end | |
end | |
def next_page | |
if @page < @pages | |
@page += 1 | |
puts "开始执行:#{@page}/#{@pages}" | |
process | |
else | |
finishing | |
end | |
end | |
private | |
def export(items) | |
unless items.empty? | |
header_row = ['SKU', '名称', '图片', '价格', '购买数', '折扣'] | |
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")", | |
item[:title], | |
item[:pic_url], | |
item[:price], | |
item[:rates][:count], | |
item[:proms].join(' '), | |
] | |
end | |
end | |
end | |
return items.count | |
end | |
def export_items(items_dom) | |
items = {} | |
unless items_dom.empty? | |
items_dom.each do |item| | |
proms = [] | |
sku_id = item['pid'].to_i | |
name_dom = item.at('div>div.num_warp_list_name') | |
proms_dom = name_dom.at('span') | |
proms << proms_dom.text.gsub('/', '') if proms_dom | |
name = name_dom.at('a').text | |
pic_url = item.at('div>div.num_warp_list_pic_top').at('img')[:src] | |
countdown_dom = item.at('div>div.num_warp_list_warp_word.time_countdown') | |
countdown_dom.remove if countdown_dom | |
rates_count = item.at('div>div.num_warp_list_warp_word').css('span').last.text.gsub(/\p{Han}/, '').to_i | |
price_dom = item.at('div>div.num_warp_list_view_bg') || item.at('div>div.num_warp_list_name_mall') | |
price = price_dom.css('span').last.text.gsub('¥', '').to_f | |
proms_dom = price_dom.text.match(/\((.*)\)/) | |
proms << proms_dom[1] if proms_dom | |
if sku_id | |
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, proms: proms } | |
end | |
end | |
end | |
export(items) | |
end | |
def get_items_dom(page_dom) | |
list = page_dom.at('div#search_list_wrap>div.products>ul') | |
if list | |
return list.css('li.item') | |
else | |
return [] | |
end | |
end | |
def parse_total(page_dom) | |
total_dom = page_dom.at('div.search_info>div>div.content').css('label.red') | |
if total_dom.count == 2 | |
total_dom[1].text.to_i | |
else | |
0 | |
end | |
end | |
def item_url(sku_id) | |
"http://mall.jumei.com/product_#{sku_id}.html" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'mini_magick' | |
require 'rtesseract' | |
require 'crawler' | |
class Lefeng < Crawler | |
def initialize(band_name, page=1, pages=0) | |
@band_name = band_name | |
@pages = pages | |
@page = page | |
@path = "#{@band_name}/#{Date.today}/lefeng" | |
mkdirs(@path) | |
@canvas = MiniMagick::Image.open('./assets/canvas.jpg') | |
@request = Nestful::Request.new('http://search.lefeng.com/search/search') | |
end | |
def finishing | |
merge_csv_files(@path, @pages) | |
FileUtils.rm(@canvas.path) # 删除临时文件 | |
end | |
def process | |
self.params = { key: @band_name, pageNo: @page } | |
page_dom = get_page_dom('UTF-8') | |
items_dom = get_items_dom(page_dom) | |
page_count = export_items(items_dom) | |
if @pages < 1 | |
total = parse_total(page_dom) | |
@pages = pages_count(total, page_count) | |
end | |
if @pages > 1 | |
next_page | |
else | |
finishing | |
end | |
end | |
def next_page | |
if @page < @pages | |
@page += 1 | |
puts "开始执行:#{@page}/#{@pages}" | |
process | |
else | |
finishing | |
end | |
end | |
private | |
def export(items) | |
unless items.empty? | |
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数'] | |
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")", | |
item[:title], | |
item[:pic_url], | |
item[:proms], | |
item[:price], | |
item[:rates][:count], | |
] | |
end | |
end | |
end | |
return items.count | |
end | |
def export_items(items_dom) | |
items = {} | |
unless items_dom.empty? | |
items_dom.each do |item| | |
link_dom = item.at('dt>a') | |
pic_url = link_dom.at('img')['src2'] | |
name = link_dom[:title] | |
proms = item.at('dd.nam>a>i') | |
proms = proms.text if proms | |
rates_count = item.at('dd.mess>a').text.match(/\d+/)[0] | |
sku_id = link_dom[:href].match(/product\/(.*)\.html$/)[1] | |
price_url = item.at('dd.pri>img')[:src] | |
if sku_id | |
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(price_url), rates: { count: rates_count }, proms: proms } | |
end | |
end | |
end | |
export(items) | |
end | |
def get_items_dom(page_dom) | |
list = page_dom.at('div.list>div.smPruArea>div.makeup') | |
if list | |
return list.css('div.makeupdl') | |
else | |
return [] | |
end | |
end | |
def parse_total(page_dom) | |
total_dom = page_dom.at('b#searchernum') | |
if total_dom | |
total_dom.text.to_i | |
else | |
0 | |
end | |
end | |
def parse_price(price_url) | |
attempts = 0 | |
begin | |
img_price = MiniMagick::Image.open(price_url) | |
img_price.resize '125%' # 放大 | |
img = @canvas.composite(img_price) do |c| | |
c.gravity 'center' | |
end | |
img.colorspace("GRAY") # 灰度化 | |
rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error | |
attempts = attempts + 1 | |
puts "错误: #{error}" | |
retry if(attempts < @max_attempts) | |
end | |
str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别 | |
FileUtils.rm( [img_price.path, img.path] ) # 删除临时文件 | |
price = str.match(/\d+\.\d+/) | |
return price ? price[0].to_f : price_url | |
end | |
def item_url(sku_id) | |
"http://product.lefeng.com/product/#{sku_id}.html" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
$:.unshift File.expand_path('./lib') | |
namespace :cai do | |
# 必填项 | |
task :required do | |
if ENV['band_name'].nil? | |
puts '缺少参数,请参照以下命令:' | |
puts 'rake cai band_name=品牌名称' | |
puts 'rake cai:amazon band_name=品牌名称' | |
puts 'rake cai:buy360 band_name=品牌名称' | |
puts 'rake cai:lefeng band_name=品牌名称' | |
puts 'rake cai:jumei band_name=品牌名称' | |
puts 'rake cai:yihaodian band_name=品牌名称' | |
exit | |
end | |
end | |
desc "采集 京东 数据" | |
task :buy360, [:page, :pages] => :required do |t, args| | |
args.with_defaults(page: 1, pages: 0) | |
require '360buy' | |
buy360 = Buy360.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i) | |
buy360.process | |
end | |
desc "采集 乐峰 数据" | |
task :lefeng, [:page, :pages] => :required do |t, args| | |
args.with_defaults(page: 1, pages: 0) | |
require 'lefeng' | |
lefeng = Lefeng.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i) | |
lefeng.process | |
end | |
desc "采集 聚美 数据" | |
task :jumei, [:page, :pages] => :required do |t, args| | |
args.with_defaults(page: 1, pages: 0) | |
require 'jumei' | |
jumei = Jumei.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i) | |
jumei.process | |
end | |
desc "采集 一号店 数据" | |
task :yihaodian, [:page, :pages] => :required do |t, args| | |
args.with_defaults(page: 1, pages: 0) | |
require 'yihaodian' | |
yihaodian = Yihaodian.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i) | |
yihaodian.process | |
end | |
desc "采集 亚马逊 数据" | |
task :amazon, [:page, :pages] => :required do |t, args| | |
args.with_defaults(page: 1, pages: 0) | |
require 'amazon' | |
amazon = Amazon.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i) | |
amazon.process | |
end | |
end | |
desc "采集 全平台 数据" | |
task :cai => ['cai:jumei', 'cai:lefeng', 'cai:yihaodian', 'cai:amazon', 'cai:buy360' ] do | |
puts "全平台目前仅支持:京东、乐峰、聚美、亚马逊、一号店。" | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'crawler' | |
class Yihaodian < Crawler | |
def initialize(band_name, page=1, pages=0) | |
@band_name = band_name | |
@pages = pages | |
@page = page | |
@path = "#{@band_name}/#{Date.today}/yihaodian" | |
mkdirs(@path) | |
@request = Nestful::Request.new('') | |
end | |
def get_json_dom(charset, try_count=0) | |
response = request.connection.get(request.query_path) | |
html = response.body.force_encoding(charset).encode("UTF-8") | |
html = ActiveSupport::JSON.decode(html)['value'] if html | |
return Nokogiri::HTML(html) | |
rescue Nestful::ForbiddenAccess => error | |
if try_count < 3 # 重试3次 | |
puts "========================开始重试========================" | |
get_page_dom(charset, try_count + 1) | |
else | |
puts "========================很扯,三次都没搞定========================" | |
end | |
end | |
def finishing | |
merge_csv_files(@path, @pages) | |
end | |
def search_url | |
"http://search.yihaodian.com/searchPage/c0-0/b/a-s1-v0-p#{@page}-price-d0-f0-m1-rt0-pid-k#{URI::encode(@band_name)}" | |
end | |
def process | |
self.url = search_url | |
page_dom = get_json_dom('UTF-8') | |
items_dom = get_items_dom(page_dom) | |
page_count = export_items(items_dom) | |
if @pages < 1 | |
@pages = parse_total(page_dom) | |
end | |
if @pages > 1 | |
next_page | |
else | |
finishing | |
end | |
end | |
def next_page | |
if @page < @pages | |
@page += 1 | |
puts "开始执行:#{@page}/#{@pages}" | |
process | |
else | |
finishing | |
end | |
end | |
private | |
def export(items) | |
unless items.empty? | |
header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数'] | |
CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(item[:item_id])}\",\"#{sprintf("%010d", sku_id)}\")", | |
item[:title], | |
item[:pic_url], | |
item[:tag_price], | |
item[:price], | |
item[:rates][:count], | |
] | |
end | |
end | |
end | |
return items.count | |
end | |
def export_items(items_dom) | |
items = {} | |
unless items_dom.empty? | |
items_dom.each do |item| | |
product_id = item[:id].match(/\d+/)[0].to_i | |
link_dom = item.at("a#pdlink1_#{product_id}") | |
item_id = link_dom['pmid'].to_i | |
sku_id = item.at("div.buyInfo>button#buyButton_#{product_id}") | |
sku_id = sku_id['productcode'].to_i if sku_id | |
link_dom = link_dom.at('img') | |
pic_url = link_dom['original'] || link_dom[:src] | |
name = link_dom[:title] | |
link_dom = item.at('p.price') | |
rates_count = link_dom.at('a') | |
rates_count = rates_count ? rates_count.text.match(/\d+/)[0].to_i : 0 | |
price = link_dom.at('strong').text | |
tag_price = link_dom.at('del') | |
tag_price = parse_price(tag_price.text) if tag_price | |
if sku_id | |
items[sku_id.to_i] = { item_id: item_id, pic_url: pic_url.strip, title: name.strip, price: parse_price(price), rates: { count: rates_count }, tag_price: tag_price } | |
end | |
end | |
end | |
export(items) | |
end | |
def get_items_dom(page_dom) | |
list = page_dom.at('div#plist>div#search_table>div.itemSearchResult.clearfix>ul.itemSearchList') | |
if list | |
return list.css('li') | |
else | |
return [] | |
end | |
end | |
def parse_total(page_dom) | |
total_dom = page_dom.at('ul.page.clearfix>li.pageNum') | |
if total_dom | |
total_dom.text.match(/\/(.*)$/)[1].to_i | |
else | |
0 | |
end | |
end | |
def parse_price(str) | |
str.gsub('¥', '').to_f | |
end | |
def item_url(item_id) | |
"http://www.yihaodian.com/item/#{item_id}_1" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment