-
-
Save zernel/3939624 to your computer and use it in GitHub Desktop.
京东商品价格解析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
require 'mini_magick' | |
require 'rtesseract' | |
require 'nokogiri' | |
require 'nestful' | |
require 'csv' | |
require 'pp' | |
class Buy360 | |
def initialize(url='') | |
@items = {} | |
@request = Nestful::Request.new(url) | |
end | |
def request | |
@request | |
end | |
def url=(value) | |
request.url = value | |
end | |
def params=(value) | |
request.params = value | |
end | |
def search(keyword) | |
self.url = search_url | |
self.params = { keyword: keyword, qrst: 'UNEXPAND', enc: 'utf-8' } | |
dom = get_dom | |
return parse_item(dom) if dom | |
end | |
def brand(brand_id) | |
self.url = brand_url | |
self.params = { 'BrandId' => brand_id } | |
dom = get_dom | |
return parse_item(dom) if dom | |
end | |
def export | |
unless @items.empty? | |
header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数', '好评率'] | |
CSV.open("#{Date.today}.csv", "wb:GB18030", col_sep: ',') do |csv| | |
csv << header_row | |
@items.each do |sku_id, item| | |
csv << [ | |
"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")", | |
item[:title], | |
item[:pic_url], | |
item[:proms] ? item[:proms].join(' ') : '', | |
item[:price], | |
item[:rates][:count], | |
item[:rates][:rank], | |
] | |
end | |
end | |
end | |
end | |
def get_dom(try_count=0) | |
if request.url | |
debug | |
response = request.connection.get(request.query_path) | |
html = response.body.force_encoding("GBK").encode("UTF-8") | |
return parse_html(html) if html | |
else | |
puts "亲,干嘛?" | |
end | |
rescue Nestful::ForbiddenAccess => error | |
if try_count < 3 # 重试3次 | |
puts "========================开始重试========================" | |
get_dom(try_count + 1) | |
else | |
puts "========================很扯,三次都没搞定========================" | |
end | |
end | |
private | |
def debug | |
puts request.url | |
puts request.params | |
end | |
def next_page(page) | |
self.params = request.params.merge(page: page) | |
end | |
def parse_item(dom) | |
total = parse_total(dom) | |
if total > 1 | |
items = items_dom(dom) | |
pages = pages_count(total, items.count) | |
set_items(items) | |
if pages > 1 | |
2.upto(pages).each do |page| | |
next_page(page) | |
puts "第#{page}/#{pages}页" | |
dom = get_dom | |
if dom | |
items = items_dom(dom) | |
set_items(items) | |
end | |
end | |
end | |
end | |
return @items | |
end | |
def set_items(dom) | |
unless dom.empty? | |
items = {} | |
dom.each do |item| | |
img_dom = item.at('div.p-img').at('img') | |
link_dom = item.at('div.p-name').at('a') | |
pic_url = img_dom['data-lazyload'] || img_dom[:src] | |
name = link_dom.text | |
rates_count = item.at('div.extra').at('a').text | |
rates_rank = item.at('div.extra').at('span.reputation').text | |
sku_id = item[:sku] || link_dom[:href].match(/product\/(.*)\.html$/)[1] | |
if sku_id | |
rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank? | |
rates_rank = rates_rank.match(/\d+/)[0] unless rates_rank.blank? | |
items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count, rank: rates_rank} } | |
end | |
end | |
items = set_proms(items) | |
@items.merge!(items) | |
end | |
end | |
def set_proms(node) | |
proms = get_proms(node.keys) | |
unless proms.empty? | |
proms.each do |item_id, prom| | |
node[item_id][:proms] = prom if node.has_key?(item_id) | |
end | |
end | |
return node | |
end | |
def parse_html(html) | |
return Nokogiri::HTML(html) | |
end | |
def parse_proms(proms) | |
item_proms = [] | |
proms.each do |flag| | |
item_proms << case flag.to_i | |
when 1 | |
'直降' | |
when 2 | |
'赠品' | |
when 3 | |
'返券' | |
when 4 | |
'送积分' | |
end | |
end | |
return item_proms | |
end | |
def items_dom(dom) | |
list = dom.at('div#plist') | |
list.at('ul.list-h').css('li') if list | |
end | |
def parse_total(dom) | |
filter_dom = dom.at('div#filter') | |
total_dom = filter_dom.at('div.total') | |
if total_dom | |
total_dom.at('strong').text.to_i | |
else | |
total_dom = filter_dom.at('ul.extra') | |
total_dom.at('li').at('strong').text.to_i if total_dom | |
end | |
end | |
def pages_count(total, size=24) | |
page = (total / size.to_f).to_i | |
page += 1 if (total % size) > 0 | |
return page | |
end | |
def parse_price(sku_id, try_count=0) | |
sleep try_count | |
img = MiniMagick::Image.open(price_url(sku_id)) | |
img.colorspace("GRAY") # 灰度化 | |
img.monochrome # 去色 | |
str = RTesseract.new(img.path).to_s.strip # 识别 | |
File.unlink(img.path) # 删除临时文件 | |
price = str.match(/\d+\.\d+/)[0].to_f | |
puts "#{sku_id}:#{str}:#{price}" | |
price if price > 1 | |
rescue MiniMagick::Error => error | |
if try_count < 3 # 重试3次 | |
puts "========================开始重试:#{sku_id}========================" | |
parse_price(sku_id,try_count + 1) | |
else | |
puts "========================很扯,三次都没搞定:#{price_url(sku_id)}========================" | |
end | |
end | |
def price_url(sku_id) | |
"http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-3.png" | |
end | |
def brand_url | |
'http://www.360buy.com/brandlist.aspx' | |
end | |
def search_url | |
'http://search.360buy.com/search' | |
end | |
def item_url(sku_id) | |
"http://www.360buy.com/product/#{sku_id}.html" | |
end | |
def store_url(store_id, page=1) | |
"http://mall.360buy.com/shopWare-#{store_id}----#{page}.html" | |
end | |
def get_proms(sku_ids) | |
prom_url = 'http://price.360buy.com/PromotionFlag.aspx' | |
params = { pid: sku_ids.join(',')} | |
html = Nestful.get prom_url, params: params | |
html = html.force_encoding("GBK").encode("UTF-8") | |
json = html.match(/\((.*)\)/)[1] | |
item_proms = {} | |
if json | |
json = ActiveSupport::JSON.decode(json) | |
proms = json['data'] | |
unless proms.empty? | |
item_proms = {} | |
proms.each do |prom| | |
item_proms[prom['Pid']] = parse_proms(prom['PF']) | |
end | |
end | |
end | |
return item_proms | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment