mimosz · October 14, 2022 04:28
diff --git a/.gitignore b/.gitignore
 .DS_Store
diff --git a/.rvmrc b/.rvmrc
 rvm use jruby
diff --git a/360buy.rb b/360buy.rb
 # -*- encoding: utf-8 -*-
 require 'mini_magick'
 require 'rtesseract'
 require 'crawler'

 class Buy360 < Crawler
  def initialize(band_name, page=1, pages=0)
    @band_name = band_name
    @max_attempts = 3
    @pages = pages
    @page  = page

    @path = "#{@band_name}/#{Date.today}/360buy"
    mkdirs(@path)

    @request = Nestful::Request.new('http://search.360buy.com/search')

  end

  def finishing
    merge_csv_files(@path, @pages)
  end

  def process
    self.params = { keyword: @band_name, qrst: 'UNEXPAND', enc: 'utf-8', page: @page }
    page_dom    = get_page_dom('GBK')
    items_dom   = get_items_dom(page_dom)

    page_count  = export_items(items_dom)
    
    if @pages < 1
      total  = parse_total(page_dom)
      @pages = pages_count(total, page_count)
    end

    if @pages > 1
      next_page
    else
      finishing
    end
  end

  def next_page
    if @page < @pages
      @page += 1 
      puts "开始执行：#{@page}/#{@pages}"
      process
    else
      finishing
    end
  end

  private

  def export(items)
     unless items.empty?
       header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数']
       CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
         csv << header_row
         items.each do |sku_id, item|
             csv << [ 
               "=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
               item[:title],
               item[:pic_url],
               item[:proms] ? item[:proms].join(' ') : '',
               item[:price],
               item[:rates][:count],
             ]
         end
       end
     end
     return items.count
  end

  def export_items(items_dom)
    items = {}
    unless items_dom.empty? 
      items_dom.each do |item|
        
        link_dom      = item.at('div.p-name>a')
        if link_dom
            img_dom   = item.at('div.p-img>a>img')
          sku_id      = item[:sku] || link_dom[:href].match(/product\/(.*)\.html$/)[1]
          pic_url     = img_dom['data-lazyload'] || img_dom[:src]
          name        = link_dom.text
          rates_count = item.at('div.extra').at('a').text
          rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank?

          items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count } }
        end
      end
      items = set_proms(items) unless items.empty? 
    end
    export(items)
  end

  def set_proms(node)
    proms = get_proms(node.keys)
    unless proms.empty?
      proms.each do |item_id, prom|
        node[item_id][:proms] = prom if node.has_key?(item_id)
      end
    end
    return node
  end

  def parse_proms(proms)
    item_proms = []
    proms.each do |flag|
      item_proms << case flag.to_i
      when 1
        '直降'
      when 2
        '赠品'
      when 3
        '返券'
      when 4
        '送积分'
      end
    end
    return item_proms
  end

  def get_items_dom(page_dom)
    list = page_dom.at('div#plist>ul.list-h')
    if list
      return list.css('li') 
    else
      return []
    end
  end

  def parse_total(page_dom)
    filter_dom = page_dom.at('div#filter')
    return 0 if filter_dom.nil?
    total_dom  = filter_dom.at('div.total')
    if total_dom
      total_dom.at('strong').text.to_i
    else
      total_dom = filter_dom.at('ul.extra')
      total_dom.at('li').at('strong').text.to_i if total_dom
    end
  end

  

  def parse_price(sku_id, type=3)
    attempts = 0

    img_url = price_url(sku_id, type)
    begin
      img = MiniMagick::Image.open(img_url)
      img.colorspace("GRAY") # 灰度化 
      img.monochrome         # 去色
    rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error
      attempts   = attempts + 1
      puts "错误: #{error}"
      puts img_url
      return 0 if(attempts < @max_attempts)
    end
    
    str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
    File.unlink(img.path)  # 删除临时文件
    price = str.match(/\d+\.\d+/)

    return  price ? price[0].to_f : try_parse_price(sku_id)
  end

  def try_parse_price(sku_id, type=2)
    img_url = price_url(sku_id, type)
    img = MiniMagick::Image.open(img_url)
    img.resize '200x100'   # 放大
    str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
    File.unlink(img.path)  # 删除临时文件
    price = str.match(/\d+\.\d+/)
    price = price[0] if price
    return price.to_f
  end

  def price_url(sku_id, type=3)
    "http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-#{type}.png"
  end


  def item_url(sku_id)
    "http://www.360buy.com/product/#{sku_id}.html"
  end

  def get_proms(sku_ids, try_count=0)
    item_proms = {}
    json       = nil

    begin
      prom_url = 'http://price.360buy.com/PromotionFlag.aspx'
      params   = { pid: sku_ids.join(',')}
      html     = Nestful.get prom_url, params: params
    rescue Nestful::ServerError => error
      return item_proms
    end
    html     = html.force_encoding("GBK").encode("UTF-8")
    
    if html.first == '({' && html.last == '})' # 京东系统错误时，会跳回首页
      json = html.match(/\((.*)\)/)
    elsif try_count < @max_attempts # 重试3次
      get_proms(sku_ids, try_count + 1) 
    end
    
    if json
      json = json[1]
      json = ActiveSupport::JSON.decode(json)
      proms = json['data']
      unless proms.empty?
        item_proms = {}
        proms.each do |prom|
          item_proms[prom['Pid']] = parse_proms(prom['PF'])
        end
      end
    end
    return item_proms
  end
 end
diff --git a/amazon.rb b/amazon.rb
 # -*- encoding: utf-8 -*-
 require 'crawler'

 class Amazon < Crawler
  def initialize(band_name, page=1, pages=0)
    @band_name = band_name

    @pages = pages
    @page  = page

    @path = "#{@band_name}/#{Date.today}/amazon"
    mkdirs(@path)

    @request = Nestful::Request.new('http://www.amazon.cn/s')
    @request.headers = { 'Host' => 'www.amazon.cn' }
  end

  def finishing
    merge_csv_files(@path, @pages)
  end

  def process
    self.params = { ie: 'UTF8', keywords: @band_name, page: @page, rh: "i:aps,k:#{@band_name}" }
    page_dom   = get_page_dom('UTF-8')
    page_dom   = page_dom.at('div#main>div#searchTemplate')

    extra_dom  = page_dom.at('div#centerBelow>div#btfResults')
    page_dom   = page_dom.at('div#center')

    items_dom  = get_items_dom(page_dom, extra_dom)

    page_count = export_items(items_dom)
    
    if @pages < 1
      @pages = parse_total(page_dom)
    end

    if @pages > 1
      next_page
    else
      finishing
    end
  end

  def next_page
    if @page < @pages
      @page += 1 
      puts "开始执行：#{@page}/#{@pages}"
      process
    else
      finishing
    end
  end

  private

  def export(items)
     unless items.empty?
       header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
       CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
         csv << header_row
         items.each do |sku_id, item|
             csv << [ 
               "=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
               item[:title],
               item[:pic_url],
               item[:tag_price],
               item[:price],
               item[:rates][:count],
             ]
         end
       end
     end
     return items.count
  end

  def export_items(items_dom)
    items = {}
    unless items_dom.empty? 
      items_dom.each do |item|
        sku_id    = item[:name]

        pic_url   = item.at('div.productImage>a>img')[:src]

        item      = item.at('div.productData')
        name      = item.at('div.productTitle').text
        
        price     = nil
        tag_price = nil

        price_dom = item.at('div.newPrice')
        if price_dom
          price     = price_dom.at('span')
          price     = parse_price(price.text) if price

          tag_price = price_dom.at('strike')
          tag_price = parse_price(tag_price.text) if tag_price
        end

        rates_count = item.at('div.starsAndPrime')
        rates_count = rates_count.css('a').last if rates_count
        rates_count = rates_count ? rates_count.text.gsub(',', '').to_i : 0

        if sku_id
          items[sku_id] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, tag_price: tag_price }
        end
      end
    end
    export(items)
  end

  def get_items_dom(page_dom, extra_dom=nil)
    items_dom = []
    page_dom  = page_dom.at('div#atfResults')
    if page_dom
      items_dom = page_dom.css('div.product')
    end
    
    if extra_dom
      items_dom = items_dom + extra_dom.css('div.product')
    end
    
    return items_dom
  end

  def parse_total(page_dom)
    total_dom = page_dom.at('div#resultCount')
    if total_dom
      total_dom.text.match(/\-(.*)条，/)[1].to_i
    else
      0
    end
  end

  def parse_price(str)
    str = str.gsub('￥ ', '')
    str = str.gsub(',', '')
    str.to_f
  end

  def item_url(sku_id)
    "http://www.amazon.cn/dp/#{sku_id}"
  end
 end
diff --git a/canvas.jpg b/canvas.jpg
diff --git a/crawler.rb b/crawler.rb
 # -*- encoding: utf-8 -*-
 require 'fileutils'
 require 'nokogiri'
 require 'nestful'
 require 'csv'

 class Crawler

  def initialize
    @max_attempts = 3
  end

  def request
    @request
  end

  def url=(value)
    request.url = value
  end

  def headers=(value)
    request.headers = value
  end

  def params=(value)
    request.params = value
  end

  def get_page_dom(charset)
    html     = nil

    begin
      response = request.connection.get(request.query_path)
      html     = response.body.force_encoding(charset).encode("UTF-8")
    rescue Nestful::Redirection => error
      location = error.response['Location']
      cookie   = error.response['Set-Cookie']
      if location.include?('no_results')
        return html
      else
        self.headers = { 'Cookie' => cookie, 'Referer' => request.url }
        self.url     = location
        retry
      end
    end

    return Nokogiri::HTML(html) unless html.nil?
  end

  def mkdirs(path)
    FileUtils.mkdir_p(path)
  end

  def merge_csv_files(path, files_count)
    csv_files = Dir["#{path}/*.csv" + '']
    puts "文件不够，呵呵~" if csv_files.count != files_count
    CSV.open("#{path}.csv", "w:binary", col_sep: ',') do |csv|
      has_header = false 
      csv_files.each do |csv_file|
        data = CSV.read(csv_file, 'r:binary', headers: true, col_sep: ',')
        unless has_header
          csv << data.headers
          has_header = true
        end
        data.each do |line|
          csv << line
        end
      end
    end
    FileUtils.rm_r(path) # 删除临时文件夹
  end

  def pages_count(total, size)
    return 0 if total == 0 || size == 0
    page = (total / size.to_f).to_i
    page += 1 if (total % size) > 0
    return page
  end
 end
diff --git a/jumei.rb b/jumei.rb
 # -*- encoding: utf-8 -*-
 require 'crawler'

 class Jumei < Crawler
  def initialize(band_name, page=1, pages=0)
    @band_name = band_name

    @pages = pages
    @page  = page

    @path = "#{@band_name}/#{Date.today}/jumei"
    mkdirs(@path)

    @request = Nestful::Request.new('http://search.jumei.com')
  end

  def finishing
    merge_csv_files(@path, @pages)
  end

  def process
    self.params = { filter: "0-0-0-0-31-#{@page}", search: @band_name }
    page_dom    = get_page_dom('UTF-8')
    if page_dom
      page_dom    = page_dom.at('div#search_result_wrap')

      items_dom   = get_items_dom(page_dom)

      export_items(items_dom)
      
      if @pages < 1
        total  = parse_total(page_dom)
        @pages = pages_count(total, 40)
      end

      if @pages > 1
        next_page
      else
        finishing
      end
    else
      puts "聚美优品中搜索 #{@band_name}，无结果"
    end
  end

  def next_page
    if @page < @pages
      @page += 1 
      puts "开始执行：#{@page}/#{@pages}"
      process
    else
      finishing
    end
  end

  private

  def export(items)
     unless items.empty?
       header_row = ['SKU', '名称', '图片', '价格', '购买数', '折扣']
       CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
         csv << header_row
         items.each do |sku_id, item|
             csv << [ 
               "=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
               item[:title],
               item[:pic_url],
               item[:price],
               item[:rates][:count],
               item[:proms].join(' '),
             ]
         end
       end
     end
     return items.count
  end

  def export_items(items_dom)
    items = {}
    unless items_dom.empty? 
      items_dom.each do |item|
        proms  = []
        sku_id = item['pid'].to_i

        name_dom  = item.at('div>div.num_warp_list_name')
        proms_dom = name_dom.at('span')
        proms << proms_dom.text.gsub('/', '') if proms_dom
        name  = name_dom.at('a').text
        pic_url   = item.at('div>div.num_warp_list_pic_top').at('img')[:src]


        countdown_dom = item.at('div>div.num_warp_list_warp_word.time_countdown')
        countdown_dom.remove if countdown_dom
        rates_count   = item.at('div>div.num_warp_list_warp_word').css('span').last.text.gsub(/\p{Han}/, '').to_i


        price_dom = item.at('div>div.num_warp_list_view_bg') || item.at('div>div.num_warp_list_name_mall')


        price = price_dom.css('span').last.text.gsub('¥', '').to_f

        proms_dom = price_dom.text.match(/\((.*)\)/)
        proms << proms_dom[1] if proms_dom

        if sku_id
          items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, proms: proms }
        end
      end
    end
    export(items)
  end

  def get_items_dom(page_dom)
    list = page_dom.at('div#search_list_wrap>div.products>ul')
    if list
      return list.css('li.item')
    else
      return []
    end
  end

  def parse_total(page_dom)
    total_dom = page_dom.at('div.search_info>div>div.content').css('label.red')
    if total_dom.count == 2
      total_dom[1].text.to_i
    else
      0
    end
  end

  def item_url(sku_id)
    "http://mall.jumei.com/product_#{sku_id}.html"
  end
 end
diff --git a/lefeng.rb b/lefeng.rb
 # -*- encoding: utf-8 -*-
 require 'mini_magick'
 require 'rtesseract'
 require 'crawler'

 class Lefeng < Crawler
  def initialize(band_name, page=1, pages=0)
    @band_name = band_name

    @pages = pages
    @page  = page

    @path = "#{@band_name}/#{Date.today}/lefeng"
    mkdirs(@path)
    
    @canvas  = MiniMagick::Image.open('./assets/canvas.jpg')

    @request = Nestful::Request.new('http://search.lefeng.com/search/search')
  end

  def finishing
    merge_csv_files(@path, @pages)
    FileUtils.rm(@canvas.path)  # 删除临时文件
  end

  def process
    self.params = { key: @band_name, pageNo: @page }
    page_dom    = get_page_dom('UTF-8')
    items_dom   = get_items_dom(page_dom)

    page_count  = export_items(items_dom)
    
    if @pages < 1
      total  = parse_total(page_dom)
      @pages = pages_count(total, page_count)
    end

    if @pages > 1
      next_page
    else
      finishing
    end
  end

  def next_page
    if @page < @pages
      @page += 1 
      puts "开始执行：#{@page}/#{@pages}"
      process
    else
      finishing
    end
  end

  private

  def export(items)
     unless items.empty?
       header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数']
       CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
         csv << header_row
         items.each do |sku_id, item|
             csv << [ 
               "=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
               item[:title],
               item[:pic_url],
               item[:proms],
               item[:price],
               item[:rates][:count],
             ]
         end
       end
     end
     return items.count
  end

  def export_items(items_dom)
    items = {}
    unless items_dom.empty? 
      items_dom.each do |item|
        link_dom    = item.at('dt>a')
        pic_url     = link_dom.at('img')['src2']
        name        = link_dom[:title]

        proms       = item.at('dd.nam>a>i')
        proms       = proms.text if proms

        rates_count = item.at('dd.mess>a').text.match(/\d+/)[0]
        sku_id      = link_dom[:href].match(/product\/(.*)\.html$/)[1]
        price_url   = item.at('dd.pri>img')[:src]
        if sku_id
          items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(price_url), rates: { count: rates_count }, proms: proms }
        end
      end
    end
    export(items)
  end

  def get_items_dom(page_dom)
    list = page_dom.at('div.list>div.smPruArea>div.makeup')
    if list
      return list.css('div.makeupdl')
    else
      return []
    end
  end

  def parse_total(page_dom)
    total_dom = page_dom.at('b#searchernum')
    if total_dom
      total_dom.text.to_i
    else
      0
    end
  end

  def parse_price(price_url)
    attempts = 0

    begin
      img_price = MiniMagick::Image.open(price_url)
      img_price.resize '125%' # 放大

      img       = @canvas.composite(img_price) do |c|
        c.gravity 'center'
      end
      img.colorspace("GRAY") # 灰度化 
    rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error
      attempts   = attempts + 1
      puts "错误: #{error}"
      retry if(attempts < @max_attempts)
    end

    
    str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
    FileUtils.rm( [img_price.path, img.path] )  # 删除临时文件
    price = str.match(/\d+\.\d+/)

    return  price ? price[0].to_f : price_url
  end

  def item_url(sku_id)
    "http://product.lefeng.com/product/#{sku_id}.html"
  end
 end
diff --git a/Rakefile b/Rakefile
 # -*- encoding: utf-8 -*-
 $:.unshift File.expand_path('./lib')

 namespace :cai do
  # 必填项
  task :required do
    if ENV['band_name'].nil?
      puts '缺少参数，请参照以下命令：'
      puts 'rake cai band_name=品牌名称'
      puts 'rake cai:amazon band_name=品牌名称'
      puts 'rake cai:buy360 band_name=品牌名称'
      puts 'rake cai:lefeng band_name=品牌名称'
      puts 'rake cai:jumei band_name=品牌名称'
      puts 'rake cai:yihaodian band_name=品牌名称'
      exit
    end
  end

  desc "采集 京东 数据"
  task :buy360, [:page, :pages] => :required do |t, args|
    args.with_defaults(page: 1, pages: 0)
    require '360buy'
    
    buy360 = Buy360.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
    buy360.process
  end

  desc "采集 乐峰 数据"
  task :lefeng, [:page, :pages] => :required do |t, args|
    args.with_defaults(page: 1, pages: 0)
    require 'lefeng'

    lefeng = Lefeng.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
    lefeng.process
  end

  desc "采集 聚美 数据"
  task :jumei, [:page, :pages] => :required do |t, args|
    args.with_defaults(page: 1, pages: 0)
    require 'jumei'

    jumei = Jumei.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
    jumei.process
  end

  desc "采集 一号店 数据"
  task :yihaodian, [:page, :pages] => :required do |t, args|
    args.with_defaults(page: 1, pages: 0)
    require 'yihaodian'

    yihaodian = Yihaodian.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
    yihaodian.process
  end

  desc "采集 亚马逊 数据"
  task :amazon, [:page, :pages] => :required do |t, args|
    args.with_defaults(page: 1, pages: 0)
    require 'amazon'

    amazon = Amazon.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
    amazon.process
  end
 end

 desc "采集 全平台 数据"
 task :cai => ['cai:jumei', 'cai:lefeng', 'cai:yihaodian', 'cai:amazon', 'cai:buy360' ] do
  puts "全平台目前仅支持：京东、乐峰、聚美、亚马逊、一号店。"
 end
diff --git a/yihaodian.rb b/yihaodian.rb
 # -*- encoding: utf-8 -*-
 require 'crawler'

 class Yihaodian < Crawler
  def initialize(band_name, page=1, pages=0)
    @band_name = band_name

    @pages = pages
    @page  = page

    @path = "#{@band_name}/#{Date.today}/yihaodian"
    mkdirs(@path)

    @request = Nestful::Request.new('')
  end

  def get_json_dom(charset, try_count=0)
    response    = request.connection.get(request.query_path)
    html        = response.body.force_encoding(charset).encode("UTF-8")
    html        = ActiveSupport::JSON.decode(html)['value'] if html
    return Nokogiri::HTML(html)
  rescue Nestful::ForbiddenAccess => error
    if try_count < 3 # 重试3次
      puts "========================开始重试========================"
      get_page_dom(charset, try_count + 1)
    else
      puts "========================很扯，三次都没搞定========================"
    end
  end

  def finishing
    merge_csv_files(@path, @pages)
  end

  def search_url
    "http://search.yihaodian.com/searchPage/c0-0/b/a-s1-v0-p#{@page}-price-d0-f0-m1-rt0-pid-k#{URI::encode(@band_name)}"
  end

  def process
    self.url   = search_url
    page_dom   = get_json_dom('UTF-8')
    items_dom  = get_items_dom(page_dom)

    page_count = export_items(items_dom)
    
    if @pages < 1
      @pages = parse_total(page_dom)
    end

    if @pages > 1
      next_page
    else
      finishing
    end
  end

  def next_page
    if @page < @pages
      @page += 1 
      puts "开始执行：#{@page}/#{@pages}"
      process
    else
      finishing
    end
  end

  private

  def export(items)
     unless items.empty?
       header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
       CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do |csv|
         csv << header_row
         items.each do |sku_id, item|
             csv << [ 
               "=HYPERLINK(\"#{item_url(item[:item_id])}\",\"#{sprintf("%010d", sku_id)}\")",
               item[:title],
               item[:pic_url],
               item[:tag_price],
               item[:price],
               item[:rates][:count],
             ]
         end
       end
     end
     return items.count
  end

  def export_items(items_dom)
    items = {}
    unless items_dom.empty? 
      items_dom.each do |item|
        product_id = item[:id].match(/\d+/)[0].to_i
        link_dom   = item.at("a#pdlink1_#{product_id}")
        item_id    = link_dom['pmid'].to_i
        sku_id     = item.at("div.buyInfo>button#buyButton_#{product_id}")
        sku_id     = sku_id['productcode'].to_i if sku_id
        link_dom   = link_dom.at('img')
        pic_url    = link_dom['original'] || link_dom[:src]
        name      = link_dom[:title] 

        link_dom   = item.at('p.price')

        rates_count = link_dom.at('a')
        rates_count = rates_count ? rates_count.text.match(/\d+/)[0].to_i : 0

        price     = link_dom.at('strong').text
        tag_price = link_dom.at('del')
        tag_price = parse_price(tag_price.text) if tag_price

        if sku_id
          items[sku_id.to_i] = { item_id: item_id, pic_url: pic_url.strip, title: name.strip, price: parse_price(price), rates: { count: rates_count }, tag_price: tag_price }
        end
      end
    end
    export(items)
  end

  def get_items_dom(page_dom)
    list = page_dom.at('div#plist>div#search_table>div.itemSearchResult.clearfix>ul.itemSearchList')
    if list
      return list.css('li') 
    else
      return []
    end
  end

  def parse_total(page_dom)
    total_dom = page_dom.at('ul.page.clearfix>li.pageNum')
    if total_dom
      total_dom.text.match(/\/(.*)$/)[1].to_i
    else
      0
    end
  end

  def parse_price(str)
    str.gsub('¥', '').to_f
  end

  def item_url(item_id)
    "http://www.yihaodian.com/item/#{item_id}_1"
  end
 end
	# -- encoding: utf-8 --
	require 'mini_magick'
	require 'rtesseract'
	require 'crawler'

	class Buy360 < Crawler
	def initialize(band_name, page=1, pages=0)
	@band_name = band_name
	@max_attempts = 3
	@pages = pages
	@page = page

	@path = "#{@band_name}/#{Date.today}/360buy"
	mkdirs(@path)

	@request = Nestful::Request.new('http://search.360buy.com/search')

	end

	def finishing
	merge_csv_files(@path, @pages)
	end

	def process
	self.params = { keyword: @band_name, qrst: 'UNEXPAND', enc: 'utf-8', page: @page }
	page_dom = get_page_dom('GBK')
	items_dom = get_items_dom(page_dom)

	page_count = export_items(items_dom)

	if @pages < 1
	total = parse_total(page_dom)
	@pages = pages_count(total, page_count)
	end

	if @pages > 1
	next_page
	else
	finishing
	end
	end

	def next_page
	if @page < @pages
	@page += 1
	puts "开始执行：#{@page}/#{@pages}"
	process
	else
	finishing
	end
	end

	private

	def export(items)
	unless items.empty?
	header_row = ['SKU', '名称', '图片', '活动', '价格', '评价数']
	CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do \|csv\|
	csv << header_row
	items.each do \|sku_id, item\|
	csv << [
	"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
	item[:title],
	item[:pic_url],
	item[:proms] ? item[:proms].join(' ') : '',
	item[:price],
	item[:rates][:count],
	]
	end
	end
	end
	return items.count
	end

	def export_items(items_dom)
	items = {}
	unless items_dom.empty?
	items_dom.each do \|item\|

	link_dom = item.at('div.p-name>a')
	if link_dom
	img_dom = item.at('div.p-img>a>img')
	sku_id = item[:sku] \|\| link_dom[:href].match(/product\/(.*)\.html$/)[1]
	pic_url = img_dom['data-lazyload'] \|\| img_dom[:src]
	name = link_dom.text
	rates_count = item.at('div.extra').at('a').text
	rates_count = rates_count.gsub(/\p{Han}/, '') unless rates_count.blank?

	items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: parse_price(sku_id), rates: { count: rates_count } }
	end
	end
	items = set_proms(items) unless items.empty?
	end
	export(items)
	end

	def set_proms(node)
	proms = get_proms(node.keys)
	unless proms.empty?
	proms.each do \|item_id, prom\|
	node[item_id][:proms] = prom if node.has_key?(item_id)
	end
	end
	return node
	end

	def parse_proms(proms)
	item_proms = []
	proms.each do \|flag\|
	item_proms << case flag.to_i
	when 1
	'直降'
	when 2
	'赠品'
	when 3
	'返券'
	when 4
	'送积分'
	end
	end
	return item_proms
	end

	def get_items_dom(page_dom)
	list = page_dom.at('div#plist>ul.list-h')
	if list
	return list.css('li')
	else
	return []
	end
	end

	def parse_total(page_dom)
	filter_dom = page_dom.at('div#filter')
	return 0 if filter_dom.nil?
	total_dom = filter_dom.at('div.total')
	if total_dom
	total_dom.at('strong').text.to_i
	else
	total_dom = filter_dom.at('ul.extra')
	total_dom.at('li').at('strong').text.to_i if total_dom
	end
	end



	def parse_price(sku_id, type=3)
	attempts = 0

	img_url = price_url(sku_id, type)
	begin
	img = MiniMagick::Image.open(img_url)
	img.colorspace("GRAY") # 灰度化
	img.monochrome # 去色
	rescue MiniMagick::Error, OpenURI::HTTPError, Timeout::Error => error
	attempts = attempts + 1
	puts "错误: #{error}"
	puts img_url
	return 0 if(attempts < @max_attempts)
	end

	str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
	File.unlink(img.path) # 删除临时文件
	price = str.match(/\d+\.\d+/)

	return price ? price[0].to_f : try_parse_price(sku_id)
	end

	def try_parse_price(sku_id, type=2)
	img_url = price_url(sku_id, type)
	img = MiniMagick::Image.open(img_url)
	img.resize '200x100' # 放大
	str = RTesseract.new(img.path, processor: 'mini_magick').to_s.strip # 识别
	File.unlink(img.path) # 删除临时文件
	price = str.match(/\d+\.\d+/)
	price = price[0] if price
	return price.to_f
	end

	def price_url(sku_id, type=3)
	"http://jprice.360buyimg.com/price/gp#{sku_id}-1-1-#{type}.png"
	end


	def item_url(sku_id)
	"http://www.360buy.com/product/#{sku_id}.html"
	end

	def get_proms(sku_ids, try_count=0)
	item_proms = {}
	json = nil

	begin
	prom_url = 'http://price.360buy.com/PromotionFlag.aspx'
	params = { pid: sku_ids.join(',')}
	html = Nestful.get prom_url, params: params
	rescue Nestful::ServerError => error
	return item_proms
	end
	html = html.force_encoding("GBK").encode("UTF-8")

	if html.first == '({' && html.last == '})' # 京东系统错误时，会跳回首页
	json = html.match(/\((.*)\)/)
	elsif try_count < @max_attempts # 重试3次
	get_proms(sku_ids, try_count + 1)
	end

	if json
	json = json[1]
	json = ActiveSupport::JSON.decode(json)
	proms = json['data']
	unless proms.empty?
	item_proms = {}
	proms.each do \|prom\|
	item_proms[prom['Pid']] = parse_proms(prom['PF'])
	end
	end
	end
	return item_proms
	end
	end
	# -- encoding: utf-8 --
	require 'crawler'

	class Amazon < Crawler
	def initialize(band_name, page=1, pages=0)
	@band_name = band_name

	@pages = pages
	@page = page

	@path = "#{@band_name}/#{Date.today}/amazon"
	mkdirs(@path)

	@request = Nestful::Request.new('http://www.amazon.cn/s')
	@request.headers = { 'Host' => 'www.amazon.cn' }
	end

	def finishing
	merge_csv_files(@path, @pages)
	end

	def process
	self.params = { ie: 'UTF8', keywords: @band_name, page: @page, rh: "i:aps,k:#{@band_name}" }
	page_dom = get_page_dom('UTF-8')
	page_dom = page_dom.at('div#main>div#searchTemplate')

	extra_dom = page_dom.at('div#centerBelow>div#btfResults')
	page_dom = page_dom.at('div#center')

	items_dom = get_items_dom(page_dom, extra_dom)

	page_count = export_items(items_dom)

	if @pages < 1
	@pages = parse_total(page_dom)
	end

	if @pages > 1
	next_page
	else
	finishing
	end
	end

	def next_page
	if @page < @pages
	@page += 1
	puts "开始执行：#{@page}/#{@pages}"
	process
	else
	finishing
	end
	end

	private

	def export(items)
	unless items.empty?
	header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
	CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do \|csv\|
	csv << header_row
	items.each do \|sku_id, item\|
	csv << [
	"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
	item[:title],
	item[:pic_url],
	item[:tag_price],
	item[:price],
	item[:rates][:count],
	]
	end
	end
	end
	return items.count
	end

	def export_items(items_dom)
	items = {}
	unless items_dom.empty?
	items_dom.each do \|item\|
	sku_id = item[:name]

	pic_url = item.at('div.productImage>a>img')[:src]

	item = item.at('div.productData')
	name = item.at('div.productTitle').text

	price = nil
	tag_price = nil

	price_dom = item.at('div.newPrice')
	if price_dom
	price = price_dom.at('span')
	price = parse_price(price.text) if price

	tag_price = price_dom.at('strike')
	tag_price = parse_price(tag_price.text) if tag_price
	end

	rates_count = item.at('div.starsAndPrime')
	rates_count = rates_count.css('a').last if rates_count
	rates_count = rates_count ? rates_count.text.gsub(',', '').to_i : 0

	if sku_id
	items[sku_id] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, tag_price: tag_price }
	end
	end
	end
	export(items)
	end

	def get_items_dom(page_dom, extra_dom=nil)
	items_dom = []
	page_dom = page_dom.at('div#atfResults')
	if page_dom
	items_dom = page_dom.css('div.product')
	end

	if extra_dom
	items_dom = items_dom + extra_dom.css('div.product')
	end

	return items_dom
	end

	def parse_total(page_dom)
	total_dom = page_dom.at('div#resultCount')
	if total_dom
	total_dom.text.match(/\-(.*)条，/)[1].to_i
	else
	0
	end
	end

	def parse_price(str)
	str = str.gsub('￥ ', '')
	str = str.gsub(',', '')
	str.to_f
	end

	def item_url(sku_id)
	"http://www.amazon.cn/dp/#{sku_id}"
	end
	end
	# -- encoding: utf-8 --
	require 'fileutils'
	require 'nokogiri'
	require 'nestful'
	require 'csv'

	class Crawler

	def initialize
	@max_attempts = 3
	end

	def request
	@request
	end

	def url=(value)
	request.url = value
	end

	def headers=(value)
	request.headers = value
	end

	def params=(value)
	request.params = value
	end

	def get_page_dom(charset)
	html = nil

	begin
	response = request.connection.get(request.query_path)
	html = response.body.force_encoding(charset).encode("UTF-8")
	rescue Nestful::Redirection => error
	location = error.response['Location']
	cookie = error.response['Set-Cookie']
	if location.include?('no_results')
	return html
	else
	self.headers = { 'Cookie' => cookie, 'Referer' => request.url }
	self.url = location
	retry
	end
	end

	return Nokogiri::HTML(html) unless html.nil?
	end

	def mkdirs(path)
	FileUtils.mkdir_p(path)
	end

	def merge_csv_files(path, files_count)
	csv_files = Dir["#{path}/*.csv" + '']
	puts "文件不够，呵呵~" if csv_files.count != files_count
	CSV.open("#{path}.csv", "w:binary", col_sep: ',') do \|csv\|
	has_header = false
	csv_files.each do \|csv_file\|
	data = CSV.read(csv_file, 'r:binary', headers: true, col_sep: ',')
	unless has_header
	csv << data.headers
	has_header = true
	end
	data.each do \|line\|
	csv << line
	end
	end
	end
	FileUtils.rm_r(path) # 删除临时文件夹
	end

	def pages_count(total, size)
	return 0 if total == 0 \|\| size == 0
	page = (total / size.to_f).to_i
	page += 1 if (total % size) > 0
	return page
	end
	end
	# -- encoding: utf-8 --
	require 'crawler'

	class Jumei < Crawler
	def initialize(band_name, page=1, pages=0)
	@band_name = band_name

	@pages = pages
	@page = page

	@path = "#{@band_name}/#{Date.today}/jumei"
	mkdirs(@path)

	@request = Nestful::Request.new('http://search.jumei.com')
	end

	def finishing
	merge_csv_files(@path, @pages)
	end

	def process
	self.params = { filter: "0-0-0-0-31-#{@page}", search: @band_name }
	page_dom = get_page_dom('UTF-8')
	if page_dom
	page_dom = page_dom.at('div#search_result_wrap')

	items_dom = get_items_dom(page_dom)

	export_items(items_dom)

	if @pages < 1
	total = parse_total(page_dom)
	@pages = pages_count(total, 40)
	end

	if @pages > 1
	next_page
	else
	finishing
	end
	else
	puts "聚美优品中搜索 #{@band_name}，无结果"
	end
	end

	def next_page
	if @page < @pages
	@page += 1
	puts "开始执行：#{@page}/#{@pages}"
	process
	else
	finishing
	end
	end

	private

	def export(items)
	unless items.empty?
	header_row = ['SKU', '名称', '图片', '价格', '购买数', '折扣']
	CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do \|csv\|
	csv << header_row
	items.each do \|sku_id, item\|
	csv << [
	"=HYPERLINK(\"#{item_url(sku_id)}\",\"#{sku_id}\")",
	item[:title],
	item[:pic_url],
	item[:price],
	item[:rates][:count],
	item[:proms].join(' '),
	]
	end
	end
	end
	return items.count
	end

	def export_items(items_dom)
	items = {}
	unless items_dom.empty?
	items_dom.each do \|item\|
	proms = []
	sku_id = item['pid'].to_i

	name_dom = item.at('div>div.num_warp_list_name')
	proms_dom = name_dom.at('span')
	proms << proms_dom.text.gsub('/', '') if proms_dom
	name = name_dom.at('a').text
	pic_url = item.at('div>div.num_warp_list_pic_top').at('img')[:src]


	countdown_dom = item.at('div>div.num_warp_list_warp_word.time_countdown')
	countdown_dom.remove if countdown_dom
	rates_count = item.at('div>div.num_warp_list_warp_word').css('span').last.text.gsub(/\p{Han}/, '').to_i


	price_dom = item.at('div>div.num_warp_list_view_bg') \|\| item.at('div>div.num_warp_list_name_mall')


	price = price_dom.css('span').last.text.gsub('¥', '').to_f

	proms_dom = price_dom.text.match(/\((.*)\)/)
	proms << proms_dom[1] if proms_dom

	if sku_id
	items[sku_id.to_i] = { pic_url: pic_url.strip, title: name.strip, price: price, rates: { count: rates_count }, proms: proms }
	end
	end
	end
	export(items)
	end

	def get_items_dom(page_dom)
	list = page_dom.at('div#search_list_wrap>div.products>ul')
	if list
	return list.css('li.item')
	else
	return []
	end
	end

	def parse_total(page_dom)
	total_dom = page_dom.at('div.search_info>div>div.content').css('label.red')
	if total_dom.count == 2
	total_dom[1].text.to_i
	else
	0
	end
	end

	def item_url(sku_id)
	"http://mall.jumei.com/product_#{sku_id}.html"
	end
	end
	# -- encoding: utf-8 --
	$:.unshift File.expand_path('./lib')

	namespace :cai do
	# 必填项
	task :required do
	if ENV['band_name'].nil?
	puts '缺少参数，请参照以下命令：'
	puts 'rake cai band_name=品牌名称'
	puts 'rake cai:amazon band_name=品牌名称'
	puts 'rake cai:buy360 band_name=品牌名称'
	puts 'rake cai:lefeng band_name=品牌名称'
	puts 'rake cai:jumei band_name=品牌名称'
	puts 'rake cai:yihaodian band_name=品牌名称'
	exit
	end
	end

	desc "采集京东数据"
	task :buy360, [:page, :pages] => :required do \|t, args\|
	args.with_defaults(page: 1, pages: 0)
	require '360buy'

	buy360 = Buy360.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
	buy360.process
	end

	desc "采集乐峰数据"
	task :lefeng, [:page, :pages] => :required do \|t, args\|
	args.with_defaults(page: 1, pages: 0)
	require 'lefeng'

	lefeng = Lefeng.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
	lefeng.process
	end

	desc "采集聚美数据"
	task :jumei, [:page, :pages] => :required do \|t, args\|
	args.with_defaults(page: 1, pages: 0)
	require 'jumei'

	jumei = Jumei.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
	jumei.process
	end

	desc "采集一号店数据"
	task :yihaodian, [:page, :pages] => :required do \|t, args\|
	args.with_defaults(page: 1, pages: 0)
	require 'yihaodian'

	yihaodian = Yihaodian.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
	yihaodian.process
	end

	desc "采集亚马逊数据"
	task :amazon, [:page, :pages] => :required do \|t, args\|
	args.with_defaults(page: 1, pages: 0)
	require 'amazon'

	amazon = Amazon.new(ENV['band_name'], args[:page].to_i, args[:pages].to_i)
	amazon.process
	end
	end

	desc "采集全平台数据"
	task :cai => ['cai:jumei', 'cai:lefeng', 'cai:yihaodian', 'cai:amazon', 'cai:buy360' ] do
	puts "全平台目前仅支持：京东、乐峰、聚美、亚马逊、一号店。"
	end
	# -- encoding: utf-8 --
	require 'crawler'

	class Yihaodian < Crawler
	def initialize(band_name, page=1, pages=0)
	@band_name = band_name

	@pages = pages
	@page = page

	@path = "#{@band_name}/#{Date.today}/yihaodian"
	mkdirs(@path)

	@request = Nestful::Request.new('')
	end

	def get_json_dom(charset, try_count=0)
	response = request.connection.get(request.query_path)
	html = response.body.force_encoding(charset).encode("UTF-8")
	html = ActiveSupport::JSON.decode(html)['value'] if html
	return Nokogiri::HTML(html)
	rescue Nestful::ForbiddenAccess => error
	if try_count < 3 # 重试3次
	puts "========================开始重试========================"
	get_page_dom(charset, try_count + 1)
	else
	puts "========================很扯，三次都没搞定========================"
	end
	end

	def finishing
	merge_csv_files(@path, @pages)
	end

	def search_url
	"http://search.yihaodian.com/searchPage/c0-0/b/a-s1-v0-p#{@page}-price-d0-f0-m1-rt0-pid-k#{URI::encode(@band_name)}"
	end

	def process
	self.url = search_url
	page_dom = get_json_dom('UTF-8')
	items_dom = get_items_dom(page_dom)

	page_count = export_items(items_dom)

	if @pages < 1
	@pages = parse_total(page_dom)
	end

	if @pages > 1
	next_page
	else
	finishing
	end
	end

	def next_page
	if @page < @pages
	@page += 1
	puts "开始执行：#{@page}/#{@pages}"
	process
	else
	finishing
	end
	end

	private

	def export(items)
	unless items.empty?
	header_row = ['SKU', '名称', '图片', '吊牌价', '价格', '评价数']
	CSV.open("#{@path}/#{@page}.csv", "wb:GB18030", col_sep: ',') do \|csv\|
	csv << header_row
	items.each do \|sku_id, item\|
	csv << [
	"=HYPERLINK(\"#{item_url(item[:item_id])}\",\"#{sprintf("%010d", sku_id)}\")",
	item[:title],
	item[:pic_url],
	item[:tag_price],
	item[:price],
	item[:rates][:count],
	]
	end
	end
	end
	return items.count
	end

	def export_items(items_dom)
	items = {}
	unless items_dom.empty?
	items_dom.each do \|item\|
	product_id = item[:id].match(/\d+/)[0].to_i
	link_dom = item.at("a#pdlink1_#{product_id}")
	item_id = link_dom['pmid'].to_i
	sku_id = item.at("div.buyInfo>button#buyButton_#{product_id}")
	sku_id = sku_id['productcode'].to_i if sku_id
	link_dom = link_dom.at('img')
	pic_url = link_dom['original'] \|\| link_dom[:src]
	name = link_dom[:title]

	link_dom = item.at('p.price')

	rates_count = link_dom.at('a')
	rates_count = rates_count ? rates_count.text.match(/\d+/)[0].to_i : 0

	price = link_dom.at('strong').text
	tag_price = link_dom.at('del')
	tag_price = parse_price(tag_price.text) if tag_price

	if sku_id
	items[sku_id.to_i] = { item_id: item_id, pic_url: pic_url.strip, title: name.strip, price: parse_price(price), rates: { count: rates_count }, tag_price: tag_price }
	end
	end
	end
	export(items)
	end

	def get_items_dom(page_dom)
	list = page_dom.at('div#plist>div#search_table>div.itemSearchResult.clearfix>ul.itemSearchList')
	if list
	return list.css('li')
	else
	return []
	end
	end

	def parse_total(page_dom)
	total_dom = page_dom.at('ul.page.clearfix>li.pageNum')
	if total_dom
	total_dom.text.match(/\/(.*)$/)[1].to_i
	else
	0
	end
	end

	def parse_price(str)
	str.gsub('¥', '').to_f
	end

	def item_url(item_id)
	"http://www.yihaodian.com/item/#{item_id}_1"
	end
	end