Skip to content

Instantly share code, notes, and snippets.

@twtw
Created October 12, 2010 15:31
Show Gist options
  • Save twtw/622374 to your computer and use it in GitHub Desktop.
Save twtw/622374 to your computer and use it in GitHub Desktop.
# encoding: utf-8
require 'ostruct'
require 'hpricot'
require 'hpricot_scrub'
require 'open-uri'
require 'curb'
require 'uri'
class Page
attr_reader :doc
def initialize(url)
@doc = Hpricot(open(url))
end
end
class BookItem
attr_reader :bookid, :url, :doc, :image, :title, :price, :author, :publisher, :pubdate, :isbn, :intro
def initialize(bookid)
@bookid = bookid
@url = "http://www.books.com.tw/exep/prod/booksfile.php?item=#{bookid}"
@doc = Page.new(url).doc
image = b5tou8(doc.at('div.pri002_rec').to_s)
if image.to_s =~ /.*(<img src=.*\/>)[<\/a>]*<\/div>.*/
@image = $1
else
@image = image
end
author = get_author(doc)
(doc/:h1).strip
(doc/:div.class).strip
(doc/:span).strip
@title = b5tou8(doc.at('div.prd001').to_s.gsub(/\n/,'')).to_s.gsub('<div class="prd001">','').gsub(/<\/?dfn>/,'').gsub('</div>','')
price = b5tou8(doc.search('div.cnt').to_s.gsub(/\n/,'')).to_s
if price =~ /.*<li>定價:<dfn><u>(\d+)<\/u>元<\/dfn>.*/
@price = $1
else
@price = '不詳'
end
author_u8 = b5tou8(author)
if author_u8.to_s =~ /.*<li>作者:(\S+)<\/li> .*/
@author = $1
end
if author_u8.to_s =~ /.*<li>出版社:(\S+)<\/li>.*/
@publisher = $1
end
if author_u8.to_s =~ /.*<li>出版日期:<dfn>(\d+)年\d+月\d+日<\/dfn>.*/
@pubdate = $1
end
if author_u8.to_s =~ /.*ISBN:<dfn>(\d+)<\/dfn>.*/
@isbn = $1
end
# 抓簡介
intro = doc.at('table.spr01_free')
@intro = b5tou8(intro.to_s.gsub('<br />',"<br />\n").gsub('</p>',"</p>\n"))
end
end
def get_author(doc)
require 'hpricot_scrub'
(doc/:a).strip
(doc/:span).strip
authors = doc.search('ul.prd002').to_s.gsub(/\n/,'')
return authors.to_s
end
def b5tou8(big5)
require 'iconv'
utf8 = Iconv.iconv("UTF-8//IGNORE","BIG5//IGNORE",big5)
return utf8[0]
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment