mogya · March 15, 2013 04:05
diff --git a/scraper.rb b/scraper.rb
 # -*- encoding: utf-8 -*-
 require "singleton"
 require "uri"
 require 'lib/debugLog.rb'

 #WEBページを取得するための基底クラス
 class ScraperException < StandardError; end
 class ScraperPageNotFoundException < ScraperException; end

 class Scraper
  include DebugLog
  def initialize(params,log=nil)
    @log = log
  end

  def get(url,param=nil)
  end
 end

 =begin
 =end
diff --git a/wgetScraper.rb b/wgetScraper.rb
 # -*- encoding: utf-8 -*-
 require 'lib/scraper.rb'
 require "uri"
 require "kconv"  

 class WgetScraperException < ScraperException; end

 #WEBページを取得する際、キャッシュやアクセス時間の間隔などを配慮するためのクラス
 class WgetScraper < Scraper
  include DebugLog
  CACHE_DIR = '/tmp/WgetScraper/'
  COOKIE_FILE = '/tmp/WgetScraper/cookie.txt'
  WGET = '/usr/bin/wget'
  CHMOD = '/bin/chmod'
  MKDIR = '/bin/mkdir'
  def initialize(params=nil,log=nil)
    debugLog_init(log)
    # wgetでアクセスする時のアクセス間隔
    default_interval = 5
    @interval = params['interval'] if (params && params['interval'])
    @interval = @interval || default_interval 

    @keep_cache_seconds = params['keep_cache_seconds'] if (params && params['keep_cache_seconds'])
    @keep_cache_seconds = @keep_cache_seconds || 60*60*24*7*3 # 3week：キャッシュしたファイルを返す期間
    @userAgent = params['userAgent'] if (params && params['userAgent'])
    @userAgent = @userAgent||"mogya scraper. contact me at [email protected] if any problem."
    @lastAccess = Time.at(0)
    @forceFileName = params['forceFileName']||false    #ファイル名をwgetに任せずにWgetScraperで決定する。ファイル末のmemo参照。
    @clearlyFileAndDirectory = params['clearlyFileAndDirectory']||false #ファイル名の末尾に_をつけてディレクトリと明確に分ける。ファイル末のmemo参照。
    #キャッシュディレクトリがあることを確認
    if (!File.exist?(CACHE_DIR))
      Dir::mkdir(CACHE_DIR)
    end
    #WGETが使えることを確認
    test = `#{WGET} -V`
    raise "#{WGET} not found. need to set WGET constance?" if (test.length <= 0)
  end

  def get(uri,param=nil)
    filename = get_cache_path(uri)
    wget_get(uri) if (!cache_valid?(filename))
    return read_cache(uri)
  end

  #アクセス間隔を保証するための関数
  #前回アクセスから@interval 以上たっていなければ、残り時間分sleepする。
  #todo：出来ればドメイン別に記録を取っておいて、違うドメインならそのままアクセスしたい
  def sleep_between_access(uri)
    now = Time.now
    # 朝6時以降はアクセス頻度を大幅に落とす
    interval = (now.hour<6)?@interval:@interval*10
    left_interval = interval - (now - @lastAccess)
    if ( left_interval>0 )
      debug("wgetScraper: sleep #{sleep left_interval}s..")
      sleep left_interval 
    end
    @lastAccess = Time.now
  end
  def wget_get(uri)
    sleep_between_access(uri)
    debug('doing accual network access to '+uri)
    wget_cmd = %Q(#{WGET} -x -N -P "#{CACHE_DIR}" "#{uri}" --user-agent="#{@userAgent}" --load-cookies "#{COOKIE_FILE}" --save-cookies "#{COOKIE_FILE}" --keep-session-cookies 2>&1)
    if(@forceFileName)
      cachepath = get_cache_path(uri)
      cachedir = (cachepath+' ').split('/')[0..-2].join('/')
      `#{MKDIR} -p #{cachedir}` #フォルダは作っておいてあげないといけない。
      wget_cmd = %Q(#{WGET} -x -O "#{cachepath}" "#{uri}" --user-agent="#{@userAgent}" --load-cookies "#{COOKIE_FILE}" --save-cookies "#{COOKIE_FILE}" --keep-session-cookies 2>&1)
    end
    status = `#{wget_cmd}`
    #debug("wget_cmd:#{wget_cmd}")
    #debug("status:#{status}")
    if ($?.exitstatus!=0)
      raise WgetScraperException.new("#{$?.exitstatus}. #{status}")
    end
    
    #普通にwgetすると、アクセス権が775になる。この場合、Apache経由のCGIでは同じプログラムがキャッシュを書けなくなってしまう。
    #(なんだかちょとまずいような気もするけど)ここで777にしてしまおう。
    `#{CHMOD} -R 777 #{CACHE_DIR} > /dev/null 2>&1`
  end
  def get_cache_path(uri)
    uri_obj = URI.parse(uri)
    
    filename = "#{uri_obj.host}#{uri_obj.path}"
    if (uri_obj.query)
      #wgetの挙動として、':'以外はエンコードするみたいなので真似る。
      filename = filename+"?"+ URI.encode(URI.decode(uri_obj.query), /[^-_.!~*'()a-zA-Z\d;?:@&=+$,\[\]]/)
    end
    # wgetがindex.htmlを自動補完してくれるので、ファイル名も対応しておく。
    if(@clearlyFileAndDirectory)
        filename.sub!(/([^\/]$)/, '\1_')
    end
    filename.sub!(/\/$/, '/index.html')
    filename.sub!(/\/\?/, '/index.html?')
    cache_path = "#{CACHE_DIR}#{filename}"
    return cache_path
  end
  #キャッシュが存在して有効期間内である時TRUEを返す
  def cache_valid?(filename)
    return false if (!File.readable?(filename))
    return false if (File.size(filename)==0) #ファイルサイズが０ならキャッシュを破棄して読み直す
    return ( Time.now - File.ctime(filename) < @keep_cache_seconds )
  end
  def read_cache(uri)
    file_path = get_cache_path(uri)
    return File.read(file_path).toutf8
  end
  def destruct_cache(uri)
    file_path = get_cache_path(uri)
    debug("wgetScraper: destruct #{file_path}")
    return File.delete(file_path)
  end

 end

 =begin
 irb -Ku
 load 'lib/wgetScraper.rb'
 userAgent = 'test scraper. [email protected]'
 scraper = WgetScraper.new( {'userAgent'=>userAgent}, $log )
 url = 'http://oasis.mogya.com/test.txt?a=b&c=d'
 scraper.get_cache_path(url)
 scraper.wget_get(url)
 scraper.read_cache(url)
 scraper.get(url)


 require 'lib/wgetScraper.rb'
 userAgent = 'test scraper. [email protected]'
 scraper = WgetScraper.new( {'userAgent'=>userAgent}, Logger.new(STDERR) )
 scraper.get('http://oasis.mogya.com/test.txt?a=b&c=d')
 scraper.get('http://oasis.mogya.com/test.txt?a=b&c=d')
 =end

 =begin
 forceFileName と clearlyFileAndDirectory について

 　いずれも、wordpressで作成されたサイトのスクレイピングで引っかかる問題を回避するための実装。
 　とはいえ、問題自体は一般的に起こりうるものなので普通のオプションとして実装した。

 　URLエンコードされた日本語ファイル名
 　　たとえば、
 　　　'http://dengen-cafe.com/archives/tag/%e3%82%a2%e3%83%b3%e3%83%86%e3%82%a3%e3%83%bb%e3%82%a2%e3%83%b3%e3%82%ba'
 　　というURLをwgetで普通にスクレイプすると、一部の制御文字を無視するもんだから、なんだかわからないファイル名になってしまう。
 　　この現象をWgetScraperで再現するのは難しいので、キャッシュを取得できない問題になる。
 　　forceFileNameをつけると、キャッシュファイル名をwgetに任せずにWgetScraperで決めたファイル名を強制する。
 　　これにより上記現象を回避できるけど、リダイレクトされたら意図しないファイル名になってしまったりする可能性が残るので
 　　不必要には使わないほうがいいと思う。
 　ディレクトリと区別のつかないファイル名
 　　たとえば
 　　http://dengen-cafe.com/archives/category/tokyo/station/
 　　にアクセスすると、/のつかない station にリダイレクトされる。
 　　 ところが、
 　　http://dengen-cafe.com/archives/category/tokyo/station/nikotamaeki
 　　というページも存在する。
 　　
 　　スクリプトでURIをパースしていて実ファイルと結びつかないから、こういう実装をしても気にならないらしい。
 　　キャッシュする方は、ディレクトリなのかファイル名なのかわからないから泣きそうなんだけど。

 　　仕方ないので、 clearlyFileAndDirectory オプションを指定すると、/ で終わらないURIの最後に_をつけて保存する。
 　　もし本当にそんな名前のURIがあったら困るけど、まあ普通無いだろう。
 　　wgetの挙動と異なってしまうので、forceFileNameとセットで指定することが前提。

 =end
	# -- encoding: utf-8 --
	require "singleton"
	require "uri"
	require 'lib/debugLog.rb'

	#WEBページを取得するための基底クラス
	class ScraperException < StandardError; end
	class ScraperPageNotFoundException < ScraperException; end

	class Scraper
	include DebugLog
	def initialize(params,log=nil)
	@log = log
	end

	def get(url,param=nil)
	end
	end

	=begin
	=end
	# -- encoding: utf-8 --
	require 'lib/scraper.rb'
	require "uri"
	require "kconv"

	class WgetScraperException < ScraperException; end

	#WEBページを取得する際、キャッシュやアクセス時間の間隔などを配慮するためのクラス
	class WgetScraper < Scraper
	include DebugLog
	CACHE_DIR = '/tmp/WgetScraper/'
	COOKIE_FILE = '/tmp/WgetScraper/cookie.txt'
	WGET = '/usr/bin/wget'
	CHMOD = '/bin/chmod'
	MKDIR = '/bin/mkdir'
	def initialize(params=nil,log=nil)
	debugLog_init(log)
	# wgetでアクセスする時のアクセス間隔
	default_interval = 5
	@interval = params['interval'] if (params && params['interval'])
	@interval = @interval \|\| default_interval

	@keep_cache_seconds = params['keep_cache_seconds'] if (params && params['keep_cache_seconds'])
	@keep_cache_seconds = @keep_cache_seconds \|\| 60602473 # 3week：キャッシュしたファイルを返す期間
	@userAgent = params['userAgent'] if (params && params['userAgent'])
	@userAgent = @userAgent\|\|"mogya scraper. contact me at [email protected] if any problem."
	@lastAccess = Time.at(0)
	@forceFileName = params['forceFileName']\|\|false #ファイル名をwgetに任せずにWgetScraperで決定する。ファイル末のmemo参照。
	@clearlyFileAndDirectory = params['clearlyFileAndDirectory']\|\|false #ファイル名の末尾に_をつけてディレクトリと明確に分ける。ファイル末のmemo参照。
	#キャッシュディレクトリがあることを確認
	if (!File.exist?(CACHE_DIR))
	Dir::mkdir(CACHE_DIR)
	end
	#WGETが使えることを確認
	test = `#{WGET} -V`
	raise "#{WGET} not found. need to set WGET constance?" if (test.length <= 0)
	end

	def get(uri,param=nil)
	filename = get_cache_path(uri)
	wget_get(uri) if (!cache_valid?(filename))
	return read_cache(uri)
	end

	#アクセス間隔を保証するための関数
	#前回アクセスから@interval 以上たっていなければ、残り時間分sleepする。
	#todo：出来ればドメイン別に記録を取っておいて、違うドメインならそのままアクセスしたい
	def sleep_between_access(uri)
	now = Time.now
	# 朝6時以降はアクセス頻度を大幅に落とす
	interval = (now.hour<6)?@interval:@interval*10
	left_interval = interval - (now - @lastAccess)
	if ( left_interval>0 )
	debug("wgetScraper: sleep #{sleep left_interval}s..")
	sleep left_interval
	end
	@lastAccess = Time.now
	end
	def wget_get(uri)
	sleep_between_access(uri)
	debug('doing accual network access to '+uri)
	wget_cmd = %Q(#{WGET} -x -N -P "#{CACHE_DIR}" "#{uri}" --user-agent="#{@userAgent}" --load-cookies "#{COOKIE_FILE}" --save-cookies "#{COOKIE_FILE}" --keep-session-cookies 2>&1)
	if(@forceFileName)
	cachepath = get_cache_path(uri)
	cachedir = (cachepath+' ').split('/')[0..-2].join('/')
	`#{MKDIR} -p #{cachedir}` #フォルダは作っておいてあげないといけない。
	wget_cmd = %Q(#{WGET} -x -O "#{cachepath}" "#{uri}" --user-agent="#{@userAgent}" --load-cookies "#{COOKIE_FILE}" --save-cookies "#{COOKIE_FILE}" --keep-session-cookies 2>&1)
	end
	status = `#{wget_cmd}`
	#debug("wget_cmd:#{wget_cmd}")
	#debug("status:#{status}")
	if ($?.exitstatus!=0)
	raise WgetScraperException.new("#{$?.exitstatus}. #{status}")
	end

	#普通にwgetすると、アクセス権が775になる。この場合、Apache経由のCGIでは同じプログラムがキャッシュを書けなくなってしまう。
	#(なんだかちょとまずいような気もするけど)ここで777にしてしまおう。
	`#{CHMOD} -R 777 #{CACHE_DIR} > /dev/null 2>&1`
	end
	def get_cache_path(uri)
	uri_obj = URI.parse(uri)

	filename = "#{uri_obj.host}#{uri_obj.path}"
	if (uri_obj.query)
	#wgetの挙動として、':'以外はエンコードするみたいなので真似る。
	filename = filename+"?"+ URI.encode(URI.decode(uri_obj.query), /[^-_.!~*'()a-zA-Z\d;?:@&=+$,\[\]]/)
	end
	# wgetがindex.htmlを自動補完してくれるので、ファイル名も対応しておく。
	if(@clearlyFileAndDirectory)
	filename.sub!(/([^\/]$)/, '\1_')
	end
	filename.sub!(/\/$/, '/index.html')
	filename.sub!(/\/\?/, '/index.html?')
	cache_path = "#{CACHE_DIR}#{filename}"
	return cache_path
	end
	#キャッシュが存在して有効期間内である時TRUEを返す
	def cache_valid?(filename)
	return false if (!File.readable?(filename))
	return false if (File.size(filename)==0) #ファイルサイズが０ならキャッシュを破棄して読み直す
	return ( Time.now - File.ctime(filename) < @keep_cache_seconds )
	end
	def read_cache(uri)
	file_path = get_cache_path(uri)
	return File.read(file_path).toutf8
	end
	def destruct_cache(uri)
	file_path = get_cache_path(uri)
	debug("wgetScraper: destruct #{file_path}")
	return File.delete(file_path)
	end

	end

	=begin
	irb -Ku
	load 'lib/wgetScraper.rb'
	userAgent = 'test scraper. [email protected]'
	scraper = WgetScraper.new( {'userAgent'=>userAgent}, $log )
	url = 'http://oasis.mogya.com/test.txt?a=b&c=d'
	scraper.get_cache_path(url)
	scraper.wget_get(url)
	scraper.read_cache(url)
	scraper.get(url)


	require 'lib/wgetScraper.rb'
	userAgent = 'test scraper. [email protected]'
	scraper = WgetScraper.new( {'userAgent'=>userAgent}, Logger.new(STDERR) )
	scraper.get('http://oasis.mogya.com/test.txt?a=b&c=d')
	scraper.get('http://oasis.mogya.com/test.txt?a=b&c=d')
	=end

	=begin
	forceFileName と clearlyFileAndDirectory について

	いずれも、wordpressで作成されたサイトのスクレイピングで引っかかる問題を回避するための実装。
	とはいえ、問題自体は一般的に起こりうるものなので普通のオプションとして実装した。

	URLエンコードされた日本語ファイル名
	たとえば、
	'http://dengen-cafe.com/archives/tag/%e3%82%a2%e3%83%b3%e3%83%86%e3%82%a3%e3%83%bb%e3%82%a2%e3%83%b3%e3%82%ba'
	というURLをwgetで普通にスクレイプすると、一部の制御文字を無視するもんだから、なんだかわからないファイル名になってしまう。
	この現象をWgetScraperで再現するのは難しいので、キャッシュを取得できない問題になる。
	forceFileNameをつけると、キャッシュファイル名をwgetに任せずにWgetScraperで決めたファイル名を強制する。
	これにより上記現象を回避できるけど、リダイレクトされたら意図しないファイル名になってしまったりする可能性が残るので
	不必要には使わないほうがいいと思う。
	ディレクトリと区別のつかないファイル名
	たとえば
	http://dengen-cafe.com/archives/category/tokyo/station/
	にアクセスすると、/のつかない station にリダイレクトされる。
	ところが、
	http://dengen-cafe.com/archives/category/tokyo/station/nikotamaeki
	というページも存在する。

	スクリプトでURIをパースしていて実ファイルと結びつかないから、こういう実装をしても気にならないらしい。
	キャッシュする方は、ディレクトリなのかファイル名なのかわからないから泣きそうなんだけど。

	仕方ないので、 clearlyFileAndDirectory オプションを指定すると、/ で終わらないURIの最後に_をつけて保存する。
	もし本当にそんな名前のURIがあったら困るけど、まあ普通無いだろう。
	wgetの挙動と異なってしまうので、forceFileNameとセットで指定することが前提。

	=end