Skip to content

Instantly share code, notes, and snippets.

@kzk
Created March 19, 2011 01:42
Show Gist options
  • Save kzk/877137 to your computer and use it in GitHub Desktop.
Save kzk/877137 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
require 'rubygems'
require 'json'
require 'uri'
require 'net/http'
require 'active_record'
require 'active_support'
require 'mechanize'
require 'nokogiri'
require 'kconv'
require 'nkf'
$KCODE = 'u' if RUBY_VERSION < '1.9.0'
def fix_charset_to_utf8(more_nkf_options = "")
lambda do |params|
if content_type = params[:response]["Content-Type"]
if (content_type =~ /charset/i && (not content_type =~ /utf[-]?8/))
puts content_type
content_type.sub!(/charset\s*=\s*([^;\s]+)/i, "charset=UTF-8")
params[:response]["Content-Type"] = content_type
response_body = NKF.nkf("-w -m0 #{more_nkf_options}", params[:response_body])
if m = response_body.match(/<\?xml[^>]+encoding\s*=\s*["']([^>\s]+)["'][^>]*\?>/i)
response_body[Range.new(m.begin(1), m.end(1) - 1)] = "UTF-8"
end
if m = response_body.match(/\]+charset\s*=\s*([^>"'\/\s]+)[^>]*>/i)
response_body[Range.new(m.begin(1), m.end(1) - 1)] = "UTF-8"
end
params[:response_body] = response_body
end
end
end
end
def html_get_page_title(url)
h = {}
begin
timeout(10) do
agent = Mechanize.new
#agent.post_connect_hooks << fix_charset_to_utf8
page = agent.get(url)
puts "--------------"
puts page.title
puts page.title.toutf8
puts NKF.nkf('-w -S', page.title)
end
rescue Timeout::Error => e
p e
rescue => e
p e
end
end
us = ["http://r10.to/hBvz55",
"http://www.yomiuri.co.jp/feature/eq2011/refugee/?from=tw",
"http://twitpic.com/4ak47w"]
us.each { |u| html_get_page_title(u) }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment