Created
May 3, 2010 12:20
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8 -*- | |
require 'rubygems' | |
require 'MeCab' | |
require 'net/http' | |
require 'uri' | |
require 'extractcontent.rb' | |
$KCODE='u' | |
Net::HTTP.version_1_2 | |
class TF | |
@tagger = nil | |
@extract_content = nil | |
def initialize | |
@tagger = MeCab::Tagger.new('-O wakati') | |
@extract_content = ExtractContent::Extractor.new({:decay_factor=>0.75}) | |
end | |
def fetch(uri_str, limit = 10) | |
uri = URI.parse(URI.encode(uri_str)) | |
raise ArgumentError, 'http redirect too deep' if limit == 0 | |
response = nil | |
Net::HTTP.new(uri.host).start do |http| | |
response = http.get(uri.request_uri, {'user-agent' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.4) Gecko/20100413 Firefox/3.6.4'}) | |
end | |
case response | |
when Net::HTTPSuccess then response | |
when Net::HTTPRedirection then fetch(response['Location'], limit - 1) | |
else | |
response.error! | |
end | |
end | |
protected :fetch | |
def mecab_node(context) | |
@tagger.parseToNode(context) | |
end | |
protected :mecab_node | |
def tf(html) | |
tf = Hash::new | |
n = mecab_node(@extract_content.analyse(fetch(html).body).first) | |
while n do | |
if /^名詞/ =~ n.feature | |
if tf.key?(n.surface) | |
tf[n.surface] = tf[n.surface] + 1 | |
else | |
tf[n.surface] = 1 | |
end | |
end | |
n = n.next | |
end | |
tf | |
end | |
end | |
TF::new.tf('http://ja.wikipedia.org/wiki/沢城みゆき').each do |k, v| | |
puts "#{k} : #{v}" | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment