Last active
December 15, 2015 12:58
-
-
Save dchentech/5263559 to your computer and use it in GitHub Desktop.
Ruby-China相似性文章测试
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.log | |
*.lock |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'logger' | |
require 'open-uri' | |
require 'yaml' | |
require 'gsl' | |
require 'narray' | |
require 'active_record' | |
require 'tf-idf-similarity' | |
SimilarLogger = Logger.new("similar.log") | |
# 建议数据库和模型 | |
datatbase_opts = YAML.load_file(File.join(ENV['MYSQL_CONFIG_PATH'])).inject({}) {|h, kv| h[kv[0].to_sym] = kv[1]; h }[:production] | |
datatbase_opts.delete('database') | |
ActiveRecord::Base.establish_connection datatbase_opts | |
ActiveRecord::Base.connection.create_database('ruby_china_similar') rescue nil | |
datatbase_opts['database'] = 'ruby_china_similar' | |
ActiveRecord::Base.establish_connection datatbase_opts | |
class RubyChinaTopic < ActiveRecord::Base | |
end | |
class ActiveRecord::Migration | |
create_table :ruby_china_topics, :options => 'ENGINE=Innodb DEFAULT CHARSET=utf8' do |t| | |
t.integer :topic_id, :default => 0 | |
t.string :title | |
t.text :content | |
t.text :similar_results | |
t.timestamps | |
end | |
add_index :ruby_china_topics, :topic_id | |
end if not RubyChinaTopic.table_exists? | |
# 抓取数据 | |
last_insert_id = RubyChinaTopic.order("topic_id DESC").first.topic_id | |
last_topic_id = open('http://ruby-china.org/topics/last').read.scan(/\/topics\/([0-9]+)/).flatten[0].to_i | |
@start = 0 | |
last_insert_id.upto(last_topic_id) do |i| | |
next if i < @start | |
begin | |
content = open("http://ruby-china.org/topics/#{i}").read | |
title = content.gsub(/\r|\n/, '').match(/entry-title\">(.*)<\/h1>/)[1].to_s | |
SimilarLogger.info "#{i} => #{title}" | |
topic = RubyChinaTopic.find_or_create_by_topic_id i | |
topic.update_attributes! :title => title, :content => content | |
rescue OpenURI::HTTPError | |
SimilarLogger.info "#{i} => *404*" | |
end | |
@start = i | |
end; 0 | |
@corpus = TfIdfSimilarity::Collection.new | |
@topic_id_to_titles = {} | |
RubyChinaTopic.find_in_batches(:batch_size => 1000) do |topics| | |
topics.each do |topic| | |
SimilarLogger.info "#{topic.topic_id} #{topic.title}" | |
@corpus << TfIdfSimilarity::Document.new("#{topic.title} #{topic.content}", :id => topic.topic_id) | |
@topic_id_to_titles[topic.topic_id] = topic.title | |
end | |
end | |
SimilarLogger.info "计算文档相似性" | |
SimilarLogger.info "开始 #{Time.now.strftime('%d-%H%M%S')}" | |
@matrix_array = @corpus.similarity_matrix.to_a; 0 | |
# matrix的数组下标对应到真实的item_id | |
@matrix_idx_to_item_id_hash = {} | |
@corpus.documents.each_with_index do |document, idx1| | |
@matrix_idx_to_item_id_hash[idx1] = document.id | |
end; 0 | |
# 取出matrix里各item的按相关度倒序的item_ids,并保存 | |
@corpus.documents.each_with_index do |document, idx1| | |
_item_id_to_score = Hash.new 0 | |
@matrix_array[idx1].each_with_index do |num, idx2| | |
_item_id_to_score[@matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num) | |
end | |
_item_id_to_score.delete document.id | |
SimilarLogger.info "对比文档:" | |
SimilarLogger.info "#{document.id} # #{@topic_id_to_titles[document.id]}" | |
SimilarLogger.info "相关文档:" | |
_item_ids = _item_id_to_score.sort {|a, b| b[1] <=> a[1] } | |
_item_ids[0..9].each do |item_id, score| | |
SimilarLogger.info "#{score.round(3)} # #{@topic_id_to_titles[item_id]}" | |
end | |
_a = _item_ids[0..9].map {|item_id| [item_id, @topic_id_to_titles[item_id]] } | |
_h = {:default => _a} | |
RubyChinaTopic.find_by_topic_id(document.id).update_attributes! :similar_results => _h.to_json | |
SimilarLogger.info | |
end; 0 | |
SimilarLogger.info "结束 #{Time.now.strftime('%d-%H%M%S')}" | |
__END__ | |
20130330 内存占用太大跑不出来 | |
Real Mem: 2.77GB | |
Swap Used: 10.06GB | |
Page ins: 14.53GB | |
Page outs: 5.68GB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment