Skip to content

Instantly share code, notes, and snippets.

@dchentech
Last active December 15, 2015 12:58
Show Gist options
  • Save dchentech/5263559 to your computer and use it in GitHub Desktop.
Save dchentech/5263559 to your computer and use it in GitHub Desktop.
Ruby-China相似性文章测试
*.log
*.lock
# encoding: UTF-8
require 'logger'
require 'open-uri'
require 'yaml'
require 'gsl'
require 'narray'
require 'active_record'
require 'tf-idf-similarity'
SimilarLogger = Logger.new("similar.log")
# 建议数据库和模型
datatbase_opts = YAML.load_file(File.join(ENV['MYSQL_CONFIG_PATH'])).inject({}) {|h, kv| h[kv[0].to_sym] = kv[1]; h }[:production]
datatbase_opts.delete('database')
ActiveRecord::Base.establish_connection datatbase_opts
ActiveRecord::Base.connection.create_database('ruby_china_similar') rescue nil
datatbase_opts['database'] = 'ruby_china_similar'
ActiveRecord::Base.establish_connection datatbase_opts
class RubyChinaTopic < ActiveRecord::Base
end
class ActiveRecord::Migration
create_table :ruby_china_topics, :options => 'ENGINE=Innodb DEFAULT CHARSET=utf8' do |t|
t.integer :topic_id, :default => 0
t.string :title
t.text :content
t.text :similar_results
t.timestamps
end
add_index :ruby_china_topics, :topic_id
end if not RubyChinaTopic.table_exists?
# 抓取数据
last_insert_id = RubyChinaTopic.order("topic_id DESC").first.topic_id
last_topic_id = open('http://ruby-china.org/topics/last').read.scan(/\/topics\/([0-9]+)/).flatten[0].to_i
@start = 0
last_insert_id.upto(last_topic_id) do |i|
next if i < @start
begin
content = open("http://ruby-china.org/topics/#{i}").read
title = content.gsub(/\r|\n/, '').match(/entry-title\">(.*)<\/h1>/)[1].to_s
SimilarLogger.info "#{i} => #{title}"
topic = RubyChinaTopic.find_or_create_by_topic_id i
topic.update_attributes! :title => title, :content => content
rescue OpenURI::HTTPError
SimilarLogger.info "#{i} => *404*"
end
@start = i
end; 0
@corpus = TfIdfSimilarity::Collection.new
@topic_id_to_titles = {}
RubyChinaTopic.find_in_batches(:batch_size => 1000) do |topics|
topics.each do |topic|
SimilarLogger.info "#{topic.topic_id} #{topic.title}"
@corpus << TfIdfSimilarity::Document.new("#{topic.title} #{topic.content}", :id => topic.topic_id)
@topic_id_to_titles[topic.topic_id] = topic.title
end
end
SimilarLogger.info "计算文档相似性"
SimilarLogger.info "开始 #{Time.now.strftime('%d-%H%M%S')}"
@matrix_array = @corpus.similarity_matrix.to_a; 0
# matrix的数组下标对应到真实的item_id
@matrix_idx_to_item_id_hash = {}
@corpus.documents.each_with_index do |document, idx1|
@matrix_idx_to_item_id_hash[idx1] = document.id
end; 0
# 取出matrix里各item的按相关度倒序的item_ids,并保存
@corpus.documents.each_with_index do |document, idx1|
_item_id_to_score = Hash.new 0
@matrix_array[idx1].each_with_index do |num, idx2|
_item_id_to_score[@matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
end
_item_id_to_score.delete document.id
SimilarLogger.info "对比文档:"
SimilarLogger.info "#{document.id} # #{@topic_id_to_titles[document.id]}"
SimilarLogger.info "相关文档:"
_item_ids = _item_id_to_score.sort {|a, b| b[1] <=> a[1] }
_item_ids[0..9].each do |item_id, score|
SimilarLogger.info "#{score.round(3)} # #{@topic_id_to_titles[item_id]}"
end
_a = _item_ids[0..9].map {|item_id| [item_id, @topic_id_to_titles[item_id]] }
_h = {:default => _a}
RubyChinaTopic.find_by_topic_id(document.id).update_attributes! :similar_results => _h.to_json
SimilarLogger.info
end; 0
SimilarLogger.info "结束 #{Time.now.strftime('%d-%H%M%S')}"
__END__
20130330 内存占用太大跑不出来
Real Mem: 2.77GB
Swap Used: 10.06GB
Page ins: 14.53GB
Page outs: 5.68GB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment