Last active
March 24, 2017 07:23
-
-
Save seancdavis/fa07542fa8dab0310b9c to your computer and use it in GitHub Desktop.
Related Content (without metadata) in Rails using tf-idf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ bundle exec rails g migration add_words_to_posts words:text | |
$ bundle exec rake db:migrate | |
$ bundle install | |
$ bundle exec rails g migration add_related_posts_to_posts related_posts | |
$ bundle exec rake db:migrate |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gem 'htmlentities' | |
gem 'nokogiri' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# config/initializers/array.rb | |
class Array | |
def multiset(arr) | |
result=[] | |
h1,h2=Hash.new(0),Hash.new(0) | |
self.each { |x| h1[x] += 1 } | |
arr.each { |x| h2[x] += 1 } | |
h1.each_pair { |k,v| result << [k] * [v, h2[k]].min if h2[k] != 0 } | |
result.flatten | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# app/models/post.rb | |
include ActionView::Helpers::SanitizeHelper | |
class Post < ActiveRecord::Base | |
after_save :update_words! | |
def update_words! | |
require 'htmlentities'; require 'nokogiri' | |
doc = Nokogiri::HTML.parse(body) | |
doc.xpath("//pre").remove.xpath("//code").remove | |
words = doc.text.gsub(/\n/, '').downcase | |
words = HTMLEntities.new.decode(sanitize(words, :tags => [])) | |
words = words.split(/\ |\.|\,|\!|\?|\//).reject(&:blank?).sort.join(',') | |
words.gsub(/[^a-z\,]/i, '').split(',').reject(&:blank?).sort.join(',') | |
update_columns(:words => words) | |
end | |
def update_related! | |
posts = Post.all; related = {} | |
ifd = inverse_document_frequency(posts) | |
(posts.select(&:published?) - [self]).each do |post| | |
score = 0 | |
intersection = self.words.split(',').multiset(post.words.split(',')) | |
intersection.each { |word| score += ifd[word] } | |
related[post.id] = score | |
end | |
related = related.sort_by { |k,v| v }.reverse | |
related = related.collect { |k,v| k }.first(3).join(',') | |
update_columns(:related_posts => related) | |
end | |
def related | |
Post.published.where(:id => related_posts.split(',')) | |
end | |
private | |
def inverse_document_frequency(posts) | |
words = {} | |
posts.each do |post| | |
RelatedPost.process_words(post.body) if post.words.blank? | |
post.words.split(',').uniq.each do |word| | |
words[word] = 0 if words[word].nil? | |
words[word] += 1 | |
end | |
end | |
words.each do |word, freq| | |
words[word] = Math.log(posts.size / freq) | |
end | |
words | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# app/controllers/posts_controller.rb | |
class PostsController < ApplicationController | |
def show | |
@post = Post.find_by_id(params[:id]) | |
@related = @post.related.first(3) | |
end | |
def new | |
@post = Post.new | |
end | |
def create | |
@post = Post.new(post_params) | |
if @post.save | |
@post.update_related! | |
@post.related.each { |p| p.update_related! } | |
redirect_to @post, :notice => "Post was created successfully." | |
else | |
render 'new' | |
end | |
end | |
def edit | |
@post = Post.find_by_id(params[:id]) | |
end | |
def update | |
@post = Post.find_by_id(params[:id]) | |
if @post.update(post_params) | |
@post.update_related! | |
@post.related.each { |p| p.update_related! } | |
redirect_to @post, :notice => "Post was updated successfully." | |
else | |
render 'edit' | |
end | |
end | |
private | |
def post_params | |
params.require(:post).permit(:title, :body, :published) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I am getting the following error:
uninitialized constant Post::RelatedPost