Skip to content

Instantly share code, notes, and snippets.

@s-mage
Last active October 11, 2015 22:58
Show Gist options
  • Save s-mage/3932753 to your computer and use it in GitHub Desktop.
Save s-mage/3932753 to your computer and use it in GitHub Desktop.
Scrzy classifier. Looks bad, works bad, I need to remake it.
#encoding: utf-8
require 'sequel'
def create_model
DB = Sequel.sqlite('aw.db')
DB.create_table(:authors) do
#creating table of authors
primary_key :id, type: Integer, index: true
String :name, unique: true
end
DB.create_table(:words) do
#creating table of words
primary_key :id, type: Integer, index: true
String :word, unique: true
end
DB.create_table(:authors_words) do
#creating many-to-many relation
foreign_key :author_id, :authors, null: false
foreign_key :word_id, :words, null: false
primary_key [:author_id, :word_id]
index [:author_id, :word_id]
Integer :freq
end
#indexes are numbered from 1
end
#!/usr/bin/env ruby
#encoding: utf-8
require 'rubygems'
require 'sequel'
require 'csv'
require 'lingua/stemmer'
require 'unicode_utils/downcase'
require 'matrix'
DB = Sequel.sqlite('/home/s/aw.db')
class String
def stem
a = self.split
a.inject([]){ |sum, word| sum << Lingua.stemmer(UnicodeUtils.downcase(word).gsub(/([^a-zа-яё])/, ''), language: 'ru')}
end
end
class Vector
def sim a
numenator = a.inner_product self
if numenator != 0
numenator / (a.r * self.r).to_f
else
0
end
end
end
def insert_data
filename = 'Downloads/red eyes/ruby-cjr.csv'
CSV.foreach(filename, headers: true) do |row|
#members << row['contact_name']
DB[:authors].insert(name: row['contact_name']) unless DB[:authors].where(name: row['contact_name']).first
author_id = DB[:authors].where(name: row['contact_name']).first[:id]
row['message'].stem.each do |x|
#words << x
DB[:words].insert(word: x) unless DB[:words].where(word: x).first
word_id = DB[:words].where(word: x).first[:id]
if DB[:authors_words].where(author_id: author_id, word_id: word_id).first #if we already have bunch author_id, word_id
p freq = DB[:authors_words].where(author_id: author_id, word_id: word_id).first[:freq]
DB[:authors_words].where(author_id: author_id, word_id: word_id).update(author_id: author_id, word_id: word_id, freq: freq + 1)
else
DB[:authors_words].insert(author_id: author_id, word_id: word_id, freq: 1)
end
end
end
end
def chances(ph)
authors = DB[:authors].select{:name}.group(:id).inject([]){|sum, x| sum << x[:name]}
chances = authors.inject({}) do |chances, x|
#select words of author by author's id
words_freqs = DB[:words].join(DB[:authors_words], :word_id => :id).select(:word, :freq).where(author_id: authors.index(x)+1)
vocabulary = words_freqs.inject({}){|voc, x| voc.merge(x[:"`word`"] => x[:"`freq`"])}
#forming phrase as hash {word: freq}
phrase = ph.stem.inject({}) { |s, e| s[e].nil? ? s[e] = 1 : s[e] += 1; s }
#intersecting vocabulary with phrase
vocabulary = vocabulary.select{|k,_| phrase[k]}
phrase.each_key {|k| vocabulary[k] = 0 unless vocabulary.has_key?(k)}
#finding similarity between phrase and vocabulary
phrase = Vector[*Hash[phrase.sort].values]
vocabulary = Vector[*Hash[vocabulary.sort].values]
chances[x] = vocabulary.sim phrase
chances
end
end
print "Enter phrase\n> "
ph = STDIN.gets.chomp
ch = chances(ph).sort_by{|_, v| v}
ch.each{|x| p x}
@dustalov
Copy link

You also can obtain Gajim logs with use of my gist.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment