Last active
October 11, 2015 22:58
-
-
Save s-mage/3932753 to your computer and use it in GitHub Desktop.
Scrzy classifier. Looks bad, works bad, I need to remake it.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
require 'sequel' | |
def create_model | |
DB = Sequel.sqlite('aw.db') | |
DB.create_table(:authors) do | |
#creating table of authors | |
primary_key :id, type: Integer, index: true | |
String :name, unique: true | |
end | |
DB.create_table(:words) do | |
#creating table of words | |
primary_key :id, type: Integer, index: true | |
String :word, unique: true | |
end | |
DB.create_table(:authors_words) do | |
#creating many-to-many relation | |
foreign_key :author_id, :authors, null: false | |
foreign_key :word_id, :words, null: false | |
primary_key [:author_id, :word_id] | |
index [:author_id, :word_id] | |
Integer :freq | |
end | |
#indexes are numbered from 1 | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
#encoding: utf-8 | |
require 'rubygems' | |
require 'sequel' | |
require 'csv' | |
require 'lingua/stemmer' | |
require 'unicode_utils/downcase' | |
require 'matrix' | |
DB = Sequel.sqlite('/home/s/aw.db') | |
class String | |
def stem | |
a = self.split | |
a.inject([]){ |sum, word| sum << Lingua.stemmer(UnicodeUtils.downcase(word).gsub(/([^a-zа-яё])/, ''), language: 'ru')} | |
end | |
end | |
class Vector | |
def sim a | |
numenator = a.inner_product self | |
if numenator != 0 | |
numenator / (a.r * self.r).to_f | |
else | |
0 | |
end | |
end | |
end | |
def insert_data | |
filename = 'Downloads/red eyes/ruby-cjr.csv' | |
CSV.foreach(filename, headers: true) do |row| | |
#members << row['contact_name'] | |
DB[:authors].insert(name: row['contact_name']) unless DB[:authors].where(name: row['contact_name']).first | |
author_id = DB[:authors].where(name: row['contact_name']).first[:id] | |
row['message'].stem.each do |x| | |
#words << x | |
DB[:words].insert(word: x) unless DB[:words].where(word: x).first | |
word_id = DB[:words].where(word: x).first[:id] | |
if DB[:authors_words].where(author_id: author_id, word_id: word_id).first #if we already have bunch author_id, word_id | |
p freq = DB[:authors_words].where(author_id: author_id, word_id: word_id).first[:freq] | |
DB[:authors_words].where(author_id: author_id, word_id: word_id).update(author_id: author_id, word_id: word_id, freq: freq + 1) | |
else | |
DB[:authors_words].insert(author_id: author_id, word_id: word_id, freq: 1) | |
end | |
end | |
end | |
end | |
def chances(ph) | |
authors = DB[:authors].select{:name}.group(:id).inject([]){|sum, x| sum << x[:name]} | |
chances = authors.inject({}) do |chances, x| | |
#select words of author by author's id | |
words_freqs = DB[:words].join(DB[:authors_words], :word_id => :id).select(:word, :freq).where(author_id: authors.index(x)+1) | |
vocabulary = words_freqs.inject({}){|voc, x| voc.merge(x[:"`word`"] => x[:"`freq`"])} | |
#forming phrase as hash {word: freq} | |
phrase = ph.stem.inject({}) { |s, e| s[e].nil? ? s[e] = 1 : s[e] += 1; s } | |
#intersecting vocabulary with phrase | |
vocabulary = vocabulary.select{|k,_| phrase[k]} | |
phrase.each_key {|k| vocabulary[k] = 0 unless vocabulary.has_key?(k)} | |
#finding similarity between phrase and vocabulary | |
phrase = Vector[*Hash[phrase.sort].values] | |
vocabulary = Vector[*Hash[vocabulary.sort].values] | |
chances[x] = vocabulary.sim phrase | |
chances | |
end | |
end | |
print "Enter phrase\n> " | |
ph = STDIN.gets.chomp | |
ch = chances(ph).sort_by{|_, v| v} | |
ch.each{|x| p x} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You also can obtain Gajim logs with use of my gist.