Skip to content

Instantly share code, notes, and snippets.

@s-mage
Last active December 19, 2015 08:29
Show Gist options
  • Save s-mage/5925679 to your computer and use it in GitHub Desktop.
Save s-mage/5925679 to your computer and use it in GitHub Desktop.
mraztem -- bad copy of mystem with ruby and sqlite.
#!/usr/bin/env ruby
# encoding: utf-8
require 'sequel'
require 'msgpack'
require 'benchmark'
require 'ruby-prof'
require 'logger'
def analyze(word)
all_tags = []
max_suffix = word[-SUFFIX_SIZE..-1] || word
word_suffixes = suffixes.
where(suffix: possible_suffixes(max_suffix)).all
word_suffixes.each do |word_suffix|
suffix = word_suffix[:suffix]
suffix_id = word_suffix[:id]
stem = word[0..-(suffix.size+1)]
stems = DB[:stems].where(suffix_id: suffix_id).all
unless (existing_word = stems.select { |x| x[:stem] == stem }).empty?
return existing_word.map { |x| MessagePack.unpack(x[:tags]) }
end
return stems.map { |x| MessagePack.unpack(x[:tags]) }
end
def possible_suffixes(word)
word.reverse.each_char.inject(['']) { |r, c| r << c + r.last }
end
SUFFIX_SIZE = 5
DB = Sequel.sqlite 'data.db'
p analyze(ARGV.first)
#!/usr/bin/env ruby
# encoding: utf-8
require 'sequel'
require 'nokogiri'
require 'msgpack'
# Using IO.foreach for parsing file. It should not eat my ram.
#
def find_lemmas(path)
IO.foreach(path).each_slice(TRANSACTION_SIZE) do |lemmas|
DB.transaction do
lemmas.each do |lemma|
next if lemma !~ /<lemma id.*/
forms, tagsets = [], []
Nokogiri::XML(lemma).children.first.children.each do |form|
forms << form[:t]
tagsets << form.children.map { |x| x[:v] }
end
put_into_db forms, tagsets
end
end
end
end
def put_into_db(forms, tags)
# Remove lemma from list of forms and tagsets.
forms.shift
return unless common_tags = tags.shift
forms.each do |word|
stem = word[0...-SUFFIX_SIZE]
suffix = word[-SUFFIX_SIZE..-1] || word
unless suffix_id = DB[:suffixes].where(suffix: suffix).get(:id)
suffix_id = DB[:suffixes].insert(suffix: suffix)
end
DB[:stems].insert(stem: stem,
tags: Sequel.blob((common_tags + tags.shift).to_msgpack),
suffix_id: suffix_id)
end
end
DB = Sequel.sqlite 'data.db'
TRANSACTION_SIZE = 200
SUFFIX_SIZE = 5
path = 'corpus/files/export/dict/dict.opcorpora.xml'
find_lemmas(path)
#!/usr/bin/env ruby
# encoding: utf-8
# I'm sax-y and I know it.
#
require 'sax-machine'
class GrammemeAttribute
include SAXMachine
attribute :parent
value :text
end
class Grammeme
include SAXMachine
elements :grammeme, :as => :grammemes, class: GrammemeAttribute
end
class Tags
include SAXMachine
attribute :v
value :text
end
class WordForm
include SAXMachine
elements :g, :as => :tags, class: Tags
attribute :t, :as => :word
value :text
end
class Lemma
include SAXMachine
element :l, :as => :initial_form, class: WordForm
elements :f, :as => :forms, class: WordForm
end
class Lemmata
include SAXMachine
elements :lemma, class: Lemma
end
class Dictionary
include SAXMachine
element :grammemes, class: Grammeme
element :lemmata, class: Lemmata
end
#!/usr/bin/env ruby
# encoding: utf-8
require 'sequel'
DB = Sequel.sqlite('data.db')
DB.create_table?(:stems) do
primary_key :id, autoincrement: true
String :stem
File :tags
foreign_key :suffix_id, :suffixes
end
DB.create_table?(:suffixes) do
primary_key :id, autoincrement: true
String :suffix
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment