Skip to content

Instantly share code, notes, and snippets.

@shouya
Created June 28, 2015 22:37
Show Gist options
  • Save shouya/99e60c0ce5800d5fd056 to your computer and use it in GitHub Desktop.
Save shouya/99e60c0ce5800d5fd056 to your computer and use it in GitHub Desktop.
an improved bigram word segmentating algorithm
def clean(txt)
mark = true
txt.each_char.map { |x|
if x.ascii_only? and mark
mark = false
' '
elsif x.ascii_only?
nil
else
mark = true
x
end
}.compact.map { |x|
x.nil? ? ' ' : x
}.join
end
def bigram(str)
str.split.map {|x| x.each_char.each_cons(2).to_a }.flatten(1)
end
def stat(bigram)
tbl = Hash.new
bigram.to_a.each do |a,b|
tbl[a] ||= Hash.new(0)
tbl[a][b] += 1
end
tbl
end
def maxfreq(tbl)
max = 0
tbl.each do |k1,v1|
v1.each do |k2,v2|
max = v2 if max < v2
end
end
max
end
def normalize(len, maxfreq, tbl)
new_tbl = Hash.new
rec = Math.log(len + 1)
norm = Math.log(maxfreq)/rec
tbl.each do |k1,v1|
v1.each do |k2,v2|
new_tbl[k1] ||= Hash.new(0)
reced = Math.log(v2.to_f)/rec
new_tbl[k1][k2] = reced/norm
end
end
new_tbl
end
def entropy(tbl, bigram)
ent_bef = {}
ent_aft = {}
bigram.each do |a,b|
ent_bef[a] = tbl[a].map {|k,v| v }.inject(&:+) if ent_bef[a].nil?
ent_aft[b] = tbl.map {|k,v| v[b] }.inject(&:+) if ent_bef[a].nil?
end
[ent_bef, ent_aft]
end
def segm(tbl, ent, txt, prec)
segs = []
buf = []
ent_bef, ent_aft = *ent
txt.each_char.each_cons(2) do |c1, c2|
buf << c1 unless c1 == ' '
if tbl[c1].nil?
segs << buf.join
buf = []
next
end
c1_freq = ent_bef[c1] || 0.000000000000001
c2_freq = ent_aft[c2] || 0.000000000000001
comp = tbl[c1][c2]
if comp == 0 or comp/c1_freq < prec or comp/c2_freq < prec
segs << buf.join
buf = []
end
end
segs.reject {|x| x.length == 0 }
end
require_relative 'ngram'
require 'json'
txt = clean(File.read('tweets.txt'))
cons = bigram(txt)
tbl = stat(cons)
max = maxfreq(tbl)
tbl = normalize(txt.length, max, tbl)
ent = entropy(tbl, cons)
segs = segm(tbl, ent, txt, 0.012)
p segs
File.write('./out.json', segs.to_json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment