Skip to content

Instantly share code, notes, and snippets.

@makimoto
Created June 22, 2010 21:12
Show Gist options
  • Save makimoto/449097 to your computer and use it in GitHub Desktop.
Save makimoto/449097 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# PLSI implement in pure Ruby
# 'foolishly honest' translation from python impl. in http://satomacoto.blogspot.com/2009/10/pythonplsa.html
class PLSI
def initialize(n, nz=2)
# n: Document-Word matrix, nz: # latent variables
@n = n
@nz = nz
@nd = n.size
@nw = n[0].size
@pdwz = generate_array( @nd, @nw, @nz ) # P(z|d,w)
@pzw = generate_array( @nz, @nw ) # P(w|z)
@pzd = generate_array( @nz, @nd ) # P(d|z)
@pz = generate_array( @nz ) # P(z)
@pdw = generate_array( @nd, @nw ) # P(d,w)
end
def status_print
puts 'n:'
p @n
puts 'nz:'
p @nz
puts 'nd:'
p @nd
puts 'nw:'
p @nw
puts 'pdwz:'
p @pdwz
puts 'pzw:'
p @pzw
puts 'pzd:'
p @pzd
puts 'pz:'
p @pz
puts 'pdw:'
p @pdw
end
def train(k = 1000)
# iterate until converged
tmp = 0
k.times do |i|
e_step
m_step
l = likelihood
if (l - tmp).abs < 1.0e-10
break
else
tmp = l
end
end
end
def normalize(list)
# normalize sum of the array to 1.0
total = list.inject(0.0){|i,j| i+j }
return list.map{|i| i/total}
end
def generate_array(*n)
result = []
head = n.shift
if n.size > 0
head.times{|i| result.push generate_array(*n) }
return result
else
head.times{|i| result.push rand }
return normalize result
end
end
def likelihood
# P(d,w)
@nd.times do |d|
@nw.times do |w|
@pdw[d][w] = @nz.times.map{|z| @pz[z] * @pzd[z][d] * @pzw[z][w] }.inject(0.0){|i,j| i + j }
end
end
# Σ n(d,w) log P(d,w)
return @nw.times.inject(0.0){|t0,w|
t0 + @nd.times.inject(0.0){|t1,d|
# avoid Numerical argument out of domain in Math.log when arg is 0
log_pdw = @pdw[d][w] == 0.0 ? 0.0 : Math.log @pdw[d][w]
t1 + @n[d][w] * log_pdw
}
}
end
def e_step
# P(z|d,w)
@nd.times.each do |d|
@nw.times.each do |w|
@nz.times.each do |z|
@pdwz[d][w][z] = @pz[z] * @pzd[z][d] * @pzw[z][w]
end
@pdwz[d][w] = normalize(@pdwz[d][w])
end
end
end
def m_step
# P(w|z)
@nz.times do |z|
@nw.times do |w|
@pzw[z][w] = @nd.times.inject(0.0){|t,d| t + @n[d][w] * @pdwz[d][w][z]}
end
@pzw[z] = normalize(@pzw[z])
end
# P(d|z)
@nz.times do |z|
@nd.times do |d|
@pzd[z][d] = @nw.times.inject(0.0){|t,w| t + @n[d][w] * @pdwz[d][w][z] }
end
@pzd[z] = normalize(@pzd[z])
end
# P(z)
@nz.times do |z|
@pz[z] = @nw.times.inject(0.0){|t0,w|
t0 + @nd.times.inject(0.0){|t1,d|
t1 + @n[d][w] * @pdwz[d][w][z]
}
}
end
@pz = normalize(@pz)
end
end
n = [
[1,1,0,0],
[0,0,1,1],
[0,0,0,1]
]
plsi = PLSI.new n
plsi.train
plsi.status_print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment