Created
June 22, 2010 21:12
-
-
Save makimoto/449097 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# PLSI implement in pure Ruby | |
# 'foolishly honest' translation from python impl. in http://satomacoto.blogspot.com/2009/10/pythonplsa.html | |
class PLSI | |
def initialize(n, nz=2) | |
# n: Document-Word matrix, nz: # latent variables | |
@n = n | |
@nz = nz | |
@nd = n.size | |
@nw = n[0].size | |
@pdwz = generate_array( @nd, @nw, @nz ) # P(z|d,w) | |
@pzw = generate_array( @nz, @nw ) # P(w|z) | |
@pzd = generate_array( @nz, @nd ) # P(d|z) | |
@pz = generate_array( @nz ) # P(z) | |
@pdw = generate_array( @nd, @nw ) # P(d,w) | |
end | |
def status_print | |
puts 'n:' | |
p @n | |
puts 'nz:' | |
p @nz | |
puts 'nd:' | |
p @nd | |
puts 'nw:' | |
p @nw | |
puts 'pdwz:' | |
p @pdwz | |
puts 'pzw:' | |
p @pzw | |
puts 'pzd:' | |
p @pzd | |
puts 'pz:' | |
p @pz | |
puts 'pdw:' | |
p @pdw | |
end | |
def train(k = 1000) | |
# iterate until converged | |
tmp = 0 | |
k.times do |i| | |
e_step | |
m_step | |
l = likelihood | |
if (l - tmp).abs < 1.0e-10 | |
break | |
else | |
tmp = l | |
end | |
end | |
end | |
def normalize(list) | |
# normalize sum of the array to 1.0 | |
total = list.inject(0.0){|i,j| i+j } | |
return list.map{|i| i/total} | |
end | |
def generate_array(*n) | |
result = [] | |
head = n.shift | |
if n.size > 0 | |
head.times{|i| result.push generate_array(*n) } | |
return result | |
else | |
head.times{|i| result.push rand } | |
return normalize result | |
end | |
end | |
def likelihood | |
# P(d,w) | |
@nd.times do |d| | |
@nw.times do |w| | |
@pdw[d][w] = @nz.times.map{|z| @pz[z] * @pzd[z][d] * @pzw[z][w] }.inject(0.0){|i,j| i + j } | |
end | |
end | |
# Σ n(d,w) log P(d,w) | |
return @nw.times.inject(0.0){|t0,w| | |
t0 + @nd.times.inject(0.0){|t1,d| | |
# avoid Numerical argument out of domain in Math.log when arg is 0 | |
log_pdw = @pdw[d][w] == 0.0 ? 0.0 : Math.log @pdw[d][w] | |
t1 + @n[d][w] * log_pdw | |
} | |
} | |
end | |
def e_step | |
# P(z|d,w) | |
@nd.times.each do |d| | |
@nw.times.each do |w| | |
@nz.times.each do |z| | |
@pdwz[d][w][z] = @pz[z] * @pzd[z][d] * @pzw[z][w] | |
end | |
@pdwz[d][w] = normalize(@pdwz[d][w]) | |
end | |
end | |
end | |
def m_step | |
# P(w|z) | |
@nz.times do |z| | |
@nw.times do |w| | |
@pzw[z][w] = @nd.times.inject(0.0){|t,d| t + @n[d][w] * @pdwz[d][w][z]} | |
end | |
@pzw[z] = normalize(@pzw[z]) | |
end | |
# P(d|z) | |
@nz.times do |z| | |
@nd.times do |d| | |
@pzd[z][d] = @nw.times.inject(0.0){|t,w| t + @n[d][w] * @pdwz[d][w][z] } | |
end | |
@pzd[z] = normalize(@pzd[z]) | |
end | |
# P(z) | |
@nz.times do |z| | |
@pz[z] = @nw.times.inject(0.0){|t0,w| | |
t0 + @nd.times.inject(0.0){|t1,d| | |
t1 + @n[d][w] * @pdwz[d][w][z] | |
} | |
} | |
end | |
@pz = normalize(@pz) | |
end | |
end | |
n = [ | |
[1,1,0,0], | |
[0,0,1,1], | |
[0,0,0,1] | |
] | |
plsi = PLSI.new n | |
plsi.train | |
plsi.status_print |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment