Skip to content

Instantly share code, notes, and snippets.

@siddMahen
Created November 24, 2014 15:49
Show Gist options
  • Save siddMahen/05bce51c9efcabf21418 to your computer and use it in GitHub Desktop.
Save siddMahen/05bce51c9efcabf21418 to your computer and use it in GitHub Desktop.
import Base.open
import Base.close
import Base.readlines
@vectorize_1arg String open
@vectorize_1arg IOStream close
@vectorize_1arg IOStream readlines
# map a dict into an array
function dict_map(f::Function, d::Dict)
a = Array(typeof(f(first(d)...)),0)
for (k,v) in d
push!(a,f(k,v))
end
a
end
# map a dict into a value
function dict_mapreduce(f::Function, op::Function, d::Dict)
reduce(op, dict_map(f, dict))
end
abstract NLPClassifier
immutable type NLPDocument
bow::Dict{String,Uint}
end
NLPDocument(doc::NLPDocument) = NLPDocument(doc.bow)
function NLPDocument(a::NLPDocument, b::NLPDocument)
bow = Dict{String,Uint}()
keys = union(collect(keys(b.bow)),collect(keys(a.bow)))
for key in keys
bow[key] = get(a,key,0) + get(b,key,0)
end
NLPDocument(bow)
end
function NLPDocument(docs::Array{NLPDocument})
doc = reduce(merge,docs)
NLPDocument(doc)
end
function NLPDocument(doc::String)
str = split(doc)
words = unique(str)
bow = Dict{String,Uint}()
sizehint(bow,length(words))
for word in words
bow[word] = count(y -> y == word, str)
end
NLPDocument(bow)
end
function NLPDocument{T <: String}(docs::Array{T})
str = reduce(*, docs)
NLPDocument(str)
end
function NLPDocument(handle::IOStream)
str = readline(handle)
NLPDocument(str)
end
function merge(a::NLPDocument, b::NLPDocument)
NLPDocument(a,b)
end
function size(d::NLPDocument)
const s = dict_mapreduce(+, d.bow) do k, v
v
end
end
function words(d::NLPDocument)
const w = dict_map(d.bow) do k, v
k
end
end
immutable type NLPClass
name::String # human readable class name
size::Uint # number of docs in the class
composite::NLPDocument # mega-doc of all documents in the class
end
function NLPClass(n::String, docs::Array{NLPDocument})
doc = reduce(merge, docs)
NLPClass(n,length(docs),doc)
end
function NLPClass(n::String, size::Uint, doc::String)
composite = NLPDocument(doc)
NLPClass(n,size,composite)
end
function NLPClass{T<:String}(n::String, docs::Array{T})
doc = NLPDocument(docs)
NLPClass(n,length(docs),doc)
end
immutable type NLPNaiveBayesClassifier <: NLPClassifier
classes::Array{NLPClass}
alpha::Float64
NLPNaiveBayesClassifier(c::Array{NLPClass}, a::Float64=1.0) = new(c,a)
end
function NLPNaiveBayesClassifier(class::NLPClass, classes::NLPClass...)
NLPNaiveBayesClassifier(vcat(class, classes))
end
function NLPNaiveBayesClassifier{T <: String}(files::Dict{T,T}, a::Float64=1.0)
classes = dict_map(files) do name, file
handle = open(file)
data = readlines(handle)
data = sanitize!(data)
class = NLPClass(name, data)
close(handle)
# TODO: Need room for stemming? For large data sets, doesn't really
# improve performance.
return class
end
NLPNaiveBayesClassifier(classes, a)
end
const unknown = 0.000000001
# returns a dict of type NLPClass => Float64
function priors(c::NLPNaiveBayesClassifier)
# prior values
p = Dict{NLPClass,Float64}()
sizehint(p, length(c.classes))
s = mapreduce(+, c.classes) do class
class.size
end
s = log(s)
for class in c.classes
p[class] = log(class.size) - s
end
p
end
function vocab(c::NLPNaiveBayesClassifier)
v = mapreduce(union, c.classes) do class
words(class.composite)
end
(v, length(v))
end
function likelihoods(c::NLPNaiveBayesClassifier)
(v, vlen) = vocab(c)
l = Dict{NLPClass,Dict{String,Float64}}()
sizehint(l, length(c.classes))
for class in c.classes
l[class] = Dict{String,Float64}()
sizehint(l, vlen)
prelsize = class.size - c.alpha*vlen
if prelsize < 0
# This could happen on very small data sets, where the number
# of words in the combined classes exceeds the number of
# documents in the class. In practice, this does not happen;
# if you have more words than documents, the algorithm isn't
# doing much learning
prelsize = 1
end
lsize = log(prelsize)
bow = class.composite.bow
for word in v
l[class][word] = log(get(bow,word,unknown) + c.alpha) - lsize
end
end
l
end
# probably need to rename
typealias NLPTrainingData (NLPClass, Float64, Dict{String,Float64})
function train(c::NLPNaiveBayesClassifier)
p = priors(c)
l = likelihoods(c)
map(c.classes) do class
(class, p[class], l[class])
end
end
function classify(data::Array{NLPTrainingData})
# should return a function which accepts an NLPDocument and classifies it
let data = data
return function(d::NLPDocument)
max = 0
maxclass = 0 # TODO: a blank doc; used for type here
for (class, prior, likelihoods) in data
println(likelihoods)
# TODO: ugly, must change
p = prior + sum(dict_map(d.bow) do k,v
get(likelihoods, k, unknown)
end)
if p > max
max = p
maxclass = class
end
end
maxclass
end
end
end
function sanitize!{T <: String}(data::Array{T})
const transformations = [
lowercase,
str -> replace(str, "'", ""),
str -> replace(str, ".", ""),
str -> replace(str, "!", ""),
str -> replace(str, "?", ""),
str -> replace(str, ",", ""),
str -> replace(str, ":", ""),
str -> replace(str, ";", ""),
str -> replace(str, "\"", ""),
str -> replace(str, "\'", ""),
str -> replace(str, "(", ""),
str -> replace(str, ")", ""),
str -> replace(str, "[", ""),
str -> replace(str, "]", "")
]
for f in transformations
data = map(f, data)
end
data
end
files = [
"safari" => "safari.txt",
"ocean" => "ocean.txt"
]
classifier = NLPNaiveBayesClassifier(files)
data = train(classifier)
h = classify(data)
doc = NLPDocument("lion and jaguars are both large predatory cats.")
println(h(doc).name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment