Created
November 24, 2014 15:49
-
-
Save siddMahen/05bce51c9efcabf21418 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base.open | |
import Base.close | |
import Base.readlines | |
@vectorize_1arg String open | |
@vectorize_1arg IOStream close | |
@vectorize_1arg IOStream readlines | |
# map a dict into an array | |
function dict_map(f::Function, d::Dict) | |
a = Array(typeof(f(first(d)...)),0) | |
for (k,v) in d | |
push!(a,f(k,v)) | |
end | |
a | |
end | |
# map a dict into a value | |
function dict_mapreduce(f::Function, op::Function, d::Dict) | |
reduce(op, dict_map(f, dict)) | |
end | |
abstract NLPClassifier | |
immutable type NLPDocument | |
bow::Dict{String,Uint} | |
end | |
NLPDocument(doc::NLPDocument) = NLPDocument(doc.bow) | |
function NLPDocument(a::NLPDocument, b::NLPDocument) | |
bow = Dict{String,Uint}() | |
keys = union(collect(keys(b.bow)),collect(keys(a.bow))) | |
for key in keys | |
bow[key] = get(a,key,0) + get(b,key,0) | |
end | |
NLPDocument(bow) | |
end | |
function NLPDocument(docs::Array{NLPDocument}) | |
doc = reduce(merge,docs) | |
NLPDocument(doc) | |
end | |
function NLPDocument(doc::String) | |
str = split(doc) | |
words = unique(str) | |
bow = Dict{String,Uint}() | |
sizehint(bow,length(words)) | |
for word in words | |
bow[word] = count(y -> y == word, str) | |
end | |
NLPDocument(bow) | |
end | |
function NLPDocument{T <: String}(docs::Array{T}) | |
str = reduce(*, docs) | |
NLPDocument(str) | |
end | |
function NLPDocument(handle::IOStream) | |
str = readline(handle) | |
NLPDocument(str) | |
end | |
function merge(a::NLPDocument, b::NLPDocument) | |
NLPDocument(a,b) | |
end | |
function size(d::NLPDocument) | |
const s = dict_mapreduce(+, d.bow) do k, v | |
v | |
end | |
end | |
function words(d::NLPDocument) | |
const w = dict_map(d.bow) do k, v | |
k | |
end | |
end | |
immutable type NLPClass | |
name::String # human readable class name | |
size::Uint # number of docs in the class | |
composite::NLPDocument # mega-doc of all documents in the class | |
end | |
function NLPClass(n::String, docs::Array{NLPDocument}) | |
doc = reduce(merge, docs) | |
NLPClass(n,length(docs),doc) | |
end | |
function NLPClass(n::String, size::Uint, doc::String) | |
composite = NLPDocument(doc) | |
NLPClass(n,size,composite) | |
end | |
function NLPClass{T<:String}(n::String, docs::Array{T}) | |
doc = NLPDocument(docs) | |
NLPClass(n,length(docs),doc) | |
end | |
immutable type NLPNaiveBayesClassifier <: NLPClassifier | |
classes::Array{NLPClass} | |
alpha::Float64 | |
NLPNaiveBayesClassifier(c::Array{NLPClass}, a::Float64=1.0) = new(c,a) | |
end | |
function NLPNaiveBayesClassifier(class::NLPClass, classes::NLPClass...) | |
NLPNaiveBayesClassifier(vcat(class, classes)) | |
end | |
function NLPNaiveBayesClassifier{T <: String}(files::Dict{T,T}, a::Float64=1.0) | |
classes = dict_map(files) do name, file | |
handle = open(file) | |
data = readlines(handle) | |
data = sanitize!(data) | |
class = NLPClass(name, data) | |
close(handle) | |
# TODO: Need room for stemming? For large data sets, doesn't really | |
# improve performance. | |
return class | |
end | |
NLPNaiveBayesClassifier(classes, a) | |
end | |
const unknown = 0.000000001 | |
# returns a dict of type NLPClass => Float64 | |
function priors(c::NLPNaiveBayesClassifier) | |
# prior values | |
p = Dict{NLPClass,Float64}() | |
sizehint(p, length(c.classes)) | |
s = mapreduce(+, c.classes) do class | |
class.size | |
end | |
s = log(s) | |
for class in c.classes | |
p[class] = log(class.size) - s | |
end | |
p | |
end | |
function vocab(c::NLPNaiveBayesClassifier) | |
v = mapreduce(union, c.classes) do class | |
words(class.composite) | |
end | |
(v, length(v)) | |
end | |
function likelihoods(c::NLPNaiveBayesClassifier) | |
(v, vlen) = vocab(c) | |
l = Dict{NLPClass,Dict{String,Float64}}() | |
sizehint(l, length(c.classes)) | |
for class in c.classes | |
l[class] = Dict{String,Float64}() | |
sizehint(l, vlen) | |
prelsize = class.size - c.alpha*vlen | |
if prelsize < 0 | |
# This could happen on very small data sets, where the number | |
# of words in the combined classes exceeds the number of | |
# documents in the class. In practice, this does not happen; | |
# if you have more words than documents, the algorithm isn't | |
# doing much learning | |
prelsize = 1 | |
end | |
lsize = log(prelsize) | |
bow = class.composite.bow | |
for word in v | |
l[class][word] = log(get(bow,word,unknown) + c.alpha) - lsize | |
end | |
end | |
l | |
end | |
# probably need to rename | |
typealias NLPTrainingData (NLPClass, Float64, Dict{String,Float64}) | |
function train(c::NLPNaiveBayesClassifier) | |
p = priors(c) | |
l = likelihoods(c) | |
map(c.classes) do class | |
(class, p[class], l[class]) | |
end | |
end | |
function classify(data::Array{NLPTrainingData}) | |
# should return a function which accepts an NLPDocument and classifies it | |
let data = data | |
return function(d::NLPDocument) | |
max = 0 | |
maxclass = 0 # TODO: a blank doc; used for type here | |
for (class, prior, likelihoods) in data | |
println(likelihoods) | |
# TODO: ugly, must change | |
p = prior + sum(dict_map(d.bow) do k,v | |
get(likelihoods, k, unknown) | |
end) | |
if p > max | |
max = p | |
maxclass = class | |
end | |
end | |
maxclass | |
end | |
end | |
end | |
function sanitize!{T <: String}(data::Array{T}) | |
const transformations = [ | |
lowercase, | |
str -> replace(str, "'", ""), | |
str -> replace(str, ".", ""), | |
str -> replace(str, "!", ""), | |
str -> replace(str, "?", ""), | |
str -> replace(str, ",", ""), | |
str -> replace(str, ":", ""), | |
str -> replace(str, ";", ""), | |
str -> replace(str, "\"", ""), | |
str -> replace(str, "\'", ""), | |
str -> replace(str, "(", ""), | |
str -> replace(str, ")", ""), | |
str -> replace(str, "[", ""), | |
str -> replace(str, "]", "") | |
] | |
for f in transformations | |
data = map(f, data) | |
end | |
data | |
end | |
files = [ | |
"safari" => "safari.txt", | |
"ocean" => "ocean.txt" | |
] | |
classifier = NLPNaiveBayesClassifier(files) | |
data = train(classifier) | |
h = classify(data) | |
doc = NLPDocument("lion and jaguars are both large predatory cats.") | |
println(h(doc).name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment