siddMahen · November 24, 2014 15:49
diff --git a/new-naive-bayes.jl b/new-naive-bayes.jl
 import Base.open
 import Base.close
 import Base.readlines

 @vectorize_1arg String open
 @vectorize_1arg IOStream close
 @vectorize_1arg IOStream readlines

 # map a dict into an array
 function dict_map(f::Function, d::Dict)
    a = Array(typeof(f(first(d)...)),0)
    for (k,v) in d
        push!(a,f(k,v))
    end
    a
 end

 # map a dict into a value
 function dict_mapreduce(f::Function, op::Function, d::Dict)
    reduce(op, dict_map(f, dict))
 end

 abstract NLPClassifier

 immutable type NLPDocument
    bow::Dict{String,Uint}
 end

 NLPDocument(doc::NLPDocument) = NLPDocument(doc.bow)

 function NLPDocument(a::NLPDocument, b::NLPDocument)
    bow = Dict{String,Uint}()
    keys = union(collect(keys(b.bow)),collect(keys(a.bow)))

    for key in keys
        bow[key] = get(a,key,0) + get(b,key,0)
    end

    NLPDocument(bow)
 end

 function NLPDocument(docs::Array{NLPDocument})
    doc = reduce(merge,docs)
    NLPDocument(doc)
 end

 function NLPDocument(doc::String)
    str = split(doc)
    words = unique(str)

    bow = Dict{String,Uint}()
    sizehint(bow,length(words))

    for word in words
        bow[word] = count(y -> y == word, str)
    end

    NLPDocument(bow)
 end

 function NLPDocument{T <: String}(docs::Array{T})
    str = reduce(*, docs)
    NLPDocument(str)
 end

 function NLPDocument(handle::IOStream)
    str = readline(handle)
    NLPDocument(str)
 end

 function merge(a::NLPDocument, b::NLPDocument)
    NLPDocument(a,b)
 end

 function size(d::NLPDocument)
    const s = dict_mapreduce(+, d.bow) do k, v
        v
    end
 end

 function words(d::NLPDocument)
    const w = dict_map(d.bow) do k, v
        k
    end
 end

 immutable type NLPClass
    name::String # human readable class name
    size::Uint # number of docs in the class
    composite::NLPDocument # mega-doc of all documents in the class
 end

 function NLPClass(n::String, docs::Array{NLPDocument})
    doc = reduce(merge, docs)
    NLPClass(n,length(docs),doc)
 end

 function NLPClass(n::String, size::Uint, doc::String)
    composite = NLPDocument(doc)
    NLPClass(n,size,composite)
 end

 function NLPClass{T<:String}(n::String, docs::Array{T})
    doc = NLPDocument(docs)
    NLPClass(n,length(docs),doc)
 end

 immutable type NLPNaiveBayesClassifier <: NLPClassifier
    classes::Array{NLPClass}
    alpha::Float64

    NLPNaiveBayesClassifier(c::Array{NLPClass}, a::Float64=1.0) = new(c,a)
 end

 function NLPNaiveBayesClassifier(class::NLPClass, classes::NLPClass...)
    NLPNaiveBayesClassifier(vcat(class, classes))
 end

 function NLPNaiveBayesClassifier{T <: String}(files::Dict{T,T}, a::Float64=1.0)
    classes = dict_map(files) do name, file
        handle = open(file)
        data = readlines(handle)
        data = sanitize!(data)
        class = NLPClass(name, data)
        close(handle)
        # TODO: Need room for stemming? For large data sets, doesn't really
        # improve performance.
        return class
    end

    NLPNaiveBayesClassifier(classes, a)
 end

 const unknown = 0.000000001

 # returns a dict of type NLPClass => Float64
 function priors(c::NLPNaiveBayesClassifier)
    # prior values
    p = Dict{NLPClass,Float64}()
    sizehint(p, length(c.classes))

    s = mapreduce(+, c.classes) do class
        class.size
    end

    s = log(s)

    for class in c.classes
        p[class] = log(class.size) - s
    end

    p
 end

 function vocab(c::NLPNaiveBayesClassifier)
    v = mapreduce(union, c.classes) do class
        words(class.composite)
    end
    (v, length(v))
 end

 function likelihoods(c::NLPNaiveBayesClassifier)
    (v, vlen) = vocab(c)

    l = Dict{NLPClass,Dict{String,Float64}}()
    sizehint(l, length(c.classes))

    for class in c.classes
        l[class] = Dict{String,Float64}()
        sizehint(l, vlen)

        prelsize = class.size - c.alpha*vlen

        if prelsize < 0
            # This could happen on very small data sets, where the number
            # of words in the combined classes exceeds the number of
            # documents in the class. In practice, this does not happen;
            # if you have more words than documents, the algorithm isn't
            # doing much learning
            prelsize = 1
        end

        lsize = log(prelsize)
        bow = class.composite.bow

        for word in v
            l[class][word] = log(get(bow,word,unknown) + c.alpha) - lsize
        end
    end

    l
 end

 # probably need to rename
 typealias NLPTrainingData (NLPClass, Float64, Dict{String,Float64})

 function train(c::NLPNaiveBayesClassifier)
    p = priors(c)
    l = likelihoods(c)

    map(c.classes) do class
        (class, p[class], l[class])
    end
 end

 function classify(data::Array{NLPTrainingData})
    # should return a function which accepts an NLPDocument and classifies it
    let data = data
        return function(d::NLPDocument)
            max = 0
            maxclass = 0 # TODO: a blank doc; used for type here
            for (class, prior, likelihoods) in data
                println(likelihoods)
                # TODO: ugly, must change
                p = prior + sum(dict_map(d.bow) do k,v
                    get(likelihoods, k, unknown)
                end)

                if p > max
                    max = p
                    maxclass = class
                end
            end
            maxclass
        end
    end
 end

 function sanitize!{T <: String}(data::Array{T})
    const transformations = [
        lowercase,
        str -> replace(str, "'", ""),
        str -> replace(str, ".", ""),
        str -> replace(str, "!", ""),
        str -> replace(str, "?", ""),
        str -> replace(str, ",", ""),
        str -> replace(str, ":", ""),
        str -> replace(str, ";", ""),
        str -> replace(str, "\"", ""),
        str -> replace(str, "\'", ""),
        str -> replace(str, "(", ""),
        str -> replace(str, ")", ""),
        str -> replace(str, "[", ""),
        str -> replace(str, "]", "")
    ]

    for f in transformations
        data = map(f, data)
    end

    data
 end

 files = [
    "safari" => "safari.txt",
    "ocean" => "ocean.txt"
 ]

 classifier = NLPNaiveBayesClassifier(files)
 data = train(classifier)
 h = classify(data)

 doc = NLPDocument("lion and jaguars are both large predatory cats.")
 println(h(doc).name)
	import Base.open
	import Base.close
	import Base.readlines

	@vectorize_1arg String open
	@vectorize_1arg IOStream close
	@vectorize_1arg IOStream readlines

	# map a dict into an array
	function dict_map(f::Function, d::Dict)
	a = Array(typeof(f(first(d)...)),0)
	for (k,v) in d
	push!(a,f(k,v))
	end
	a
	end

	# map a dict into a value
	function dict_mapreduce(f::Function, op::Function, d::Dict)
	reduce(op, dict_map(f, dict))
	end

	abstract NLPClassifier

	immutable type NLPDocument
	bow::Dict{String,Uint}
	end

	NLPDocument(doc::NLPDocument) = NLPDocument(doc.bow)

	function NLPDocument(a::NLPDocument, b::NLPDocument)
	bow = Dict{String,Uint}()
	keys = union(collect(keys(b.bow)),collect(keys(a.bow)))

	for key in keys
	bow[key] = get(a,key,0) + get(b,key,0)
	end

	NLPDocument(bow)
	end

	function NLPDocument(docs::Array{NLPDocument})
	doc = reduce(merge,docs)
	NLPDocument(doc)
	end

	function NLPDocument(doc::String)
	str = split(doc)
	words = unique(str)

	bow = Dict{String,Uint}()
	sizehint(bow,length(words))

	for word in words
	bow[word] = count(y -> y == word, str)
	end

	NLPDocument(bow)
	end

	function NLPDocument{T <: String}(docs::Array{T})
	str = reduce(*, docs)
	NLPDocument(str)
	end

	function NLPDocument(handle::IOStream)
	str = readline(handle)
	NLPDocument(str)
	end

	function merge(a::NLPDocument, b::NLPDocument)
	NLPDocument(a,b)
	end

	function size(d::NLPDocument)
	const s = dict_mapreduce(+, d.bow) do k, v
	v
	end
	end

	function words(d::NLPDocument)
	const w = dict_map(d.bow) do k, v
	k
	end
	end

	immutable type NLPClass
	name::String # human readable class name
	size::Uint # number of docs in the class
	composite::NLPDocument # mega-doc of all documents in the class
	end

	function NLPClass(n::String, docs::Array{NLPDocument})
	doc = reduce(merge, docs)
	NLPClass(n,length(docs),doc)
	end

	function NLPClass(n::String, size::Uint, doc::String)
	composite = NLPDocument(doc)
	NLPClass(n,size,composite)
	end

	function NLPClass{T<:String}(n::String, docs::Array{T})
	doc = NLPDocument(docs)
	NLPClass(n,length(docs),doc)
	end

	immutable type NLPNaiveBayesClassifier <: NLPClassifier
	classes::Array{NLPClass}
	alpha::Float64

	NLPNaiveBayesClassifier(c::Array{NLPClass}, a::Float64=1.0) = new(c,a)
	end

	function NLPNaiveBayesClassifier(class::NLPClass, classes::NLPClass...)
	NLPNaiveBayesClassifier(vcat(class, classes))
	end

	function NLPNaiveBayesClassifier{T <: String}(files::Dict{T,T}, a::Float64=1.0)
	classes = dict_map(files) do name, file
	handle = open(file)
	data = readlines(handle)
	data = sanitize!(data)
	class = NLPClass(name, data)
	close(handle)
	# TODO: Need room for stemming? For large data sets, doesn't really
	# improve performance.
	return class
	end

	NLPNaiveBayesClassifier(classes, a)
	end

	const unknown = 0.000000001

	# returns a dict of type NLPClass => Float64
	function priors(c::NLPNaiveBayesClassifier)
	# prior values
	p = Dict{NLPClass,Float64}()
	sizehint(p, length(c.classes))

	s = mapreduce(+, c.classes) do class
	class.size
	end

	s = log(s)

	for class in c.classes
	p[class] = log(class.size) - s
	end

	p
	end

	function vocab(c::NLPNaiveBayesClassifier)
	v = mapreduce(union, c.classes) do class
	words(class.composite)
	end
	(v, length(v))
	end

	function likelihoods(c::NLPNaiveBayesClassifier)
	(v, vlen) = vocab(c)

	l = Dict{NLPClass,Dict{String,Float64}}()
	sizehint(l, length(c.classes))

	for class in c.classes
	l[class] = Dict{String,Float64}()
	sizehint(l, vlen)

	prelsize = class.size - c.alpha*vlen

	if prelsize < 0
	# This could happen on very small data sets, where the number
	# of words in the combined classes exceeds the number of
	# documents in the class. In practice, this does not happen;
	# if you have more words than documents, the algorithm isn't
	# doing much learning
	prelsize = 1
	end

	lsize = log(prelsize)
	bow = class.composite.bow

	for word in v
	l[class][word] = log(get(bow,word,unknown) + c.alpha) - lsize
	end
	end

	l
	end

	# probably need to rename
	typealias NLPTrainingData (NLPClass, Float64, Dict{String,Float64})

	function train(c::NLPNaiveBayesClassifier)
	p = priors(c)
	l = likelihoods(c)

	map(c.classes) do class
	(class, p[class], l[class])
	end
	end

	function classify(data::Array{NLPTrainingData})
	# should return a function which accepts an NLPDocument and classifies it
	let data = data
	return function(d::NLPDocument)
	max = 0
	maxclass = 0 # TODO: a blank doc; used for type here
	for (class, prior, likelihoods) in data
	println(likelihoods)
	# TODO: ugly, must change
	p = prior + sum(dict_map(d.bow) do k,v
	get(likelihoods, k, unknown)
	end)

	if p > max
	max = p
	maxclass = class
	end
	end
	maxclass
	end
	end
	end

	function sanitize!{T <: String}(data::Array{T})
	const transformations = [
	lowercase,
	str -> replace(str, "'", ""),
	str -> replace(str, ".", ""),
	str -> replace(str, "!", ""),
	str -> replace(str, "?", ""),
	str -> replace(str, ",", ""),
	str -> replace(str, ":", ""),
	str -> replace(str, ";", ""),
	str -> replace(str, "\"", ""),
	str -> replace(str, "\'", ""),
	str -> replace(str, "(", ""),
	str -> replace(str, ")", ""),
	str -> replace(str, "[", ""),
	str -> replace(str, "]", "")
	]

	for f in transformations
	data = map(f, data)
	end

	data
	end

	files = [
	"safari" => "safari.txt",
	"ocean" => "ocean.txt"
	]

	classifier = NLPNaiveBayesClassifier(files)
	data = train(classifier)
	h = classify(data)

	doc = NLPDocument("lion and jaguars are both large predatory cats.")
	println(h(doc).name)