Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Last active August 29, 2015 14:01
Show Gist options
  • Save tokestermw/3df2198d4f8353c2a5df to your computer and use it in GitHub Desktop.
Save tokestermw/3df2198d4f8353c2a5df to your computer and use it in GitHub Desktop.
naive naive bayes in Julia
module GetData
typealias RaggedMatrix{T} Array{Array{T,1},1}
function parse(filename)
file = readlines(open(filename))
x = convert(RaggedMatrix{String},
[apply(vcat, [rstrip(term) for term in split(line, '\t')[2:]])
for line in file])
y = [split(line, '\t')[1] for line in file]
return(x, y)
end
export parse, RaggedMatrix
end
# data from Yong-Yeol “YY” Ahn
# Flavor network and the principles of food pairing
# http://yongyeol.com/data/scirep-cuisines-detail.zip
using GetData
filename = "data/scirep-cuisines-detail/epic_recipes.txt"
x, y = GetData.parse(filename)
all_cuisines = unique(y)
cuisines = (String=>Int64)[all_cuisines[i] => i for i in 1:length(all_cuisines)]
rev_cuisines = (Int64=>String)[cuisines[i] => i for i in all_cuisines]
all_ingredients = unique(apply(vcat, [line for line in x]))
ingredients = (String=>Int64)[all_ingredients[i] => i for i in 1:length(all_ingredients)]
rev_ingredients = (Int64=>String)[ingredients[i] => i for i in all_ingredients]
function count_vectorizer(x::GetData.RaggedMatrix{String}, y::Array{String,1},
features::Dict{String,Int64}, classes::Dict{String,Int64})
X = zeros(length(features), length(classes))
for c in keys(classes)
features_for_class = apply(vcat, [i for i in x[find(_ -> _ == c, y)]])
for f in features_for_class
@inbounds X[features[f], classes[c]] += 1.0
end
end
X
end
# 0.179 seconds (ingredients) x (cuisines)
@timed X = count_vectorizer(x, y, ingredients, cuisines)
function naive_bayes_fit(X::Array{Float64,2}, alpha::Float64 = 1.0)
log_like = zeros(size(X))
# sum over classes
total = sum(X, 1)
for j in 1:size(X)[2]
@inbounds log_like[:,j] = log(X[:,j] + alpha) - log(total[j])
end
#prior = ones(size(X)[2]) / length(size(X)[2]
#sum(log_like, 1)[:] + log(prior)
log_like
end
# 0.00039 seconds
@timed log_like = naive_bayes_fit(X)
function naive_bayes_predict(X::Array{String,1}, log_like::Array{Float64,2},
features::Dict{String,Int64}, classes::Dict{Int64,String})
results = zeros(length(classes))
prior = 1.0 / length(classes)
for i in 1:length(classes)
ind = convert(Array{Int64,1}, [features[i] for i in X])
results[i] = sum(log_like[ind, i]) + log(prior)
end
classes[indmax(results)]
end
# check results
f = open("results.txt", "w")
how_many_correct = 0
for i in 1:length(x)
prediction = naive_bayes_predict(x[i], log_like, ingredients, rev_cuisines)
println(f, prediction, " | ", y[i], " \n", x[i])
if prediction == y[i]
how_many_correct += 1
end
end
println(f, how_many_correct / length(y))
close(f)
confusion_matrix = zeros(length(cuisines), length(cuisines))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment