Created
December 30, 2016 04:11
-
-
Save guangningyu/00726435269fa45153b13082b66b0733 to your computer and use it in GitHub Desktop.
Sample code of Naive Bayes model. Reference: Machine Learning in Action Chapter 4.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
from numpy import ones, log | |
def createDataSet(): | |
docs = [ | |
['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'], | |
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], | |
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], | |
['stop', 'posting', 'stupid', 'worthless', 'garbage'], | |
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], | |
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'], | |
] | |
labels = [0, 1, 0, 1, 0, 1] | |
return docs, labels | |
def createVocabList(docs): | |
''' | |
create a full list of the words in docs | |
''' | |
vocabList = [word for doc in docs for word in doc] | |
return list(set(vocabList)) | |
def doc2Vec(vocabList, doc, model='set'): | |
''' | |
map vocabList to a vector: | |
[set-of-words model] if the word is in the doc, then set it to 1; else set it to 0. | |
[bag-of-words model] set the value to the total frequency in the doc | |
''' | |
docVec = [0] * len(vocabList) | |
if (model == 'set'): | |
for word in doc: | |
if (word in vocabList): | |
docVec[vocabList.index(word)] = 1 | |
else: | |
for word in doc: | |
if (word in vocabList): | |
docVec[vocabList.index(word)] += 1 | |
return docVec | |
def docs2Matrix(vocabList, docs): | |
docsMatrix = [] | |
for doc in docs: | |
docsMatrix.append(doc2Vec(vocabList, doc)) | |
return docsMatrix | |
def trainNB(vocabList, docs, labels): | |
''' | |
train naive bayes model | |
''' | |
docs = docs2Matrix(vocabList, docs) # convert data set to 0/1 matrix | |
docsNum = len(docs) | |
wordsNum = len(docs[0]) | |
pPositive = sum(labels)/float(len(labels)) # positive rate | |
p1Num = ones(wordsNum) # freq of each word when positive; set 1 as default value for smoothing | |
p1Denom = 2.0 # freq of all the words when positive; set 2.0 as default value for smoothing | |
p0Num = ones(wordsNum) # freq of each word when negative; set 1 as default value for smoothing | |
p0Denom = 2.0 # freq of all the words when negative; set 2.0 as default value for smoothing | |
for i in range(docsNum): | |
if (labels[i] == 1): | |
# if label is positive | |
p1Num += docs[i] | |
p1Denom += sum(docs[i]) | |
else: | |
# if label is negative | |
p0Num += docs[i] | |
p0Denom += sum(docs[i]) | |
p1Vec = log(p1Num / p1Denom) | |
p0Vec = log(p0Num / p0Denom) | |
return p0Vec, p1Vec, pPositive | |
def classifyNB(docVec, p0Vec, p1Vec, pPositive): | |
p0 = sum(docVec * p0Vec) + log(1.0 - pPositive) | |
p1 = sum(docVec * p1Vec) + log(pPositive) | |
return 1 if p1 > p0 else 0 | |
def testNB(vocabList, p0Vec, p1Vec, pPositive, testDoc): | |
testVec = doc2Vec(vocabList, testDoc) | |
testLabel = classifyNB(testVec, p0Vec, p1Vec, pPositive) | |
print testDoc, 'classified as: ', testLabel | |
if __name__ == '__main__': | |
# prepare train data set | |
docs, labels = createDataSet() | |
# generate a full word list | |
vocabList = createVocabList(docs) | |
# train naive bayes model | |
p0Vec, p1Vec, pPositive = trainNB(vocabList, docs, labels) | |
# run test | |
testNB(vocabList, p0Vec, p1Vec, pPositive, ['love', 'my', 'dalmation']) | |
testNB(vocabList, p0Vec, p1Vec, pPositive, ['stupid', 'garbage']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment