Last active
April 23, 2022 02:43
-
-
Save caongocthai/27c62464c4aaa83dba5becbcfa78f134 to your computer and use it in GitHub Desktop.
Sentiment Analysis: Naive Bayes Classifier from scratch in Golang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"log" | |
"os" | |
"strings" | |
) | |
// dataset returns a map of sentences to their classes from a file | |
func dataset(file string) map[string]string { | |
f, err := os.Open(file) | |
if err != nil { | |
panic(err) | |
} | |
defer f.Close() | |
dataset := make(map[string]string) | |
scanner := bufio.NewScanner(f) | |
for scanner.Scan() { | |
l := scanner.Text() | |
data := strings.Split(l, "\t") | |
if len(data) != 2 { | |
continue | |
} | |
sentence := data[0] | |
if data[1] == "0" { | |
dataset[sentence] = negative | |
} else if data[1] == "1" { | |
dataset[sentence] = positive | |
} | |
} | |
if err := scanner.Err(); err != nil { | |
log.Fatal(err) | |
} | |
return dataset | |
} | |
func main() { | |
// Initialize a new classifier | |
nb := newClassifier() | |
// Get dataset from a text file | |
// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences | |
dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt") | |
// Train the classifier with dataset | |
nb.train(dataset) | |
// Prompt for inputs from console | |
reader := bufio.NewReader(os.Stdin) | |
for { | |
fmt.Print("Enter your review: ") | |
sentence, _ := reader.ReadString('\n') | |
// Classify input sentence | |
result := nb.classify(sentence) | |
class := "" | |
if result[positive] > result[negative] { | |
class = positive | |
} else { | |
class = negative | |
} | |
fmt.Printf("> Your review is %s\n\n", class) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// The string values of the 2 classes | |
// They can be "positive" >< "negative" as in this example | |
// They can also be "ham" >< "spam", i.e. | |
const ( | |
positive = "positive" | |
negative = "negative" | |
) | |
/* | |
* Classifier | |
*/ | |
// wordFrequency stores frequency of words. For example: | |
// wordFrequency{ | |
// word: "excellent" | |
// counter: map[string]int{ | |
// "positive": 15 | |
// "negative": 0 | |
// } | |
// } | |
type wordFrequency struct { | |
word string | |
counter map[string]int | |
} | |
// classifier can be trained and used to categorize objects | |
// Attributes: | |
// dataset: map each class with a list of sentences from training data | |
// map[string][]string{ | |
// "positive": []string{ | |
// "The restaurant is excellent", | |
// "I really love this restaurant", | |
// }, | |
// "negative": []string{ | |
// "Their food is awful", | |
// } | |
// | |
// } | |
// words: map each word with their frequency | |
// map[string]wordFrequency{ | |
// "restaurant": wordFrequency{ | |
// word: "restaurant" | |
// counter: map[string]int{ | |
// "positive": 2 | |
// "negative": 0 | |
// } | |
// } | |
// } | |
type classifier struct { | |
dataset map[string][]string | |
words map[string]wordFrequency | |
} | |
// newClassifier returns a new classifier with empty dataset and words | |
func newClassifier() *classifier { | |
c := new(classifier) | |
c.dataset = map[string][]string{ | |
positive: []string{}, | |
negative: []string{}, | |
} | |
c.words = map[string]wordFrequency{} | |
return c | |
} | |
// train populates a classifier's dataset and words with input dataset map | |
// Sample dataset: map[string]string{ | |
// "The restaurant is excellent": "Positive", | |
// "I really love this restaurant": "Positive", | |
// "Their food is awful": "Negative", | |
//} | |
func (c *classifier) train(dataset map[string]string) { | |
for sentence, class := range dataset { | |
c.addSentence(sentence, class) | |
words := tokenize(sentence) | |
for _, w := range words { | |
c.addWord(w, class) | |
} | |
} | |
} | |
// classify return the probablitities of a sentence being each class | |
// Sample @return map[string]float64 { | |
// "positive": 0.7, | |
// "negative": 0.1, | |
//} | |
// Meaning 70% chance the input sentence is positive, 10% it's negative | |
func (c classifier) classify(sentence string) map[string]float64 { | |
words := tokenize(sentence) | |
posProb := c.probability(words, positive) | |
negProb := c.probability(words, negative) | |
return map[string]float64{ | |
positive: posProb, | |
negative: negProb, | |
} | |
} | |
// addSentence adds a sentence and its class to a classifier's dataset map | |
func (c *classifier) addSentence(sentence, class string) { | |
c.dataset[class] = append(c.dataset[class], sentence) | |
} | |
// addSentence adds a word to a classifier's words map and update its frequency | |
func (c *classifier) addWord(word, class string) { | |
wf, ok := c.words[word] | |
if !ok { | |
wf = wordFrequency{word: word, counter: map[string]int{ | |
positive: 0, | |
negative: 0, | |
}} | |
} | |
wf.counter[class]++ | |
c.words[word] = wf | |
} | |
// priorProb returns the prior probability of each class of the classifier | |
// This probability is determined purely by the training dataset | |
func (c classifier) priorProb(class string) float64 { | |
return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative])) | |
} | |
// totalWordCount returns the word count of a class (duplicated also count) | |
// If class provided is not positive or negative, it returns | |
// the total word count in dataset. | |
func (c classifier) totalWordCount(class string) int { | |
posCount := 0 | |
negCount := 0 | |
for _, wf := range c.words { | |
posCount += wf.counter[positive] | |
negCount += wf.counter[negative] | |
} | |
if class == positive { | |
return posCount | |
} else if class == negative { | |
return negCount | |
} else { | |
return posCount + negCount | |
} | |
} | |
// totalDistinctWordCount returns the number of distinct words in dataset | |
func (c classifier) totalDistinctWordCount() int { | |
posCount := 0 | |
negCount := 0 | |
for _, wf := range c.words { | |
posCount += zeroOneTransform(wf.counter[positive]) | |
negCount += zeroOneTransform(wf.counter[negative]) | |
} | |
return posCount + negCount | |
} | |
// probability retuns the probability of a list of words being in a class | |
func (c classifier) probability(words []string, class string) float64 { | |
prob := c.priorProb(class) | |
for _, w := range words { | |
count := 0 | |
if wf, ok := c.words[w]; ok { | |
count = wf.counter[class] | |
} | |
prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount()))) | |
} | |
for _, w := range words { | |
count := 0 | |
if wf, ok := c.words[w]; ok { | |
count += (wf.counter[positive] + wf.counter[negative]) | |
} | |
prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount()))) | |
} | |
return prob | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"math" | |
"regexp" | |
"strings" | |
) | |
/* | |
* Utilities | |
*/ | |
// stopwords are words which have very little meaning | |
var stopwords = map[string]struct{}{ | |
"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{}, | |
"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{}, | |
"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{}, | |
"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{}, | |
"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{}, | |
"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{}, | |
"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{}, | |
"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{}, | |
"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{}, | |
"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{}, | |
"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{}, | |
"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{}, | |
"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{}, | |
"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{}, | |
"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{}, | |
"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{}, | |
"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{}, | |
"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{}, | |
"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{}, | |
"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{}, | |
} | |
func isStopword(w string) bool { | |
_, ok := stopwords[w] | |
return ok | |
} | |
// cleanup remove none-alnum characters and lowercasize them | |
func cleanup(sentence string) string { | |
re := regexp.MustCompile("[^a-zA-Z 0-9]+") | |
return re.ReplaceAllString(strings.ToLower(sentence), "") | |
} | |
// tokenize create an array of words from a sentence | |
func tokenize(sentence string) []string { | |
s := cleanup(sentence) | |
words := strings.Fields(s) | |
var tokens []string | |
for _, w := range words { | |
if !isStopword(w) { | |
tokens = append(tokens, w) | |
} | |
} | |
return tokens | |
} | |
// zeroOneTransform returns | |
// 0 if argument x = 0 | |
// 1 otherwise | |
func zeroOneTransform(x int) int { | |
return int(math.Ceil(float64(x) / (float64(x) + 1.0))) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment