Skip to content

Instantly share code, notes, and snippets.

@caongocthai
Last active April 23, 2022 02:43
Show Gist options
  • Save caongocthai/27c62464c4aaa83dba5becbcfa78f134 to your computer and use it in GitHub Desktop.
Save caongocthai/27c62464c4aaa83dba5becbcfa78f134 to your computer and use it in GitHub Desktop.
Sentiment Analysis: Naive Bayes Classifier from scratch in Golang
package main
import (
"bufio"
"fmt"
"log"
"os"
"strings"
)
// dataset returns a map of sentences to their classes from a file
func dataset(file string) map[string]string {
f, err := os.Open(file)
if err != nil {
panic(err)
}
defer f.Close()
dataset := make(map[string]string)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
l := scanner.Text()
data := strings.Split(l, "\t")
if len(data) != 2 {
continue
}
sentence := data[0]
if data[1] == "0" {
dataset[sentence] = negative
} else if data[1] == "1" {
dataset[sentence] = positive
}
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
return dataset
}
func main() {
// Initialize a new classifier
nb := newClassifier()
// Get dataset from a text file
// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt")
// Train the classifier with dataset
nb.train(dataset)
// Prompt for inputs from console
reader := bufio.NewReader(os.Stdin)
for {
fmt.Print("Enter your review: ")
sentence, _ := reader.ReadString('\n')
// Classify input sentence
result := nb.classify(sentence)
class := ""
if result[positive] > result[negative] {
class = positive
} else {
class = negative
}
fmt.Printf("> Your review is %s\n\n", class)
}
}
package main
// The string values of the 2 classes
// They can be "positive" >< "negative" as in this example
// They can also be "ham" >< "spam", i.e.
const (
positive = "positive"
negative = "negative"
)
/*
* Classifier
*/
// wordFrequency stores frequency of words. For example:
// wordFrequency{
// word: "excellent"
// counter: map[string]int{
// "positive": 15
// "negative": 0
// }
// }
type wordFrequency struct {
word string
counter map[string]int
}
// classifier can be trained and used to categorize objects
// Attributes:
// dataset: map each class with a list of sentences from training data
// map[string][]string{
// "positive": []string{
// "The restaurant is excellent",
// "I really love this restaurant",
// },
// "negative": []string{
// "Their food is awful",
// }
//
// }
// words: map each word with their frequency
// map[string]wordFrequency{
// "restaurant": wordFrequency{
// word: "restaurant"
// counter: map[string]int{
// "positive": 2
// "negative": 0
// }
// }
// }
type classifier struct {
dataset map[string][]string
words map[string]wordFrequency
}
// newClassifier returns a new classifier with empty dataset and words
func newClassifier() *classifier {
c := new(classifier)
c.dataset = map[string][]string{
positive: []string{},
negative: []string{},
}
c.words = map[string]wordFrequency{}
return c
}
// train populates a classifier's dataset and words with input dataset map
// Sample dataset: map[string]string{
// "The restaurant is excellent": "Positive",
// "I really love this restaurant": "Positive",
// "Their food is awful": "Negative",
//}
func (c *classifier) train(dataset map[string]string) {
for sentence, class := range dataset {
c.addSentence(sentence, class)
words := tokenize(sentence)
for _, w := range words {
c.addWord(w, class)
}
}
}
// classify return the probablitities of a sentence being each class
// Sample @return map[string]float64 {
// "positive": 0.7,
// "negative": 0.1,
//}
// Meaning 70% chance the input sentence is positive, 10% it's negative
func (c classifier) classify(sentence string) map[string]float64 {
words := tokenize(sentence)
posProb := c.probability(words, positive)
negProb := c.probability(words, negative)
return map[string]float64{
positive: posProb,
negative: negProb,
}
}
// addSentence adds a sentence and its class to a classifier's dataset map
func (c *classifier) addSentence(sentence, class string) {
c.dataset[class] = append(c.dataset[class], sentence)
}
// addSentence adds a word to a classifier's words map and update its frequency
func (c *classifier) addWord(word, class string) {
wf, ok := c.words[word]
if !ok {
wf = wordFrequency{word: word, counter: map[string]int{
positive: 0,
negative: 0,
}}
}
wf.counter[class]++
c.words[word] = wf
}
// priorProb returns the prior probability of each class of the classifier
// This probability is determined purely by the training dataset
func (c classifier) priorProb(class string) float64 {
return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative]))
}
// totalWordCount returns the word count of a class (duplicated also count)
// If class provided is not positive or negative, it returns
// the total word count in dataset.
func (c classifier) totalWordCount(class string) int {
posCount := 0
negCount := 0
for _, wf := range c.words {
posCount += wf.counter[positive]
negCount += wf.counter[negative]
}
if class == positive {
return posCount
} else if class == negative {
return negCount
} else {
return posCount + negCount
}
}
// totalDistinctWordCount returns the number of distinct words in dataset
func (c classifier) totalDistinctWordCount() int {
posCount := 0
negCount := 0
for _, wf := range c.words {
posCount += zeroOneTransform(wf.counter[positive])
negCount += zeroOneTransform(wf.counter[negative])
}
return posCount + negCount
}
// probability retuns the probability of a list of words being in a class
func (c classifier) probability(words []string, class string) float64 {
prob := c.priorProb(class)
for _, w := range words {
count := 0
if wf, ok := c.words[w]; ok {
count = wf.counter[class]
}
prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount())))
}
for _, w := range words {
count := 0
if wf, ok := c.words[w]; ok {
count += (wf.counter[positive] + wf.counter[negative])
}
prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount())))
}
return prob
}
package main
import (
"math"
"regexp"
"strings"
)
/*
* Utilities
*/
// stopwords are words which have very little meaning
var stopwords = map[string]struct{}{
"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{},
"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{},
"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{},
"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{},
"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{},
"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{},
"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{},
"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{},
"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{},
"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{},
"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{},
"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{},
"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{},
"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{},
"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{},
"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{},
"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{},
"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{},
"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{},
"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{},
}
func isStopword(w string) bool {
_, ok := stopwords[w]
return ok
}
// cleanup remove none-alnum characters and lowercasize them
func cleanup(sentence string) string {
re := regexp.MustCompile("[^a-zA-Z 0-9]+")
return re.ReplaceAllString(strings.ToLower(sentence), "")
}
// tokenize create an array of words from a sentence
func tokenize(sentence string) []string {
s := cleanup(sentence)
words := strings.Fields(s)
var tokens []string
for _, w := range words {
if !isStopword(w) {
tokens = append(tokens, w)
}
}
return tokens
}
// zeroOneTransform returns
// 0 if argument x = 0
// 1 otherwise
func zeroOneTransform(x int) int {
return int(math.Ceil(float64(x) / (float64(x) + 1.0)))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment