caongocthai · April 23, 2022 02:43
diff --git a/main.go b/main.go
 package main

 import (
 	"bufio"
 	"fmt"
 	"log"
 	"os"
 	"strings"
 )

 // dataset returns a map of sentences to their classes from a file
 func dataset(file string) map[string]string {
 	f, err := os.Open(file)
 	if err != nil {
 		panic(err)
 	}
 	defer f.Close()

 	dataset := make(map[string]string)
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
 		l := scanner.Text()
 		data := strings.Split(l, "\t")
 		if len(data) != 2 {
 			continue
 		}
 		sentence := data[0]
 		if data[1] == "0" {
 			dataset[sentence] = negative
 		} else if data[1] == "1" {
 			dataset[sentence] = positive
 		}
 	}

 	if err := scanner.Err(); err != nil {
 		log.Fatal(err)
 	}
 	return dataset
 }

 func main() {
 	// Initialize a new classifier
 	nb := newClassifier()
 	// Get dataset from a text file
 	// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
 	dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt")
 	// Train the classifier with dataset
 	nb.train(dataset)

 	// Prompt for inputs from console
 	reader := bufio.NewReader(os.Stdin)
 	for {
 		fmt.Print("Enter your review: ")
 		sentence, _ := reader.ReadString('\n')
 		// Classify input sentence
 		result := nb.classify(sentence)
 		class := ""
 		if result[positive] > result[negative] {
 			class = positive
 		} else {
 			class = negative
 		}
 		fmt.Printf("> Your review is %s\n\n", class)
 	}
 }
diff --git a/Naivebayes.go b/Naivebayes.go
 package main

 // The string values of the 2 classes
 // They can be "positive" >< "negative" as in this example
 // They can also be "ham" >< "spam", i.e.
 const (
 	positive = "positive"
 	negative = "negative"
 )

 /*
 * Classifier
 */

 // wordFrequency stores frequency of words. For example:
 // wordFrequency{
 //      word: "excellent"
 //	counter: map[string]int{
 //		"positive": 15
 //		"negative": 0
 //	}
 // }
 type wordFrequency struct {
 	word    string
 	counter map[string]int
 }

 // classifier can be trained and used to categorize objects
 // Attributes:
 //	dataset: map each class with a list of  sentences from training data
 //		map[string][]string{
 //			"positive": []string{
 //				"The restaurant is excellent",
 //				"I really love this restaurant",
 //			},
 //			"negative": []string{
 //				"Their food is awful",
 //			}
 //
 //		}
 //	words: map each word with their frequency
 //		map[string]wordFrequency{
 //			"restaurant": wordFrequency{
 //				word: "restaurant"
 //				counter: map[string]int{
 //					"positive": 2
 //					"negative": 0
 //				}
 // 			}
 //		}
 type classifier struct {
 	dataset map[string][]string
 	words   map[string]wordFrequency
 }

 // newClassifier returns a new classifier with empty dataset and words
 func newClassifier() *classifier {
 	c := new(classifier)
 	c.dataset = map[string][]string{
 		positive: []string{},
 		negative: []string{},
 	}
 	c.words = map[string]wordFrequency{}
 	return c
 }

 // train populates a classifier's dataset and words with input dataset map
 // Sample dataset: map[string]string{
 //	"The restaurant is excellent": "Positive",
 //	"I really love this restaurant": "Positive",
 //	"Their food is awful": "Negative",
 //}
 func (c *classifier) train(dataset map[string]string) {
 	for sentence, class := range dataset {
 		c.addSentence(sentence, class)
 		words := tokenize(sentence)
 		for _, w := range words {
 			c.addWord(w, class)
 		}
 	}
 }

 // classify return the probablitities of a sentence being each class
 // Sample @return map[string]float64 {
 //	"positive": 0.7,
 //	"negative": 0.1,
 //}
 // Meaning 70% chance the input sentence is positive, 10% it's negative
 func (c classifier) classify(sentence string) map[string]float64 {
 	words := tokenize(sentence)
 	posProb := c.probability(words, positive)
 	negProb := c.probability(words, negative)
 	return map[string]float64{
 		positive: posProb,
 		negative: negProb,
 	}
 }

 // addSentence adds a sentence and its class to a classifier's dataset map
 func (c *classifier) addSentence(sentence, class string) {
 	c.dataset[class] = append(c.dataset[class], sentence)
 }

 // addSentence adds a word to a classifier's words map and update its frequency
 func (c *classifier) addWord(word, class string) {
 	wf, ok := c.words[word]
 	if !ok {
 		wf = wordFrequency{word: word, counter: map[string]int{
 			positive: 0,
 			negative: 0,
 		}}
 	}
 	wf.counter[class]++
 	c.words[word] = wf
 }

 // priorProb returns the prior probability of each class of the classifier
 // This probability is determined purely by the training dataset
 func (c classifier) priorProb(class string) float64 {
 	return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative]))
 }

 // totalWordCount returns the word count of a class (duplicated also count)
 // If class provided is not positive or negative, it returns
 // the total word count in dataset.
 func (c classifier) totalWordCount(class string) int {
 	posCount := 0
 	negCount := 0
 	for _, wf := range c.words {
 		posCount += wf.counter[positive]
 		negCount += wf.counter[negative]
 	}
 	if class == positive {
 		return posCount
 	} else if class == negative {
 		return negCount
 	} else {
 		return posCount + negCount
 	}
 }

 // totalDistinctWordCount returns the number of distinct words in dataset
 func (c classifier) totalDistinctWordCount() int {
 	posCount := 0
 	negCount := 0
 	for _, wf := range c.words {
 		posCount += zeroOneTransform(wf.counter[positive])
 		negCount += zeroOneTransform(wf.counter[negative])
 	}
 	return posCount + negCount
 }

 // probability retuns the probability of a list of words being in a class
 func (c classifier) probability(words []string, class string) float64 {
 	prob := c.priorProb(class)
 	for _, w := range words {
 		count := 0
 		if wf, ok := c.words[w]; ok {
 			count = wf.counter[class]
 		}
 		prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount())))
 	}
 	for _, w := range words {
 		count := 0
 		if wf, ok := c.words[w]; ok {
 			count += (wf.counter[positive] + wf.counter[negative])
 		}
 		prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount())))
 	}
 	return prob
 }
diff --git a/utilities.go b/utilities.go
 package main

 import (
 	"math"
 	"regexp"
 	"strings"
 )

 /*
 * Utilities
 */

 // stopwords are words which have very little meaning
 var stopwords = map[string]struct{}{
 	"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{},
 	"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{},
 	"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{},
 	"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{},
 	"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{},
 	"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{},
 	"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{},
 	"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{},
 	"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{},
 	"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{},
 	"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{},
 	"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{},
 	"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{},
 	"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{},
 	"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{},
 	"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{},
 	"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{},
 	"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{},
 	"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{},
 	"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{},
 }

 func isStopword(w string) bool {
 	_, ok := stopwords[w]
 	return ok
 }

 // cleanup remove none-alnum characters and lowercasize them
 func cleanup(sentence string) string {
 	re := regexp.MustCompile("[^a-zA-Z 0-9]+")
 	return re.ReplaceAllString(strings.ToLower(sentence), "")
 }

 // tokenize create an array of words from a sentence
 func tokenize(sentence string) []string {
 	s := cleanup(sentence)
 	words := strings.Fields(s)
 	var tokens []string
 	for _, w := range words {
 		if !isStopword(w) {
 			tokens = append(tokens, w)
 		}
 	}
 	return tokens
 }

 // zeroOneTransform returns
 //   0 if argument x = 0
 //   1 otherwise
 func zeroOneTransform(x int) int {
 	return int(math.Ceil(float64(x) / (float64(x) + 1.0)))
 }
	package main

	import (
	"bufio"
	"fmt"
	"log"
	"os"
	"strings"
	)

	// dataset returns a map of sentences to their classes from a file
	func dataset(file string) map[string]string {
	f, err := os.Open(file)
	if err != nil {
	panic(err)
	}
	defer f.Close()

	dataset := make(map[string]string)
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
	l := scanner.Text()
	data := strings.Split(l, "\t")
	if len(data) != 2 {
	continue
	}
	sentence := data[0]
	if data[1] == "0" {
	dataset[sentence] = negative
	} else if data[1] == "1" {
	dataset[sentence] = positive
	}
	}

	if err := scanner.Err(); err != nil {
	log.Fatal(err)
	}
	return dataset
	}

	func main() {
	// Initialize a new classifier
	nb := newClassifier()
	// Get dataset from a text file
	// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
	dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt")
	// Train the classifier with dataset
	nb.train(dataset)

	// Prompt for inputs from console
	reader := bufio.NewReader(os.Stdin)
	for {
	fmt.Print("Enter your review: ")
	sentence, _ := reader.ReadString('\n')
	// Classify input sentence
	result := nb.classify(sentence)
	class := ""
	if result[positive] > result[negative] {
	class = positive
	} else {
	class = negative
	}
	fmt.Printf("> Your review is %s\n\n", class)
	}
	}
	package main

	// The string values of the 2 classes
	// They can be "positive" >< "negative" as in this example
	// They can also be "ham" >< "spam", i.e.
	const (
	positive = "positive"
	negative = "negative"
	)

	/*
	* Classifier
	*/

	// wordFrequency stores frequency of words. For example:
	// wordFrequency{
	// word: "excellent"
	// counter: map[string]int{
	// "positive": 15
	// "negative": 0
	// }
	// }
	type wordFrequency struct {
	word string
	counter map[string]int
	}

	// classifier can be trained and used to categorize objects
	// Attributes:
	// dataset: map each class with a list of sentences from training data
	// map[string][]string{
	// "positive": []string{
	// "The restaurant is excellent",
	// "I really love this restaurant",
	// },
	// "negative": []string{
	// "Their food is awful",
	// }
	//
	// }
	// words: map each word with their frequency
	// map[string]wordFrequency{
	// "restaurant": wordFrequency{
	// word: "restaurant"
	// counter: map[string]int{
	// "positive": 2
	// "negative": 0
	// }
	// }
	// }
	type classifier struct {
	dataset map[string][]string
	words map[string]wordFrequency
	}

	// newClassifier returns a new classifier with empty dataset and words
	func newClassifier() *classifier {
	c := new(classifier)
	c.dataset = map[string][]string{
	positive: []string{},
	negative: []string{},
	}
	c.words = map[string]wordFrequency{}
	return c
	}

	// train populates a classifier's dataset and words with input dataset map
	// Sample dataset: map[string]string{
	// "The restaurant is excellent": "Positive",
	// "I really love this restaurant": "Positive",
	// "Their food is awful": "Negative",
	//}
	func (c *classifier) train(dataset map[string]string) {
	for sentence, class := range dataset {
	c.addSentence(sentence, class)
	words := tokenize(sentence)
	for _, w := range words {
	c.addWord(w, class)
	}
	}
	}

	// classify return the probablitities of a sentence being each class
	// Sample @return map[string]float64 {
	// "positive": 0.7,
	// "negative": 0.1,
	//}
	// Meaning 70% chance the input sentence is positive, 10% it's negative
	func (c classifier) classify(sentence string) map[string]float64 {
	words := tokenize(sentence)
	posProb := c.probability(words, positive)
	negProb := c.probability(words, negative)
	return map[string]float64{
	positive: posProb,
	negative: negProb,
	}
	}

	// addSentence adds a sentence and its class to a classifier's dataset map
	func (c *classifier) addSentence(sentence, class string) {
	c.dataset[class] = append(c.dataset[class], sentence)
	}

	// addSentence adds a word to a classifier's words map and update its frequency
	func (c *classifier) addWord(word, class string) {
	wf, ok := c.words[word]
	if !ok {
	wf = wordFrequency{word: word, counter: map[string]int{
	positive: 0,
	negative: 0,
	}}
	}
	wf.counter[class]++
	c.words[word] = wf
	}

	// priorProb returns the prior probability of each class of the classifier
	// This probability is determined purely by the training dataset
	func (c classifier) priorProb(class string) float64 {
	return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative]))
	}

	// totalWordCount returns the word count of a class (duplicated also count)
	// If class provided is not positive or negative, it returns
	// the total word count in dataset.
	func (c classifier) totalWordCount(class string) int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
	posCount += wf.counter[positive]
	negCount += wf.counter[negative]
	}
	if class == positive {
	return posCount
	} else if class == negative {
	return negCount
	} else {
	return posCount + negCount
	}
	}

	// totalDistinctWordCount returns the number of distinct words in dataset
	func (c classifier) totalDistinctWordCount() int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
	posCount += zeroOneTransform(wf.counter[positive])
	negCount += zeroOneTransform(wf.counter[negative])
	}
	return posCount + negCount
	}

	// probability retuns the probability of a list of words being in a class
	func (c classifier) probability(words []string, class string) float64 {
	prob := c.priorProb(class)
	for _, w := range words {
	count := 0
	if wf, ok := c.words[w]; ok {
	count = wf.counter[class]
	}
	prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount())))
	}
	for _, w := range words {
	count := 0
	if wf, ok := c.words[w]; ok {
	count += (wf.counter[positive] + wf.counter[negative])
	}
	prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount())))
	}
	return prob
	}
	package main

	import (
	"math"
	"regexp"
	"strings"
	)

	/*
	* Utilities
	*/

	// stopwords are words which have very little meaning
	var stopwords = map[string]struct{}{
	"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{},
	"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{},
	"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{},
	"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{},
	"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{},
	"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{},
	"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{},
	"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{},
	"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{},
	"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{},
	"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{},
	"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{},
	"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{},
	"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{},
	"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{},
	"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{},
	"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{},
	"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{},
	"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{},
	"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{},
	}

	func isStopword(w string) bool {
	_, ok := stopwords[w]
	return ok
	}

	// cleanup remove none-alnum characters and lowercasize them
	func cleanup(sentence string) string {
	re := regexp.MustCompile("[^a-zA-Z 0-9]+")
	return re.ReplaceAllString(strings.ToLower(sentence), "")
	}

	// tokenize create an array of words from a sentence
	func tokenize(sentence string) []string {
	s := cleanup(sentence)
	words := strings.Fields(s)
	var tokens []string
	for _, w := range words {
	if !isStopword(w) {
	tokens = append(tokens, w)
	}
	}
	return tokens
	}

	// zeroOneTransform returns
	// 0 if argument x = 0
	// 1 otherwise
	func zeroOneTransform(x int) int {
	return int(math.Ceil(float64(x) / (float64(x) + 1.0)))
	}