Created
February 10, 2015 09:52
-
-
Save shkesar/201edf8b08b657732bb4 to your computer and use it in GitHub Desktop.
A version of NaiveBayes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// DON'T PUSH CHANGES MADE TO THIS FILE | |
// ADD IT TO YOUR .gitignore AFTER DOWNLOADING IT ONCE | |
import org.apache.spark.rdd.RDD | |
class NaiveBayesModel(val lambda: Double, | |
val vocabSize: Long, | |
private val vocabEmotion: Array[Seq[String]], | |
private val vocabEmotionLength: Array[Int], | |
val emotions: Array[String], | |
val priorProbabilities: Array[Double]) { | |
def predict(tweet: String): String = { | |
val words = tweet.split(" ") | |
val emotionScore = (priorProbabilities zip (0 to 2)).map { case (priorProb: Double, index: Int) => | |
words.foldLeft(priorProb)((sum, word) => { | |
val prob = wordCondProbability(word, vocabEmotionLength(index), vocabSize, vocabEmotion(index)) | |
if (prob == 0) sum else sum + prob | |
}) | |
}.zip(emotions).toSeq | |
println(emotionScore.mkString(" ")) | |
val prominentEmotion = emotionScore.max(maxEmotion)._2 | |
prominentEmotion | |
} | |
def predict(tweetRDD: RDD[String]): RDD[String] = { | |
tweetRDD.map(predict) | |
} | |
private def wordCondProbability(word: String, vocabEmotionSize: Long, vocabSize: Long, vocabEmotion: Seq[String]): Double = { | |
val termLength = vocabEmotion.count(_ == word) | |
(termLength + lambda) / (vocabEmotionSize + vocabSize) | |
} | |
private def maxEmotion = new Ordering[(Double, String)] { | |
override def compare(x: (Double, String), y: (Double, String)): Int = { | |
val diff = x._1 - y._1 | |
if (diff > 0) diff.ceil.toInt else diff.floor.toInt // convert a floating difference result to an integer keeping the Ordering API the same | |
} | |
} | |
} | |
import scalax.io.{LongTraversable, Resource} | |
object NaiveBayes { | |
def train(dictionaryPath:String, emotions: Array[String], lambda: Double = 1.0): NaiveBayesModel = { | |
val dictionaryLines = Resource.fromFile(dictionaryPath).lines() | |
val totalLines = dictionaryLines.size | |
val priorProb = emotions.map(getEmotionLineCount(dictionaryLines, _).toDouble / totalLines) | |
val vocabEmotion = emotions.map(genEmotionWords(dictionaryLines, _).toSeq) | |
val vocabEmotionLength = vocabEmotion.map(_.length) | |
val vocabSize = vocabEmotion.flatMap(_.toList.distinct).size | |
new NaiveBayesModel(lambda, vocabSize, vocabEmotion, vocabEmotionLength, emotions, priorProb) | |
} | |
// helper functions | |
private def genEmotionWords(dictionaryLines: LongTraversable[String], emotion: String) = | |
dictionaryLines.map(_.split(",")).filter(_(0) == emotion).flatMap(_(1).split(" ")) | |
private def getEmotionLineCount(dictionaryLines: LongTraversable[String], emotion: String) = | |
dictionaryLines.map(_.split(",")(0)).filter(_ == emotion).size | |
} | |
val model = NaiveBayes.train("/Users/shubham/projects/thealth/data/dict/sentiment_dictionary.txt", Array("positive", "negative", "neutral"), 0.0) | |
model.predict("") | |
model.predict("happy is thy man who has food") | |
model.predict("precious is happy") | |
model.predict("Apple and YouTube, with @PTXofficial's help, surprise all at Clive Davis' pre-#GRAMMYs party http://on.mash.to/1EPFsQQ") | |
model.predict("The Apple Watch may sound cool but here's why it's going to be a flop ") | |
model.predict("serious ferociously sinister bowdlerize leer inflammatory exuberance outshine easiness") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment