midnightfreddie · February 29, 2016 04:24
diff --git a/twitter-streaming-language-classifier.scala b/twitter-streaming-language-classifier.scala
 // Selected hand-copied text from https://www.youtube.com/watch?v=FjhRkfAuU7I#t=2035
 // Article at https://databricks.gitbooks.io/databricks-spark-reference-applications/content/twitter_classifier/index.html
 //   references this video, but the provided code is different than video.

 // The video does a bunch of stuff to obtain and process the tweets, but
 // what "val texts = " becomes is simply an array of strings, so I just
 // made a text file with one log entry per line (I'm trying to classify logs)
 // and did something like:

 val texts = sc.textFile("input.log")

 // and then the rest works as shown on-screen!

 // "This converts text to feature vector"
 import org.apache.spark.mllib.clustering.KMeans
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 
 def featurize(s: String): Vector = {
  val n = 1000
  val result = new Array[Double](n)
  val bigrams = s.sliding(2).toArray
  for (h <- bigrams.map(_.hashCode % n)) {
    result(h) += 1.0 / bigrams.length
  }
  Vectors.sparse(n, result.zipWithIndex.filter(_._1 != 0).map(_.swap))
 }
 
 // featurize the log text
 val vectors = texts.map(featurize)
 
 //cache it
 vectors.cache()
 
 // First time is slow, subsequent times are fast
 vectors.count()
 
 // Train the model
 val model = KMeans.train(vectors, k=10, maxIterations = 10)
 
 // Sample 100 of the original set
 val some_tweets = texts.take(100)
 
 // iterate through the 100 samples and show which cluster they are in
 for (i <- 0 until 10) {
  println(s"\nCLUSTER $i:")
  some_tweets.foreach { t =>
    if (model.predict(featurize(t)) == i) {
      println(t)
    }
  }
 }
 
 // save the model
 sc.makeRDD(model.clusterCenters).saveAsObjectFile("model")

 // Next steps: load the model, then model.predict(logline) and tag or
 // otherwise mark each log line by cluster for analysis in Kibana/Elasticsearch
	// Selected hand-copied text from https://www.youtube.com/watch?v=FjhRkfAuU7I#t=2035
	// Article at https://databricks.gitbooks.io/databricks-spark-reference-applications/content/twitter_classifier/index.html
	// references this video, but the provided code is different than video.

	// The video does a bunch of stuff to obtain and process the tweets, but
	// what "val texts = " becomes is simply an array of strings, so I just
	// made a text file with one log entry per line (I'm trying to classify logs)
	// and did something like:

	val texts = sc.textFile("input.log")

	// and then the rest works as shown on-screen!

	// "This converts text to feature vector"
	import org.apache.spark.mllib.clustering.KMeans
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	def featurize(s: String): Vector = {
	val n = 1000
	val result = new Array[Double](n)
	val bigrams = s.sliding(2).toArray
	for (h <- bigrams.map(_.hashCode % n)) {
	result(h) += 1.0 / bigrams.length
	}
	Vectors.sparse(n, result.zipWithIndex.filter(_._1 != 0).map(_.swap))
	}

	// featurize the log text
	val vectors = texts.map(featurize)

	//cache it
	vectors.cache()

	// First time is slow, subsequent times are fast
	vectors.count()

	// Train the model
	val model = KMeans.train(vectors, k=10, maxIterations = 10)

	// Sample 100 of the original set
	val some_tweets = texts.take(100)

	// iterate through the 100 samples and show which cluster they are in
	for (i <- 0 until 10) {
	println(s"\nCLUSTER $i:")
	some_tweets.foreach { t =>
	if (model.predict(featurize(t)) == i) {
	println(t)
	}
	}
	}

	// save the model
	sc.makeRDD(model.clusterCenters).saveAsObjectFile("model")

	// Next steps: load the model, then model.predict(logline) and tag or
	// otherwise mark each log line by cluster for analysis in Kibana/Elasticsearch