Last active
February 29, 2016 04:24
-
-
Save midnightfreddie/5c5651e8b9410cd5aab6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Selected hand-copied text from https://www.youtube.com/watch?v=FjhRkfAuU7I#t=2035 | |
// Article at https://databricks.gitbooks.io/databricks-spark-reference-applications/content/twitter_classifier/index.html | |
// references this video, but the provided code is different than video. | |
// The video does a bunch of stuff to obtain and process the tweets, but | |
// what "val texts = " becomes is simply an array of strings, so I just | |
// made a text file with one log entry per line (I'm trying to classify logs) | |
// and did something like: | |
val texts = sc.textFile("input.log") | |
// and then the rest works as shown on-screen! | |
// "This converts text to feature vector" | |
import org.apache.spark.mllib.clustering.KMeans | |
import org.apache.spark.mllib.linalg.{Vector, Vectors} | |
def featurize(s: String): Vector = { | |
val n = 1000 | |
val result = new Array[Double](n) | |
val bigrams = s.sliding(2).toArray | |
for (h <- bigrams.map(_.hashCode % n)) { | |
result(h) += 1.0 / bigrams.length | |
} | |
Vectors.sparse(n, result.zipWithIndex.filter(_._1 != 0).map(_.swap)) | |
} | |
// featurize the log text | |
val vectors = texts.map(featurize) | |
//cache it | |
vectors.cache() | |
// First time is slow, subsequent times are fast | |
vectors.count() | |
// Train the model | |
val model = KMeans.train(vectors, k=10, maxIterations = 10) | |
// Sample 100 of the original set | |
val some_tweets = texts.take(100) | |
// iterate through the 100 samples and show which cluster they are in | |
for (i <- 0 until 10) { | |
println(s"\nCLUSTER $i:") | |
some_tweets.foreach { t => | |
if (model.predict(featurize(t)) == i) { | |
println(t) | |
} | |
} | |
} | |
// save the model | |
sc.makeRDD(model.clusterCenters).saveAsObjectFile("model") | |
// Next steps: load the model, then model.predict(logline) and tag or | |
// otherwise mark each log line by cluster for analysis in Kibana/Elasticsearch |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment