Skip to content

Instantly share code, notes, and snippets.

@feynmanliang
Last active July 8, 2016 13:14
Show Gist options
  • Save feynmanliang/63a2835f6ba2bf35b85a to your computer and use it in GitHub Desktop.
Save feynmanliang/63a2835f6ba2bf35b85a to your computer and use it in GitHub Desktop.
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.{BasicResponseHandler, HttpClientBuilder}
import org.apache.spark.mllib.fpm.PrefixSpan
// sequence database
val sequenceDatabase = {
val url = "http://www.philippe-fournier-viger.com/spmf/datasets/SIGN.txt"
val client = HttpClientBuilder.create().build()
val request = new HttpGet(url)
val response = client.execute(request)
val handler = new BasicResponseHandler()
val stringData = handler.handleResponse(response).trim
stringData.split("\n").map { case (patternStr: String) =>
// SPMF data uses -1 for delimiter and -2 for EOL
// so we split on -1 and drop the last item
patternStr.split("-1").init.map { case (itemsetStr: String) =>
itemsetStr.trim.split(" ").map { case (itemStr: String) =>
itemStr.toInt
}
}
}
}
// mapping from item code to event name
val itemToEvent = {
val url = "http://cs-people.bu.edu/panagpap/Research/Asl_project/ASL_results/codes.txt"
val client = HttpClientBuilder.create().build()
val request = new HttpGet(url)
val response = client.execute(request)
val handler = new BasicResponseHandler()
val stringData = handler.handleResponse(response).trim
stringData
.split("\n").drop(2).mkString("\n") // drop header lines
.split("@").drop(1) // split by event groups
.map { case (eventGroupStr: String) =>
eventGroupStr.split("\n").map { case (line:String) =>
val splittedLine = line.trim.split("\t")
if (splittedLine.length == 2) { // event group identifier
splittedLine.last.trim
} else { // event code, event description
(splittedLine.head, splittedLine.last)
}
}
}.flatMap { line =>
val eventGroup = line.head
line.tail.map { case (eventCode: String, eventDescription: String) =>
eventCode.toInt -> (eventGroup + " - " + eventDescription)
}
}.toMap
}
// mine sequential patterns with prefix span
val rdd = sc.parallelize(sequenceDatabase, 2).cache()
val prefixSpan = new PrefixSpan()
.setMinSupport(0.6)
.setMaxPatternLength(10)
val results = prefixSpan.run(rdd)
// convert items to event names and print results
results.freqSequences.collect().map { freqSeq =>
freqSeq.sequence.map("(" + _.map(itemToEvent).mkString(", ") + ")").mkString(", ")
}.foreach(println)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment