Last active
July 8, 2016 13:14
-
-
Save feynmanliang/63a2835f6ba2bf35b85a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.http.client.methods.HttpGet | |
import org.apache.http.impl.client.{BasicResponseHandler, HttpClientBuilder} | |
import org.apache.spark.mllib.fpm.PrefixSpan | |
// sequence database | |
val sequenceDatabase = { | |
val url = "http://www.philippe-fournier-viger.com/spmf/datasets/SIGN.txt" | |
val client = HttpClientBuilder.create().build() | |
val request = new HttpGet(url) | |
val response = client.execute(request) | |
val handler = new BasicResponseHandler() | |
val stringData = handler.handleResponse(response).trim | |
stringData.split("\n").map { case (patternStr: String) => | |
// SPMF data uses -1 for delimiter and -2 for EOL | |
// so we split on -1 and drop the last item | |
patternStr.split("-1").init.map { case (itemsetStr: String) => | |
itemsetStr.trim.split(" ").map { case (itemStr: String) => | |
itemStr.toInt | |
} | |
} | |
} | |
} | |
// mapping from item code to event name | |
val itemToEvent = { | |
val url = "http://cs-people.bu.edu/panagpap/Research/Asl_project/ASL_results/codes.txt" | |
val client = HttpClientBuilder.create().build() | |
val request = new HttpGet(url) | |
val response = client.execute(request) | |
val handler = new BasicResponseHandler() | |
val stringData = handler.handleResponse(response).trim | |
stringData | |
.split("\n").drop(2).mkString("\n") // drop header lines | |
.split("@").drop(1) // split by event groups | |
.map { case (eventGroupStr: String) => | |
eventGroupStr.split("\n").map { case (line:String) => | |
val splittedLine = line.trim.split("\t") | |
if (splittedLine.length == 2) { // event group identifier | |
splittedLine.last.trim | |
} else { // event code, event description | |
(splittedLine.head, splittedLine.last) | |
} | |
} | |
}.flatMap { line => | |
val eventGroup = line.head | |
line.tail.map { case (eventCode: String, eventDescription: String) => | |
eventCode.toInt -> (eventGroup + " - " + eventDescription) | |
} | |
}.toMap | |
} | |
// mine sequential patterns with prefix span | |
val rdd = sc.parallelize(sequenceDatabase, 2).cache() | |
val prefixSpan = new PrefixSpan() | |
.setMinSupport(0.6) | |
.setMaxPatternLength(10) | |
val results = prefixSpan.run(rdd) | |
// convert items to event names and print results | |
results.freqSequences.collect().map { freqSeq => | |
freqSeq.sequence.map("(" + _.map(itemToEvent).mkString(", ") + ")").mkString(", ") | |
}.foreach(println) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment