Created
August 3, 2011 20:19
-
-
Save arjones/1123656 to your computer and use it in GitHub Desktop.
Sample collection handling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is a sample of handling collections with Scala | |
// It receives the .xml from your delicious account and generates an output to plot | |
// the count of tags on a timeframe | |
// | |
// Please, if you know better ways to do the same processing, please let me know: @arjones | |
// | |
import scala.xml._ | |
import org.joda.time._ | |
object App { | |
def main(args: Array[String]) = { | |
// curl https://USERNAME:[email protected]/v1/posts/all > data/data.xml | |
// | |
val delicious = XML.loadFile("data/data.xml") | |
val dateAndTags = extractDateAndTags(delicious) | |
//group tags by month | |
val tagsByMonth = dateAndTags.groupBy(groupByInstruction) | |
// grouping together the tags | |
val flattenTagsByMonth = tagsByMonth.map(e => (e._1, e._2.map(_._2).flatten)) | |
// Map[DateTime, Map[String,Int]] | |
// Date -> (Tag, Count) | |
// | |
val dateTagCount = flattenTagsByMonth.map { el => | |
val key: DateTime = el._1 | |
val values: Seq[String] = el._2 | |
val valuesCount = values.groupBy(e => e) | |
val tagCount = valuesCount.map(e => (e._1, e._2.size)) | |
(key, tagCount) | |
} | |
// create a index of keys sorted to print out correctly | |
val dates = dateTagCount.keys.toList.sortWith((a, b) => b.isAfter(a)) | |
// get tags that appear more than XX times per month | |
val MIN_APPEARS = 4 | |
val relevantTags = dateTagCount.values.flatten.filter(_._2 > MIN_APPEARS).map(_._1).toList.distinct.filterNot(_ == "").sortWith((a, b) => a < b) | |
printResults(dates, relevantTags, dateTagCount) | |
} | |
def extractDateAndTags(doc: Elem) = { | |
(doc \\ "post").map { p => | |
val date = p.attribute("time").getOrElse("1970-01-01").toString | |
val tags = p.attribute("tag").getOrElse("").toString.toLowerCase.split(" ") | |
(new DateTime(date), tags.filterNot(_ == "for:@twitter")) | |
} | |
} | |
// this function group by month | |
def groupByInstruction(e: (DateTime, Array[String])) = e._1.monthOfYear.roundFloorCopy() | |
def printResults(dates: List[DateTime], relevantTags: List[String], dateTagCount: Map[DateTime, Map[String, Int]]) { | |
// print results | |
// headers | |
print("date\t") | |
for (tag <- relevantTags) print(tag + "\t") | |
println() | |
// each element | |
for (date <- dates) { | |
print(date.toLocalDate() + "\t") | |
val tag = dateTagCount(date) | |
for (h <- relevantTags) { | |
val count = if (tag.contains(h)) tag(h) else "" | |
print(count + "\t") | |
} | |
println() | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment