Skip to content

Instantly share code, notes, and snippets.

@arjones
Created August 3, 2011 20:19
Show Gist options
  • Save arjones/1123656 to your computer and use it in GitHub Desktop.
Save arjones/1123656 to your computer and use it in GitHub Desktop.
Sample collection handling
// This is a sample of handling collections with Scala
// It receives the .xml from your delicious account and generates an output to plot
// the count of tags on a timeframe
//
// Please, if you know better ways to do the same processing, please let me know: @arjones
//
import scala.xml._
import org.joda.time._
object App {
def main(args: Array[String]) = {
// curl https://USERNAME:[email protected]/v1/posts/all > data/data.xml
//
val delicious = XML.loadFile("data/data.xml")
val dateAndTags = extractDateAndTags(delicious)
//group tags by month
val tagsByMonth = dateAndTags.groupBy(groupByInstruction)
// grouping together the tags
val flattenTagsByMonth = tagsByMonth.map(e => (e._1, e._2.map(_._2).flatten))
// Map[DateTime, Map[String,Int]]
// Date -> (Tag, Count)
//
val dateTagCount = flattenTagsByMonth.map { el =>
val key: DateTime = el._1
val values: Seq[String] = el._2
val valuesCount = values.groupBy(e => e)
val tagCount = valuesCount.map(e => (e._1, e._2.size))
(key, tagCount)
}
// create a index of keys sorted to print out correctly
val dates = dateTagCount.keys.toList.sortWith((a, b) => b.isAfter(a))
// get tags that appear more than XX times per month
val MIN_APPEARS = 4
val relevantTags = dateTagCount.values.flatten.filter(_._2 > MIN_APPEARS).map(_._1).toList.distinct.filterNot(_ == "").sortWith((a, b) => a < b)
printResults(dates, relevantTags, dateTagCount)
}
def extractDateAndTags(doc: Elem) = {
(doc \\ "post").map { p =>
val date = p.attribute("time").getOrElse("1970-01-01").toString
val tags = p.attribute("tag").getOrElse("").toString.toLowerCase.split(" ")
(new DateTime(date), tags.filterNot(_ == "for:@twitter"))
}
}
// this function group by month
def groupByInstruction(e: (DateTime, Array[String])) = e._1.monthOfYear.roundFloorCopy()
def printResults(dates: List[DateTime], relevantTags: List[String], dateTagCount: Map[DateTime, Map[String, Int]]) {
// print results
// headers
print("date\t")
for (tag <- relevantTags) print(tag + "\t")
println()
// each element
for (date <- dates) {
print(date.toLocalDate() + "\t")
val tag = dateTagCount(date)
for (h <- relevantTags) {
val count = if (tag.contains(h)) tag(h) else ""
print(count + "\t")
}
println()
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment