Skip to content

Instantly share code, notes, and snippets.

@eribeiro
Last active December 19, 2015 11:49
Show Gist options
  • Save eribeiro/5950664 to your computer and use it in GitHub Desktop.
Save eribeiro/5950664 to your computer and use it in GitHub Desktop.
def scanTrace() {
import collection.mutable.{ HashMap, MultiMap, Set }
val source = scala.io.Source.fromFile("google-cluster-data-1.csv")
val map = new HashMap[String, Set[String]] with MultiMap[String, String]
for (i <- source.getLines()) {
val fields = i.split(" ")
val parent = fields(1)
val child = fields(2)
map.addBinding(parent, child)
}
source.close()
// mapping from number_of_tasks --> how_many_jobs_have_those_number_of_tasks
val countingMap = new HashMap[Int,Int]
for (v <- map.values) {
val size = v.size
val currSize = countingMap.getOrElse(size, 0)
countingMap(size) = currSize + 1
}
for (i <- countingMap.keySet.toList.sortWith(_ > _))
println(countingMap.get(i).get + " job has " + i + " tasks")
val jobs = map.keys.size
val singletonJobs = map.values.foldLeft(0)( (a,b) => if (b.size == 1) a + 1 else a )
println("jobs:" + jobs)
print("jobs with 1 task only: " + singletonJobs)
printf(" (%.2f%s", ((singletonJobs.toDouble/jobs.toDouble) * 100), "%)\n")
println("largest job:" + map.values.maxBy[Int]( x => x.size ).size + " tasks ")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment