Skip to content

Instantly share code, notes, and snippets.

@salamanders
Created May 7, 2018 16:42
Show Gist options
  • Save salamanders/2ccb4496f2b9e5bfcef1dde46bc66eae to your computer and use it in GitHub Desktop.
Save salamanders/2ccb4496f2b9e5bfcef1dde46bc66eae to your computer and use it in GitHub Desktop.
/**
* Read lines and get counts
*/
fun main(args: Array<String>) {
val file = File("googlebooks-eng-1M-3gram-20090715-112.csv")
val ms = measureTimeMillis {
var total = 0
file.inputStream()
.bufferedReader()
.readLines() // Seems to use multiple cores? 50m lines, 61sec.
.forEach { total++ }
println(total)
}
println("readLines ms: $ms")
val ms2 = measureTimeMillis {
var total = 0
file.inputStream()
.bufferedReader()
.readLines() // Try again to account for caching. 38sec. Still slow. And uses all cores.
.forEach { total++ }
println(total)
}
println("readLines2 ms: $ms2")
val ms3 = measureTimeMillis {
var total = 0
file.inputStream()
.bufferedReader()
.lineSequence() // Try as a sequence. ~4 to 6 sec with 1 core (!!!)
.forEach { total++ }
println(total)
}
println("lineSequence ms: $ms3")
val ms4 = measureTimeMillis {
var total = 0
file.inputStream()
.bufferedReader()
.lineSequence()
.forEach { line->
val (ngram, _year, match_count, _, _) = line.toLowerCase().split("\t")
total += match_count.toInt()
}
println(total)
}
println("lineSequence and parse ms: $ms4") // 18 seconds, 1 core
val ms5 = measureTimeMillis {
var total = 0
var listOfAsync = file.inputStream()
.bufferedReader()
.lineSequence()
.map { line->
async {
val (ngram, _year, match_count, _, _) = line.toLowerCase().split("\t")
match_count.toInt()
}
}.toList() // to start all the asyncs. Maybe this slows it down?
println("Asyncs started")
runBlocking {
listOfAsync.forEach {
total += it.await()
}
}
println(total)
}
println("lineSequence and async parse ms: $ms5") //148sec
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment