-
-
Save lancegatlin/923cf28389c363427ad3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.io.Source | |
//read a file and build a concordance list of the tokens in the file | |
//there's gotta be a nicer way to do this | |
def two(fileName:String) = { | |
val chunkSize = 1024 | |
Source.fromFile(fileName).getLines() | |
// read in chunkSize lines into memory at once | |
// From: http://stackoverflow.com/questions/6751463/iterate-over-lines-in-a-file-in-parallel-scala | |
.grouped(chunkSize) | |
.map { lines => | |
lines | |
// Take advantage of Scala collection parallel processing | |
.par | |
// Get the words from all lines (split on one or more whitespace) | |
.flatMap(line => line.split("\\s+")) | |
// Group by unique words | |
.groupBy(v => v) | |
// Get the count | |
.map { case (word, words) => (word, words.size) } | |
.toMap | |
} | |
// Each chunk is now a Map[String, Int] so we need to combine these | |
.reduce { (map1,map2) => | |
// From: http://stackoverflow.com/questions/7076128/best-way-to-merge-two-maps-and-sum-the-values-of-same-key (2nd answer) | |
map1 ++ map2.map{ case (k,v) => k -> (v + map1.getOrElse(k,0)) } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment