Last active
September 15, 2019 10:57
-
-
Save girisandeep/f12ab4bf2536dc5f0a8ca673efbac1db to your computer and use it in GitHub Desktop.
An example of broadcast variables in spark using scala.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var commonWords = Array("a", "an", "the", "of", "at", "is", "am","are","this","that","at", "in", "or", "and", "or", "not", "be", "for", "to", "it") | |
val commonWordsMap = collection.mutable.Map[String, Int]() | |
for(word <- commonWords){ | |
commonWordsMap(word) = 1 | |
} | |
var commonWordsBC = sc.broadcast(commonWordsMap) | |
var file = sc.textFile("/data/mr/wordcount/input/big.txt") | |
def toWords(line:String):Array[String] = { | |
var words = line.split(" ") | |
var output = Array[String](); | |
for(word <- words){ | |
if(! (commonWordsBC.value contains word.toLowerCase.trim.replaceAll("[^a-z]",""))) output = output :+ word; | |
} | |
return output; | |
} | |
var uncommonWords = file.flatMap(toWords) | |
uncommonWords.take(100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment