Skip to content

Instantly share code, notes, and snippets.

@dgadiraju
Last active June 1, 2017 08:07
Show Gist options
  • Save dgadiraju/2f0d333f4d4795945162c5739fd8322f to your computer and use it in GitHub Desktop.
Save dgadiraju/2f0d333f4d4795945162c5739fd8322f to your computer and use it in GitHub Desktop.
package wordcount
/**
* Created by itversity on 25/03/17.
* spark-submit
spark-submit \
--class WordCount \
/Users/itversity/IdeaProjects/sands/target/scala-2.10/sands_2.10-1.0.jar \
dev /Users/itversity/Research/data/wordcount.txt /Users/itversity/Research/data/wc
*/
import org.apache.hadoop.fs._
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]) = {
val executionEnvironment = args(0)
val props: Config = ConfigFactory.load()
val conf = new SparkConf().
setAppName("Word Count").
setMaster(props.getConfig(executionEnvironment).getString("executionMode"))
val sc = new SparkContext(conf)
val fs = FileSystem.get(sc.hadoopConfiguration)
val inputPath = args(1)
val outputPath = args(2)
if(!fs.exists(new Path(inputPath))) {
println("Input path does not exist")
} else {
if(fs.exists(new Path(outputPath)))
fs.delete(new Path(outputPath), true)
sc.textFile(inputPath).
flatMap(rec => rec.split(" ")).
map(rec => (rec.replace(",", ""), 1)).
// above line can be added if you want to discard punctuation marks while performing word count
reduceByKey(_ + _).
map(rec => rec.productIterator.mkString("\t")).
// alternative - map(rec => rec._1 + "\t" + rec._2).
saveAsTextFile(outputPath)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment