Skip to content

Instantly share code, notes, and snippets.

val path = "/Users/itversity/Research/data/wordcount.txt" or val path = "/public/randomtextwriter/part-m-00000"
sc.textFile(path).
mapPartitions(lines => {
// Using Scala APIs to process each partition
lines.flatMap(_.split(" ")).map((_, 1))
}).
reduceByKey((total, agg) => total + agg).
take(100).
foreach(println)
spark-submit --class wordcount.WordCount \
target/scala-2.10/sparkdemo_2.10-1.0.jar \
uat \
/Users/itversity/Research/data/wordcount.txt \
/Users/itversity/Research/data/wordcount
spark-submit --class wordcount.WordCount \
--master yarn \
--conf spark.ui.port=54312 \
sparkdemo_2.10-1.0.jar \
prod \
/public/randomtextwriter/part-m-00000 \
/user/dgadiraju/wordcount
spark-submit --class wordcount.WordCount \
--num-executors 10 \
--executor-memory 3584M \
--executor-cores 4 \
--master yarn \
--conf spark.ui.port=54123 \
sparkdemo_2.10-1.0.jar \
prod /public/randomtextwriter /user/dgadiraju/wordcount
// Make sure you do not have directory used for output path
// hadoop fs -rm -R /user/dgadiraju/cardcountbysuit
val inputPath = "/public/cards/largedeck.txt"
val outputPath = "/user/dgadiraju/cardcountbysuit"
sc.textFile(inputPath).
map(card => (card.split("\\|")(1), 1)).
reduceByKey((total, card) => total + card).
saveAsTextFile(outputPath)
// Make sure you do not have directory used for output path
val path = "/Users/itversity/Research/data/wordcount.txt" or val path = "/public/randomtextwriter/part-m-00000"
sc.textFile(path).
coalesce(5). // with out coalesce it will try to use 9 tasks in first stage
flatMap(_.split(" ")).
map((_, 1)).
reduceByKey((total, agg) => total + agg).
coalesce(2). // second stage will use only 2 tasks
take(100).
// Make sure you do not have directory used for output path
// hadoop fs -rm -R /user/dgadiraju/cardcountbysuit
val inputPath = "/public/cards/largedeck.txt"
val outputPath = "/user/dgadiraju/cardcountbysuit"
sc.textFile(inputPath).
repartition(12).
map(card => (card.split("\\|")(1), 1)).
reduceByKey((total, card) => total + card, 2).
saveAsTextFile(outputPath)
// Make sure you do not have directory used for output path
// hadoop fs -rm -R /user/dgadiraju/cardcountbysuit
val inputPath = "/public/cards/largedeck.txt"
val outputPath = "/user/dgadiraju/cardcountbysuit"
sc.textFile(inputPath).
map(card => (card.split("\\|")(1), 1)).
reduceByKey((total, card) => total + card, 1). //Only 1 file will be created and 1 task will be used in second stage.
saveAsTextFile(outputPath)
val inputPath = "/public/randomtextwriter/part-m-0000*"
val outputPath = "/user/dgadiraju/wordcount"
sc.textFile(inputPath).
flatMap(_.split(" ")).
map((_, 1)).
reduceByKey((total, agg) => total + agg, 10). //Ideal number of tasks could be 4
saveAsTextFile(outputPath)
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
<title>Document</title>