rounakdatta · June 9, 2019 19:00
diff --git a/wordCount.py b/wordCount.py
 from pyspark.sql import SparkSession

 # helper function to add
 def addThem(x, y):
    return x + y

 # spark session object construction - remember there's only one session that's why getOrCreate
 spark = SparkSession.builder.appName("wordCount").getOrCreate()

 # read the file contents
 sampleFileName = "./sample.txt"
 sampleFileData = spark.read.text(sampleFileName)

 # print the number of lines in the file
 lines = spark.read.text(sampleFileData).rdd.map(lambda r: r[0])
 print(lines)

 # map reduce to calculate the frequency of every word
 wordCountData = lines.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(addThem)

 # print them one-by-one
 wordCountData.collect().map(lambda tuple: print(tuple))
diff --git a/wordCount.scala b/wordCount.scala
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._

 // read the file contents
 val sampleFilename = "sample.txt"
 val sampleFileData = sc.textFile(sampleFilename)

 // cache the file contents so that spark doesn't read again and again from disk
 sampleFileData.cache()

 // print the number of lines in the file
 println(sampleFileData.count())

 // map reduce to calculate the frequency of each word
 val wordCountData = sampleFileData.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)

 // print them one-by-one
 wordCountData.collect().foreach(println)
	from pyspark.sql import SparkSession

	# helper function to add
	def addThem(x, y):
	return x + y

	# spark session object construction - remember there's only one session that's why getOrCreate
	spark = SparkSession.builder.appName("wordCount").getOrCreate()

	# read the file contents
	sampleFileName = "./sample.txt"
	sampleFileData = spark.read.text(sampleFileName)

	# print the number of lines in the file
	lines = spark.read.text(sampleFileData).rdd.map(lambda r: r[0])
	print(lines)

	# map reduce to calculate the frequency of every word
	wordCountData = lines.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(addThem)

	# print them one-by-one
	wordCountData.collect().map(lambda tuple: print(tuple))
	import org.apache.spark.SparkContext
	import org.apache.spark.SparkContext._

	// read the file contents
	val sampleFilename = "sample.txt"
	val sampleFileData = sc.textFile(sampleFilename)

	// cache the file contents so that spark doesn't read again and again from disk
	sampleFileData.cache()

	// print the number of lines in the file
	println(sampleFileData.count())

	// map reduce to calculate the frequency of each word
	val wordCountData = sampleFileData.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)

	// print them one-by-one
	wordCountData.collect().foreach(println)