Skip to content

Instantly share code, notes, and snippets.

@rounakdatta
Last active June 9, 2019 19:00
Show Gist options
  • Save rounakdatta/0cc5a0c9aa625794b5761cf0476e0fbe to your computer and use it in GitHub Desktop.
Save rounakdatta/0cc5a0c9aa625794b5761cf0476e0fbe to your computer and use it in GitHub Desktop.
Word Counter implementations using Apache Spark
from pyspark.sql import SparkSession
# helper function to add
def addThem(x, y):
return x + y
# spark session object construction - remember there's only one session that's why getOrCreate
spark = SparkSession.builder.appName("wordCount").getOrCreate()
# read the file contents
sampleFileName = "./sample.txt"
sampleFileData = spark.read.text(sampleFileName)
# print the number of lines in the file
lines = spark.read.text(sampleFileData).rdd.map(lambda r: r[0])
print(lines)
# map reduce to calculate the frequency of every word
wordCountData = lines.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(addThem)
# print them one-by-one
wordCountData.collect().map(lambda tuple: print(tuple))
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
// read the file contents
val sampleFilename = "sample.txt"
val sampleFileData = sc.textFile(sampleFilename)
// cache the file contents so that spark doesn't read again and again from disk
sampleFileData.cache()
// print the number of lines in the file
println(sampleFileData.count())
// map reduce to calculate the frequency of each word
val wordCountData = sampleFileData.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
// print them one-by-one
wordCountData.collect().foreach(println)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment