Skip to content

Instantly share code, notes, and snippets.

@skhatri
Created September 27, 2022 02:59
Show Gist options
  • Save skhatri/eac545839fdb867294b3449dd4791f5d to your computer and use it in GitHub Desktop.
Save skhatri/eac545839fdb867294b3449dd4791f5d to your computer and use it in GitHub Desktop.
WordCount Apache Spark

With RDD

    val sc = new SparkContext(new SparkConf()
                                .setAppName("word-count-rdd").setMaster("local[1]")
                                .set("spark.driver.ip", "127.0.0.1")
            )

    sc.textFile(path).flatMap(line => line.toLowerCase().split(" "))
    .map(w => (w, 1))
    .reduceByKey((v1, v2) => v1 + v2)
    .sortBy(kv => -kv._2)
    .foreach(println)

    sc.stop()

With Spark Session and DataFrame

val spark = SparkSession.builder()
        .appName("wordcount")
        .master("local[1]")
        .config("spark.driver.ip", "127.0.0.1")
        .getOrCreate()

    import spark.implicits._ 
    import org.apache.spark.sql.functions._ 
    val df = spark.read.format("text").load(path)
    df.flatMap(row => row.getAs[String]("value").toLowerCase().split(" "))
    .groupBy("value").count()
    .sort(col("count").desc)
    .show(5)
    

With Spark SQL

    val df = spark.read.format("text").load(path)
    df.createOrReplaceTempView("lines")

    spark.sql("""select explode(split(lower(value), ' ')) as word, count(*) 
    from lines group by 1 order by 2 desc""")
    .show(5, false)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment