bigsnarfdude · November 7, 2016 03:38 · snowindy · Sep 4, 2015
diff --git a/gistfile1.txt b/gistfile1.txt
 // in the spark shell here i load the file from S3
 val myFile = sc.textFile("s3://some-s3-bucket/us-constitution.txt")

 // Classic wordcount 
 val counts = myFile.flatMap(line => line.toLowerCase().replace(".", " ").replace(",", " ").split(" ")).map(word => (word, 1L)).reduceByKey(_ + _)

 // create tuples for the words 
 val sorted_counts = counts.collect().sortBy(wc => -wc._2)

 // print out a sample of 10 to see results
 sorted_counts.take(10).foreach(println)

 // save the files out to S3 bucket
 sc.parallelize(sorted_counts).saveAsTextFile("s3n://some-s3-bucket/wordcount-us-constution")

 // maybe you want to write it out as csv
 val csvResults = sorted_counts map { case (key, value) => Array(key, value).mkString(",\t") }

 // save csv out to S3
 sc.parallelize(results).saveAsTextFile("s3n://some-s3-bucket/wordcount-csv-constitution")
	// in the spark shell here i load the file from S3
	val myFile = sc.textFile("s3://some-s3-bucket/us-constitution.txt")

	// Classic wordcount
	val counts = myFile.flatMap(line => line.toLowerCase().replace(".", " ").replace(",", " ").split(" ")).map(word => (word, 1L)).reduceByKey(_ + _)

	// create tuples for the words
	val sorted_counts = counts.collect().sortBy(wc => -wc._2)

	// print out a sample of 10 to see results
	sorted_counts.take(10).foreach(println)

	// save the files out to S3 bucket
	sc.parallelize(sorted_counts).saveAsTextFile("s3n://some-s3-bucket/wordcount-us-constution")

	// maybe you want to write it out as csv
	val csvResults = sorted_counts map { case (key, value) => Array(key, value).mkString(",\t") }

	// save csv out to S3
	sc.parallelize(results).saveAsTextFile("s3n://some-s3-bucket/wordcount-csv-constitution")