Skip to content

Instantly share code, notes, and snippets.

@tOverney
Last active May 31, 2016 00:12
Show Gist options
  • Save tOverney/f32317475b49cab79870 to your computer and use it in GitHub Desktop.
Save tOverney/f32317475b49cab79870 to your computer and use it in GitHub Desktop.
a small project for a first hand with Spark
package main.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.rdd._
object Project1 {
def charsetFix(s : String) : String = s.filter(Character.isDefined)
def main(args: Array[String]) {
if(args.length != 2) {
println("USAGE: Project1 inputfile newoutputdir");
exit(1);
}
val conf = new SparkConf().setAppName("Project1")
// .setMaster("yarn-cluster")
val spark = new SparkContext(conf)
val customers: PairRDDFunctions[Int, Boolean] = spark.textFile(
args(0) + "/customer.tbl").map(_.split("|")).map(
lst => (lst.head.toInt, false))
val pattern = "^.*special.*requests.*$"
val orders: PairRDDFunctions[Int, Int] = spark.textFile(
args(0) + "/orders.tbl").map(_.split("|")).filter(
lst => !lst(8).matches(pattern)).map(lst => (lst(1).toInt, 1)).reduceByKey(_ + _)
val joined: PairRDDFunctions[String, Int] =
(customers.leftOuterJoin(orders)).mapValues(
value => (value._2 match {
case _: Some => 1
case None => 0
}))
val res: RDD[(Int, Int)] = joined.map(tup => (tup._2, tup._1)).mapValues(x => 1).reduceByKey(_ + _)
val toPrint = res.map(_.mkString("|"))
toPrint.saveAsTextFile(charsetFix(args(1) + "/out"))
spark.stop()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment