tecmaverick · December 19, 2022 09:25
diff --git a/RDDSample002.scala b/RDDSample002.scala
 //Input Data
 // studentid,coursename_with_attendance
 // 01,CHEM:12|PHY:33|MATH:22
 // 02,CHEM:34|PHY:3
 // 03,MATH:12|COMP:45|CHEM:12
 // 04,MATH:67|PHY:76
 // 05,HIST:88|MARKT:33|BIOL:55
 // 06,BIOL:88|PHY:77
 // 07,BOTONY:34|ZOOL:77
 // 08,BOTONY:34|COMP:99
 // 09,HIST:36|COMP:66
 // 010,MATH:44|COMP:32|STAT:23
 // 011,COMP:44|STAT:66

 //Output Data
 // +----------+--------------------+----------------+
 // |courseName|attendanceHoursTotal|studentsEnrolled|
 // +----------+--------------------+----------------+
 // |      CHEM|                  58|               3|
 // |      MATH|                 145|               4|
 // |      COMP|                 286|               5|
 // |      HIST|                 124|               2|
 // |      BIOL|                 143|               2|
 // |       PHY|                 189|               4|
 // |     MARKT|                  33|               1|
 // |    BOTONY|                  68|               2|
 // |      ZOOL|                  77|               1|
 // |      STAT|                  89|               2|
 // +----------+--------------------+----------------+


 def main(args: Array[String]): Unit = {

  val filePath = "file:///Users/abe/Personal/Apache Spark/Scripts/AjpRDDReverseMapping/data/data1.csv"

  val spark = SparkSession.builder()
    .master("local[3]")
    .appName("Data Processor")
    .config("spark.sql.shuffle.partitions", 2)
    .getOrCreate()

  val csv_rdd = spark.sparkContext.textFile(filePath)
  val header = csv_rdd.first()
  val csv_filtered_rdd = csv_rdd
    .filter(x => x != header)
    .map { row =>
      val fields = row.split(",")
      val student_id = fields(0)
      val courses_with_attendance = fields(1).split("\\|")
      (student_id, courses_with_attendance)
    }
    .flatMapValues(x=>x)
    .map{
      x=>
        val course_attendance = x._2.split(":")
        // StudentID, CourseName, CourseAttendanceHours
        (x._1,course_attendance(0),course_attendance(1).toInt)
    }

  val df = spark.createDataFrame(csv_filtered_rdd)
                .toDF("studentId","courseName","attendanceHours")
    .groupBy("courseName")
    .agg(sum("attendanceHours").alias("attendanceHoursTotal"),
      count("studentId").alias("studentsEnrolled"))
    .show()
	//Input Data
	// studentid,coursename_with_attendance
	// 01,CHEM:12\|PHY:33\|MATH:22
	// 02,CHEM:34\|PHY:3
	// 03,MATH:12\|COMP:45\|CHEM:12
	// 04,MATH:67\|PHY:76
	// 05,HIST:88\|MARKT:33\|BIOL:55
	// 06,BIOL:88\|PHY:77
	// 07,BOTONY:34\|ZOOL:77
	// 08,BOTONY:34\|COMP:99
	// 09,HIST:36\|COMP:66
	// 010,MATH:44\|COMP:32\|STAT:23
	// 011,COMP:44\|STAT:66

	//Output Data
	// +----------+--------------------+----------------+
	// \|courseName\|attendanceHoursTotal\|studentsEnrolled\|
	// +----------+--------------------+----------------+
	// \| CHEM\| 58\| 3\|
	// \| MATH\| 145\| 4\|
	// \| COMP\| 286\| 5\|
	// \| HIST\| 124\| 2\|
	// \| BIOL\| 143\| 2\|
	// \| PHY\| 189\| 4\|
	// \| MARKT\| 33\| 1\|
	// \| BOTONY\| 68\| 2\|
	// \| ZOOL\| 77\| 1\|
	// \| STAT\| 89\| 2\|
	// +----------+--------------------+----------------+


	def main(args: Array[String]): Unit = {

	val filePath = "file:///Users/abe/Personal/Apache Spark/Scripts/AjpRDDReverseMapping/data/data1.csv"

	val spark = SparkSession.builder()
	.master("local[3]")
	.appName("Data Processor")
	.config("spark.sql.shuffle.partitions", 2)
	.getOrCreate()

	val csv_rdd = spark.sparkContext.textFile(filePath)
	val header = csv_rdd.first()
	val csv_filtered_rdd = csv_rdd
	.filter(x => x != header)
	.map { row =>
	val fields = row.split(",")
	val student_id = fields(0)
	val courses_with_attendance = fields(1).split("\\\|")
	(student_id, courses_with_attendance)
	}
	.flatMapValues(x=>x)
	.map{
	x=>
	val course_attendance = x._2.split(":")
	// StudentID, CourseName, CourseAttendanceHours
	(x._1,course_attendance(0),course_attendance(1).toInt)
	}

	val df = spark.createDataFrame(csv_filtered_rdd)
	.toDF("studentId","courseName","attendanceHours")
	.groupBy("courseName")
	.agg(sum("attendanceHours").alias("attendanceHoursTotal"),
	count("studentId").alias("studentsEnrolled"))
	.show()