InvisibleTech · February 5, 2016 01:34
diff --git a/joiner.scala b/joiner.scala
 // Need to :paste this into Spark Shell to see it work.
 //

 // Load up the columns
 val alpha = sc.parallelize(List("a", "b", "c", "d"))
 val nums = sc.parallelize(List(1, 2, 3, 4))

 // Key them by index
 val alphaK = alpha.zipWithIndex.map(t => (t._2, t._1))
 val numsK = nums.zipWithIndex.map(t => (t._2, t._1))

 // Join them - which gives you (k (v1, v2))
 val joined = alphaK.join(numsK)

 // Join the join again - adding a third column of duplicate data - (k ((V1, v2), v3))
 val dupes =joined.join(alphaK)

 // Okay - now flattent the tuples to a list - there may be a better way
 val flatter = dupes.map(t => (t._1, t._2._1.productIterator.toList ++ List(t._2._2)))

 // Take out the key - now all you have are three column rows
 flatter.map(_._2).collect
diff --git a/sampleoutput.sh b/sampleoutput.sh
 scala> dupes.collect
 res54: Array[(Long, ((String, Int), String))] = Array((0,((a,1),a)), (1,((b,2),b)), (2,((c,3),c)), (3,((d,4),d)))

 scala> flatter.map(_._2).collect
 res55: Array[List[Any]] = Array(List(a, 1, a), List(b, 2, b), List(c, 3, c), List(d, 4, d))

 scala>
	// Need to :paste this into Spark Shell to see it work.
	//

	// Load up the columns
	val alpha = sc.parallelize(List("a", "b", "c", "d"))
	val nums = sc.parallelize(List(1, 2, 3, 4))

	// Key them by index
	val alphaK = alpha.zipWithIndex.map(t => (t._2, t._1))
	val numsK = nums.zipWithIndex.map(t => (t._2, t._1))

	// Join them - which gives you (k (v1, v2))
	val joined = alphaK.join(numsK)

	// Join the join again - adding a third column of duplicate data - (k ((V1, v2), v3))
	val dupes =joined.join(alphaK)

	// Okay - now flattent the tuples to a list - there may be a better way
	val flatter = dupes.map(t => (t._1, t._2._1.productIterator.toList ++ List(t._2._2)))

	// Take out the key - now all you have are three column rows
	flatter.map(_._2).collect
	scala> dupes.collect
	res54: Array[(Long, ((String, Int), String))] = Array((0,((a,1),a)), (1,((b,2),b)), (2,((c,3),c)), (3,((d,4),d)))

	scala> flatter.map(_._2).collect
	res55: Array[List[Any]] = Array(List(a, 1, a), List(b, 2, b), List(c, 3, c), List(d, 4, d))

	scala>