mvaz · August 29, 2015 14:08
diff --git a/csv_to_rdd.sc b/csv_to_rdd.sc
 A simplistic approach would be to have a way to preserve the header.

 Let's say you have a file.csv like:

 user, topic, hits
 om,  scala, 120
 daniel, spark, 80
 3754978, spark, 1

 We can define a header class that uses a parsed version of the first row:

 class SimpleCSVHeader(header:Array[String]) extends Serializable {
  val index = header.zipWithIndex.toMap
  def apply(array:Array[String], key:String):String = array(index(key))
 }

 That we can use that header to address the data further down the road:

 val csv = sc.textFile("file.csv")  // original file
 val data = csv.map(line => line.split(",").map(elem => elem.trim)) //lines in rows
 val header = new SimpleCSVHeader(data.take(1)(0)) // we build our header with the first line
 val rows = data.filter(line => header(line,"user") != "user") // filter the header out
 val users = rows.map(row => header(row,"user")
 val usersByHits = rows.map(row => header(row,"user") -> header(row,"hits").toInt)
 ...
diff --git a/Examples b/Examples
 https://altiscale.zendesk.com/hc/en-us/articles/202627136-Spark-Shell-Examples

 http://stackoverflow.com/questions/25362942/how-to-parsing-csv-or-json-file-with-apache-spark

 http://stackoverflow.com/questions/22500701/joining-two-hdfs-files-in-in-spark
diff --git a/gistfile1.txt b/gistfile1.txt
 http://www.researchgate.net/post/What_is_the_formula_to_calculate_the_critical_value_of_correlation
diff --git a/gistfile2.py b/gistfile2.py
 from collections import namedtuple

 Foo = namedtuple('Foo', ['item1', 'item2'], verbose=False)

 class ExtendedFoo(Foo):
    def __hash__(self):
        return hash(self.item1) * hash(self.item2)

 foo = ExtendedFoo(1, 2)

 hash(foo)
	A simplistic approach would be to have a way to preserve the header.

	Let's say you have a file.csv like:

	user, topic, hits
	om, scala, 120
	daniel, spark, 80
	3754978, spark, 1

	We can define a header class that uses a parsed version of the first row:

	class SimpleCSVHeader(header:Array[String]) extends Serializable {
	val index = header.zipWithIndex.toMap
	def apply(array:Array[String], key:String):String = array(index(key))
	}

	That we can use that header to address the data further down the road:

	val csv = sc.textFile("file.csv") // original file
	val data = csv.map(line => line.split(",").map(elem => elem.trim)) //lines in rows
	val header = new SimpleCSVHeader(data.take(1)(0)) // we build our header with the first line
	val rows = data.filter(line => header(line,"user") != "user") // filter the header out
	val users = rows.map(row => header(row,"user")
	val usersByHits = rows.map(row => header(row,"user") -> header(row,"hits").toInt)
	...
	https://altiscale.zendesk.com/hc/en-us/articles/202627136-Spark-Shell-Examples

	http://stackoverflow.com/questions/25362942/how-to-parsing-csv-or-json-file-with-apache-spark

	http://stackoverflow.com/questions/22500701/joining-two-hdfs-files-in-in-spark
	from collections import namedtuple

	Foo = namedtuple('Foo', ['item1', 'item2'], verbose=False)

	class ExtendedFoo(Foo):
	def __hash__(self):
	return hash(self.item1) * hash(self.item2)

	foo = ExtendedFoo(1, 2)

	hash(foo)