svenhofstede-zz · December 16, 2015 15:05
diff --git a/spark_dataframe_persistance.scala b/spark_dataframe_persistance.scala
 // Spark version 1.3.0

 import org.apache.spark.storage.StorageLevel

 // Given a query
 val initial_query = "SELECT * FROM snow_stg.sc_cat_item"

 // For all the 4 tries below, the dataframe does not get persisted and no entries in the Storage tab of the Spark UI
 // Try 1
 val cmdb_rel_ci = sqlContext.sql(initial_query)
 cmdb_rel_ci.cache()
 cmdb_rel_ci.count()

 // Try 2
 val cmdb_rel_ci = sqlContext.sql(initial_query).cache()
 cmdb_rel_ci.count()

 // Try 3
 val cmdb_rel_ci = sqlContext.sql(initial_query).persist(StorageLevel.MEMORY_ONLY)
 cmdb_rel_ci.count()

 // Try 4
 val cmdb_rel_ci = sqlContext.sql(initial_query).persist(StorageLevel.MEMORY_ONLY_SER)
 cmdb_rel_ci.count()

 // cmdb_rel_ci is indeed a DataFrame
 scala> cmdb_rel_ci
 res2: org.apache.spark.sql.DataFrame = [Startpoint_ID: string, Startpoint_Name: string, Parent_ID: string, Parent_Name: string, Endpoint_ID: string, Endpoint_Name: string, Endpoint_Class: string, Level: int, Relation_IDs: string, Breadcrumb: string]

 // When convert the dataframe to an RDD first, it persist without issues
 val cmdb_rel_ci = sqlContext.sql(initial_query).rdd.persist(StorageLevel.MEMORY_ONLY_SER)
 cmdb_rel_ci.count()
 // THIS WORKS
 // MEMORY_ONLY_SER is 120 MB
 // MEMORY_ONLY is 13 gigs
 //
 // I would prefer to keep it as a DataFrame though and according to the documentation I should be able to persist it just fine.
	// Spark version 1.3.0

	import org.apache.spark.storage.StorageLevel

	// Given a query
	val initial_query = "SELECT * FROM snow_stg.sc_cat_item"

	// For all the 4 tries below, the dataframe does not get persisted and no entries in the Storage tab of the Spark UI
	// Try 1
	val cmdb_rel_ci = sqlContext.sql(initial_query)
	cmdb_rel_ci.cache()
	cmdb_rel_ci.count()

	// Try 2
	val cmdb_rel_ci = sqlContext.sql(initial_query).cache()
	cmdb_rel_ci.count()

	// Try 3
	val cmdb_rel_ci = sqlContext.sql(initial_query).persist(StorageLevel.MEMORY_ONLY)
	cmdb_rel_ci.count()

	// Try 4
	val cmdb_rel_ci = sqlContext.sql(initial_query).persist(StorageLevel.MEMORY_ONLY_SER)
	cmdb_rel_ci.count()

	// cmdb_rel_ci is indeed a DataFrame
	scala> cmdb_rel_ci
	res2: org.apache.spark.sql.DataFrame = [Startpoint_ID: string, Startpoint_Name: string, Parent_ID: string, Parent_Name: string, Endpoint_ID: string, Endpoint_Name: string, Endpoint_Class: string, Level: int, Relation_IDs: string, Breadcrumb: string]

	// When convert the dataframe to an RDD first, it persist without issues
	val cmdb_rel_ci = sqlContext.sql(initial_query).rdd.persist(StorageLevel.MEMORY_ONLY_SER)
	cmdb_rel_ci.count()
	// THIS WORKS
	// MEMORY_ONLY_SER is 120 MB
	// MEMORY_ONLY is 13 gigs
	//
	// I would prefer to keep it as a DataFrame though and according to the documentation I should be able to persist it just fine.