dennyglee · October 21, 2015 21:53
diff --git a/Spark 1.4,Java7 b/Spark 1.4,Java7
 /* Spark Shell Executed */
 ./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


 /* Output */
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

 Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
 Type in expressions to have them evaluated.
 Type :help for more information.
 Spark context available as sc.
 15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
 15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
 15/07/06 18:39:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 0.13.1aa
 SQL context available as sqlContext.

 scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
 df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

 scala> df.registerTempTable("training")

 scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
 [88175]

 scala> import org.apache.spark.sql.hive.HiveContext 
 import org.apache.spark.sql.hive.HiveContext

 scala> val ctx = new HiveContext(sc)
 ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@21165c0e

 scala> import ctx.implicits._
 import ctx.implicits._

 scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
 java.lang.OutOfMemoryError: PermGen space
 Stopping spark context.
 Exception in thread "main" 
 Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "main"
diff --git a/Spark 1.4,Java7,PermGenSize=256 b/Spark 1.4,Java7,PermGenSize=256
 /* Spark Shell Executed */
 ./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=256m" 
 
 
 /* Output */
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

 Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
 Type in expressions to have them evaluated.
 Type :help for more information.
 Spark context available as sc.
 SQL context available as sqlContext.

 scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
 df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

 scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
 [88175]

 scala> import org.apache.spark.sql.hive.HiveContext 
 import org.apache.spark.sql.hive.HiveContext

 scala> val ctx = new HiveContext(sc)
 ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@1f5588c1

 scala> import ctx.implicits._
 import ctx.implicits._

 scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
 df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
 scala> df.registerTempTable("training")

 scala> val dfCount = ctx.sql("select count(*) as cnt from training")
 dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]

 scala> println(dfCount.first.getLong(0))
 88175


diff --git a/Spark 1.8,Java8 b/Spark 1.8,Java8
 /* Spark Shell Executed */
 ./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


 /* Output */
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

 Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_45)
 Type in expressions to have them evaluated.
 Type :help for more information.
 Spark context available as sc.
 SQL context available as sqlContext.

 scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
 df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

 scala> df.registerTempTable("training")

 scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
 [88175]


 scala> import org.apache.spark.sql.hive.HiveContext 
 import org.apache.spark.sql.hive.HiveContext

 scala> val ctx = new HiveContext(sc)
 ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@38c408c7

 scala> import ctx.implicits._

 scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")

 scala> df.registerTempTable("training")
 df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
 scala> 
 
 scala> val dfCount = ctx.sql("select count(*) as cnt from training")
 dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]

 scala> println(dfCount.first.getLong(0))
 88175
	/* Spark Shell Executed */
	./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


	/* Output */
	Welcome to
	____ __
	/ __/__ ___ _____/ /__
	_\ \/ _ \/ _ `/ __/ '_/
	/___/ .__/\_,_/_/ /_/\_\ version 1.4.0
	/_/

	Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
	Type in expressions to have them evaluated.
	Type :help for more information.
	Spark context available as sc.
	15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
	15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
	15/07/06 18:39:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 0.13.1aa
	SQL context available as sqlContext.

	scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
	df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

	scala> df.registerTempTable("training")

	scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
	[88175]

	scala> import org.apache.spark.sql.hive.HiveContext
	import org.apache.spark.sql.hive.HiveContext

	scala> val ctx = new HiveContext(sc)
	ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@21165c0e

	scala> import ctx.implicits._
	import ctx.implicits._

	scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
	java.lang.OutOfMemoryError: PermGen space
	Stopping spark context.
	Exception in thread "main"
	Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "main"