Skip to content

Instantly share code, notes, and snippets.

@dennyglee
Last active October 21, 2015 21:53
Show Gist options
  • Save dennyglee/c933b5ae01c57bd01d94 to your computer and use it in GitHub Desktop.
Save dennyglee/c933b5ae01c57bd01d94 to your computer and use it in GitHub Desktop.
Spark 1.4 PermGenSize Error (ssimeonov)
/* Spark Shell Executed */
./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH
/* Output */
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.4.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
15/07/06 18:39:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 0.13.1aa
SQL context available as sqlContext.
scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala> df.registerTempTable("training")
scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]
scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext
scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@21165c0e
scala> import ctx.implicits._
import ctx.implicits._
scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
java.lang.OutOfMemoryError: PermGen space
Stopping spark context.
Exception in thread "main"
Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "main"
/* Spark Shell Executed */
./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=256m"
/* Output */
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.4.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.
scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]
scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext
scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@1f5588c1
scala> import ctx.implicits._
import ctx.implicits._
scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala> df.registerTempTable("training")
scala> val dfCount = ctx.sql("select count(*) as cnt from training")
dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]
scala> println(dfCount.first.getLong(0))
88175
/* Spark Shell Executed */
./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH
/* Output */
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.4.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_45)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.
scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala> df.registerTempTable("training")
scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]
scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext
scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@38c408c7
scala> import ctx.implicits._
scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
scala> df.registerTempTable("training")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala>
scala> val dfCount = ctx.sql("select count(*) as cnt from training")
dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]
scala> println(dfCount.first.getLong(0))
88175
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment