Created
June 3, 2015 08:19
-
-
Save rxin/577f7e15545a1edc6f88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
In [1]: df = sqlContext.read.json("examples/src/main/resources/people.json") | |
In [2]: df.withColumn('a b', df.age) | |
Out[2]: DataFrame[age: bigint, name: string, a b: bigint] | |
In [3]: df.withColumn('a b', df.age).write.parquet('test-parquet.out') | |
15/06/03 01:14:56 ERROR InsertIntoHadoopFsRelation: Aborting job. | |
java.lang.RuntimeException: Attribute name "a b" contains invalid character(s) among " ,;{}() =". Please use alias to rename it. | |
at scala.sys.package$.error(package.scala:27) | |
at org.apache.spark.sql.parquet.ParquetTypesConverter$$anonfun$checkSpecialCharacters$2.apply(ParquetTypes.scala:414) | |
at org.apache.spark.sql.parquet.ParquetTypesConverter$$anonfun$checkSpecialCharacters$2.apply(ParquetTypes.scala:412) | |
at scala.collection.immutable.List.foreach(List.scala:318) | |
at org.apache.spark.sql.parquet.ParquetTypesConverter$.checkSpecialCharacters(ParquetTypes.scala:412) | |
at org.apache.spark.sql.parquet.ParquetTypesConverter$.convertToString(ParquetTypes.scala:423) | |
at org.apache.spark.sql.parquet.RowWriteSupport$.setSchema(ParquetTableSupport.scala:383) | |
at org.apache.spark.sql.parquet.ParquetRelation2.prepareJobForWrite(newParquet.scala:230) | |
at org.apache.spark.sql.sources.BaseWriterContainer.driverSideSetup(commands.scala:276) | |
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.insert(commands.scala:121) | |
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.run(commands.scala:104) | |
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57) | |
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57) | |
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:68) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:148) | |
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:87) | |
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:920) | |
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:920) | |
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:338) | |
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:144) | |
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:135) | |
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:281) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:606) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) | |
at py4j.Gateway.invoke(Gateway.java:259) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:207) | |
at java.lang.Thread.run(Thread.java:744) | |
--------------------------------------------------------------------------- | |
Py4JJavaError Traceback (most recent call last) | |
<ipython-input-3-fd925f614f74> in <module>() | |
----> 1 df.withColumn('a b', df.age).write.parquet('test-parquet.out') | |
/scratch/rxin/spark/python/pyspark/sql/readwriter.pyc in parquet(self, path, mode) | |
350 >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data')) | |
351 """ | |
--> 352 self._jwrite.mode(mode).parquet(path) | |
353 | |
354 @since(1.4) | |
/Users/rxin/anaconda/lib/python2.7/site-packages/py4j-0.8.1-py2.7.egg/py4j/java_gateway.pyc in __call__(self, *args) | |
535 answer = self.gateway_client.send_command(command) | |
536 return_value = get_return_value(answer, self.gateway_client, | |
--> 537 self.target_id, self.name) | |
538 | |
539 for temp_arg in temp_args: | |
/Users/rxin/anaconda/lib/python2.7/site-packages/py4j-0.8.1-py2.7.egg/py4j/protocol.pyc in get_return_value(answer, gateway_client, target_id, name) | |
298 raise Py4JJavaError( | |
299 'An error occurred while calling {0}{1}{2}.\n'. | |
--> 300 format(target_id, '.', name), value) | |
301 else: | |
302 raise Py4JError( | |
Py4JJavaError: An error occurred while calling o35.parquet. | |
: java.lang.NullPointerException | |
at org.apache.spark.sql.sources.BaseWriterContainer.abortJob(commands.scala:362) | |
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.insert(commands.scala:127) | |
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.run(commands.scala:104) | |
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57) | |
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57) | |
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:68) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:148) | |
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:87) | |
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:920) | |
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:920) | |
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:338) | |
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:144) | |
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:135) | |
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:281) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:606) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) | |
at py4j.Gateway.invoke(Gateway.java:259) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:207) | |
at java.lang.Thread.run(Thread.java:744) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment