nsivabalan · February 21, 2021 17:16
diff --git a/gistfile1.txt b/gistfile1.txt
 docker run test_hudi py.test -s --verbose test_hudi.py



 ============================= test session starts ==============================
 platform linux -- Python 3.7.9, pytest-6.1.1, py-1.10.0, pluggy-0.13.1 -- /usr/bin/python3
 cachedir: .pytest_cache
 rootdir: /
 collecting ... collected 1 item

 test_hudi.py::test_hudi Ivy Default Cache set to: /root/.ivy2/cache
 The jars for the packages stored in: /root/.ivy2/jars
 :: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
 org.apache.hudi#hudi-spark-bundle_2.12 added as a dependency
 org.apache.spark#spark-avro_2.12 added as a dependency
 org.apache.spark#spark-sql_2.12 added as a dependency
 :: resolving dependencies :: org.apache.spark#spark-submit-parent-58d523e8-14cb-4100-8ee6-04eb7c7954d0;1.0
 	confs: [default]
 	found org.apache.hudi#hudi-spark-bundle_2.12;0.7.0 in central
 	found org.apache.spark#spark-avro_2.12;3.0.0 in central
 	found org.spark-project.spark#unused;1.0.0 in central
 downloading https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar ...
 	[SUCCESSFUL ] org.apache.hudi#hudi-spark-bundle_2.12;0.7.0!hudi-spark-bundle_2.12.jar (1037ms)
 downloading https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.12/3.0.0/spark-avro_2.12-3.0.0.jar ...
 	[SUCCESSFUL ] org.apache.spark#spark-avro_2.12;3.0.0!spark-avro_2.12.jar (23ms)
 downloading https://repo1.maven.org/maven2/org/spark-project/spark/unused/1.0.0/unused-1.0.0.jar ...
 	[SUCCESSFUL ] org.spark-project.spark#unused;1.0.0!unused.jar (17ms)
 :: resolution report :: resolve 2852ms :: artifacts dl 1082ms
 	:: modules in use:
 	org.apache.hudi#hudi-spark-bundle_2.12;0.7.0 from central in [default]
 	org.apache.spark#spark-avro_2.12;3.0.0 from central in [default]
 	org.spark-project.spark#unused;1.0.0 from central in [default]
 	---------------------------------------------------------------------
 	|                  |            modules            ||   artifacts   |
 	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
 	---------------------------------------------------------------------
 	|      default     |   3   |   3   |   3   |   0   ||   3   |   3   |
 	---------------------------------------------------------------------
 :: retrieving :: org.apache.spark#spark-submit-parent-58d523e8-14cb-4100-8ee6-04eb7c7954d0
 	confs: [default]
 	3 artifacts copied, 0 already retrieved (36431kB/61ms)
 21/02/21 17:15:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 Setting default log level to "WARN".
 To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 21/02/21 17:15:49 WARN DefaultSource: Loading Base File Only View.              
 21/02/21 17:15:49 WARN DataSource: All paths were ignored:
  file:/tmp/pytest-of-root/pytest-0/test_hudi0/.hoodie/.aux/.bootstrap
 FAILED

 =================================== FAILURES ===================================
 __________________________________ test_hudi ___________________________________

 tmp_path = PosixPath('/tmp/pytest-of-root/pytest-0/test_hudi0')

    def test_hudi(tmp_path):
        SparkContext.getOrCreate(
            conf=SparkConf()
            .setAppName("testing")
            .setMaster("local[6]")
            .set(
                "spark.jars.packages",
                "org.apache.hudi:hudi-spark-bundle_2.12:0.7.0,org.apache.spark:spark-avro_2.12:3.0.0,org.apache.spark:spark-sql_2.12:3.0.0",
            )
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .set("spark.sql.hive.convertMetastoreParquet", "false")
        )
        spark = SparkSession.builder.getOrCreate()
    
        hudi_options = {
            "hoodie.table.name": "test",
            "hoodie.datasource.write.recordkey.field": "id",
            "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.SimpleKeyGenerator",
            "hoodie.datasource.write.partitionpath.field": "year,month,day",
            "hoodie.datasource.write.table.name": "test",
            "hoodie.datasource.write.table.type": "COPY_ON_WRITE",
            "hoodie.datasource.write.operation": "upsert",
            "hoodie.datasource.write.precombine.field": "ts",
        }
        df = spark.createDataFrame(
            [
                Row(id=1, year=2020, month=7, day=5, ts=1),
            ]
        )
    
        #import findspark
        #findspark.init()
    
        df.write.format("hudi").options(**hudi_options).mode("append").save(str(tmp_path))
        #read_df = spark.read.format("parquet").load(str(tmp_path) + "/*/*/*")
        # This works
        #print(read_df.collect())
    
 >       read_df = spark.read.format("hudi").load(str(tmp_path) + "/*/*/*")

 test_hudi.py:47: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
 usr/local/lib/python3.7/site-packages/pyspark/sql/readwriter.py:178: in load
    return self._df(self._jreader.load(path))
 usr/local/lib/python3.7/site-packages/py4j/java_gateway.py:1305: in __call__
    answer, self.gateway_client, self.target_id, self.name)
 usr/local/lib/python3.7/site-packages/pyspark/sql/utils.py:137: in deco
    raise_from(converted)
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 e = AnalysisException('Unable to infer schema for Parquet. It must be specified manually.;', 'org.apache.spark.sql.Analysi...:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n', None)

 >   ???
 E   pyspark.sql.utils.AnalysisException: Unable to infer schema for Parquet. It must be specified manually.;

 <string>:3: AnalysisException
 =============================== warnings summary ===============================
 test_hudi.py::test_hudi
  /usr/local/lib/python3.7/site-packages/pyspark/sql/context.py:77: DeprecationWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
    DeprecationWarning)

 -- Docs: https://docs.pytest.org/en/stable/warnings.html
 =========================== short test summary info ============================
 FAILED test_hudi.py::test_hudi - pyspark.sql.utils.AnalysisException: Unable ...
 ======================== 1 failed, 1 warning in 49.08s =========================
	docker run test_hudi py.test -s --verbose test_hudi.py



	============================= test session starts ==============================
	platform linux -- Python 3.7.9, pytest-6.1.1, py-1.10.0, pluggy-0.13.1 -- /usr/bin/python3
	cachedir: .pytest_cache
	rootdir: /
	collecting ... collected 1 item

	test_hudi.py::test_hudi Ivy Default Cache set to: /root/.ivy2/cache
	The jars for the packages stored in: /root/.ivy2/jars
	:: loading settings :: url = jar:file:/usr/local/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
	org.apache.hudi#hudi-spark-bundle_2.12 added as a dependency
	org.apache.spark#spark-avro_2.12 added as a dependency
	org.apache.spark#spark-sql_2.12 added as a dependency
	:: resolving dependencies :: org.apache.spark#spark-submit-parent-58d523e8-14cb-4100-8ee6-04eb7c7954d0;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark-bundle_2.12;0.7.0 in central
	found org.apache.spark#spark-avro_2.12;3.0.0 in central
	found org.spark-project.spark#unused;1.0.0 in central
	downloading https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar ...
	[SUCCESSFUL ] org.apache.hudi#hudi-spark-bundle_2.12;0.7.0!hudi-spark-bundle_2.12.jar (1037ms)
	downloading https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.12/3.0.0/spark-avro_2.12-3.0.0.jar ...
	[SUCCESSFUL ] org.apache.spark#spark-avro_2.12;3.0.0!spark-avro_2.12.jar (23ms)
	downloading https://repo1.maven.org/maven2/org/spark-project/spark/unused/1.0.0/unused-1.0.0.jar ...
	[SUCCESSFUL ] org.spark-project.spark#unused;1.0.0!unused.jar (17ms)
	:: resolution report :: resolve 2852ms :: artifacts dl 1082ms
	:: modules in use:
	org.apache.hudi#hudi-spark-bundle_2.12;0.7.0 from central in [default]
	org.apache.spark#spark-avro_2.12;3.0.0 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	---------------------------------------------------------------------
	\| \| modules \|\| artifacts \|
	\| conf \| number\| search\|dwnlded\|evicted\|\| number\|dwnlded\|
	---------------------------------------------------------------------
	\| default \| 3 \| 3 \| 3 \| 0 \|\| 3 \| 3 \|
	---------------------------------------------------------------------
	:: retrieving :: org.apache.spark#spark-submit-parent-58d523e8-14cb-4100-8ee6-04eb7c7954d0
	confs: [default]
	3 artifacts copied, 0 already retrieved (36431kB/61ms)
	21/02/21 17:15:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
	Setting default log level to "WARN".
	To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
	21/02/21 17:15:49 WARN DefaultSource: Loading Base File Only View.
	21/02/21 17:15:49 WARN DataSource: All paths were ignored:
	file:/tmp/pytest-of-root/pytest-0/test_hudi0/.hoodie/.aux/.bootstrap
	FAILED

	=================================== FAILURES ===================================
	__________________________________ test_hudi ___________________________________

	tmp_path = PosixPath('/tmp/pytest-of-root/pytest-0/test_hudi0')

	def test_hudi(tmp_path):
	SparkContext.getOrCreate(
	conf=SparkConf()
	.setAppName("testing")
	.setMaster("local[6]")
	.set(
	"spark.jars.packages",
	"org.apache.hudi:hudi-spark-bundle_2.12:0.7.0,org.apache.spark:spark-avro_2.12:3.0.0,org.apache.spark:spark-sql_2.12:3.0.0",
	)
	.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
	.set("spark.sql.hive.convertMetastoreParquet", "false")
	)
	spark = SparkSession.builder.getOrCreate()

	hudi_options = {
	"hoodie.table.name": "test",
	"hoodie.datasource.write.recordkey.field": "id",
	"hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.SimpleKeyGenerator",
	"hoodie.datasource.write.partitionpath.field": "year,month,day",
	"hoodie.datasource.write.table.name": "test",
	"hoodie.datasource.write.table.type": "COPY_ON_WRITE",
	"hoodie.datasource.write.operation": "upsert",
	"hoodie.datasource.write.precombine.field": "ts",
	}
	df = spark.createDataFrame(
	[
	Row(id=1, year=2020, month=7, day=5, ts=1),
	]
	)

	#import findspark
	#findspark.init()

	df.write.format("hudi").options(**hudi_options).mode("append").save(str(tmp_path))
	#read_df = spark.read.format("parquet").load(str(tmp_path) + "///*")
	# This works
	#print(read_df.collect())

	> read_df = spark.read.format("hudi").load(str(tmp_path) + "///*")

	test_hudi.py:47:
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
	usr/local/lib/python3.7/site-packages/pyspark/sql/readwriter.py:178: in load
	return self._df(self._jreader.load(path))
	usr/local/lib/python3.7/site-packages/py4j/java_gateway.py:1305: in __call__
	answer, self.gateway_client, self.target_id, self.name)
	usr/local/lib/python3.7/site-packages/pyspark/sql/utils.py:137: in deco
	raise_from(converted)
	_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

	e = AnalysisException('Unable to infer schema for Parquet. It must be specified manually.;', 'org.apache.spark.sql.Analysi...:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n', None)

	> ???
	E pyspark.sql.utils.AnalysisException: Unable to infer schema for Parquet. It must be specified manually.;

	<string>:3: AnalysisException
	=============================== warnings summary ===============================
	test_hudi.py::test_hudi
	/usr/local/lib/python3.7/site-packages/pyspark/sql/context.py:77: DeprecationWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
	DeprecationWarning)

	-- Docs: https://docs.pytest.org/en/stable/warnings.html
	=========================== short test summary info ============================
	FAILED test_hudi.py::test_hudi - pyspark.sql.utils.AnalysisException: Unable ...
	======================== 1 failed, 1 warning in 49.08s =========================