nsivabalan · November 27, 2021 19:57
diff --git a/gistfile1.txt b/gistfile1.txt
 step1: mvn package -DskipTests
 step2: set up docker. 
 cd docker;
 ./setup_demo.sh

 Step3: 
 Copy jars and requried files to docker. 

 cd ../
 docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
 docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
 docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/

 Step4: execute the job.

 docker exec -it adhoc-2 /bin/bash

 spark-submit \
 --packages org.apache.spark:spark-avro_2.11:2.4.0 \
 --conf spark.task.cpus=1 \
 --conf spark.executor.cores=1 \
 --conf spark.task.maxFailures=100 \
 --conf spark.memory.fraction=0.4  \
 --conf spark.rdd.compress=true  \
 --conf spark.kryoserializer.buffer.max=2000m \
 --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
 --conf spark.memory.storageFraction=0.1 \
 --conf spark.shuffle.service.enabled=true  \
 --conf spark.sql.hive.convertMetastoreParquet=false  \
 --conf spark.driver.maxResultSize=12g \
 --conf spark.executor.heartbeatInterval=120s \
 --conf spark.network.timeout=600s \
 --conf spark.yarn.max.executor.failures=10 \
 --conf spark.sql.catalogImplementation=hive \
 --conf spark.driver.extraClassPath=/var/demo/jars/* \
 --conf spark.executor.extraClassPath=/var/demo/jars/* \
 --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
 /opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
 --source-ordering-field test_suite_source_ordering_field \
 --use-deltastreamer \
 --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
 --input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
 --target-table table1 \
 --props test.properties \
 --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
 --source-class org.apache.hudi.utilities.sources.AvroDFSSource \
 --input-file-size 125829120 \
 --workload-yaml-path file:/opt/complex-dag-cow.yaml \
 --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
 --table-type COPY_ON_WRITE \
 --compact-scheduling-minshare 1 \
 --clean-input \
 --clean-output

 Once complete, you can run with any other yamls and with MOR table types too. 
 yamls can be found at hudi_root/docker/demo/config/test_suite/ 

 This is the long running yaml
 cow-long-running-multi-partitions.yaml
	step1: mvn package -DskipTests
	step2: set up docker.
	cd docker;
	./setup_demo.sh

	Step3:
	Copy jars and requried files to docker.

	cd ../
	docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/
	docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/
	docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/

	Step4: execute the job.

	docker exec -it adhoc-2 /bin/bash

	spark-submit \
	--packages org.apache.spark:spark-avro_2.11:2.4.0 \
	--conf spark.task.cpus=1 \
	--conf spark.executor.cores=1 \
	--conf spark.task.maxFailures=100 \
	--conf spark.memory.fraction=0.4 \
	--conf spark.rdd.compress=true \
	--conf spark.kryoserializer.buffer.max=2000m \
	--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
	--conf spark.memory.storageFraction=0.1 \
	--conf spark.shuffle.service.enabled=true \
	--conf spark.sql.hive.convertMetastoreParquet=false \
	--conf spark.driver.maxResultSize=12g \
	--conf spark.executor.heartbeatInterval=120s \
	--conf spark.network.timeout=600s \
	--conf spark.yarn.max.executor.failures=10 \
	--conf spark.sql.catalogImplementation=hive \
	--conf spark.driver.extraClassPath=/var/demo/jars/* \
	--conf spark.executor.extraClassPath=/var/demo/jars/* \
	--class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \
	/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \
	--source-ordering-field test_suite_source_ordering_field \
	--use-deltastreamer \
	--target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \
	--input-base-path /user/hive/warehouse/hudi-integ-test-suite/input \
	--target-table table1 \
	--props test.properties \
	--schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \
	--source-class org.apache.hudi.utilities.sources.AvroDFSSource \
	--input-file-size 125829120 \
	--workload-yaml-path file:/opt/complex-dag-cow.yaml \
	--workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \
	--table-type COPY_ON_WRITE \
	--compact-scheduling-minshare 1 \
	--clean-input \
	--clean-output

	Once complete, you can run with any other yamls and with MOR table types too.
	yamls can be found at hudi_root/docker/demo/config/test_suite/

	This is the long running yaml
	cow-long-running-multi-partitions.yaml