nsivabalan · April 14, 2022 20:45
diff --git a/hudi mor sql update b/hudi mor sql update
 ./bin/spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \
 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'   \
 --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'  \
 --conf 'spark.kryoserializer.buffer.max=1024m' --conf spark.rdd.compress=true --driver-memory 6g 



 create table hudi_mor1 (
  VendorID int,
  tpep_pickup_datetime string,
  tpep_dropoff_datetime string,
  passenger_count int,
  trip_distance double,
  RatecodeID int,
  store_and_fwd_flag string,
  PULocationID int,
  DOLocationID int,
  payment_type int,
  fare_amount double,
  extra double,
  mta_tax double,
  tip_amount double,
  tolls_amount double,
  improvement_surcharge double,
  total_amount double,
  congestion_surcharge double,
  date_col string
 ) using hudi
 tblproperties (
  type = 'mor',
  primaryKey = 'VendorID',
  preCombineField = 'tpep_pickup_datetime'
 )
 location 'file:///tmp/hudi/hudi_mor1';


 set hoodie.sql.bulk.insert.enable=true;
 set hoodie.sql.insert.mode=non-strict;
 set hoodie.bulkinsert.sort.mode=NONE;
 set hoodie.combine.before.insert=false;


 create table ny_parquet using parquet location 'file:///dataset_path/*.parquet';

 insert into hudi_mor1 select * from ny_parquet ;

 select count(1) from hudi_mor1 where VendorID='4';

 update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y' where VendorID = '4';

 select count(1) from hudi_mor1 where VendorID='4';

 select DOLocationID, store_and_fwd_flag from hudi_mor1 where VendorID='4' limit 10;

 update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y';
	./bin/spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \
	--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
	--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
	--conf 'spark.kryoserializer.buffer.max=1024m' --conf spark.rdd.compress=true --driver-memory 6g



	create table hudi_mor1 (
	VendorID int,
	tpep_pickup_datetime string,
	tpep_dropoff_datetime string,
	passenger_count int,
	trip_distance double,
	RatecodeID int,
	store_and_fwd_flag string,
	PULocationID int,
	DOLocationID int,
	payment_type int,
	fare_amount double,
	extra double,
	mta_tax double,
	tip_amount double,
	tolls_amount double,
	improvement_surcharge double,
	total_amount double,
	congestion_surcharge double,
	date_col string
	) using hudi
	tblproperties (
	type = 'mor',
	primaryKey = 'VendorID',
	preCombineField = 'tpep_pickup_datetime'
	)
	location 'file:///tmp/hudi/hudi_mor1';


	set hoodie.sql.bulk.insert.enable=true;
	set hoodie.sql.insert.mode=non-strict;
	set hoodie.bulkinsert.sort.mode=NONE;
	set hoodie.combine.before.insert=false;


	create table ny_parquet using parquet location 'file:///dataset_path/*.parquet';

	insert into hudi_mor1 select * from ny_parquet ;

	select count(1) from hudi_mor1 where VendorID='4';

	update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y' where VendorID = '4';

	select count(1) from hudi_mor1 where VendorID='4';

	select DOLocationID, store_and_fwd_flag from hudi_mor1 where VendorID='4' limit 10;

	update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y';