Skip to content

Instantly share code, notes, and snippets.

@nsivabalan
Last active April 14, 2022 20:45
Show Gist options
  • Save nsivabalan/a4bb5e80457998d607d0f5d3c64ab686 to your computer and use it in GitHub Desktop.
Save nsivabalan/a4bb5e80457998d607d0f5d3c64ab686 to your computer and use it in GitHub Desktop.
./bin/spark-sql --packages org.apache.hudi:hudi-spark-bundle_2.11:0.10.0,org.apache.spark:spark-avro_2.11:2.4.4 \
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' \
--conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' \
--conf 'spark.kryoserializer.buffer.max=1024m' --conf spark.rdd.compress=true --driver-memory 6g
create table hudi_mor1 (
VendorID int,
tpep_pickup_datetime string,
tpep_dropoff_datetime string,
passenger_count int,
trip_distance double,
RatecodeID int,
store_and_fwd_flag string,
PULocationID int,
DOLocationID int,
payment_type int,
fare_amount double,
extra double,
mta_tax double,
tip_amount double,
tolls_amount double,
improvement_surcharge double,
total_amount double,
congestion_surcharge double,
date_col string
) using hudi
tblproperties (
type = 'mor',
primaryKey = 'VendorID',
preCombineField = 'tpep_pickup_datetime'
)
location 'file:///tmp/hudi/hudi_mor1';
set hoodie.sql.bulk.insert.enable=true;
set hoodie.sql.insert.mode=non-strict;
set hoodie.bulkinsert.sort.mode=NONE;
set hoodie.combine.before.insert=false;
create table ny_parquet using parquet location 'file:///dataset_path/*.parquet';
insert into hudi_mor1 select * from ny_parquet ;
select count(1) from hudi_mor1 where VendorID='4';
update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y' where VendorID = '4';
select count(1) from hudi_mor1 where VendorID='4';
select DOLocationID, store_and_fwd_flag from hudi_mor1 where VendorID='4' limit 10;
update hudi_mor1 set DOLocationID = 200, store_and_fwd_flag = 'Y';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment