Skip to content

Instantly share code, notes, and snippets.

@igorbasko01
Last active September 20, 2021 16:30
Show Gist options
  • Select an option

  • Save igorbasko01/05d81fef8f39e305527fd24b946fdb9a to your computer and use it in GitHub Desktop.

Select an option

Save igorbasko01/05d81fef8f39e305527fd24b946fdb9a to your computer and use it in GitHub Desktop.
Hudi GLUE sync
// EMR Applications: Spark, Hive
// spark-shell --packages org.apache.hudi:hudi-spark-bundle_2.11:0.5.1-incubating,org.apache.spark:spark-avro_2.11:2.4.4 --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf "spark.sql.hive.convertMetastoreParquet=false"
// EMR configuration:
// {
// "classification": "hive-site",
// "properties": {
// "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory",
// "hive.metastore.schema.verification": "false"
// }
// },
// {
// "classification": "spark-hive-site",
// "properties": {
// "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
// }
// }
// spark 2.4.4
import org.apache.hudi.QuickstartUtils._
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY, HIVE_SYNC_ENABLED_OPT_KEY, HIVE_TABLE_OPT_KEY, HIVE_PARTITION_FIELDS_OPT_KEY}
import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
import org.apache.spark.sql.{SaveMode, SparkSession}
import spark.implicits._
val event1 = "{'uuid': '1', 'utc': 1000, 'event_date': '2020/02/05', 'driver_id': 'aaa', 'lat': 33.3, 'lng': 33.3}"
val df1 = spark.read.json(Seq(event1).toDS)
df1.show()
df1.write.format("org.apache.hudi")
.options(getQuickstartWriteConfigs)
.option(PRECOMBINE_FIELD_OPT_KEY, "utc")
.option(RECORDKEY_FIELD_OPT_KEY, "uuid")
.option(PARTITIONPATH_FIELD_OPT_KEY, "event_date")
.option(TABLE_NAME, "drivers")
.option(HIVE_SYNC_ENABLED_OPT_KEY, "true")
.option(HIVE_TABLE_OPT_KEY, "drivers")
.option(HIVE_PARTITION_FIELDS_OPT_KEY, "event_date")
.mode(SaveMode.Overwrite).save("s3://<some_bucket>/hudi")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment