nsivabalan’s gists

nsivabalan / hive sync not spark ds

Created January 3, 2022 14:41

	root@adhoc-2:/opt# /var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
	> --jdbc-url jdbc:hive2://hiveserver:10000 \
	> --user hive \
	> --pass hive \
	> --partitioned-by ppath \
	> --base-path /tmp/hudi_timestamp_tbl2 \
	> --database testdb \
	> --table timestamp_tbl3 \
	> --partition-value-extractor org.apache.hudi.hive.MultiPartKeysValueExtractor \
	> --spark-datasource

nsivabalan / spark structured streaming diff tables

Created January 2, 2022 22:16


	./bin/spark-shell --packages org.apache.spark:spark-avro_2.11:2.4.4,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 \
	--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --driver-memory 8g --executor-memory 9g --jars ~/Documents/personal/projects/nov26/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.10.0-SNAPSHOT.jar --conf spark.driver.extraJavaOptions="-Dlog4j.configuration=file:/Users/nsb/Documents/personal/tools/log4j/debug_hudi_log4j.properties" --conf spark.executor.extraJavaOptions="-Dlog4j.configuration=file:/Users/nsb/Documents/personal/tools/log4j/debug_hudi_log4j.properties"



	// Define kafka flow
	val dataStreamReader = spark.
	readStream.
	format("kafka").

nsivabalan / local spark structured streaming

Created December 29, 2021 16:13


	./bin/spark-shell --packages org.apache.spark:spark-avro_2.11:2.4.4,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 \
	--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --driver-memory 8g --executor-memory 9g --jars ~/Documents/personal/projects/nov26/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.10.0-SNAPSHOT.jar --conf spark.driver.extraJavaOptions="-Dlog4j.configuration=file:/Users/nsb/Documents/personal/tools/log4j/debug_hudi_log4j.properties" --conf spark.executor.extraJavaOptions="-Dlog4j.configuration=file:/Users/nsb/Documents/personal/tools/log4j/debug_hudi_log4j.properties"



	// Define kafka flow
	val dataStreamReader = spark.
	readStream.
	format("kafka").

nsivabalan / gist:fa5d592d8ee08eb51c31835c7a51023c

Created December 22, 2021 08:50


	import org.apache.hudi.QuickstartUtils._
	import scala.collection.JavaConversions._
	import org.apache.spark.sql.SaveMode._
	import org.apache.hudi.DataSourceReadOptions._
	import org.apache.hudi.DataSourceWriteOptions._
	import org.apache.hudi.config.HoodieWriteConfig._

	val tableName = "hudi_trips_cow"
	val basePath = "file:///tmp/hudi_trips_cow"

nsivabalan / local changes

Created December 19, 2021 16:09


	- @ParameterizedTest
	- @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE", "MERGE_ON_READ"})
	- public void testHoodieClientBasicMultiWriter(HoodieTableType tableType) throws Exception {
	+ //@ParameterizedTest
	+ //@EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE", "MERGE_ON_READ"})
	+ @RepeatedTest(20)
	+ public void testHoodieClientBasicMultiWriter() throws Exception {
	+ HoodieTableType tableType = HoodieTableType.MERGE_ON_READ;
	if (tableType == HoodieTableType.MERGE_ON_READ) {

nsivabalan / gist:b3e9ca73746337e9404d65ffe18bd876

Created December 16, 2021 16:35

	run docker set up.


	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/hive-site.xml spark/conf/
	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/core-site.xml spark/conf/
	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/hdfs-site.xml spark/conf/


	$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --conf spark.sql.catalogImplementation=hive --deploy-mode client --driver-memory 1G --executor-memory 3G --num-executors 1 --packages org.apache.spark:spark-avro_2.11:2.4.4

nsivabalan / hive sync docker set up + drop partitions

Last active December 16, 2021 17:33

	run docker set up.


	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/hive-site.xml spark/conf/
	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/core-site.xml spark/conf/
	root@adhoc-1:/opt# cp hadoop-2.8.4/etc/hadoop/hdfs-site.xml spark/conf/


	$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --conf spark.sql.catalogImplementation=hive --deploy-mode client --driver-memory 1G --executor-memory 3G --num-executors 1 --packages org.apache.spark:spark-avro_2.11:2.4.4

nsivabalan / hive sync docker + drop partitions

Last active December 16, 2021 08:55


	// spark-shell
	import org.apache.hudi.QuickstartUtils._
	import scala.collection.JavaConversions._
	import org.apache.spark.sql.SaveMode._
	import org.apache.hudi.DataSourceReadOptions._
	import org.apache.hudi.DataSourceWriteOptions._
	import org.apache.hudi.config.HoodieWriteConfig._

	import org.apache.spark.sql.types._

nsivabalan / gist:505deb9be08efe9eff241d8dab9ea2eb

Created November 27, 2021 19:57

	step1: mvn package -DskipTests
	step2: set up docker.
	cd docker;
	./setup_demo.sh

	Step3:
	Copy jars and requried files to docker.

	cd ../
	docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/

nsivabalan / MetadataCommand.java

Last active November 22, 2021 22:01

	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0

Sivabalan Narayanan nsivabalan