naren-dremio · December 20, 2021 16:49
diff --git a/iceberg_hadoop_spark_hive b/iceberg_hadoop_spark_hive
 #Install hive:

 export HIVE_HOME=/opt/homebrew/Cellar/hive/apache-hive-3.1.2-bin
 wget https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
 mkdir -p /opt/homebrew/Cellar/hive && tar xvf apache-hive-3.1.2-bin.tar.gz -C /opt/homebrew/Cellar/hive

 #hive-site.xml
 <?xml version="1.0"?>
 <configuration>
    <property>
        <name>hive.querylog.location</name>
        <value>/Users/apache-hive-3.1.2-bin/log/hive.log</value>
    </property>
    <property>
        <name>hive.querylog.enable.plan.progress</name>
        <value>false</value>
    </property>
    <property>
        <name>hive.log.explain.output</name>
        <value>false</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.cj.jdbc.Driver</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>root</value>
    </property>
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>hdfs://localhost:9000/user/hive/warehouse</value>
    </property>
 </configuration>


 #Install Hadoop:
 export HADOOP_HOME=/opt/homebrew/Cellar/hadoop/3.3.1/libexec
 wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
 mkdir -p /opt/homebrew/Cellar/hadoop && tar xvf hadoop-3.3.1.tar.gz -C /opt/homebrew/Cellar/hadoop

 #core-site.xml:
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

 <configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://localhost:9000</value>
  </property>
 </configuration>

 #hdfs-site.xml:
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
 </configuration>

 #yarn-site.xml:
 <?xml version="1.0"?>
 <configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
 <property>
  <name>yarn.nodemanager.env-whitelist</name>
  <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
 </property>
 </configuration>

 #mapred-site.xml:
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
 <property>
 <name>mapreduce.framework.name</name>
 <value>yarn</value>
 </property>
 <property>
 <name>mapreduce.application.classpath</name>   
 <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
 </property>
 </configuration>

 #Add iceberg jars to hive and spark classpath:

 cd $HIVE_HOME/lib
 wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/0.12.1/iceberg-hive-runtime-0.12.1.jar

 cd $SPARK_HOME/jars
 wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime/0.12.1/iceberg-spark-runtime-0.12.1.jar

 # Prep for Hadoop:

 ssh-keygen
 cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

 # enable remote login (Settings > Sharing)

 $HADOOP_HOME/bin/hdfs namenode -format
 $HADOOP_HOME/sbin/start-all.sh

 # Prepare hive

 brew install mysql
 mysql -uroot

 mysql> create database metastore;
 mysql> use metastore;
 mysql> ALTER USER 'root'@'localhost' IDENTIFIED BY 'root';
 mysql> source /opt/homebrew/Cellar/hive/scripts/metastore/upgrade/mysql/hive-schema-3.1.0.mysql.sql

 $HIVE_HOME/bin/schematool -initSchema -dbType mysql
 $HIVE_HOME/bin/hive --service metastore &


 #notebook

 import findspark
 findspark.init()

 from pyspark.context import SparkContext
 from pyspark.sql.session import SparkSession
 from pyspark.sql import SQLContext

 spark = SparkSession.builder.master("local").appName("Iceberg Demo") \
        .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
        .config("spark.sql.catalog.spark_catalog.type","hive") \
        .config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
        .config("spark.sql.catalog.local","org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type","hive") \
        .config("spark.sql.catalog.local.uri","thrift://localhost:9083") \
        .config("hive.metastore.uris", "thrift://localhost:9083") \
        .getOrCreate()
 sqlContext = SQLContext(spark)

 sqlContext.sql("DROP TABLE default.iceberg_sample;");
 sqlContext.sql("CREATE TABLE default.iceberg_sample (id bigint, data string) USING iceberg")
 sqlContext.sql("INSERT INTO default.iceberg_sample VALUES (1, 'a'), (2, 'b'), (3, 'c');")
 df = sqlContext.sql("SELECT * FROM default.iceberg_sample;");
 df.show();

 for i in range (4,10000):
    sqlContext.sql("INSERT INTO default.iceberg_sample VALUES ({}, '{}a');".format(i,i))
	#Install hive:

	export HIVE_HOME=/opt/homebrew/Cellar/hive/apache-hive-3.1.2-bin
	wget https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
	mkdir -p /opt/homebrew/Cellar/hive && tar xvf apache-hive-3.1.2-bin.tar.gz -C /opt/homebrew/Cellar/hive

	#hive-site.xml
	<?xml version="1.0"?>
	<configuration>
	<property>
	<name>hive.querylog.location</name>
	<value>/Users/apache-hive-3.1.2-bin/log/hive.log</value>
	</property>
	<property>
	<name>hive.querylog.enable.plan.progress</name>
	<value>false</value>
	</property>
	<property>
	<name>hive.log.explain.output</name>
	<value>false</value>
	</property>
	<property>
	<name>javax.jdo.option.ConnectionURL</name>
	<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
	</property>
	<property>
	<name>javax.jdo.option.ConnectionDriverName</name>
	<value>com.mysql.cj.jdbc.Driver</value>
	</property>
	<property>
	<name>javax.jdo.option.ConnectionUserName</name>
	<value>root</value>
	</property>
	<property>
	<name>javax.jdo.option.ConnectionPassword</name>
	<value>root</value>
	</property>
	<property>
	<name>hive.metastore.schema.verification</name>
	<value>false</value>
	</property>
	<property>
	<name>hive.metastore.warehouse.dir</name>
	<value>hdfs://localhost:9000/user/hive/warehouse</value>
	</property>
	</configuration>


	#Install Hadoop:
	export HADOOP_HOME=/opt/homebrew/Cellar/hadoop/3.3.1/libexec
	wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
	mkdir -p /opt/homebrew/Cellar/hadoop && tar xvf hadoop-3.3.1.tar.gz -C /opt/homebrew/Cellar/hadoop

	#core-site.xml:
	<?xml version="1.0" encoding="UTF-8"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

	<configuration>
	<property>
	<name>fs.defaultFS</name>
	<value>hdfs://localhost:9000</value>
	</property>
	</configuration>

	#hdfs-site.xml:
	<?xml version="1.0" encoding="UTF-8"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
	<configuration>
	<property>
	<name>dfs.replication</name>
	<value>1</value>
	</property>
	</configuration>

	#yarn-site.xml:
	<?xml version="1.0"?>
	<configuration>
	<property>
	<name>yarn.nodemanager.aux-services</name>
	<value>mapreduce_shuffle</value>
	</property>
	<property>
	<name>yarn.nodemanager.env-whitelist</name>
	<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
	</property>
	</configuration>

	#mapred-site.xml:
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
	<configuration>
	<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
	</property>
	<property>
	<name>mapreduce.application.classpath</name>
	<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/</value>
	</property>
	</configuration>

	#Add iceberg jars to hive and spark classpath:

	cd $HIVE_HOME/lib
	wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/0.12.1/iceberg-hive-runtime-0.12.1.jar

	cd $SPARK_HOME/jars
	wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime/0.12.1/iceberg-spark-runtime-0.12.1.jar

	# Prep for Hadoop:

	ssh-keygen
	cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

	# enable remote login (Settings > Sharing)

	$HADOOP_HOME/bin/hdfs namenode -format
	$HADOOP_HOME/sbin/start-all.sh

	# Prepare hive

	brew install mysql
	mysql -uroot

	mysql> create database metastore;
	mysql> use metastore;
	mysql> ALTER USER 'root'@'localhost' IDENTIFIED BY 'root';
	mysql> source /opt/homebrew/Cellar/hive/scripts/metastore/upgrade/mysql/hive-schema-3.1.0.mysql.sql

	$HIVE_HOME/bin/schematool -initSchema -dbType mysql
	$HIVE_HOME/bin/hive --service metastore &


	#notebook

	import findspark
	findspark.init()

	from pyspark.context import SparkContext
	from pyspark.sql.session import SparkSession
	from pyspark.sql import SQLContext

	spark = SparkSession.builder.master("local").appName("Iceberg Demo") \
	.config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
	.config("spark.sql.catalog.spark_catalog.type","hive") \
	.config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
	.config("spark.sql.catalog.local","org.apache.iceberg.spark.SparkCatalog") \
	.config("spark.sql.catalog.local.type","hive") \
	.config("spark.sql.catalog.local.uri","thrift://localhost:9083") \
	.config("hive.metastore.uris", "thrift://localhost:9083") \
	.getOrCreate()
	sqlContext = SQLContext(spark)

	sqlContext.sql("DROP TABLE default.iceberg_sample;");
	sqlContext.sql("CREATE TABLE default.iceberg_sample (id bigint, data string) USING iceberg")
	sqlContext.sql("INSERT INTO default.iceberg_sample VALUES (1, 'a'), (2, 'b'), (3, 'c');")
	df = sqlContext.sql("SELECT * FROM default.iceberg_sample;");
	df.show();

	for i in range (4,10000):
	sqlContext.sql("INSERT INTO default.iceberg_sample VALUES ({}, '{}a');".format(i,i))