Skip to content

Instantly share code, notes, and snippets.

@naren-dremio
Created December 20, 2021 16:49
Show Gist options
  • Save naren-dremio/91f324484e4d0a7243e996f4d432fdf6 to your computer and use it in GitHub Desktop.
Save naren-dremio/91f324484e4d0a7243e996f4d432fdf6 to your computer and use it in GitHub Desktop.
#Install hive:
export HIVE_HOME=/opt/homebrew/Cellar/hive/apache-hive-3.1.2-bin
wget https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
mkdir -p /opt/homebrew/Cellar/hive && tar xvf apache-hive-3.1.2-bin.tar.gz -C /opt/homebrew/Cellar/hive
#hive-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>hive.querylog.location</name>
<value>/Users/apache-hive-3.1.2-bin/log/hive.log</value>
</property>
<property>
<name>hive.querylog.enable.plan.progress</name>
<value>false</value>
</property>
<property>
<name>hive.log.explain.output</name>
<value>false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs://localhost:9000/user/hive/warehouse</value>
</property>
</configuration>
#Install Hadoop:
export HADOOP_HOME=/opt/homebrew/Cellar/hadoop/3.3.1/libexec
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz
mkdir -p /opt/homebrew/Cellar/hadoop && tar xvf hadoop-3.3.1.tar.gz -C /opt/homebrew/Cellar/hadoop
#core-site.xml:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
#hdfs-site.xml:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
#yarn-site.xml:
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
#mapred-site.xml:
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
#Add iceberg jars to hive and spark classpath:
cd $HIVE_HOME/lib
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/0.12.1/iceberg-hive-runtime-0.12.1.jar
cd $SPARK_HOME/jars
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime/0.12.1/iceberg-spark-runtime-0.12.1.jar
# Prep for Hadoop:
ssh-keygen
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# enable remote login (Settings > Sharing)
$HADOOP_HOME/bin/hdfs namenode -format
$HADOOP_HOME/sbin/start-all.sh
# Prepare hive
brew install mysql
mysql -uroot
mysql> create database metastore;
mysql> use metastore;
mysql> ALTER USER 'root'@'localhost' IDENTIFIED BY 'root';
mysql> source /opt/homebrew/Cellar/hive/scripts/metastore/upgrade/mysql/hive-schema-3.1.0.mysql.sql
$HIVE_HOME/bin/schematool -initSchema -dbType mysql
$HIVE_HOME/bin/hive --service metastore &
#notebook
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext
spark = SparkSession.builder.master("local").appName("Iceberg Demo") \
.config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
.config("spark.sql.catalog.spark_catalog.type","hive") \
.config("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog") \
.config("spark.sql.catalog.local","org.apache.iceberg.spark.SparkCatalog") \
.config("spark.sql.catalog.local.type","hive") \
.config("spark.sql.catalog.local.uri","thrift://localhost:9083") \
.config("hive.metastore.uris", "thrift://localhost:9083") \
.getOrCreate()
sqlContext = SQLContext(spark)
sqlContext.sql("DROP TABLE default.iceberg_sample;");
sqlContext.sql("CREATE TABLE default.iceberg_sample (id bigint, data string) USING iceberg")
sqlContext.sql("INSERT INTO default.iceberg_sample VALUES (1, 'a'), (2, 'b'), (3, 'c');")
df = sqlContext.sql("SELECT * FROM default.iceberg_sample;");
df.show();
for i in range (4,10000):
sqlContext.sql("INSERT INTO default.iceberg_sample VALUES ({}, '{}a');".format(i,i))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment