jamesrajendran · May 7, 2017 04:44
diff --git a/Hive Commands b/Hive Commands
 hive> set mapreduce.framework.name=local
 display hive database name: set hive.cli.print.current.db=true;

 DESCRIBE EXTENDED husn_small; --to get statistics
 Analyze table husn_small compute statistics;

 create table snpn(sn String, pn String)
 LOAD DATA  INPATH 'hdfs://127200813master.eap.g4ihos.itcs.hpecorp.net:8020/user/centos7/test_data/snpn' append INTO TABLE snpn


 ----------------------ORC in hive performance tuning--------------------------------------
 TEZ

 CREATE TABLE A_ORC (customerID int, name string, age int, address string ) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
 INSERT INTO TABLE A_ORC SELECT * FROM A;

 set hive.vectorized.execution.enabled = true;
 set hive.vectorized.execution.reduce.enabled = true;
 data should be stored in ORC format

 --enbale CBO---------------
 set hive.cbo.enbale=true;
 set hive.compute.query.using.stats=true;
 set hive.stats.fetch.column.stats=true;
 set hive.stats.fetch.partition.stats=true;

 analyze table tweets compute statistics for columns;

 ----window function------------
 SELECT * FROM
 (SELECT *, RANK() over (partition by sessionID,order by timestamp desc) as rank FROM clicks) ranked_clicks
 WHERE ranked_clicks.rank=1;
 -----avoid global sort------------------------
 SELECT id, name, salary, dept FROM employee
 DISTRIBUTE BY dept --custom partition like
 SORT BY id ASC, name DESC;
 --------------------parallel -------------
   <name>hive.exec.parallel</name>
   
   <name>hive.exec.parallel.thread.number</name>
   <value>8</value>
 ---------------- multi group by single reducer------------------

    <name>hive.multigroupby.singlereducer</name>
    <value>true</value>
    <description>
      Whether to optimize multi group by query to generate single M/R  job plan. If the multi group by query has 
      common group by keys, it will be optimized to generate single M/R job.
 	  
 --------------------bucketing----------------------------
 uses hash partitioning
 useful when cardinality of data is high
 result in fixed number of files as opposed to Partitioning 
 it's similar to clustering.
 recommended not to use	  

 -----------------functions-----------------
 show functions
 show functions  ".*date.*"
 describe function <function name>
 describe function extended <function name>
	hive> set mapreduce.framework.name=local
	display hive database name: set hive.cli.print.current.db=true;

	DESCRIBE EXTENDED husn_small; --to get statistics
	Analyze table husn_small compute statistics;

	create table snpn(sn String, pn String)
	LOAD DATA INPATH 'hdfs://127200813master.eap.g4ihos.itcs.hpecorp.net:8020/user/centos7/test_data/snpn' append INTO TABLE snpn


	----------------------ORC in hive performance tuning--------------------------------------
	TEZ

	CREATE TABLE A_ORC (customerID int, name string, age int, address string ) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
	INSERT INTO TABLE A_ORC SELECT * FROM A;

	set hive.vectorized.execution.enabled = true;
	set hive.vectorized.execution.reduce.enabled = true;
	data should be stored in ORC format

	--enbale CBO---------------
	set hive.cbo.enbale=true;
	set hive.compute.query.using.stats=true;
	set hive.stats.fetch.column.stats=true;
	set hive.stats.fetch.partition.stats=true;

	analyze table tweets compute statistics for columns;

	----window function------------
	SELECT * FROM
	(SELECT *, RANK() over (partition by sessionID,order by timestamp desc) as rank FROM clicks) ranked_clicks
	WHERE ranked_clicks.rank=1;
	-----avoid global sort------------------------
	SELECT id, name, salary, dept FROM employee
	DISTRIBUTE BY dept --custom partition like
	SORT BY id ASC, name DESC;
	--------------------parallel -------------
	<name>hive.exec.parallel</name>

	<name>hive.exec.parallel.thread.number</name>
	<value>8</value>
	---------------- multi group by single reducer------------------

	<name>hive.multigroupby.singlereducer</name>
	<value>true</value>
	<description>
	Whether to optimize multi group by query to generate single M/R job plan. If the multi group by query has
	common group by keys, it will be optimized to generate single M/R job.

	--------------------bucketing----------------------------
	uses hash partitioning
	useful when cardinality of data is high
	result in fixed number of files as opposed to Partitioning
	it's similar to clustering.
	recommended not to use

	-----------------functions-----------------
	show functions
	show functions ".date."
	describe function <function name>
	describe function extended <function name>