thanoojgithub · May 29, 2018 08:02 · thanoojgithub · Jan 27, 2016 · thanoojgithub · Jan 28, 2016
diff --git a/PARTITION_CLUSTERED_HIVE.sql b/PARTITION_CLUSTERED_HIVE.sql
 hive> show schemas;
 OK
 default
 thanooj
 Time taken: 0.251 seconds, Fetched: 2 row(s)
 hive> use thanooj;
 OK
 Time taken: 0.011 seconds
 hive> show tables;
 OK
 bucketed_users
 raw_bucketed_users
 Time taken: 0.056 seconds, Fetched: 2 row(s)
 hive> select * from thanooj.raw_bucketed_users;
 OK
 1	sriram	12-10-2015
 2	seeta	12-11-2015
 3	sriram	12-10-2015
 4	seeta	12-11-2015
 5	sriram	12-10-2015
 6	seeta	12-11-2015
 7	sriram	12-10-2015
 8	seeta	12-11-2015
 9	seeta	12-11-2015
 10	seeta	12-11-2015
 Time taken: 0.784 seconds, Fetched: 10 row(s)
 hive> drop table raw_bucketed_users;
 OK
 Time taken: 0.8 seconds
 hive> drop table bucketed_users;
 OK
 Time taken: 0.387 seconds
 hive> SET hive.enforce.bucketing = true;
 hive> CREATE TABLE thanooj.raw_bucketed_users (ID INT, name STRING, dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
 OK
 Time taken: 0.256 seconds
 hive> LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.raw_bucketed_users;
 Loading data to table thanooj.raw_bucketed_users
 Table thanooj.raw_bucketed_users stats: [numFiles=1, numRows=0, totalSize=195, rawDataSize=0]
 OK
 Time taken: 0.451 seconds
 hive> select * from thanooj.raw_bucketed_users;
 OK
 1	sriram	12-10-2015
 2	seeta	12-11-2015
 3	sriram	12-10-2015
 4	seeta	12-11-2015
 5	sriram	12-10-2015
 6	seeta	12-11-2015
 7	sriram	12-10-2015
 8	seeta	12-11-2015
 9	seeta	12-11-2015
 10	seeta	12-11-2015
 Time taken: 0.074 seconds, Fetched: 10 row(s)
 hive> CREATE TABLE thanooj.bucketed_users (ID INT, name STRING) PARTITIONED BY (dt STRING) CLUSTERED BY (ID) SORTED BY (ID ASC) INTO 2 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
 OK
 Time taken: 0.088 seconds
 hive> FROM THANOOJ.raw_bucketed_users INSERT OVERWRITE TABLE THANOOJ.bucketed_users PARTITION(dt) SELECT id, name, dt;
 Query ID = ubuntu_20160127154713_e2e3afef-47ca-4392-81f9-5ee652a07ae9
 Total jobs = 1
 Launching Job 1 out of 1
 Number of reduce tasks determined at compile time: 2
 In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
 In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
 In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
 Starting Job = job_1453879583846_0002, Tracking URL = http://ubuntu:8088/proxy/application_1453879583846_0002/
 Kill Command = /usr/local/hadoop2/bin/hadoop job  -kill job_1453879583846_0002
 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 2
 2016-01-27 15:47:22,725 Stage-1 map = 0%,  reduce = 0%
 2016-01-27 15:47:32,432 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.42 sec
 2016-01-27 15:47:42,456 Stage-1 map = 100%,  reduce = 50%, Cumulative CPU 2.46 sec
 2016-01-27 15:47:43,493 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 3.5 sec
 MapReduce Total cumulative CPU time: 3 seconds 500 msec
 Ended Job = job_1453879583846_0002
 Loading data to table thanooj.bucketed_users partition (dt=null)
 	 Time taken for load dynamic partitions : 359
 	Loading partition {dt=12-11-2015}
 	Loading partition {dt=12-10-2015}
 	 Time taken for adding to write entity : 1
 Partition thanooj.bucketed_users{dt=12-10-2015} stats: [numFiles=2, numRows=4, totalSize=36, rawDataSize=32]
 Partition thanooj.bucketed_users{dt=12-11-2015} stats: [numFiles=2, numRows=6, totalSize=49, rawDataSize=43]
 MapReduce Jobs Launched: 
 Stage-Stage-1: Map: 1  Reduce: 2   Cumulative CPU: 3.5 sec   HDFS Read: 10548 HDFS Write: 324 SUCCESS
 Total MapReduce CPU Time Spent: 3 seconds 500 msec
 OK
 Time taken: 32.536 seconds
 hive> select * from thanooj.bucketed_users;
 OK
 1	sriram	12-10-2015
 3	sriram	12-10-2015
 5	sriram	12-10-2015
 7	sriram	12-10-2015
 2	seeta	12-11-2015
 4	seeta	12-11-2015
 6	seeta	12-11-2015
 8	seeta	12-11-2015
 10	seeta	12-11-2015
 9	seeta	12-11-2015
 Time taken: 0.156 seconds, Fetched: 10 row(s)
 hive> exit;
 ubuntu@ubuntu:~$ hadoop fs -ls /
 Found 4 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-13 16:13 /home
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:53 /input
 drwx-wx-wx   - ubuntu supergroup          0 2016-01-13 16:13 /tmp
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:49 /user
 ubuntu@ubuntu:~$ hadoop fs -ls /user
 Found 1 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:49 /user/hive
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive
 Found 1 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:49 /user/hive/warehouse
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse
 Found 1 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-27 15:46 /user/hive/warehouse/thanooj.db
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db
 Found 2 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-27 15:46 /user/hive/warehouse/thanooj.db/raw_bucketed_users
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/raw_bucketed_users
 Found 1 items
 -rwxr-xr-x   1 ubuntu supergroup        195 2016-01-27 15:46 /user/hive/warehouse/thanooj.db/raw_bucketed_users/raw_bucketed_users
 ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/raw_bucketed_users/raw_bucketed_users
 1,sriram,12-10-2015
 2,seeta,12-11-2015
 3,sriram,12-10-2015
 4,seeta,12-11-2015
 5,sriram,12-10-2015
 6,seeta,12-11-2015
 7,sriram,12-10-2015
 8,seeta,12-11-2015
 9,seeta,12-11-2015
 10,seeta,12-11-2015
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/bucketed_users
 Found 2 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015
 ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015
 Found 2 items
 -rwxr-xr-x   1 ubuntu supergroup          0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000000_0
 -rwxr-xr-x   1 ubuntu supergroup         36 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000001_0
 ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000000_0
 ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000001_0
 1,sriram
 3,sriram
 5,sriram
 7,sriram
 ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000000_0
 2,seeta
 4,seeta
 6,seeta
 8,seeta
 10,seeta
 ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000001_0
 9,seeta
 ubuntu@ubuntu:~$ hive

 Logging initialized using configuration in jar:file:/home/ubuntu/hive-1.2.1/lib/hive-common-1.2.1.jar!/hive-log4j.properties
 hive> ! hadoop fs -ls /;
 Found 4 items
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-13 16:13 /home
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:53 /input
 drwx-wx-wx   - ubuntu supergroup          0 2016-01-13 16:13 /tmp
 drwxr-xr-x   - ubuntu supergroup          0 2016-01-12 18:49 /user
 hive> ! hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000000_0;
 2,seeta
 4,seeta
 6,seeta
 8,seeta
 10,seeta
 hive> SET hive.enforce.bucketing = true;
 hive> SET hive.exec.dynamic.partition = true;
 hive> SET hive.exec.dynamic.partition.mode = nonstrict;
 hive> INSERT OVERWRITE TABLE THANOOJ.bucketed_users PARTITION(dt) SELECT id, name, dt FROM THANOOJ.raw_bucketed_users;
 Query ID = ubuntu_20160127171533_0a9e2627-a75c-4eed-9c17-4a6397cf3001
 Total jobs = 1
 Launching Job 1 out of 1
 Number of reduce tasks determined at compile time: 2
 In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
 In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
 In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
 Starting Job = job_1453879583846_0003, Tracking URL = http://ubuntu:8088/proxy/application_1453879583846_0003/
 Kill Command = /usr/local/hadoop2/bin/hadoop job  -kill job_1453879583846_0003
 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 2
 2016-01-27 17:15:45,375 Stage-1 map = 0%,  reduce = 0%
 2016-01-27 17:15:55,027 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.92 sec
 2016-01-27 17:16:08,999 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 2.79 sec
 MapReduce Total cumulative CPU time: 2 seconds 790 msec
 Ended Job = job_1453879583846_0003
 Loading data to table thanooj.bucketed_users partition (dt=null)
 	 Time taken for load dynamic partitions : 890
 	Loading partition {dt=12-11-2015}
 	Loading partition {dt=12-10-2015}
 	 Time taken for adding to write entity : 6
 Partition thanooj.bucketed_users{dt=12-10-2015} stats: [numFiles=2, numRows=4, totalSize=36, rawDataSize=32]
 Partition thanooj.bucketed_users{dt=12-11-2015} stats: [numFiles=2, numRows=6, totalSize=49, rawDataSize=43]
 MapReduce Jobs Launched: 
 Stage-Stage-1: Map: 1  Reduce: 2   Cumulative CPU: 2.79 sec   HDFS Read: 10544 HDFS Write: 324 SUCCESS
 Total MapReduce CPU Time Spent: 2 seconds 790 msec
 OK
 Time taken: 38.072 seconds
 hive> select * from thanooj.bucketed_users;
 OK
 1	sriram	12-10-2015
 3	sriram	12-10-2015
 5	sriram	12-10-2015
 7	sriram	12-10-2015
 2	seeta	12-11-2015
 4	seeta	12-11-2015
 6	seeta	12-11-2015
 8	seeta	12-11-2015
 10	seeta	12-11-2015
 9	seeta	12-11-2015
 Time taken: 0.4 seconds, Fetched: 10 row(s)
 hive> 







 NOTE:

 ## Static partitioning we need to specify the partition column value in each and every LOAD statement.
 hive>CREATE TABLE thanooj.bucketed_users (ID INT, name STRING) PARTITIONED BY (dt STRING) CLUSTERED BY (ID) SORTED BY (ID ASC) INTO 2 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

 suppose we are having partition on column country for table bucketed_users(ID INT, name STRING, dt STRING) PARTITIONED BY (dt STRING), so each time we need to provide country value

 hive>LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.bucketed_users PARTITION(dt="12-10-2015");

 ## Dynamic partition allow us not to specify partition column value each time. the approach we follows is as below:

 hive> CREATE TABLE thanooj.raw_bucketed_users (ID INT, name STRING, dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
 hive> LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.raw_bucketed_users;

 Create a non-partitioned table raw_bucketed_users and insert data into it.
 Now, create a table bucketed_users partitioned on intended column(say dt).
 Load data in bucketed_users from raw_bucketed_users as below:

 hive> INSERT INTO TABLE bucketed_users PARTITION(dt) SELECT * from raw_bucketed_users;
 make sure that partitioned column is always the last one in non partitioned table(as we are having dt column in bucketed_users)



 Static Partition (SP) columns: in DML/DDL involving multiple partitioning columns, the columns whose values are known at COMPILE TIME (given by user).
 Dynamic Partition (DP) columns: columns whose values are only known at EXECUTION TIME.

 In INSERT ... SELECT ... queries, the dynamic partition columns must be specified last among the columns in the SELECT statement and in the same order in which they appear in the PARTITION() clause.

 all DP columns – only allowed in nonstrict mode. In strict mode, we should throw an error. e.g.,

 INSERT OVERWRITE TABLE T PARTITION (ds, hr)
 SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;

 mixed SP & DP columns. e.g.,

 INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr)
 SELECT key, value, /*ds,*/ hr FROM srcpart WHERE ds is not null and hr>10;

 SP is a subpartition of a DP: should throw an error because partition column order determins directory hierarchy. We cannot change the hierarchy in DML. e.g.,
 -- throw an exception
 INSERT OVERWRITE TABLE T PARTITION (ds, hr = 11)
 SELECT key, value, ds/*, hr*/ FROM srcpart WHERE ds is not null and hr=11;

 multi-table insert. e.g.,

 FROM S
 INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr)
 SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10
 INSERT OVERWRITE TABLE R PARTITION (ds='2010-03-03', hr=12)
 SELECT key, value, ds, hr from srcpart where ds is not null and hr = 12;

 CTAS – syntax is a little bit different from CTAS on non-partitioned tables, since the schema of the target table is not totally derived from the select-clause. We need to specify the schema including partitioning columns in the create-clause. e.g.,

 CREATE TABLE T (key int, value string) PARTITIONED BY (ds string, hr int) AS
 SELECT key, value, ds, hr+1 hr1 FROM srcpart WHERE ds is not null and hr>10;

 The above example shows the case of all DP columns in CTAS. If you want put some constant for some partitioning column, you can specify it in the select-clause. e.g,

 CREATE TABLE T (key int, value string) PARTITIONED BY (ds string, hr int) AS
 SELECT key, value, "2010-03-03", hr+1 hr1 FROM srcpart WHERE ds is not null and hr>10;
	hive> show schemas;
	OK
	default
	thanooj
	Time taken: 0.251 seconds, Fetched: 2 row(s)
	hive> use thanooj;
	OK
	Time taken: 0.011 seconds
	hive> show tables;
	OK
	bucketed_users
	raw_bucketed_users
	Time taken: 0.056 seconds, Fetched: 2 row(s)
	hive> select * from thanooj.raw_bucketed_users;
	OK
	1 sriram 12-10-2015
	2 seeta 12-11-2015
	3 sriram 12-10-2015
	4 seeta 12-11-2015
	5 sriram 12-10-2015
	6 seeta 12-11-2015
	7 sriram 12-10-2015
	8 seeta 12-11-2015
	9 seeta 12-11-2015
	10 seeta 12-11-2015
	Time taken: 0.784 seconds, Fetched: 10 row(s)
	hive> drop table raw_bucketed_users;
	OK
	Time taken: 0.8 seconds
	hive> drop table bucketed_users;
	OK
	Time taken: 0.387 seconds
	hive> SET hive.enforce.bucketing = true;
	hive> CREATE TABLE thanooj.raw_bucketed_users (ID INT, name STRING, dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
	OK
	Time taken: 0.256 seconds
	hive> LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.raw_bucketed_users;
	Loading data to table thanooj.raw_bucketed_users
	Table thanooj.raw_bucketed_users stats: [numFiles=1, numRows=0, totalSize=195, rawDataSize=0]
	OK
	Time taken: 0.451 seconds
	hive> select * from thanooj.raw_bucketed_users;
	OK
	1 sriram 12-10-2015
	2 seeta 12-11-2015
	3 sriram 12-10-2015
	4 seeta 12-11-2015
	5 sriram 12-10-2015
	6 seeta 12-11-2015
	7 sriram 12-10-2015
	8 seeta 12-11-2015
	9 seeta 12-11-2015
	10 seeta 12-11-2015
	Time taken: 0.074 seconds, Fetched: 10 row(s)
	hive> CREATE TABLE thanooj.bucketed_users (ID INT, name STRING) PARTITIONED BY (dt STRING) CLUSTERED BY (ID) SORTED BY (ID ASC) INTO 2 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
	OK
	Time taken: 0.088 seconds
	hive> FROM THANOOJ.raw_bucketed_users INSERT OVERWRITE TABLE THANOOJ.bucketed_users PARTITION(dt) SELECT id, name, dt;
	Query ID = ubuntu_20160127154713_e2e3afef-47ca-4392-81f9-5ee652a07ae9
	Total jobs = 1
	Launching Job 1 out of 1
	Number of reduce tasks determined at compile time: 2
	In order to change the average load for a reducer (in bytes):
	set hive.exec.reducers.bytes.per.reducer=<number>
	In order to limit the maximum number of reducers:
	set hive.exec.reducers.max=<number>
	In order to set a constant number of reducers:
	set mapreduce.job.reduces=<number>
	Starting Job = job_1453879583846_0002, Tracking URL = http://ubuntu:8088/proxy/application_1453879583846_0002/
	Kill Command = /usr/local/hadoop2/bin/hadoop job -kill job_1453879583846_0002
	Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 2
	2016-01-27 15:47:22,725 Stage-1 map = 0%, reduce = 0%
	2016-01-27 15:47:32,432 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
	2016-01-27 15:47:42,456 Stage-1 map = 100%, reduce = 50%, Cumulative CPU 2.46 sec
	2016-01-27 15:47:43,493 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 3.5 sec
	MapReduce Total cumulative CPU time: 3 seconds 500 msec
	Ended Job = job_1453879583846_0002
	Loading data to table thanooj.bucketed_users partition (dt=null)
	Time taken for load dynamic partitions : 359
	Loading partition {dt=12-11-2015}
	Loading partition {dt=12-10-2015}
	Time taken for adding to write entity : 1
	Partition thanooj.bucketed_users{dt=12-10-2015} stats: [numFiles=2, numRows=4, totalSize=36, rawDataSize=32]
	Partition thanooj.bucketed_users{dt=12-11-2015} stats: [numFiles=2, numRows=6, totalSize=49, rawDataSize=43]
	MapReduce Jobs Launched:
	Stage-Stage-1: Map: 1 Reduce: 2 Cumulative CPU: 3.5 sec HDFS Read: 10548 HDFS Write: 324 SUCCESS
	Total MapReduce CPU Time Spent: 3 seconds 500 msec
	OK
	Time taken: 32.536 seconds
	hive> select * from thanooj.bucketed_users;
	OK
	1 sriram 12-10-2015
	3 sriram 12-10-2015
	5 sriram 12-10-2015
	7 sriram 12-10-2015
	2 seeta 12-11-2015
	4 seeta 12-11-2015
	6 seeta 12-11-2015
	8 seeta 12-11-2015
	10 seeta 12-11-2015
	9 seeta 12-11-2015
	Time taken: 0.156 seconds, Fetched: 10 row(s)
	hive> exit;
	ubuntu@ubuntu:~$ hadoop fs -ls /
	Found 4 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-13 16:13 /home
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:53 /input
	drwx-wx-wx - ubuntu supergroup 0 2016-01-13 16:13 /tmp
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:49 /user
	ubuntu@ubuntu:~$ hadoop fs -ls /user
	Found 1 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:49 /user/hive
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive
	Found 1 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:49 /user/hive/warehouse
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse
	Found 1 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-27 15:46 /user/hive/warehouse/thanooj.db
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db
	Found 2 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users
	drwxr-xr-x - ubuntu supergroup 0 2016-01-27 15:46 /user/hive/warehouse/thanooj.db/raw_bucketed_users
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/raw_bucketed_users
	Found 1 items
	-rwxr-xr-x 1 ubuntu supergroup 195 2016-01-27 15:46 /user/hive/warehouse/thanooj.db/raw_bucketed_users/raw_bucketed_users
	ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/raw_bucketed_users/raw_bucketed_users
	1,sriram,12-10-2015
	2,seeta,12-11-2015
	3,sriram,12-10-2015
	4,seeta,12-11-2015
	5,sriram,12-10-2015
	6,seeta,12-11-2015
	7,sriram,12-10-2015
	8,seeta,12-11-2015
	9,seeta,12-11-2015
	10,seeta,12-11-2015
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/bucketed_users
	Found 2 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015
	drwxr-xr-x - ubuntu supergroup 0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015
	ubuntu@ubuntu:~$ hadoop fs -ls /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015
	Found 2 items
	-rwxr-xr-x 1 ubuntu supergroup 0 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000000_0
	-rwxr-xr-x 1 ubuntu supergroup 36 2016-01-27 15:47 /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000001_0
	ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000000_0
	ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-10-2015/000001_0
	1,sriram
	3,sriram
	5,sriram
	7,sriram
	ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000000_0
	2,seeta
	4,seeta
	6,seeta
	8,seeta
	10,seeta
	ubuntu@ubuntu:~$ hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000001_0
	9,seeta
	ubuntu@ubuntu:~$ hive

	Logging initialized using configuration in jar:file:/home/ubuntu/hive-1.2.1/lib/hive-common-1.2.1.jar!/hive-log4j.properties
	hive> ! hadoop fs -ls /;
	Found 4 items
	drwxr-xr-x - ubuntu supergroup 0 2016-01-13 16:13 /home
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:53 /input
	drwx-wx-wx - ubuntu supergroup 0 2016-01-13 16:13 /tmp
	drwxr-xr-x - ubuntu supergroup 0 2016-01-12 18:49 /user
	hive> ! hadoop fs -cat /user/hive/warehouse/thanooj.db/bucketed_users/dt=12-11-2015/000000_0;
	2,seeta
	4,seeta
	6,seeta
	8,seeta
	10,seeta
	hive> SET hive.enforce.bucketing = true;
	hive> SET hive.exec.dynamic.partition = true;
	hive> SET hive.exec.dynamic.partition.mode = nonstrict;
	hive> INSERT OVERWRITE TABLE THANOOJ.bucketed_users PARTITION(dt) SELECT id, name, dt FROM THANOOJ.raw_bucketed_users;
	Query ID = ubuntu_20160127171533_0a9e2627-a75c-4eed-9c17-4a6397cf3001
	Total jobs = 1
	Launching Job 1 out of 1
	Number of reduce tasks determined at compile time: 2
	In order to change the average load for a reducer (in bytes):
	set hive.exec.reducers.bytes.per.reducer=<number>
	In order to limit the maximum number of reducers:
	set hive.exec.reducers.max=<number>
	In order to set a constant number of reducers:
	set mapreduce.job.reduces=<number>
	Starting Job = job_1453879583846_0003, Tracking URL = http://ubuntu:8088/proxy/application_1453879583846_0003/
	Kill Command = /usr/local/hadoop2/bin/hadoop job -kill job_1453879583846_0003
	Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 2
	2016-01-27 17:15:45,375 Stage-1 map = 0%, reduce = 0%
	2016-01-27 17:15:55,027 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 0.92 sec
	2016-01-27 17:16:08,999 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 2.79 sec
	MapReduce Total cumulative CPU time: 2 seconds 790 msec
	Ended Job = job_1453879583846_0003
	Loading data to table thanooj.bucketed_users partition (dt=null)
	Time taken for load dynamic partitions : 890
	Loading partition {dt=12-11-2015}
	Loading partition {dt=12-10-2015}
	Time taken for adding to write entity : 6
	Partition thanooj.bucketed_users{dt=12-10-2015} stats: [numFiles=2, numRows=4, totalSize=36, rawDataSize=32]
	Partition thanooj.bucketed_users{dt=12-11-2015} stats: [numFiles=2, numRows=6, totalSize=49, rawDataSize=43]
	MapReduce Jobs Launched:
	Stage-Stage-1: Map: 1 Reduce: 2 Cumulative CPU: 2.79 sec HDFS Read: 10544 HDFS Write: 324 SUCCESS
	Total MapReduce CPU Time Spent: 2 seconds 790 msec
	OK
	Time taken: 38.072 seconds
	hive> select * from thanooj.bucketed_users;
	OK
	1 sriram 12-10-2015
	3 sriram 12-10-2015
	5 sriram 12-10-2015
	7 sriram 12-10-2015
	2 seeta 12-11-2015
	4 seeta 12-11-2015
	6 seeta 12-11-2015
	8 seeta 12-11-2015
	10 seeta 12-11-2015
	9 seeta 12-11-2015
	Time taken: 0.4 seconds, Fetched: 10 row(s)
	hive>







	NOTE:

	## Static partitioning we need to specify the partition column value in each and every LOAD statement.
	hive>CREATE TABLE thanooj.bucketed_users (ID INT, name STRING) PARTITIONED BY (dt STRING) CLUSTERED BY (ID) SORTED BY (ID ASC) INTO 2 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

	suppose we are having partition on column country for table bucketed_users(ID INT, name STRING, dt STRING) PARTITIONED BY (dt STRING), so each time we need to provide country value

	hive>LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.bucketed_users PARTITION(dt="12-10-2015");

	## Dynamic partition allow us not to specify partition column value each time. the approach we follows is as below:

	hive> CREATE TABLE thanooj.raw_bucketed_users (ID INT, name STRING, dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
	hive> LOAD DATA LOCAL INPATH '/home/ubuntu/input/raw_bucketed_users' OVERWRITE INTO TABLE THANOOJ.raw_bucketed_users;

	Create a non-partitioned table raw_bucketed_users and insert data into it.
	Now, create a table bucketed_users partitioned on intended column(say dt).
	Load data in bucketed_users from raw_bucketed_users as below:

	hive> INSERT INTO TABLE bucketed_users PARTITION(dt) SELECT * from raw_bucketed_users;
	make sure that partitioned column is always the last one in non partitioned table(as we are having dt column in bucketed_users)



	Static Partition (SP) columns: in DML/DDL involving multiple partitioning columns, the columns whose values are known at COMPILE TIME (given by user).
	Dynamic Partition (DP) columns: columns whose values are only known at EXECUTION TIME.

	In INSERT ... SELECT ... queries, the dynamic partition columns must be specified last among the columns in the SELECT statement and in the same order in which they appear in the PARTITION() clause.

	all DP columns – only allowed in nonstrict mode. In strict mode, we should throw an error. e.g.,

	INSERT OVERWRITE TABLE T PARTITION (ds, hr)
	SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;

	mixed SP & DP columns. e.g.,

	INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr)
	SELECT key, value, /ds,/ hr FROM srcpart WHERE ds is not null and hr>10;

	SP is a subpartition of a DP: should throw an error because partition column order determins directory hierarchy. We cannot change the hierarchy in DML. e.g.,
	-- throw an exception
	INSERT OVERWRITE TABLE T PARTITION (ds, hr = 11)
	SELECT key, value, ds/, hr/ FROM srcpart WHERE ds is not null and hr=11;

	multi-table insert. e.g.,

	FROM S
	INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr)
	SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10
	INSERT OVERWRITE TABLE R PARTITION (ds='2010-03-03', hr=12)
	SELECT key, value, ds, hr from srcpart where ds is not null and hr = 12;

	CTAS – syntax is a little bit different from CTAS on non-partitioned tables, since the schema of the target table is not totally derived from the select-clause. We need to specify the schema including partitioning columns in the create-clause. e.g.,

	CREATE TABLE T (key int, value string) PARTITIONED BY (ds string, hr int) AS
	SELECT key, value, ds, hr+1 hr1 FROM srcpart WHERE ds is not null and hr>10;

	The above example shows the case of all DP columns in CTAS. If you want put some constant for some partitioning column, you can specify it in the select-clause. e.g,

	CREATE TABLE T (key int, value string) PARTITIONED BY (ds string, hr int) AS
	SELECT key, value, "2010-03-03", hr+1 hr1 FROM srcpart WHERE ds is not null and hr>10;