Skip to content

Instantly share code, notes, and snippets.

@anilkulkarni87
Last active October 9, 2019 23:08
Show Gist options
  • Save anilkulkarni87/2e234f3fcae39ef89ed4dbe15c4dd72b to your computer and use it in GitHub Desktop.
Save anilkulkarni87/2e234f3fcae39ef89ed4dbe15c4dd72b to your computer and use it in GitHub Desktop.
Template for different hive sqls
--Hive queries template:
--1) DDL
--CSV
CREATE TABLE IF NOT EXISTS <schemaname>.CSV_TABLE
(
uuid VARCHAR(50),
flag CHAR(1),
bigid BIGINT,
record_dt DATE,
amount DECIMAL(20,2),
record_tmstp TIMESTAMP
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '~'
LINES TERMINATED BY '\n'
LOCATION 's3://<bucket>/<prefix>/';
--ORC
CREATE EXTERNAL TABLE `<schemaname>.ORC_TABLE`(
`uuid` VARCHAR(50),
`flag` CHAR(1),
`bigid` BIGINT,
`record_dt` DATE,
`amount` DECIMAL(20,2),
`record_tmstp` TIMESTAMP)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
's3://bucket/prefix'
TBLPROPERTIES (
'orc.compress'='SNAPPY')
--AVRO
--The Schema would be taken from Avro Schema file
CREATE EXTERNAL TABLE `<schemaname>.AVRO_TABLE`(
`uuid` VARCHAR(50),
`flag` CHAR(1),
`bigid` BIGINT,
`record_dt` DATE,
`amount` DECIMAL(20,2),
`record_tmstp` TIMESTAMP)
PARTITIONED BY (
`partition` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
's3://bucket/prefix'
TBLPROPERTIES (
'avro.schema.url'='s3://bucket/prefix2/file.avsc')
--PARQUET
--JSON
CREATE EXTERNAL TABLE `<schemaname>.JSON_TABLE`(
`uuid` string COMMENT 'from deserializer',
`firstname` string COMMENT 'from deserializer',
`lastname` string COMMENT 'from deserializer')
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe'
WITH SERDEPROPERTIES (
'paths'='uuid, firstname, lastname')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket/prefix'
--CSV TABLE WITH BUCKETS
CREATE TABLE `<schemaname>.CSV_TABLE`(
`id` string,
`id2` string,
`primarykey` string,
`noid` string,
`type` string,
`source` string)
COMMENT 'Data from source'
CLUSTERED BY (
bucketid)
INTO 100 BUCKETS
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket/prefix'
--WITH PARTITION
CREATE TABLE `<schemaname>.CSV_TABLE_WITH_PARTITION`(
`id` string,
`id2` string,
`primarykey` string,
`noid` string,
`type` string,
`source` string)
COMMENT 'Data from source'
CLUSTERED BY (
bucketid)
INTO 100 BUCKETS
PARTITIONED BY (
`partition` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://bucket/prefix'
--COMPRESSION TYPES
--CTAS
--
--2) DML
--INSERT TO BUCKET TABLE
--INSERT TO PARTITION TABLE - DYNAMIC PARTITION FLAGS
--CSV GZIP - MENTION COMPRESSION FLAGS
--INSERT OVERWRITE TO TABLE
--
--3) EXTERNAL TABLE
--MSCK REPAIR TABLE
--
--4) STATS Collection for partitioned and non partitioned columns.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment