Created
March 22, 2014 00:03
-
-
Save gwenshap/9699072 to your computer and use it in GitHub Desktop.
Using Hive to Efficiently Merge Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- create temporary table for merging | |
CREATE external TABLE staging.ItemFulfillment_merged | |
(ItemFulfillment_b13aFilingOptionFedEx STRING, | |
ItemFulfillment_b13aStatementDataFedEx STRING, | |
ItemFulfillment_backupEmailAddressFedEx STRING, | |
ItemFulfillment_backupEmailAddressUps STRING, | |
ItemFulfillment_internalId STRING, | |
ItemFulfillment_itemList_item_item_internalId STRING, | |
ItemFulfillment_itemList_item_item_name STRING, | |
ItemFulfillment_itemList_item_item_type STRING, | |
ItemFulfillment_itemList_item_orderLine STRING) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.serde2.avro.AvroSerDe' | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' | |
LOCATION 'hdfs://nameservice1/data/netsuite/ItemFulfillment/2014-03-20' | |
TBLPROPERTIES ( | |
'avro.schema.url'='hdfs://nameservice1/etl/NetSuite/metadata/ItemFulfillment.avsc'); | |
-- populate it with content from existing and new data | |
-- we take all the new rows, and only the existing rows that don't exist in the new data set. | |
insert overwrite table staging.ItemFulfillment_merged | |
select * from ( | |
select /*+ MAPJOIN(st) */ gt.ItemFulfillment_b13aFilingOptionFedEx, | |
gt.ItemFulfillment_b13aStatementDataFedEx, | |
gt.ItemFulfillment_backupEmailAddressFedEx, | |
gt.ItemFulfillment_backupEmailAddressUps, | |
gt.ItemFulfillment_internalId, | |
gt.ItemFulfillment_itemList_item_item_internalId, | |
gt.ItemFulfillment_itemList_item_item_name, | |
gt.ItemFulfillment_itemList_item_item_type, | |
gt.ItemFulfillment_itemList_item_orderLine | |
from netsuite.ItemFulfillment gt | |
left outer join staging.ItemFulfillment_csv st | |
on st.ItemFulfillment_itemList_item_orderLine = gt.ItemFulfillment_itemList_item_orderLine and st.itemfulfillment_internalid = gt.itemfulfillment_internalid | |
where st.ItemFulfillment_itemList_item_orderLine is null and st.itemfulfillment_internalid is null | |
union all | |
select ItemFulfillment_b13aFilingOptionFedEx, | |
ItemFulfillment_b13aStatementDataFedEx, | |
ItemFulfillment_backupEmailAddressFedEx, | |
ItemFulfillment_backupEmailAddressUps, | |
ItemFulfillment_internalId, | |
ItemFulfillment_itemList_item_item_internalId, | |
ItemFulfillment_itemList_item_item_name, | |
ItemFulfillment_itemList_item_item_type, | |
ItemFulfillment_itemList_item_orderLine | |
from staging.ItemFulfillment_csv) T; | |
-- then we point the existing "master" table and point it to the data in the new, merged table. | |
-- after this quick, atomic operation that changes only metadata - users will now see updated data | |
-- as a last step, we drop the definition of the temporary, merged table | |
-- the data stays because its an external table | |
use netsuite; | |
alter table ItemFulfillment set location 'hdfs://nameservice1/data/netsuite/ItemFulfillment/2014-03-20'; | |
drop table staging.ItemFulfillment_merged; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment