Skip to content

Instantly share code, notes, and snippets.

@darrenfu
Created August 25, 2016 06:22
Show Gist options
  • Save darrenfu/e625d6d0516a2b100312ba0c0bdeb80f to your computer and use it in GitHub Desktop.
Save darrenfu/e625d6d0516a2b100312ba0c0bdeb80f to your computer and use it in GitHub Desktop.
set mapred.job.queue.name=hddq-commrce-mktmisc;
set hive.auto.convert.sortmerge.join=true;
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join.noconditionaltask=true;
set hive.enforce.bucketing=true;
set hive.enforce.sorting=true;
drop table IF EXISTS pla_bucket_superset;
CREATE EXTERNAL TABLE `pla_bucket_superset`(
`item_id` bigint,
`auct_title` string,
`slr_id` string,
`picture_url` string,
`url` string,
`item_site_id` string,
`item_cntry_id` string,
`leaf_categ_id` string,
`meta_categ_id` string,
`currency_id` string,
`curnt_price` string,
`auct_start_date` string,
`auct_end_date` string,
`auct_durnt_days` string,
`item_loc` string,
`shipping_fee` string,
`gallery_url` string,
`slr_description` string,
`leaf_categ_name` string,
`fb_url` string
)
CLUSTERED BY (
item_id)
SORTED BY (
item_id ASC)
INTO 500 BUCKETS
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_superset'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'numFiles'='0',
'numRows'='-1',
'rawDataSize'='-1',
'totalSize'='0',
'transient_lastDdlTime'='1467271200')
;
drop table IF EXISTS pla_bucket_score;
CREATE EXTERNAL TABLE `pla_bucket_score`(
`item_id` bigint,
`score` string
)
CLUSTERED BY (
item_id)
SORTED BY (
item_id ASC)
INTO 100 BUCKETS
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_score'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'numFiles'='0',
'numRows'='-1',
'rawDataSize'='-1',
'totalSize'='0',
'transient_lastDdlTime'='1467271200')
;
drop table IF EXISTS pla_bucket_join;
CREATE EXTERNAL TABLE `pla_bucket_join`(
`item_id` bigint,
`score` string,
`auct_title` string,
`slr_id` string,
`picture_url` string,
`url` string,
`item_site_id` string,
`item_cntry_id` string,
`leaf_categ_id` string,
`meta_categ_id` string,
`currency_id` string,
`curnt_price` string,
`auct_start_date` string,
`auct_end_date` string,
`auct_durnt_days` string,
`item_loc` string,
`shipping_fee` string,
`gallery_url` string,
`slr_description` string,
`leaf_categ_name` string,
`fb_url` string
)
CLUSTERED BY (
item_id)
SORTED BY (
item_id ASC)
INTO 500 BUCKETS
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_join'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'numFiles'='0',
'numRows'='-1',
'rawDataSize'='-1',
'totalSize'='0',
'transient_lastDdlTime'='1467271200')
;
INSERT INTO TABLE pla_bucket_join
select a.item_id,
b.score,
a.auct_title,
a.slr_id,
a.picture_url,
a.url,
a.item_site_id,
a.item_cntry_id,
a.leaf_categ_id,
a.meta_categ_id,
a.currency_id,
a.curnt_price,
a.auct_start_date,
a.auct_end_date,
a.auct_durnt_days,
a.item_loc,
a.shipping_fee,
a.gallery_url,
a.slr_description,
a.leaf_categ_name,
a.fb_url
from pla_bucket_superset a
join pla_bucket_score b
on a.item_id = b.item_id
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment