darrenfu · August 25, 2016 06:22
diff --git a/hive-bucket-table-join.sql b/hive-bucket-table-join.sql
 set mapred.job.queue.name=hddq-commrce-mktmisc;
 set hive.auto.convert.sortmerge.join=true;
 set hive.optimize.bucketmapjoin=true;
 set hive.optimize.bucketmapjoin.sortedmerge=true;
 set hive.auto.convert.sortmerge.join.noconditionaltask=true;
 set hive.enforce.bucketing=true;
 set hive.enforce.sorting=true;

 drop table IF EXISTS pla_bucket_superset;

 CREATE EXTERNAL TABLE `pla_bucket_superset`(
  `item_id` bigint,
  `auct_title` string,
  `slr_id` string,
  `picture_url` string,
  `url` string,
  `item_site_id` string,
  `item_cntry_id` string,
  `leaf_categ_id` string,
  `meta_categ_id` string,
  `currency_id` string,
  `curnt_price` string,
  `auct_start_date` string,
  `auct_end_date` string,
  `auct_durnt_days` string,
  `item_loc` string,
  `shipping_fee` string,
  `gallery_url` string,
  `slr_description` string,
  `leaf_categ_name` string,
  `fb_url` string
 )
 CLUSTERED BY (
  item_id)
 SORTED BY (
  item_id ASC)
 INTO 500 BUCKETS
 ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
 STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
 OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
 LOCATION
  '/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_superset'
 TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'numFiles'='0',
  'numRows'='-1',
  'rawDataSize'='-1',
  'totalSize'='0',
  'transient_lastDdlTime'='1467271200')
 ;

 drop table IF EXISTS pla_bucket_score;

 CREATE EXTERNAL TABLE `pla_bucket_score`(
  `item_id` bigint,
  `score` string
 )
 CLUSTERED BY (
  item_id)
 SORTED BY (
  item_id ASC)
 INTO 100 BUCKETS
 ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
 STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
 OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
 LOCATION
  '/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_score'
 TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'numFiles'='0',
  'numRows'='-1',
  'rawDataSize'='-1',
  'totalSize'='0',
  'transient_lastDdlTime'='1467271200')
 ;

 drop table IF EXISTS pla_bucket_join;

 CREATE EXTERNAL TABLE `pla_bucket_join`(
  `item_id` bigint,
  `score` string,
  `auct_title` string,
  `slr_id` string,
  `picture_url` string,
  `url` string,
  `item_site_id` string,
  `item_cntry_id` string,
  `leaf_categ_id` string,
  `meta_categ_id` string,
  `currency_id` string,
  `curnt_price` string,
  `auct_start_date` string,
  `auct_end_date` string,
  `auct_durnt_days` string,
  `item_loc` string,
  `shipping_fee` string,
  `gallery_url` string,
  `slr_description` string,
  `leaf_categ_name` string,
  `fb_url` string
 )
 CLUSTERED BY (
  item_id)
 SORTED BY (
  item_id ASC)
 INTO 500 BUCKETS
 ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
 STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
 OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
 LOCATION
  '/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_join'
 TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='false',
  'numFiles'='0',
  'numRows'='-1',
  'rawDataSize'='-1',
  'totalSize'='0',
  'transient_lastDdlTime'='1467271200')
 ;

 INSERT INTO TABLE pla_bucket_join
 select a.item_id,
  b.score,
  a.auct_title,
  a.slr_id,
  a.picture_url,
  a.url,
  a.item_site_id,
  a.item_cntry_id,
  a.leaf_categ_id,
  a.meta_categ_id,
  a.currency_id,
  a.curnt_price,
  a.auct_start_date,
  a.auct_end_date,
  a.auct_durnt_days,
  a.item_loc,
  a.shipping_fee,
  a.gallery_url,
  a.slr_description,
  a.leaf_categ_name,
  a.fb_url
 from pla_bucket_superset a
 join pla_bucket_score b
 on a.item_id = b.item_id
 ;
	set mapred.job.queue.name=hddq-commrce-mktmisc;
	set hive.auto.convert.sortmerge.join=true;
	set hive.optimize.bucketmapjoin=true;
	set hive.optimize.bucketmapjoin.sortedmerge=true;
	set hive.auto.convert.sortmerge.join.noconditionaltask=true;
	set hive.enforce.bucketing=true;
	set hive.enforce.sorting=true;

	drop table IF EXISTS pla_bucket_superset;

	CREATE EXTERNAL TABLE `pla_bucket_superset`(
	`item_id` bigint,
	`auct_title` string,
	`slr_id` string,
	`picture_url` string,
	`url` string,
	`item_site_id` string,
	`item_cntry_id` string,
	`leaf_categ_id` string,
	`meta_categ_id` string,
	`currency_id` string,
	`curnt_price` string,
	`auct_start_date` string,
	`auct_end_date` string,
	`auct_durnt_days` string,
	`item_loc` string,
	`shipping_fee` string,
	`gallery_url` string,
	`slr_description` string,
	`leaf_categ_name` string,
	`fb_url` string
	)
	CLUSTERED BY (
	item_id)
	SORTED BY (
	item_id ASC)
	INTO 500 BUCKETS
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
	STORED AS INPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
	OUTPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
	LOCATION
	'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_superset'
	TBLPROPERTIES (
	'COLUMN_STATS_ACCURATE'='false',
	'numFiles'='0',
	'numRows'='-1',
	'rawDataSize'='-1',
	'totalSize'='0',
	'transient_lastDdlTime'='1467271200')
	;

	drop table IF EXISTS pla_bucket_score;

	CREATE EXTERNAL TABLE `pla_bucket_score`(
	`item_id` bigint,
	`score` string
	)
	CLUSTERED BY (
	item_id)
	SORTED BY (
	item_id ASC)
	INTO 100 BUCKETS
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
	STORED AS INPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
	OUTPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
	LOCATION
	'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_score'
	TBLPROPERTIES (
	'COLUMN_STATS_ACCURATE'='false',
	'numFiles'='0',
	'numRows'='-1',
	'rawDataSize'='-1',
	'totalSize'='0',
	'transient_lastDdlTime'='1467271200')
	;

	drop table IF EXISTS pla_bucket_join;

	CREATE EXTERNAL TABLE `pla_bucket_join`(
	`item_id` bigint,
	`score` string,
	`auct_title` string,
	`slr_id` string,
	`picture_url` string,
	`url` string,
	`item_site_id` string,
	`item_cntry_id` string,
	`leaf_categ_id` string,
	`meta_categ_id` string,
	`currency_id` string,
	`curnt_price` string,
	`auct_start_date` string,
	`auct_end_date` string,
	`auct_durnt_days` string,
	`item_loc` string,
	`shipping_fee` string,
	`gallery_url` string,
	`slr_description` string,
	`leaf_categ_name` string,
	`fb_url` string
	)
	CLUSTERED BY (
	item_id)
	SORTED BY (
	item_id ASC)
	INTO 500 BUCKETS
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
	STORED AS INPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
	OUTPUTFORMAT
	'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
	LOCATION
	'/apps/hdmi-technology/b_pandaren_kwdm/feed_store/orc/20160823/pla_us_join'
	TBLPROPERTIES (
	'COLUMN_STATS_ACCURATE'='false',
	'numFiles'='0',
	'numRows'='-1',
	'rawDataSize'='-1',
	'totalSize'='0',
	'transient_lastDdlTime'='1467271200')
	;

	INSERT INTO TABLE pla_bucket_join
	select a.item_id,
	b.score,
	a.auct_title,
	a.slr_id,
	a.picture_url,
	a.url,
	a.item_site_id,
	a.item_cntry_id,
	a.leaf_categ_id,
	a.meta_categ_id,
	a.currency_id,
	a.curnt_price,
	a.auct_start_date,
	a.auct_end_date,
	a.auct_durnt_days,
	a.item_loc,
	a.shipping_fee,
	a.gallery_url,
	a.slr_description,
	a.leaf_categ_name,
	a.fb_url
	from pla_bucket_superset a
	join pla_bucket_score b
	on a.item_id = b.item_id
	;