-
-
Save loveshell/4756124 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table dimension_rollup_periods | |
(period_id string, time_id string, begin_time timestamp, end_time timestamp) | |
STORED AS TEXTFILE; | |
/*Run make_periods.sh script*/ | |
alter table dimension_rollup_periods set serdeproperties ('field.delim'=','); | |
LOAD DATA LOCAL INPATH 'periods/2012' OVERWRITE INTO TABLE dimension_rollup_periods; | |
/*Create a users file from http://www.generatedata.com/#generator */ | |
/*Make a file with a few different event types (1 per row) */ | |
create table users (user_id string); | |
create table events (event_type string); | |
load data local inpath 'users.csv' into table users; | |
load data local inpath 'events.csv' into table events; | |
/*Make some random timestamps, several times as this becomes driver for all the test data*/ | |
/*FYI, workarounds due to HIVE-3676 may be necessary (eg. use UNION ALL)*/ | |
/*Careful to note that this is evenly distributed data, so use sampling when pulling from it*/ | |
create table random_timestamps as | |
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) | |
as ts from dimension_rollup_periods; | |
insert into table random_timestamps | |
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) | |
from dimension_rollup_periods; | |
/*workaround version due to HIVE-3676*/ | |
insert into table random_timestamps | |
select * from | |
( | |
select from_unixtime(unix_timestamp(begin_time)+floor(rand()*3599) ) as ts | |
from dimension_rollup_periods | |
union all | |
select ts from random_timestamps | |
) unionresult" | |
insert into table fact_event_engagements | |
select a.user_id, r.ts as occured_at, a.event_type | |
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r | |
join (select * from events inner join users) a; | |
/*workaround version*/ | |
insert into table fact_event_engagements | |
select * from | |
( | |
select a.user_id, r.ts as occured_at, a.event_type | |
from random_timestamps TABLESAMPLE(BUCKET 7 OUT OF 10 ON rand()) r | |
join | |
( | |
select * | |
from events | |
inner join users | |
) a | |
union all | |
select * | |
from fact_event_engagements | |
) unionresults; | |
create table fact_event_engagements_partitioned (user_id string, occured_at timestamp, event_type string) | |
partitioned by (occured_at_date string) stored as SEQUENCEFILE; | |
set hive.exec.dynamic.partition.mode=nonstrict; | |
set hive.exec.max.dynamic.partitions=2000; | |
set hive.exec.max.dynamic.partitions.pernode=10000; | |
SET hive.exec.compress.output=true; | |
SET mapred.output.compression.type=BLOCK; | |
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; | |
from fact_event_engagements | |
insert overwrite table fact_event_engagements_partitioned partition (occured_at_date) | |
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date | |
distribute by year(occured_at), month(occured_at); | |
from fact_event_engagements_partitioned | |
insert overwrite table row_test partition (occured_at_date) | |
select user_id, occured_at, event_type, to_date(occured_at) as occured_at_date | |
distribute by year(occured_at), month(occured_at); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
function make_files { | |
for m in {01..12}; | |
do | |
for d in {01..28}; | |
do | |
for h in {00..23}; | |
do | |
PERIOD_ID=$1-$m-$d; | |
TIME_ID=$h; | |
BEGIN_TIME="$1-$m-$d $h:00:00"; | |
END_TIME="$1-$m-$d $h:59:59"; | |
# UTC_BEGIN_TIME=$(date --date="$1-$m-$d $h:00:00" +%s); | |
# UTC_END_TIME=$(date --date="$1-$m-$d $h:59:59" +%s); | |
echo $PERIOD_ID, $TIME_ID, $BEGIN_TIME, $END_TIME >> periods/$1/$m.dat | |
done; | |
done; | |
done; | |
} | |
mkdir periods | |
for y in {2009..2012}; | |
do | |
mkdir periods/$y | |
make_files $y & | |
done; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT /*+ MAPJOIN(dimension_rollup_periods) */ drp1.period_id, drp1.time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users | |
FROM dimension_rollup_periods drp1 | |
JOIN fact_event_engagements TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) engagements | |
WHERE drp1.period_id = '2012-09-11' | |
AND engagements.occured_at >= drp1.begin_time | |
AND engagements.occured_at < drp1.end_time | |
AND drp1.time_id >= 03 | |
AND drp1.time_id <= 13 | |
GROUP BY drp1.period_id, drp1.time_id; | |
/* We can accomplish the same by simply grouping by partitioned dates */ | |
select to_date(engagements.occured_at) as period_id, hour(engagements.occured_at) as time_id, COUNT(DISTINCT engagements.user_id) AS engaged_users | |
from fact_event_engagements engagements | |
WHERE to_date(engagements.occured_at) = '2012-09-11' | |
AND hour(engagements.occured_at) >= 03 | |
AND hour(engagements.occured_at) <= 13 | |
AND engagments.occured_at between (occured_at-16hours) AND (occured_at+8) | |
GROUP BY to_date(engagements.occured_at), hour(engagements.occured_at) order by period_id, time_id | |
LIMIT 1000000; | |
select to_date(engagements.occured_at) as period_id, | |
hour(engagements.occured_at) as time_id, | |
COUNT(DISTINCT engagements.user_id) AS engaged_users | |
from fact_event_engagements engagements | |
WHERE to_date(engagements.occured_at) = '2012-09-11' | |
AND hour(engagements.occured_at) >= 03 | |
AND hour(engagements.occured_at) <= 13 | |
GROUP BY to_date(engagements.occured_at), | |
hour(engagements.occured_at) | |
order by period_id, time_id | |
LIMIT 1000000; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment