Skip to content

Instantly share code, notes, and snippets.

@gwenshap
Last active December 20, 2015 06:45
Show Gist options
  • Save gwenshap/1e6894100f72e3f109f2 to your computer and use it in GitHub Desktop.
Save gwenshap/1e6894100f72e3f109f2 to your computer and use it in GitHub Desktop.
Sessionization in Hive 0.12 (to AVRO table)
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/avro-mapred.jar;
DROP TABLE raw_log;
CREATE EXTERNAL TABLE raw_log(
IP STRING,
timestamp STRING,
URL STRING,
referrer STRING,
user_agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\\d+.\\d+.\\d+.\\d+).*\\[(.*)\\].*GET (\\S*).*\\d+ \\d+ (\\S+) \"(.*)\""
)
location '/etl/bikeshop/clickstream/raw/year=2014/month=5/day=6'
;
--select * from raw_log limit 5;
CREATE TABLE if not exists apache_log
PARTITIONED BY (year int, month int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED as INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES ('avro.schema.literal' = ' {
"type" : "record",
"name" : "ApacheLog",
"fields" : [
{"name" : "IP", "type" : "string"},
{"name" : "timestamp", "type" : "string"},
{"name" : "URL", "type" : "string"},
{"name" : "referrer", "type" : "string"},
{"name" : "user_agent", "type" : "string"},
{"name" : "ts", "type" : "long" },
{"name" : "session_id", "type" : "int"}
]
}');
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
insert into table apache_log
partition(year,month)
select IP,timestamp,URL,referrer,user_agent,ts,
coalesce(sum(case when new='Y' then 1 end) over (PARTITION BY IP ORDER BY ts),0) session_id
, year(from_unixtime(ts)) year, month(from_unixtime(ts)) month
from
(select IP,timestamp,URL,referrer,user_agent,ts,case when ts-lag(ts,1) OVER (PARTITION BY IP ORDER BY ts) > 30*60 then 'Y' else 'N' end new
from (
select IP,timestamp,URL,referrer,user_agent,unix_timestamp(timestamp,'dd/MMM/yyyy:HH:mm:ss') as ts
from raw_log
) t) s;
select * from apache_log limit 5;
select count(*) from apache_log;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment