Last active
May 27, 2016 15:35
-
-
Save ottomata/7048012 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set start_day=16; | |
set end_day=17; | |
set table_name=varnish2; | |
select M.hostname, | |
M.lower_seq, | |
M.upper_seq, | |
M.total_seqs, | |
(M.expected_seqs - M.total_seqs) as missing_seqs, | |
(M.expected_seqs - M.total_seqs) / M.expected_seqs as average_loss | |
from (select T.hostname, | |
min(T.sequence) as lower_seq, | |
max(T.sequence) as upper_seq, | |
max(T.sequence) - min(T.sequence) + 1 as expected_seqs, | |
count(*) as total_seqs | |
from ${hiveconf:table_name} T | |
where T.YEAR = 2013 | |
and T.DAY between ${hiveconf:start_day} and ${hiveconf:end_day} | |
and T.sequence <> 0 | |
group by T.hostname | |
) M | |
; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set start_day=16; | |
set end_day=17; | |
set table_name=varnish2; | |
/* Find missing sequence runs counts per hostname */ | |
select M.hostname, | |
count(*) - 1 as missing_sequence_runs | |
from ${hiveconf:tablename} M | |
left outer join | |
${hiveconf:tablename} G on G.hostname = M.hostname | |
and G.sequence = M.sequence + 1 | |
and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and G.sequence <> 0 | |
where G.sequence is null | |
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} and M.sequence <> 0 | |
group by M.hostname | |
; | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set start_day=16; | |
set end_day=17; | |
set table_name=varnish2; | |
add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar; | |
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar; | |
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; | |
/* Get the missing sequence runs. Sample run: | |
sequence sequence missing | |
======== ======== ======= | |
91965123 91965125 1 | |
103290989 103290991 1 | |
*/ | |
select starting.hostname, | |
starting.sequence as missing_from, | |
ending.sequence as missing_to, | |
ending.sequence - starting.sequence - 1 as missing | |
from (select M.sequence, | |
rowSequence() as id, | |
M.hostname | |
from ${hiveconf:table_name} M | |
left outer join | |
${hiveconf:table_name} G on G.sequence = M.sequence + 1 | |
and G.hostname = M.hostname | |
and G.YEAR = 2013 and G.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and G.sequence <> 0 | |
left outer join | |
(select max(sequence) as max_seq, | |
hostname | |
from ${hiveconf:table_name} | |
where year = 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and sequence <> 0 | |
group by hostname | |
) MaxSeq on MaxSeq.max_seq = M.sequence | |
and MaxSeq.hostname = M.hostname | |
where G.sequence is null | |
and MaxSeq.max_seq is null | |
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and M.sequence <> 0 | |
order by M.hostname, M.sequence | |
limit 1000000000 | |
) starting | |
inner join | |
(select M.sequence, | |
rowSequence() as id, | |
M.hostname | |
from ${hiveconf:table_name} M | |
left outer join | |
${hiveconf:table_name} L on L.sequence = M.sequence - 1 | |
and L.hostname = M.hostname | |
and L.YEAR = 2013 and L.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and L.sequence <> 0 | |
left outer join | |
(select min(sequence) as min_seq, | |
hostname | |
from ${hiveconf:table_name} | |
where year > 2013 and day BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and sequence <> 0 | |
group by hostname | |
) MinSeq on MinSeq.min_seq = M.sequence | |
and MinSeq.hostname = M.hostname | |
where L.sequence is null | |
and MinSeq.min_seq is null | |
and M.YEAR = 2013 and M.DAY BETWEEN ${hiveconf:start_day} and ${hiveconf:end_day} | |
and M.sequence <> 0 | |
order by M.hostname, M.sequence | |
limit 1000000000 | |
) ending on ending.id = starting.id | |
and ending.hostname = starting.hostname | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment