Last active
January 11, 2023 16:43
-
-
Save milimetric/6814636 to your computer and use it in GitHub Desktop.
Two ways to find missing sequence numbers in huge Hive tables. First way - gets the left and right boundaries of each run of missing sequences. Second way - gets the count of boundaries, which if greater than 2 signifies missing sequences. The second way doesn't tell you which sequences are missing or how many are missing, but runs faster.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Common setup, two variants follow | |
*/ | |
use test; | |
set tablename=webrequest_esams0; | |
add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar; | |
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar; | |
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; | |
/* Get the missing packets. Sample run: | |
sequence sequence missing | |
======== ======== ======= | |
91965123 91965125 1 | |
103290989 103290991 1 | |
*/ | |
select starting.hostname, | |
starting.sequence as missing_from, | |
ending.sequence as missing_to, | |
ending.sequence - starting.sequence - 1 as missing | |
from (select M.sequence, | |
rowSequence() as id, | |
M.hostname | |
from ${hiveconf:tablename} M | |
left outer join | |
${hiveconf:tablename} G on G.sequence = M.sequence + 1 | |
and G.hostname = M.hostname | |
and G.YEAR > 2000 | |
left outer join | |
(select max(sequence) as max_seq, | |
hostname | |
from ${hiveconf:tablename} | |
where year > 2000 | |
group by hostname | |
) MaxSeq on MaxSeq.max_seq = M.sequence | |
and MaxSeq.hostname = M.hostname | |
where G.sequence is null | |
and MaxSeq.max_seq is null | |
and M.YEAR > 2000 | |
order by M.hostname, M.sequence | |
limit 1000000000 | |
) starting | |
inner join | |
(select M.sequence, | |
rowSequence() as id, | |
M.hostname | |
from ${hiveconf:tablename} M | |
left outer join | |
${hiveconf:tablename} L on L.sequence = M.sequence - 1 | |
and L.hostname = M.hostname | |
and L.YEAR > 2000 | |
left outer join | |
(select min(sequence) as min_seq, | |
hostname | |
from ${hiveconf:tablename} | |
where year > 2000 | |
group by hostname | |
) MinSeq on MinSeq.min_seq = M.sequence | |
and MinSeq.hostname = M.hostname | |
where L.sequence is null | |
and MinSeq.min_seq is null | |
and M.YEAR > 2000 | |
order by M.hostname, M.sequence | |
limit 1000000000 | |
) ending on ending.id = starting.id | |
and ending.hostname = starting.hostname | |
; | |
/* Find out whether there *are* missing packets (the query result > 0) */ | |
select M.hostname, | |
count(*) - 1 as missing_sequence_runs | |
from ${hiveconf:tablename} M | |
left outer join | |
${hiveconf:tablename} G on G.hostname = M.hostname | |
and G.sequence = M.sequence + 1 | |
and G.YEAR > 2000 | |
where G.sequence is null | |
and M.YEAR > 2000 | |
group by M.hostname | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment