Skip to content

Instantly share code, notes, and snippets.

@milimetric
Last active January 11, 2023 16:43
Show Gist options
  • Save milimetric/6814636 to your computer and use it in GitHub Desktop.
Save milimetric/6814636 to your computer and use it in GitHub Desktop.
Two ways to find missing sequence numbers in huge Hive tables. First way - gets the left and right boundaries of each run of missing sequences. Second way - gets the count of boundaries, which if greater than 2 signifies missing sequences. The second way doesn't tell you which sequences are missing or how many are missing, but runs faster.
/* Common setup, two variants follow
*/
use test;
set tablename=webrequest_esams0;
add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar;
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar;
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
/* Get the missing packets. Sample run:
sequence sequence missing
======== ======== =======
91965123 91965125 1
103290989 103290991 1
*/
select starting.hostname,
starting.sequence as missing_from,
ending.sequence as missing_to,
ending.sequence - starting.sequence - 1 as missing
from (select M.sequence,
rowSequence() as id,
M.hostname
from ${hiveconf:tablename} M
left outer join
${hiveconf:tablename} G on G.sequence = M.sequence + 1
and G.hostname = M.hostname
and G.YEAR > 2000
left outer join
(select max(sequence) as max_seq,
hostname
from ${hiveconf:tablename}
where year > 2000
group by hostname
) MaxSeq on MaxSeq.max_seq = M.sequence
and MaxSeq.hostname = M.hostname
where G.sequence is null
and MaxSeq.max_seq is null
and M.YEAR > 2000
order by M.hostname, M.sequence
limit 1000000000
) starting
inner join
(select M.sequence,
rowSequence() as id,
M.hostname
from ${hiveconf:tablename} M
left outer join
${hiveconf:tablename} L on L.sequence = M.sequence - 1
and L.hostname = M.hostname
and L.YEAR > 2000
left outer join
(select min(sequence) as min_seq,
hostname
from ${hiveconf:tablename}
where year > 2000
group by hostname
) MinSeq on MinSeq.min_seq = M.sequence
and MinSeq.hostname = M.hostname
where L.sequence is null
and MinSeq.min_seq is null
and M.YEAR > 2000
order by M.hostname, M.sequence
limit 1000000000
) ending on ending.id = starting.id
and ending.hostname = starting.hostname
;
/* Find out whether there *are* missing packets (the query result > 0) */
select M.hostname,
count(*) - 1 as missing_sequence_runs
from ${hiveconf:tablename} M
left outer join
${hiveconf:tablename} G on G.hostname = M.hostname
and G.sequence = M.sequence + 1
and G.YEAR > 2000
where G.sequence is null
and M.YEAR > 2000
group by M.hostname
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment