milimetric · January 11, 2023 16:43
diff --git a/Missing Sequence Numbers in Hive b/Missing Sequence Numbers in Hive
 /* Common setup, two variants follow
 */
 use test;

    set tablename=webrequest_esams0;

    add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar;
    add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar;
 create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';


 /* Get the missing packets. Sample run:

 sequence    sequence    missing
 ========    ========    =======
 91965123    91965125    1
 103290989   103290991   1

 */
 select starting.hostname,
        starting.sequence                       as missing_from,
        ending.sequence                         as missing_to,
        ending.sequence - starting.sequence - 1 as missing

   from (select M.sequence,
                rowSequence() as id,
                M.hostname
           from ${hiveconf:tablename} M
                    left outer join
                ${hiveconf:tablename} G on G.sequence = M.sequence + 1
                                       and G.hostname = M.hostname
                                       and G.YEAR > 2000
                    left outer join
                (select max(sequence) as max_seq,
                        hostname
                   from ${hiveconf:tablename}
                  where year > 2000
                  group by hostname
                ) MaxSeq               on MaxSeq.max_seq = M.sequence
                                      and MaxSeq.hostname = M.hostname
          where G.sequence is null
            and MaxSeq.max_seq is null
            and M.YEAR > 2000
          order by M.hostname, M.sequence
          limit 1000000000
        ) starting
            inner join
        (select M.sequence,
                rowSequence() as id,
                M.hostname
           from ${hiveconf:tablename} M
                    left outer join
                ${hiveconf:tablename} L on L.sequence = M.sequence - 1
                                       and L.hostname = M.hostname
                                       and L.YEAR > 2000
                    left outer join
                (select min(sequence) as min_seq,
                        hostname
                   from ${hiveconf:tablename}
                  where year > 2000
                  group by hostname
                ) MinSeq                on MinSeq.min_seq = M.sequence
                                       and MinSeq.hostname = M.hostname
          where L.sequence is null
            and MinSeq.min_seq is null
            and M.YEAR > 2000
          order by M.hostname, M.sequence
          limit 1000000000
        ) ending        on ending.id = starting.id
                       and ending.hostname = starting.hostname
 ;



 /* Find out whether there *are* missing packets (the query result > 0) */
 select M.hostname,
        count(*) - 1 as missing_sequence_runs
   from ${hiveconf:tablename} M
            left outer join
        ${hiveconf:tablename} G on G.hostname = M.hostname
                               and G.sequence = M.sequence + 1
                               and G.YEAR > 2000
  where G.sequence is null
    and M.YEAR > 2000
  group by M.hostname
 ;
	/* Common setup, two variants follow
	*/
	use test;

	set tablename=webrequest_esams0;

	add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar;
	add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar;
	create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';


	/* Get the missing packets. Sample run:

	sequence sequence missing
	======== ======== =======
	91965123 91965125 1
	103290989 103290991 1

	*/
	select starting.hostname,
	starting.sequence as missing_from,
	ending.sequence as missing_to,
	ending.sequence - starting.sequence - 1 as missing

	from (select M.sequence,
	rowSequence() as id,
	M.hostname
	from ${hiveconf:tablename} M
	left outer join
	${hiveconf:tablename} G on G.sequence = M.sequence + 1
	and G.hostname = M.hostname
	and G.YEAR > 2000
	left outer join
	(select max(sequence) as max_seq,
	hostname
	from ${hiveconf:tablename}
	where year > 2000
	group by hostname
	) MaxSeq on MaxSeq.max_seq = M.sequence
	and MaxSeq.hostname = M.hostname
	where G.sequence is null
	and MaxSeq.max_seq is null
	and M.YEAR > 2000
	order by M.hostname, M.sequence
	limit 1000000000
	) starting
	inner join
	(select M.sequence,
	rowSequence() as id,
	M.hostname
	from ${hiveconf:tablename} M
	left outer join
	${hiveconf:tablename} L on L.sequence = M.sequence - 1
	and L.hostname = M.hostname
	and L.YEAR > 2000
	left outer join
	(select min(sequence) as min_seq,
	hostname
	from ${hiveconf:tablename}
	where year > 2000
	group by hostname
	) MinSeq on MinSeq.min_seq = M.sequence
	and MinSeq.hostname = M.hostname
	where L.sequence is null
	and MinSeq.min_seq is null
	and M.YEAR > 2000
	order by M.hostname, M.sequence
	limit 1000000000
	) ending on ending.id = starting.id
	and ending.hostname = starting.hostname
	;



	/* Find out whether there are missing packets (the query result > 0) */
	select M.hostname,
	count(*) - 1 as missing_sequence_runs
	from ${hiveconf:tablename} M
	left outer join
	${hiveconf:tablename} G on G.hostname = M.hostname
	and G.sequence = M.sequence + 1
	and G.YEAR > 2000
	where G.sequence is null
	and M.YEAR > 2000
	group by M.hostname
	;