Forked from milimetric/Missing Sequence Numbers in Hive
Last active
December 24, 2015 14:58
-
-
Save ottomata/6816039 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
webrequest_esams0 | |
38746878 records | |
MapReduce Total cumulative CPU time: 11 seconds 520 msec | |
Ended Job = job_1379030026553_0153 | |
MapReduce Jobs Launched: | |
Job 0: Map: 97 Reduce: 26 Cumulative CPU: 3839.52 sec HDFS Read: 25972004039 HDFS Write: 1336075346 SUCCESS | |
Job 1: Map: 106 Reduce: 28 Cumulative CPU: 4060.18 sec HDFS Read: 27308086098 HDFS Write: 3192 SUCCESS | |
Job 2: Map: 6 Reduce: 1 Cumulative CPU: 11.52 sec HDFS Read: 9886 HDFS Write: 2 SUCCESS | |
Total MapReduce CPU Time Spent: 0 days 2 hours 11 minutes 51 seconds 220 msec | |
OK | |
_c0 | |
6 | |
Job 0: Map: 97 Reduce: 1 Cumulative CPU: 3379.42 sec HDFS Read: 25972004039 HDFS Write: 117 SUCCESS | |
Job 1: Map: 97 Reduce: 26 Cumulative CPU: 4570.52 sec HDFS Read: 25972004039 HDFS Write: 2583 SUCCESS | |
Job 2: Map: 113 Reduce: 1 Cumulative CPU: 4177.4 sec HDFS Read: 30153840760 HDFS Write: 118 SUCCESS | |
Job 3: Map: 97 Reduce: 26 Cumulative CPU: 4672.28 sec HDFS Read: 25972004039 HDFS Write: 2584 SUCCESS | |
Job 4: Map: 8 Reduce: 1 Cumulative CPU: 16.45 sec HDFS Read: 9476 HDFS Write: 140 SUCCESS | |
Job 5: Map: 6 Reduce: 1 Cumulative CPU: 13.92 sec HDFS Read: 9188 HDFS Write: 163 SUCCESS | |
Job 6: Map: 1 Reduce: 1 Cumulative CPU: 3.54 sec HDFS Read: 493 HDFS Write: 140 SUCCESS | |
Job 7: Map: 1 Reduce: 1 Cumulative CPU: 4.19 sec HDFS Read: 516 HDFS Write: 163 SUCCESS | |
Job 8: Map: 2 Reduce: 1 Cumulative CPU: 6.3 sec HDFS Read: 1009 HDFS Write: 40 SUCCESS | |
Total MapReduce CPU Time Spent: 0 days 4 hours 40 minutes 44 seconds 20 msec | |
OK | |
sequence sequence missing | |
14674712 14674714 1 | |
14682058 14682060 1 | |
Time taken: 548.884 seconds |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MapReduce Total cumulative CPU time: 4 seconds 710 msec | |
Ended Job = job_1379030026553_0465 | |
MapReduce Jobs Launched: | |
Job 0: Map: 1592 Reduce: 431 Cumulative CPU: 64713.9 sec HDFS Read: 430472626221 HDFS Write: 41458 SUCCESS | |
Job 1: Map: 1592 Reduce: 431 Cumulative CPU: 64889.64 sec HDFS Read: 430472626221 HDFS Write: 41466 SUCCESS | |
Job 2: Map: 1592 Reduce: 431 Cumulative CPU: 85013.92 sec HDFS Read: 430472626221 HDFS Write: 41800 SUCCESS | |
Job 3: Map: 1592 Reduce: 431 Cumulative CPU: 85648.47 sec HDFS Read: 430472626221 HDFS Write: 42092 SUCCESS | |
Job 4: Map: 16 Reduce: 1 Cumulative CPU: 32.26 sec HDFS Read: 264882 HDFS Write: 396 SUCCESS | |
Job 5: Map: 16 Reduce: 1 Cumulative CPU: 34.47 sec HDFS Read: 265166 HDFS Write: 396 SUCCESS | |
Job 6: Map: 1 Reduce: 1 Cumulative CPU: 3.46 sec HDFS Read: 749 HDFS Write: 396 SUCCESS | |
Job 7: Map: 1 Reduce: 1 Cumulative CPU: 4.15 sec HDFS Read: 749 HDFS Write: 396 SUCCESS | |
Job 8: Map: 2 Reduce: 1 Cumulative CPU: 4.71 sec HDFS Read: 1498 HDFS Write: 284 SUCCESS | |
Total MapReduce CPU Time Spent: 3 days 11 hours 25 minutes 44 seconds 980 msec | |
OK | |
hostname missing_from missing_to missing | |
cp3003.esams.wikimedia.org 19537005 19537007 1 | |
cp3003.esams.wikimedia.org 19537314 19537316 1 | |
cp3003.esams.wikimedia.org 19539168 19539170 1 | |
cp3003.esams.wikimedia.org 19573875 19573877 1 | |
cp3003.esams.wikimedia.org 28677724 28677726 1 | |
cp3003.esams.wikimedia.org 111682679 111682681 1 | |
Time taken: 4803.44 seconds |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* get the missing packets. Sample run: | |
sequence sequence missing | |
======== ======== ======= | |
91965123 91965125 1 | |
103290989 103290991 1 | |
*/ | |
add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar | |
create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; | |
select starting.sequence, | |
ending.sequence, | |
ending.sequence - starting.sequence - 1 as missing | |
from (select M.sequence, | |
rowSequence() as id | |
from webrequest_esams0 M | |
left outer join | |
webrequest_esams0 G on G.sequence = M.sequence + 1 | |
and G.YEAR = 2013 and G.HOUR BETWEEN 2 and 12 | |
left outer join | |
(select max(sequence) as max_seq | |
from webrequest_esams0 | |
where year = 2013 and hour BETWEEN 2 and 12 | |
) MaxSeq on MaxSeq.max_seq = M.sequence | |
where G.sequence is null | |
and MaxSeq.max_seq is null | |
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12 | |
order by M.sequence | |
limit 1000000000 | |
) starting | |
inner join | |
(select M.sequence, | |
rowSequence() as id | |
from webrequest_esams0 M | |
left outer join | |
webrequest_esams0 L on L.sequence = M.sequence - 1 | |
and L.YEAR = 2013 and L.HOUR BETWEEN 2 and 12 | |
left outer join | |
(select min(sequence) as min_seq | |
from webrequest_esams0 | |
where year = 2013 and hour BETWEEN 2 and 12 | |
) MinSeq on MinSeq.min_seq = M.sequence | |
where L.sequence is null | |
and MinSeq.min_seq is null | |
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12 | |
order by M.sequence | |
limit 1000000000 | |
) ending on ending.id = starting.id | |
; | |
/* find out whether there are missing packets (the query result > 2) Sample run: | |
6 | |
this means there are 2 runs of missing sequence numbers - (n-2)/2 | |
*/ | |
select count(M.sequence) | |
from webrequest_esams0 M | |
left outer join | |
webrequest_esams0 G on G.sequence = M.sequence + 1 | |
and G.YEAR = 2013 and G.HOUR BETWEEN 2 and 12 | |
left outer join | |
webrequest_esams0 L on L.sequence = M.sequence - 1 | |
and L.YEAR = 2013 and L.HOUR BETWEEN 2 and 12 | |
where (G.sequence is null or L.sequence is null) | |
and M.YEAR = 2013 and M.HOUR BETWEEN 2 and 12 | |
; | |
# double join for or: | |
set tablename=varnish0; | |
select M.hostname, | |
count(*) - 1 as missing_sequence_runs | |
from ${hiveconf:tablename} M | |
left outer join | |
${hiveconf:tablename} G1 on G1.hostname = M.hostname | |
and G1.sequence = M.sequence + 1 | |
and G1.DAY = 07 | |
and G1.HOUR BETWEEN 16 and 23 | |
left outer join | |
${hiveconf:tablename} G2 on G2.hostname = M.hostname | |
and G2.sequence = M.sequence + 1 | |
and G2.DAY = 08 | |
and G2.HOUR BETWEEN 0 and 16 | |
where ( M.DAY = 07 | |
and M.HOUR BETWEEN 16 and 23 | |
and G1.sequence is null | |
) or | |
( M.DAY = 08 | |
and M.HOUR BETWEEN 0 and 16 | |
and G2.sequence is null | |
) | |
group by M.hostname | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment