Created
August 19, 2014 03:01
-
-
Save jesboat/b59ee2736f925ffea5e9 to your computer and use it in GitHub Desktop.
Quick script to scrape the counters from a Hadoop jobdetails.jsp page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use XML::XPath; | |
@ARGV == 1 or die "Usage: $0 infile.xml\n"; | |
my ($infile) = @ARGV; | |
my $xp; | |
my $row_queue; | |
my %raw_counters; | |
my %merged_counters; | |
init_nocache(); | |
parse_table(); | |
show_counters(); | |
sub init_nocache { | |
$xp = XML::XPath->new(filename => $infile); | |
my @tables = $xp->findnodes('//table[not(ancestor::table)]'); | |
@tables >= 2 or die "Bad document: need at least 2 tables"; | |
my @rows = $xp->findnodes('./tr', $tables[1]); | |
$row_queue = [@rows]; | |
} | |
# Unfortunately, this is broken since Storable won't store GLOBs. | |
# | |
#use Storable qw(retrieve nstore_fd); | |
#use Digest::SHA qw(sha1_hex); | |
#use File::Slurp qw(read_file); | |
# | |
#sub init_cache { | |
# # Read the input only once, so we can be used on pipes and such. | |
# open my($infh), "<:raw", $infile or die "open: $infile: $!\n"; | |
# my $textbuf; | |
# read_file($infh, buf_ref => \$textbuf); | |
# # Compute a hash, and see if we can use a cache. | |
# my $hash = sha1_hex(do { open my($fh), "<", \$textbuf; $fh }); | |
# my $cachefile = "$infile.cache.stor"; | |
# if (-f $cachefile) { | |
# my $cache = retrieve($cachefile); | |
# if ($cache->{version} == 1 and $cache->{hash} eq $hash) { | |
# $xp = $cache->{xpath}; | |
# $row_queue = $cache->{rows}; | |
# return; | |
# } | |
# } | |
# # Need to parse from scratch | |
# $xp = XML::XPath->new(ioref => do { open my($fh), "<", \$textbuf; $fh }); | |
# my @tables = $xp->findnodes('//table[not(ancestor::table)]'); | |
# @tables >= 2 or die "Bad document: need at least 2 tables"; | |
# my @rows = $xp->findnodes('./tr', $tables[1]); | |
# $row_queue = [@rows]; | |
# # Write out a cache, if we can | |
# if (-f $infh) { | |
# if (open my($cachefh), ">", $cachefile) { | |
# nstore_fd( | |
# {version => 1, hash => $hash, xpath => $xp, rows => \@rows}, | |
# $cachefh); | |
# close $cachefh or die "close: $cachefile: $!\n"; | |
# } | |
# } | |
#} | |
sub parse_table { | |
my ($rows) = @_; | |
parse_header_row(); | |
while (@$row_queue) { | |
parse_section(); | |
} | |
} | |
sub parse_header_row { | |
(@$row_queue > 0) or die "Bad document: missing header row.\n"; | |
my @cells = $xp->findnodes('./*', shift @$row_queue); | |
my @strs = map { $_->toString } @cells; | |
@strs == 5 | |
and $strs[1] eq "<th>Counter</th>" | |
and $strs[2] eq "<th>Map</th>" | |
and $strs[3] eq "<th>Reduce</th>" | |
and $strs[4] eq "<th>Total</th>" | |
or die "Bad document: odd header row.\n"; | |
} | |
sub parse_section { | |
my ($groupSize, $groupName, $firstrowCells) | |
= parse_section_header(shift @$row_queue); | |
(@$row_queue >= $groupSize - 1) | |
or die "Bad document: too few rows for group $groupName\n"; | |
parse_counter($groupName, $firstrowCells); | |
for (1 .. ($groupSize - 1)) { | |
my $row = shift @$row_queue; | |
my @cells = $xp->findnodes('./td', $row); | |
@cells == 4 or die "Bad document: odd row.\n"; | |
parse_counter($groupName, \@cells); | |
} | |
} | |
sub parse_section_header { | |
my ($row) = @_; | |
my @cells = $xp->findnodes('./td', $row); | |
@cells == 5 or die "Bad document: odd section header.\n"; | |
my $rowHeader = shift @cells; | |
my $size = $rowHeader->getAttribute("rowspan"); | |
my $text = $rowHeader->getChildNodes->[0]->getData; | |
return ($size, $text, \@cells); | |
} | |
sub parse_counter { | |
my ($groupname, $cells) = @_; | |
my $rawname = $cells->[0]->getChildNodes->[0]->getData; | |
my $value = $cells->[1]->getChildNodes->[0]->getData; | |
$value =~ s/,//g; # strip commas from 1,234,567-style numbers | |
my $mungedname = $rawname; | |
for ($mungedname) { | |
s/[\r\n\t]/ /g; | |
s/ +/ /g; | |
s/(Some message that contained the userid) (\d+)/$1 USERID/g; | |
} | |
($raw_counters{$groupname}{$rawname} | |
&&= warn "Counter $groupname.$rawname duplicated; discarding first.\n") | |
= $value; | |
$merged_counters{$groupname}{$mungedname} += $value; | |
} | |
sub show_counters { | |
for my $group (sort keys %merged_counters) { | |
for my $counter (sort keys %{ $merged_counters{$group} }) { | |
my $value = $merged_counters{$group}{$counter}; | |
printf "%u\t%s\t%s\n", | |
$value, fmtstr($group), fmtstr($counter); | |
} | |
} | |
} | |
sub fmtstr { | |
my ($str) = @_; | |
$str =~ s/([%\n\t])/sprintf "%%%02x", ord $1/egg; | |
return $str; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Input format is the job page, run through
xmllint --html --xmlout jobdetails.html > jobdetails.xml
. Output format is one line per counter, with the group name, counter name, and counter value, tab-separated.This gets the map counters; reduce counters are ignored. I ended up with some counters formatted with newlines or user IDs in their names; the newlines are stripped, and the user IDs removed and relevant counters re-aggregated.