Skip to content

Instantly share code, notes, and snippets.

@jesboat
Created August 19, 2014 03:01
Show Gist options
  • Save jesboat/b59ee2736f925ffea5e9 to your computer and use it in GitHub Desktop.
Save jesboat/b59ee2736f925ffea5e9 to your computer and use it in GitHub Desktop.
Quick script to scrape the counters from a Hadoop jobdetails.jsp page
#!/usr/bin/env perl
use strict;
use warnings;
use XML::XPath;
@ARGV == 1 or die "Usage: $0 infile.xml\n";
my ($infile) = @ARGV;
my $xp;
my $row_queue;
my %raw_counters;
my %merged_counters;
init_nocache();
parse_table();
show_counters();
sub init_nocache {
$xp = XML::XPath->new(filename => $infile);
my @tables = $xp->findnodes('//table[not(ancestor::table)]');
@tables >= 2 or die "Bad document: need at least 2 tables";
my @rows = $xp->findnodes('./tr', $tables[1]);
$row_queue = [@rows];
}
# Unfortunately, this is broken since Storable won't store GLOBs.
#
#use Storable qw(retrieve nstore_fd);
#use Digest::SHA qw(sha1_hex);
#use File::Slurp qw(read_file);
#
#sub init_cache {
# # Read the input only once, so we can be used on pipes and such.
# open my($infh), "<:raw", $infile or die "open: $infile: $!\n";
# my $textbuf;
# read_file($infh, buf_ref => \$textbuf);
# # Compute a hash, and see if we can use a cache.
# my $hash = sha1_hex(do { open my($fh), "<", \$textbuf; $fh });
# my $cachefile = "$infile.cache.stor";
# if (-f $cachefile) {
# my $cache = retrieve($cachefile);
# if ($cache->{version} == 1 and $cache->{hash} eq $hash) {
# $xp = $cache->{xpath};
# $row_queue = $cache->{rows};
# return;
# }
# }
# # Need to parse from scratch
# $xp = XML::XPath->new(ioref => do { open my($fh), "<", \$textbuf; $fh });
# my @tables = $xp->findnodes('//table[not(ancestor::table)]');
# @tables >= 2 or die "Bad document: need at least 2 tables";
# my @rows = $xp->findnodes('./tr', $tables[1]);
# $row_queue = [@rows];
# # Write out a cache, if we can
# if (-f $infh) {
# if (open my($cachefh), ">", $cachefile) {
# nstore_fd(
# {version => 1, hash => $hash, xpath => $xp, rows => \@rows},
# $cachefh);
# close $cachefh or die "close: $cachefile: $!\n";
# }
# }
#}
sub parse_table {
my ($rows) = @_;
parse_header_row();
while (@$row_queue) {
parse_section();
}
}
sub parse_header_row {
(@$row_queue > 0) or die "Bad document: missing header row.\n";
my @cells = $xp->findnodes('./*', shift @$row_queue);
my @strs = map { $_->toString } @cells;
@strs == 5
and $strs[1] eq "<th>Counter</th>"
and $strs[2] eq "<th>Map</th>"
and $strs[3] eq "<th>Reduce</th>"
and $strs[4] eq "<th>Total</th>"
or die "Bad document: odd header row.\n";
}
sub parse_section {
my ($groupSize, $groupName, $firstrowCells)
= parse_section_header(shift @$row_queue);
(@$row_queue >= $groupSize - 1)
or die "Bad document: too few rows for group $groupName\n";
parse_counter($groupName, $firstrowCells);
for (1 .. ($groupSize - 1)) {
my $row = shift @$row_queue;
my @cells = $xp->findnodes('./td', $row);
@cells == 4 or die "Bad document: odd row.\n";
parse_counter($groupName, \@cells);
}
}
sub parse_section_header {
my ($row) = @_;
my @cells = $xp->findnodes('./td', $row);
@cells == 5 or die "Bad document: odd section header.\n";
my $rowHeader = shift @cells;
my $size = $rowHeader->getAttribute("rowspan");
my $text = $rowHeader->getChildNodes->[0]->getData;
return ($size, $text, \@cells);
}
sub parse_counter {
my ($groupname, $cells) = @_;
my $rawname = $cells->[0]->getChildNodes->[0]->getData;
my $value = $cells->[1]->getChildNodes->[0]->getData;
$value =~ s/,//g; # strip commas from 1,234,567-style numbers
my $mungedname = $rawname;
for ($mungedname) {
s/[\r\n\t]/ /g;
s/ +/ /g;
s/(Some message that contained the userid) (\d+)/$1 USERID/g;
}
($raw_counters{$groupname}{$rawname}
&&= warn "Counter $groupname.$rawname duplicated; discarding first.\n")
= $value;
$merged_counters{$groupname}{$mungedname} += $value;
}
sub show_counters {
for my $group (sort keys %merged_counters) {
for my $counter (sort keys %{ $merged_counters{$group} }) {
my $value = $merged_counters{$group}{$counter};
printf "%u\t%s\t%s\n",
$value, fmtstr($group), fmtstr($counter);
}
}
}
sub fmtstr {
my ($str) = @_;
$str =~ s/([%\n\t])/sprintf "%%%02x", ord $1/egg;
return $str;
}
@jesboat
Copy link
Author

jesboat commented Aug 19, 2014

Input format is the job page, run through xmllint --html --xmlout jobdetails.html > jobdetails.xml. Output format is one line per counter, with the group name, counter name, and counter value, tab-separated.

This gets the map counters; reduce counters are ignored. I ended up with some counters formatted with newlines or user IDs in their names; the newlines are stripped, and the user IDs removed and relevant counters re-aggregated.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment