jesboat · August 19, 2014 03:01 · jesboat · Aug 19, 2014
diff --git a/parse-mr-counters.pl b/parse-mr-counters.pl
 #!/usr/bin/env perl
 use strict;
 use warnings;

 use XML::XPath;

 @ARGV == 1 or die "Usage: $0 infile.xml\n";
 my ($infile) = @ARGV;

 my $xp;
 my $row_queue;
 my %raw_counters;
 my %merged_counters;

 init_nocache();
 parse_table();
 show_counters();

 sub init_nocache {
    $xp = XML::XPath->new(filename => $infile);
    my @tables = $xp->findnodes('//table[not(ancestor::table)]');
    @tables >= 2 or die "Bad document: need at least 2 tables";
    my @rows = $xp->findnodes('./tr', $tables[1]);
    $row_queue = [@rows];
 }

 # Unfortunately, this is broken since Storable won't store GLOBs.
 #
 #use Storable qw(retrieve nstore_fd);
 #use Digest::SHA qw(sha1_hex);
 #use File::Slurp qw(read_file);
 #
 #sub init_cache {
 #    # Read the input only once, so we can be used on pipes and such.
 #    open my($infh), "<:raw", $infile  or die "open: $infile: $!\n";
 #    my $textbuf;
 #    read_file($infh, buf_ref => \$textbuf);
 #    # Compute a hash, and see if we can use a cache.
 #    my $hash = sha1_hex(do { open my($fh), "<", \$textbuf; $fh });
 #    my $cachefile = "$infile.cache.stor";
 #    if (-f $cachefile) {
 #    	my $cache = retrieve($cachefile);
 #    	if ($cache->{version} == 1 and $cache->{hash} eq $hash) {
 #    	    $xp = $cache->{xpath};
 #    	    $row_queue = $cache->{rows};
 #    	    return;
 #        }
 #    }
 #    # Need to parse from scratch
 #    $xp = XML::XPath->new(ioref => do { open my($fh), "<", \$textbuf; $fh });
 #    my @tables = $xp->findnodes('//table[not(ancestor::table)]');
 #    @tables >= 2 or die "Bad document: need at least 2 tables";
 #    my @rows = $xp->findnodes('./tr', $tables[1]);
 #    $row_queue = [@rows];
 #    # Write out a cache, if we can
 #    if (-f $infh) {
 #    	if (open my($cachefh), ">", $cachefile) {
 #    	    nstore_fd(
 #    	        {version => 1, hash => $hash, xpath => $xp, rows => \@rows},
 #    	        $cachefh);
 #    	    close $cachefh or die "close: $cachefile: $!\n";
 #        }
 #    }
 #}

 sub parse_table {
    my ($rows) = @_;
    parse_header_row();
    while (@$row_queue) {
    	parse_section();
    }
 }

 sub parse_header_row {
    (@$row_queue > 0) or die "Bad document: missing header row.\n";
    my @cells = $xp->findnodes('./*', shift @$row_queue);
    my @strs = map { $_->toString } @cells;
    @strs == 5
        and $strs[1] eq "<th>Counter</th>"
        and $strs[2] eq "<th>Map</th>"
        and $strs[3] eq "<th>Reduce</th>"
        and $strs[4] eq "<th>Total</th>"
        or die "Bad document: odd header row.\n";
 }

 sub parse_section {
    my ($groupSize, $groupName, $firstrowCells)
        = parse_section_header(shift @$row_queue);
    (@$row_queue >= $groupSize - 1)
            or die "Bad document: too few rows for group $groupName\n";
    parse_counter($groupName, $firstrowCells);
    for (1 .. ($groupSize - 1)) {
    	my $row = shift @$row_queue;
    	my @cells = $xp->findnodes('./td', $row);
    	@cells == 4 or die "Bad document: odd row.\n";
    	parse_counter($groupName, \@cells);
    }
 }

 sub parse_section_header {
    my ($row) = @_;
    my @cells = $xp->findnodes('./td', $row);
    @cells == 5 or die "Bad document: odd section header.\n";
    my $rowHeader = shift @cells;
    my $size = $rowHeader->getAttribute("rowspan");
    my $text = $rowHeader->getChildNodes->[0]->getData;
    return ($size, $text, \@cells);
 }

 sub parse_counter {
    my ($groupname, $cells) = @_;

    my $rawname = $cells->[0]->getChildNodes->[0]->getData;
    my $value = $cells->[1]->getChildNodes->[0]->getData;
    $value =~ s/,//g;  # strip commas from 1,234,567-style numbers

    my $mungedname = $rawname;
    for ($mungedname) {
        s/[\r\n\t]/ /g;
        s/ +/ /g;
        s/(Some message that contained the userid) (\d+)/$1 USERID/g;
    }

    ($raw_counters{$groupname}{$rawname}
        &&= warn "Counter $groupname.$rawname duplicated; discarding first.\n")
        = $value;
    $merged_counters{$groupname}{$mungedname} += $value;
 }

 sub show_counters {
    for my $group (sort keys %merged_counters) {
    	for my $counter (sort keys %{ $merged_counters{$group} }) {
    	    my $value = $merged_counters{$group}{$counter};
            printf "%u\t%s\t%s\n",
                $value, fmtstr($group), fmtstr($counter);
        }
    }
 }

 sub fmtstr {
    my ($str) = @_;
    $str =~ s/([%\n\t])/sprintf "%%%02x", ord $1/egg;
    return $str;
 }
	#!/usr/bin/env perl
	use strict;
	use warnings;

	use XML::XPath;

	@ARGV == 1 or die "Usage: $0 infile.xml\n";
	my ($infile) = @ARGV;

	my $xp;
	my $row_queue;
	my %raw_counters;
	my %merged_counters;

	init_nocache();
	parse_table();
	show_counters();

	sub init_nocache {
	$xp = XML::XPath->new(filename => $infile);
	my @tables = $xp->findnodes('//table[not(ancestor::table)]');
	@tables >= 2 or die "Bad document: need at least 2 tables";
	my @rows = $xp->findnodes('./tr', $tables[1]);
	$row_queue = [@rows];
	}

	# Unfortunately, this is broken since Storable won't store GLOBs.
	#
	#use Storable qw(retrieve nstore_fd);
	#use Digest::SHA qw(sha1_hex);
	#use File::Slurp qw(read_file);
	#
	#sub init_cache {
	# # Read the input only once, so we can be used on pipes and such.
	# open my($infh), "<:raw", $infile or die "open: $infile: $!\n";
	# my $textbuf;
	# read_file($infh, buf_ref => \$textbuf);
	# # Compute a hash, and see if we can use a cache.
	# my $hash = sha1_hex(do { open my($fh), "<", \$textbuf; $fh });
	# my $cachefile = "$infile.cache.stor";
	# if (-f $cachefile) {
	# my $cache = retrieve($cachefile);
	# if ($cache->{version} == 1 and $cache->{hash} eq $hash) {
	# $xp = $cache->{xpath};
	# $row_queue = $cache->{rows};
	# return;
	# }
	# }
	# # Need to parse from scratch
	# $xp = XML::XPath->new(ioref => do { open my($fh), "<", \$textbuf; $fh });
	# my @tables = $xp->findnodes('//table[not(ancestor::table)]');
	# @tables >= 2 or die "Bad document: need at least 2 tables";
	# my @rows = $xp->findnodes('./tr', $tables[1]);
	# $row_queue = [@rows];
	# # Write out a cache, if we can
	# if (-f $infh) {
	# if (open my($cachefh), ">", $cachefile) {
	# nstore_fd(
	# {version => 1, hash => $hash, xpath => $xp, rows => \@rows},
	# $cachefh);
	# close $cachefh or die "close: $cachefile: $!\n";
	# }
	# }
	#}

	sub parse_table {
	my ($rows) = @_;
	parse_header_row();
	while (@$row_queue) {
	parse_section();
	}
	}

	sub parse_header_row {
	(@$row_queue > 0) or die "Bad document: missing header row.\n";
	my @cells = $xp->findnodes('./*', shift @$row_queue);
	my @strs = map { $_->toString } @cells;
	@strs == 5
	and $strs[1] eq "<th>Counter</th>"
	and $strs[2] eq "<th>Map</th>"
	and $strs[3] eq "<th>Reduce</th>"
	and $strs[4] eq "<th>Total</th>"
	or die "Bad document: odd header row.\n";
	}

	sub parse_section {
	my ($groupSize, $groupName, $firstrowCells)
	= parse_section_header(shift @$row_queue);
	(@$row_queue >= $groupSize - 1)
	or die "Bad document: too few rows for group $groupName\n";
	parse_counter($groupName, $firstrowCells);
	for (1 .. ($groupSize - 1)) {
	my $row = shift @$row_queue;
	my @cells = $xp->findnodes('./td', $row);
	@cells == 4 or die "Bad document: odd row.\n";
	parse_counter($groupName, \@cells);
	}
	}

	sub parse_section_header {
	my ($row) = @_;
	my @cells = $xp->findnodes('./td', $row);
	@cells == 5 or die "Bad document: odd section header.\n";
	my $rowHeader = shift @cells;
	my $size = $rowHeader->getAttribute("rowspan");
	my $text = $rowHeader->getChildNodes->[0]->getData;
	return ($size, $text, \@cells);
	}

	sub parse_counter {
	my ($groupname, $cells) = @_;

	my $rawname = $cells->[0]->getChildNodes->[0]->getData;
	my $value = $cells->[1]->getChildNodes->[0]->getData;
	$value =~ s/,//g; # strip commas from 1,234,567-style numbers

	my $mungedname = $rawname;
	for ($mungedname) {
	s/[\r\n\t]/ /g;
	s/ +/ /g;
	s/(Some message that contained the userid) (\d+)/$1 USERID/g;
	}

	($raw_counters{$groupname}{$rawname}
	&&= warn "Counter $groupname.$rawname duplicated; discarding first.\n")
	= $value;
	$merged_counters{$groupname}{$mungedname} += $value;
	}

	sub show_counters {
	for my $group (sort keys %merged_counters) {
	for my $counter (sort keys %{ $merged_counters{$group} }) {
	my $value = $merged_counters{$group}{$counter};
	printf "%u\t%s\t%s\n",
	$value, fmtstr($group), fmtstr($counter);
	}
	}
	}

	sub fmtstr {
	my ($str) = @_;
	$str =~ s/([%\n\t])/sprintf "%%%02x", ord $1/egg;
	return $str;
	}