|
#!/usr/bin/env perl |
|
|
|
use strict; |
|
use JSON::XS; |
|
use Path::Tiny; |
|
use Sereal::Encoder qw( SRL_SNAPPY SRL_ZLIB SRL_UNCOMPRESSED ); |
|
use Text::Table; |
|
|
|
die "Usage: json2sereal.pl <dir>\n\n Scans <dir> for .json files, converts to seral and compares sizes\n" unless @ARGV; |
|
|
|
my $enc_snappy = Sereal::Encoder->new({ compress => SRL_SNAPPY, dedupe_strings => 1 }); |
|
my $enc_zlib = Sereal::Encoder->new({ compress => SRL_ZLIB, dedupe_strings => 1 }); |
|
my $enc_none = Sereal::Encoder->new({ compress => SRL_UNCOMPRESSED, dedupe_strings => 1 }); |
|
my $enc_def = Sereal::Encoder->new(); |
|
|
|
my %best_vpack = ( |
|
'api-docs.json' => 994160, |
|
'commits.json' => 20789, |
|
'countries.json' => 956786, |
|
'directory-tree.json' => 244716, |
|
'doubles.json' => 899982, |
|
'doubles-small.json' => 89998, |
|
'file-list.json' => 133536, |
|
'object.json' => 118630, |
|
'pass1.json' => 804, |
|
'pass2.json' => 51, |
|
'pass3.json' => 108, |
|
'random1.json' => 6836, |
|
'random2.json' => 5815, |
|
'random3.json' => 51515, |
|
'sample.json' => 153187, |
|
'small.json' => 30, |
|
); |
|
|
|
my $it = path(@ARGV)->iterator; |
|
|
|
my (@rows, %totals); |
|
while (my $f = $it->()) { |
|
my $b = $f->basename; |
|
next unless $f->is_file and $b =~ m/[.]json$/; |
|
|
|
my $c = eval { decode_json($f->slurp_raw) }; |
|
debug("Skip file '$b', could not JSON-parse it: $@"), next unless defined $c; |
|
|
|
my $v = $best_vpack{$b}; |
|
debug("Skip file '$b', no VPack comparison"), next unless $v; |
|
|
|
my $s = $f->stat->size; |
|
my ($def, $none, $snap, $zlib) = ( |
|
length($enc_def->encode($c)), length($enc_none->encode($c)), |
|
length($enc_snappy->encode($c)), length($enc_zlib->encode($c)) |
|
); |
|
|
|
$totals{json} += $s; |
|
$totals{vpack} += $v; |
|
$totals{def} += $def; |
|
$totals{none} += $none; |
|
$totals{snap} += $snap; |
|
$totals{zlib} += $zlib; |
|
|
|
push @rows, table_row($b, $s, $v, $def, $none, $snap, $zlib); |
|
} |
|
|
|
push @rows, |
|
table_row('-- Total --', $totals{json}, $totals{vpack}, $totals{def}, $totals{none}, $totals{snap}, $totals{zlib}); |
|
|
|
my $tb = Text::Table->new( |
|
'File', |
|
'JSON Size', |
|
'VPack best', |
|
'Defaults', |
|
'% JSON', |
|
'% VPack', |
|
'No Compr', |
|
'% JSON', |
|
'% VPack', |
|
'Snappy', |
|
'% JSON', |
|
'% VPack', |
|
'ZLib', |
|
'% JSON', |
|
'% VPack', |
|
); |
|
$tb->load(@rows); |
|
print $tb; |
|
|
|
|
|
sub debug { |
|
return unless $ENV{DEBUG}; |
|
print STDERR "[DEBUG] @_\n"; |
|
} |
|
|
|
sub table_row { |
|
my ($b, $s, $v, $def, $none, $snap, $zlib) = @_; |
|
|
|
return [ |
|
$b, |
|
$s, |
|
$v, |
|
$def, |
|
sprintf('%.2f%%', $def / $s * 100), |
|
sprintf('%.2f%%', $def / $v * 100), |
|
$none, |
|
sprintf('%.2f%%', $none / $s * 100), |
|
sprintf('%.2f%%', $none / $v * 100), |
|
$snap, |
|
sprintf('%.2f%%', $snap / $s * 100), |
|
sprintf('%.2f%%', $snap / $v * 100), |
|
$zlib, |
|
sprintf('%.2f%%', $zlib / $s * 100), |
|
sprintf('%.2f%%', $zlib / $v * 100) |
|
]; |
|
} |
Not that I want to start a battle "mine is smaller" or so, but for the sake of completeness we have added two more columns to our performance table, where we have taken the compact VPack version and run "gzip -9" and snappy compression respectively. This now allows a sensible comparison of compressed sereal with compressed VelocyPack.
See https://github.com/arangodb/velocypack/blob/master/Performance.md for details.
The reason why we have not put in compression into the VPack format itself is that for us the main advantage of VPack is that one can quickly access subvalues without parsing or deserialization. This is of course no longer possible after compression. On the other hand, if the aim is only compact storage, then one can easily put compression on top of VPack outside of the format specification.