Skip to content

Instantly share code, notes, and snippets.

@ernix
Last active October 19, 2018 01:15
Show Gist options
  • Save ernix/66f2d49d1354f8ffd032667f720dcc4a to your computer and use it in GitHub Desktop.
Save ernix/66f2d49d1354f8ffd032667f720dcc4a to your computer and use it in GitHub Desktop.
weld.pl - join(1), with less options, but for 3 or more files
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use open qw(:std :utf8);
our $VERSION = '0.02';
my $app = App::Weld->new;
$app->run;
package App::Weld;
use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
use Pod::Usage qw(pod2usage);
use List::Util qw(reduce);
sub new {
my $class = shift;
my $self = bless +{ @_ }, $class;
return $self->_init;
}
sub _init {
my $self = shift;
$self->{_opt} = +{ $self->_get_option };
return $self;
}
sub _get_option {
my ($self, %opt) = @_;
$opt{check_order} = 1;
$opt{sep} = "\t";
GetOptions(
'h|help' => sub { pod2usage(0) },
'v|version' => sub {
print "weld.pl v$VERSION\n";
exit 0;
},
'e=s' => \$opt{empty},
'E=s' => \$opt{exist},
't=s' => \$opt{sep},
'z|zero-terminated' => \$opt{z},
'check-order!' => \$opt{check_order},
) or pod2usage(1);
return %opt;
}
sub opt {
my ($self, $name, $default) = @_;
my $opt = $self->{_opt}->{$name};
return $default if !defined $opt;
return $opt;
}
sub run {
my $self = shift;
if (@ARGV < 2) {
die "Feed 2 or more filenames";
}
my @fhs = map {
if ($_ eq '-') {
*STDIN
}
else {
open my $fh, '<', $_
or die "Can't open $_: $!";
$fh;
}
} @ARGV;
my @seek;
my $last_key;
while (map { !eof $_ } @fhs) {
for my $i (0 .. $#fhs) {
$seek[$i] = $self->_parse($fhs[$i]) if !defined $seek[$i];
}
my $first_index = reduce {
($seek[$a]->[0] cmp $seek[$b]->[0]) < 0 ? $a : $b
} grep { defined $seek[$_]->[0] } 0 .. $#seek;
last if !defined $first_index;
my $key = $seek[$first_index]->[0];
if (defined $last_key && ($last_key cmp $key) > 0) {
die "Detect dis-order" if $self->opt('check_order');
}
$last_key = $key;
my @cols;
for my $i (0 .. $#fhs) {
my $empty = $self->opt('empty', q{});
if (!defined $seek[$i]->[0]) {
push @cols, $empty;
}
elsif ($seek[$i]->[0] ne $key) {
push @cols, $empty;
}
else {
my $value = defined $seek[$i]->[1] ? $seek[$i]->[1] : $empty;
push @cols, $value;
$seek[$i] = $self->_parse($fhs[$i]);
}
}
my $sep = $self->opt('sep', "\t");
my $line = join $sep, grep { defined $_ } ($key, @cols);
my $delim = $self->opt('z') ? "\0" : "\n";
print $line . $delim;
}
for my $i (0 .. $#fhs) {
my $filename = $ARGV[$i];
close $fhs[$i]
or die "Can't close $filename: $!";
}
return 1;
}
sub _parse {
my ($self, $fh) = @_;
return if eof $fh;
chomp(my $line = <$fh>);
my $sep = $self->opt('sep', "\t");
my ($key, $value) = split $sep, $line, 2;
if (!defined $value && defined $self->{_opt}->{exist}) {
$value = $self->{_opt}->{exist};
}
return [$key, $value];
}
1;
__END__
=pod
=encoding utf8
=head1 NAME
weld.pl - join(1), with less options, but for 3 or more files
=head1 SYNOPSIS
weld.pl [OPTION]... [FILE]...
=head1 Differences between C<weld> and C<join>
=over
=item C<weld> doesn't have C<-1/-2/-j> options
always use the first field to join
=item C<weld> doesn't have C<-a> option
always print all unpairable lines
=item C<weld> doesn't have C<-o> option
always print the rest of lines
=item C<weld> doesn't have C<--header> option
always treat the first line as is
=item C<weld> has C<-E> option
there is a way to express empty(C<-e>) fields, C<weld> can also express the
existence(C<-E>) of the join field by appending pseudo-field
this is very handy to create a cross-sheet with boolean fields from bunch of
list files
=back
=head1 DESCRIPTION
When FILE is -, read standard input.
=head2 C<-e EMPTY>
replace missing input fields with EMPTY, same as join(1)
=head2 C<-E EXIST> (not in GNU join)
assume lines are followed by EXIST if they have only 1 fields
=head2 C<-t CHAR> (default: TAB)
use CHAR as input and output field separator
=head2 C<--check-order> (default: true)
check that the input is correctly sorted, even if all input lines are pairable
=head2 C<--nocheck-order>
do not check that the input is correctly sorted
=head2 C<-z, --zero-terminated>
line delimiter is NUL, not newline
=head2 C<--help>
display this help and exit
=head2 C<--version>
output version information and exit
=head1 AUTHOR
Shin Kojima C<< <[email protected]> >>
=head1 SEE ALSO
join(1)
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment