Last active
October 19, 2018 01:15
-
-
Save ernix/66f2d49d1354f8ffd032667f720dcc4a to your computer and use it in GitHub Desktop.
weld.pl - join(1), with less options, but for 3 or more files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use utf8; | |
use open qw(:std :utf8); | |
our $VERSION = '0.02'; | |
my $app = App::Weld->new; | |
$app->run; | |
package App::Weld; | |
use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat); | |
use Pod::Usage qw(pod2usage); | |
use List::Util qw(reduce); | |
sub new { | |
my $class = shift; | |
my $self = bless +{ @_ }, $class; | |
return $self->_init; | |
} | |
sub _init { | |
my $self = shift; | |
$self->{_opt} = +{ $self->_get_option }; | |
return $self; | |
} | |
sub _get_option { | |
my ($self, %opt) = @_; | |
$opt{check_order} = 1; | |
$opt{sep} = "\t"; | |
GetOptions( | |
'h|help' => sub { pod2usage(0) }, | |
'v|version' => sub { | |
print "weld.pl v$VERSION\n"; | |
exit 0; | |
}, | |
'e=s' => \$opt{empty}, | |
'E=s' => \$opt{exist}, | |
't=s' => \$opt{sep}, | |
'z|zero-terminated' => \$opt{z}, | |
'check-order!' => \$opt{check_order}, | |
) or pod2usage(1); | |
return %opt; | |
} | |
sub opt { | |
my ($self, $name, $default) = @_; | |
my $opt = $self->{_opt}->{$name}; | |
return $default if !defined $opt; | |
return $opt; | |
} | |
sub run { | |
my $self = shift; | |
if (@ARGV < 2) { | |
die "Feed 2 or more filenames"; | |
} | |
my @fhs = map { | |
if ($_ eq '-') { | |
*STDIN | |
} | |
else { | |
open my $fh, '<', $_ | |
or die "Can't open $_: $!"; | |
$fh; | |
} | |
} @ARGV; | |
my @seek; | |
my $last_key; | |
while (map { !eof $_ } @fhs) { | |
for my $i (0 .. $#fhs) { | |
$seek[$i] = $self->_parse($fhs[$i]) if !defined $seek[$i]; | |
} | |
my $first_index = reduce { | |
($seek[$a]->[0] cmp $seek[$b]->[0]) < 0 ? $a : $b | |
} grep { defined $seek[$_]->[0] } 0 .. $#seek; | |
last if !defined $first_index; | |
my $key = $seek[$first_index]->[0]; | |
if (defined $last_key && ($last_key cmp $key) > 0) { | |
die "Detect dis-order" if $self->opt('check_order'); | |
} | |
$last_key = $key; | |
my @cols; | |
for my $i (0 .. $#fhs) { | |
my $empty = $self->opt('empty', q{}); | |
if (!defined $seek[$i]->[0]) { | |
push @cols, $empty; | |
} | |
elsif ($seek[$i]->[0] ne $key) { | |
push @cols, $empty; | |
} | |
else { | |
my $value = defined $seek[$i]->[1] ? $seek[$i]->[1] : $empty; | |
push @cols, $value; | |
$seek[$i] = $self->_parse($fhs[$i]); | |
} | |
} | |
my $sep = $self->opt('sep', "\t"); | |
my $line = join $sep, grep { defined $_ } ($key, @cols); | |
my $delim = $self->opt('z') ? "\0" : "\n"; | |
print $line . $delim; | |
} | |
for my $i (0 .. $#fhs) { | |
my $filename = $ARGV[$i]; | |
close $fhs[$i] | |
or die "Can't close $filename: $!"; | |
} | |
return 1; | |
} | |
sub _parse { | |
my ($self, $fh) = @_; | |
return if eof $fh; | |
chomp(my $line = <$fh>); | |
my $sep = $self->opt('sep', "\t"); | |
my ($key, $value) = split $sep, $line, 2; | |
if (!defined $value && defined $self->{_opt}->{exist}) { | |
$value = $self->{_opt}->{exist}; | |
} | |
return [$key, $value]; | |
} | |
1; | |
__END__ | |
=pod | |
=encoding utf8 | |
=head1 NAME | |
weld.pl - join(1), with less options, but for 3 or more files | |
=head1 SYNOPSIS | |
weld.pl [OPTION]... [FILE]... | |
=head1 Differences between C<weld> and C<join> | |
=over | |
=item C<weld> doesn't have C<-1/-2/-j> options | |
always use the first field to join | |
=item C<weld> doesn't have C<-a> option | |
always print all unpairable lines | |
=item C<weld> doesn't have C<-o> option | |
always print the rest of lines | |
=item C<weld> doesn't have C<--header> option | |
always treat the first line as is | |
=item C<weld> has C<-E> option | |
there is a way to express empty(C<-e>) fields, C<weld> can also express the | |
existence(C<-E>) of the join field by appending pseudo-field | |
this is very handy to create a cross-sheet with boolean fields from bunch of | |
list files | |
=back | |
=head1 DESCRIPTION | |
When FILE is -, read standard input. | |
=head2 C<-e EMPTY> | |
replace missing input fields with EMPTY, same as join(1) | |
=head2 C<-E EXIST> (not in GNU join) | |
assume lines are followed by EXIST if they have only 1 fields | |
=head2 C<-t CHAR> (default: TAB) | |
use CHAR as input and output field separator | |
=head2 C<--check-order> (default: true) | |
check that the input is correctly sorted, even if all input lines are pairable | |
=head2 C<--nocheck-order> | |
do not check that the input is correctly sorted | |
=head2 C<-z, --zero-terminated> | |
line delimiter is NUL, not newline | |
=head2 C<--help> | |
display this help and exit | |
=head2 C<--version> | |
output version information and exit | |
=head1 AUTHOR | |
Shin Kojima C<< <[email protected]> >> | |
=head1 SEE ALSO | |
join(1) | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment