Gro-Tsen · September 5, 2024 08:57
diff --git a/tarfilter.pl b/tarfilter.pl
 #! /usr/local/bin/perl -w

 ## This program takes a GNU tar file and produces on the standard
 ## output another tar file that consists only of the files whose path
 ## (as stored in the archive) matches the specified pattern.  No other
 ## treatment is performed and the output tar just consists of the
 ## matching bits of the input tar.

 ## Options are:
 ##   -v : Print filenames on standard error as they are being treated.
 ##   -p <pattern> : Select pattern for filtering.
 ##     This is a Perl regexp that is used to match the path name.
 ##   -o <filename> : Choose output filename (instead standard output).

 ## Other than these options, the program expects the tar name on the
 ## command line, otherwise it will use standard input.

 ## Be warned that this program does not try to be smart about
 ## anything: it merely filters and copies the matching entries from
 ## the input archive to the output verbatim.  If there are duplicate
 ## entries, they will be copied as such.  If there are hardlinks in
 ## the input archive, the program makes no attempt to check whether
 ## the link target has been included as well (so you can end up with
 ## an archive containing a hardlink to a file not included in the
 ## archive: this may be what you want, but probably not).

 ## Written by David A. Madore on 2024-09-04 -- Public Domain
 ## This version 2024-09-05

 use strict;
 use warnings;
 use Getopt::Std;

 my %opts;
 getopts("vp:o:", \%opts);

 my $verbose = $opts{v};
 my $pattern = $opts{p};
 my $pattern_re = defined($pattern) ? qr/$pattern/ : undef;
 my $outfname = $opts{o};

 my $inf;
 if ( scalar(@ARGV) >= 2 ) {
    die "please specify 0 or 1 tar file names on command line";
 } elsif ( scalar(@ARGV) == 1 ) {
    open $inf, "<", $ARGV[0]
 	or die "opening $ARGV[0]: $!";
 } else {
    $inf = \*STDIN;
 }

 my $outf;
 if ( defined($outfname) ) {
    open $outf, ">", $outfname;
 } else {
    $outf = \*STDOUT;
 }

 my $block_cnt = 0;  # Count of blocks read

 my $longname;
 my $longlink;
 my $preblock = "";
 while (1) {
    my $block;
    my $retval = read($inf, $block, 512);
    # last if $retval == 0;
    die "short read: $!" unless $retval == 512;
    last if $block eq ("\0"x512);
    $block_cnt++;
    die "wrong magic" unless substr($block, 257, 8) eq "ustar  \0";
    my $name = unpack("Z*", substr($block, 0, 100));
    my $size_octal = unpack("Z*", substr($block, 124, 12));
    die "wrong size format" unless $size_octal =~ m/\A[0-7]+\z/;
    my $size = oct($size_octal);
    my $type = substr($block, 156, 1);
    if ( $type eq "L" ) { # Long filename
 	# The next block(s) will contain the filename; the actual
 	# archive entry will follow.
 	die "wrong pseudoname in long filename block" unless $name eq "././\@LongLink";
 	die "multiple long filename blocks" if defined($longname);
 	my $read_size = ($size + 511) & ~511;
 	die "name is absurdly long" if $read_size > 4194304;
 	my $nblock;
 	die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size;
 	$block_cnt += int($read_size/512);
 	$longname = unpack("Z*", $nblock);
 	$preblock = $preblock . $block . $nblock;
 	next;
    }
    if ( $type eq "K" ) { # Long linkname
 	# The next block(s) will contain the linkname; the actual
 	# archive entry will follow.
 	die "wrong pseudoname in long linkname block" unless $name eq "././\@LongLink";
 	die "multiple long linkname blocks" if defined($longlink);
 	my $read_size = ($size + 511) & ~511;
 	die "link is absurdly long" if $read_size > 4194304;
 	my $nblock;
 	die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size;
 	$block_cnt += int($read_size/512);
 	$longlink = unpack("Z*", $nblock);
 	$preblock = $preblock . $block . $nblock;
 	next;
    }
    if ( defined($longname) ) {
 	die "name does not match start of longname"
 	    unless $name eq substr($longname, 0, length($name));
 	$name = $longname;
    }
    $longname = undef;
    $longlink = undef;
    # The filtering is done here.  (Prepare for possible addition of
    # other conditions.)
    my $matches = 1;
    if ( defined($pattern_re) ) {
 	$matches = 0 unless $name =~ $pattern_re;
    }
    # Output header of matching entry.
    if ( $matches ) {
 	print STDERR "$name\n" if $verbose;
 	print $outf ($preblock . $block);
    }
    $preblock = "";
    # Now read through entry data.
    my $read_size = ($size + 511) & ~511;
    while ( $read_size > 0 ) {
 	my $l = $read_size > 4194304 ? 4194304 : $read_size;
 	my $dblock;
 	die "short read: $!" unless read($inf, $dblock, $l) == $l;
 	$block_cnt += int($l/512);
 	$read_size -= $l;
 	print $outf $dblock if $matches;
    }
 }

 # Add at least 2 padding blocks, and pad to a multiple of 20 blocks
 my $padding_cnt = (int(($block_cnt+2+19)/20))*20 - $block_cnt;
 for ( my $i=0 ; $i<$padding_cnt ; $i++ ) {
    print $outf ("\0"x512);
 }
	#! /usr/local/bin/perl -w

	## This program takes a GNU tar file and produces on the standard
	## output another tar file that consists only of the files whose path
	## (as stored in the archive) matches the specified pattern. No other
	## treatment is performed and the output tar just consists of the
	## matching bits of the input tar.

	## Options are:
	## -v : Print filenames on standard error as they are being treated.
	## -p <pattern> : Select pattern for filtering.
	## This is a Perl regexp that is used to match the path name.
	## -o <filename> : Choose output filename (instead standard output).

	## Other than these options, the program expects the tar name on the
	## command line, otherwise it will use standard input.

	## Be warned that this program does not try to be smart about
	## anything: it merely filters and copies the matching entries from
	## the input archive to the output verbatim. If there are duplicate
	## entries, they will be copied as such. If there are hardlinks in
	## the input archive, the program makes no attempt to check whether
	## the link target has been included as well (so you can end up with
	## an archive containing a hardlink to a file not included in the
	## archive: this may be what you want, but probably not).

	## Written by David A. Madore on 2024-09-04 -- Public Domain
	## This version 2024-09-05

	use strict;
	use warnings;
	use Getopt::Std;

	my %opts;
	getopts("vp:o:", \%opts);

	my $verbose = $opts{v};
	my $pattern = $opts{p};
	my $pattern_re = defined($pattern) ? qr/$pattern/ : undef;
	my $outfname = $opts{o};

	my $inf;
	if ( scalar(@ARGV) >= 2 ) {
	die "please specify 0 or 1 tar file names on command line";
	} elsif ( scalar(@ARGV) == 1 ) {
	open $inf, "<", $ARGV[0]
	or die "opening $ARGV[0]: $!";
	} else {
	$inf = \*STDIN;
	}

	my $outf;
	if ( defined($outfname) ) {
	open $outf, ">", $outfname;
	} else {
	$outf = \*STDOUT;
	}

	my $block_cnt = 0; # Count of blocks read

	my $longname;
	my $longlink;
	my $preblock = "";
	while (1) {
	my $block;
	my $retval = read($inf, $block, 512);
	# last if $retval == 0;
	die "short read: $!" unless $retval == 512;
	last if $block eq ("\0"x512);
	$block_cnt++;
	die "wrong magic" unless substr($block, 257, 8) eq "ustar \0";
	my $name = unpack("Z*", substr($block, 0, 100));
	my $size_octal = unpack("Z*", substr($block, 124, 12));
	die "wrong size format" unless $size_octal =~ m/\A[0-7]+\z/;
	my $size = oct($size_octal);
	my $type = substr($block, 156, 1);
	if ( $type eq "L" ) { # Long filename
	# The next block(s) will contain the filename; the actual
	# archive entry will follow.
	die "wrong pseudoname in long filename block" unless $name eq "././\@LongLink";
	die "multiple long filename blocks" if defined($longname);
	my $read_size = ($size + 511) & ~511;
	die "name is absurdly long" if $read_size > 4194304;
	my $nblock;
	die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size;
	$block_cnt += int($read_size/512);
	$longname = unpack("Z*", $nblock);
	$preblock = $preblock . $block . $nblock;
	next;
	}
	if ( $type eq "K" ) { # Long linkname
	# The next block(s) will contain the linkname; the actual
	# archive entry will follow.
	die "wrong pseudoname in long linkname block" unless $name eq "././\@LongLink";
	die "multiple long linkname blocks" if defined($longlink);
	my $read_size = ($size + 511) & ~511;
	die "link is absurdly long" if $read_size > 4194304;
	my $nblock;
	die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size;
	$block_cnt += int($read_size/512);
	$longlink = unpack("Z*", $nblock);
	$preblock = $preblock . $block . $nblock;
	next;
	}
	if ( defined($longname) ) {
	die "name does not match start of longname"
	unless $name eq substr($longname, 0, length($name));
	$name = $longname;
	}
	$longname = undef;
	$longlink = undef;
	# The filtering is done here. (Prepare for possible addition of
	# other conditions.)
	my $matches = 1;
	if ( defined($pattern_re) ) {
	$matches = 0 unless $name =~ $pattern_re;
	}
	# Output header of matching entry.
	if ( $matches ) {
	print STDERR "$name\n" if $verbose;
	print $outf ($preblock . $block);
	}
	$preblock = "";
	# Now read through entry data.
	my $read_size = ($size + 511) & ~511;
	while ( $read_size > 0 ) {
	my $l = $read_size > 4194304 ? 4194304 : $read_size;
	my $dblock;
	die "short read: $!" unless read($inf, $dblock, $l) == $l;
	$block_cnt += int($l/512);
	$read_size -= $l;
	print $outf $dblock if $matches;
	}
	}

	# Add at least 2 padding blocks, and pad to a multiple of 20 blocks
	my $padding_cnt = (int(($block_cnt+2+19)/20))*20 - $block_cnt;
	for ( my $i=0 ; $i<$padding_cnt ; $i++ ) {
	print $outf ("\0"x512);
	}