Last active
September 5, 2024 08:57
-
-
Save Gro-Tsen/76eb389931fcbb5daf6b94ab19b769f1 to your computer and use it in GitHub Desktop.
Filter selected files in a tar archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/local/bin/perl -w | |
## This program takes a GNU tar file and produces on the standard | |
## output another tar file that consists only of the files whose path | |
## (as stored in the archive) matches the specified pattern. No other | |
## treatment is performed and the output tar just consists of the | |
## matching bits of the input tar. | |
## Options are: | |
## -v : Print filenames on standard error as they are being treated. | |
## -p <pattern> : Select pattern for filtering. | |
## This is a Perl regexp that is used to match the path name. | |
## -o <filename> : Choose output filename (instead standard output). | |
## Other than these options, the program expects the tar name on the | |
## command line, otherwise it will use standard input. | |
## Be warned that this program does not try to be smart about | |
## anything: it merely filters and copies the matching entries from | |
## the input archive to the output verbatim. If there are duplicate | |
## entries, they will be copied as such. If there are hardlinks in | |
## the input archive, the program makes no attempt to check whether | |
## the link target has been included as well (so you can end up with | |
## an archive containing a hardlink to a file not included in the | |
## archive: this may be what you want, but probably not). | |
## Written by David A. Madore on 2024-09-04 -- Public Domain | |
## This version 2024-09-05 | |
use strict; | |
use warnings; | |
use Getopt::Std; | |
my %opts; | |
getopts("vp:o:", \%opts); | |
my $verbose = $opts{v}; | |
my $pattern = $opts{p}; | |
my $pattern_re = defined($pattern) ? qr/$pattern/ : undef; | |
my $outfname = $opts{o}; | |
my $inf; | |
if ( scalar(@ARGV) >= 2 ) { | |
die "please specify 0 or 1 tar file names on command line"; | |
} elsif ( scalar(@ARGV) == 1 ) { | |
open $inf, "<", $ARGV[0] | |
or die "opening $ARGV[0]: $!"; | |
} else { | |
$inf = \*STDIN; | |
} | |
my $outf; | |
if ( defined($outfname) ) { | |
open $outf, ">", $outfname; | |
} else { | |
$outf = \*STDOUT; | |
} | |
my $block_cnt = 0; # Count of blocks read | |
my $longname; | |
my $longlink; | |
my $preblock = ""; | |
while (1) { | |
my $block; | |
my $retval = read($inf, $block, 512); | |
# last if $retval == 0; | |
die "short read: $!" unless $retval == 512; | |
last if $block eq ("\0"x512); | |
$block_cnt++; | |
die "wrong magic" unless substr($block, 257, 8) eq "ustar \0"; | |
my $name = unpack("Z*", substr($block, 0, 100)); | |
my $size_octal = unpack("Z*", substr($block, 124, 12)); | |
die "wrong size format" unless $size_octal =~ m/\A[0-7]+\z/; | |
my $size = oct($size_octal); | |
my $type = substr($block, 156, 1); | |
if ( $type eq "L" ) { # Long filename | |
# The next block(s) will contain the filename; the actual | |
# archive entry will follow. | |
die "wrong pseudoname in long filename block" unless $name eq "././\@LongLink"; | |
die "multiple long filename blocks" if defined($longname); | |
my $read_size = ($size + 511) & ~511; | |
die "name is absurdly long" if $read_size > 4194304; | |
my $nblock; | |
die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size; | |
$block_cnt += int($read_size/512); | |
$longname = unpack("Z*", $nblock); | |
$preblock = $preblock . $block . $nblock; | |
next; | |
} | |
if ( $type eq "K" ) { # Long linkname | |
# The next block(s) will contain the linkname; the actual | |
# archive entry will follow. | |
die "wrong pseudoname in long linkname block" unless $name eq "././\@LongLink"; | |
die "multiple long linkname blocks" if defined($longlink); | |
my $read_size = ($size + 511) & ~511; | |
die "link is absurdly long" if $read_size > 4194304; | |
my $nblock; | |
die "short read: $!" unless read($inf, $nblock, $read_size) == $read_size; | |
$block_cnt += int($read_size/512); | |
$longlink = unpack("Z*", $nblock); | |
$preblock = $preblock . $block . $nblock; | |
next; | |
} | |
if ( defined($longname) ) { | |
die "name does not match start of longname" | |
unless $name eq substr($longname, 0, length($name)); | |
$name = $longname; | |
} | |
$longname = undef; | |
$longlink = undef; | |
# The filtering is done here. (Prepare for possible addition of | |
# other conditions.) | |
my $matches = 1; | |
if ( defined($pattern_re) ) { | |
$matches = 0 unless $name =~ $pattern_re; | |
} | |
# Output header of matching entry. | |
if ( $matches ) { | |
print STDERR "$name\n" if $verbose; | |
print $outf ($preblock . $block); | |
} | |
$preblock = ""; | |
# Now read through entry data. | |
my $read_size = ($size + 511) & ~511; | |
while ( $read_size > 0 ) { | |
my $l = $read_size > 4194304 ? 4194304 : $read_size; | |
my $dblock; | |
die "short read: $!" unless read($inf, $dblock, $l) == $l; | |
$block_cnt += int($l/512); | |
$read_size -= $l; | |
print $outf $dblock if $matches; | |
} | |
} | |
# Add at least 2 padding blocks, and pad to a multiple of 20 blocks | |
my $padding_cnt = (int(($block_cnt+2+19)/20))*20 - $block_cnt; | |
for ( my $i=0 ; $i<$padding_cnt ; $i++ ) { | |
print $outf ("\0"x512); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment