Skip to content

Instantly share code, notes, and snippets.

@fgggid
Created September 9, 2021 06:29
Show Gist options
  • Save fgggid/9d528998187cd6ee8264d07a0ac357d1 to your computer and use it in GitHub Desktop.
Save fgggid/9d528998187cd6ee8264d07a0ac357d1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Building a tar file chunk-by-chunk.
#
# taken from https://gist.github.com/chipx86/9598b1e4a9a1a7831054
# This is a quick bit of sample code for streaming data to a tar file,
# building it piece-by-piece. The tarfile is built on-the-fly and streamed
# back out. This is useful for web applications that need to dynamically
# build a tar file without swamping the server.
import os
import tarfile
import io
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
out_filename = './file.tar'
exclude_dirs = ['exclude_dir']
top_dir = './tar_dir'
exclude_files = ('.tar',)
BLOCK_SIZE = 4096
class FileStream(object):
def __init__(self):
#self.buffer = io.BytesIO()
self.buffer = StringIO()
self.offset = 0
def write(self, s):
self.buffer.write(s)
self.offset += len(s)
def close(self):
self.buffer.close()
def tell(self):
return self.offset
def pop(self):
s = self.buffer.getvalue()
self.buffer.close()
#self.buffer = io.BytesIO()
self.buffer = StringIO()
return s
streaming_fp = FileStream()
#tar = tarfile.TarFile.open(out_filename, 'w|', streaming_fp, format=tarfile.PAX_FORMAT)
tar = tarfile.TarFile.open(out_filename, 'w|gz', streaming_fp, format=tarfile.PAX_FORMAT)
def stream_build_tar(top_dir):
for root, dirs, files in os.walk(top_dir, topdown=True):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
files[:] = [f for f in files if not f.endswith(exclude_files)]
for file_ in files:
file_ = os.path.join(root, file_)
in_filename = file_
try:
stat = os.stat(in_filename)
tar_info = tarfile.TarInfo(in_filename)
# Note that you can get this information from the storage backend,
# but it's valid for either to raise a NotImplementedError, so it's
# important to check.
#
# Things like the mode or ownership won't be available.
tar_info.mtime = stat.st_mtime
tar_info.size = stat.st_size
tar.addfile(tar_info)
yield
with open(in_filename, 'rb') as in_fp:
while True:
s = in_fp.read(BLOCK_SIZE)
if len(s) > 0:
tar.fileobj.write(s)
yield
if len(s) < BLOCK_SIZE:
blocks, remainder = divmod(tar_info.size, tarfile.BLOCKSIZE)
if remainder > 0:
tar.fileobj.write(tarfile.NUL *
(tarfile.BLOCKSIZE - remainder))
yield
blocks += 1
tar.offset += blocks * tarfile.BLOCKSIZE
break
yield
except (OSError, IOError):
# if there is a broken link in the path the process fails
print 'error processing: ', in_filename
pass
tar.close()
yield
with open(out_filename, 'w') as out_fp:
for i in stream_build_tar(top_dir):
#print ' current offset: %d' % streaming_fp.tell()
block = streaming_fp.pop()
if len(block) > 0:
print ' 1 block, size: %d' % len(block)
out_fp.write(block)
out_fp.flush()
print 'Wrote tar file to %s' % out_filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment