Created
September 9, 2021 06:29
-
-
Save fgggid/9d528998187cd6ee8264d07a0ac357d1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Building a tar file chunk-by-chunk. | |
# | |
# taken from https://gist.github.com/chipx86/9598b1e4a9a1a7831054 | |
# This is a quick bit of sample code for streaming data to a tar file, | |
# building it piece-by-piece. The tarfile is built on-the-fly and streamed | |
# back out. This is useful for web applications that need to dynamically | |
# build a tar file without swamping the server. | |
import os | |
import tarfile | |
import io | |
try: | |
from cStringIO import StringIO | |
except ImportError: | |
from StringIO import StringIO | |
out_filename = './file.tar' | |
exclude_dirs = ['exclude_dir'] | |
top_dir = './tar_dir' | |
exclude_files = ('.tar',) | |
BLOCK_SIZE = 4096 | |
class FileStream(object): | |
def __init__(self): | |
#self.buffer = io.BytesIO() | |
self.buffer = StringIO() | |
self.offset = 0 | |
def write(self, s): | |
self.buffer.write(s) | |
self.offset += len(s) | |
def close(self): | |
self.buffer.close() | |
def tell(self): | |
return self.offset | |
def pop(self): | |
s = self.buffer.getvalue() | |
self.buffer.close() | |
#self.buffer = io.BytesIO() | |
self.buffer = StringIO() | |
return s | |
streaming_fp = FileStream() | |
#tar = tarfile.TarFile.open(out_filename, 'w|', streaming_fp, format=tarfile.PAX_FORMAT) | |
tar = tarfile.TarFile.open(out_filename, 'w|gz', streaming_fp, format=tarfile.PAX_FORMAT) | |
def stream_build_tar(top_dir): | |
for root, dirs, files in os.walk(top_dir, topdown=True): | |
dirs[:] = [d for d in dirs if d not in exclude_dirs] | |
files[:] = [f for f in files if not f.endswith(exclude_files)] | |
for file_ in files: | |
file_ = os.path.join(root, file_) | |
in_filename = file_ | |
try: | |
stat = os.stat(in_filename) | |
tar_info = tarfile.TarInfo(in_filename) | |
# Note that you can get this information from the storage backend, | |
# but it's valid for either to raise a NotImplementedError, so it's | |
# important to check. | |
# | |
# Things like the mode or ownership won't be available. | |
tar_info.mtime = stat.st_mtime | |
tar_info.size = stat.st_size | |
tar.addfile(tar_info) | |
yield | |
with open(in_filename, 'rb') as in_fp: | |
while True: | |
s = in_fp.read(BLOCK_SIZE) | |
if len(s) > 0: | |
tar.fileobj.write(s) | |
yield | |
if len(s) < BLOCK_SIZE: | |
blocks, remainder = divmod(tar_info.size, tarfile.BLOCKSIZE) | |
if remainder > 0: | |
tar.fileobj.write(tarfile.NUL * | |
(tarfile.BLOCKSIZE - remainder)) | |
yield | |
blocks += 1 | |
tar.offset += blocks * tarfile.BLOCKSIZE | |
break | |
yield | |
except (OSError, IOError): | |
# if there is a broken link in the path the process fails | |
print 'error processing: ', in_filename | |
pass | |
tar.close() | |
yield | |
with open(out_filename, 'w') as out_fp: | |
for i in stream_build_tar(top_dir): | |
#print ' current offset: %d' % streaming_fp.tell() | |
block = streaming_fp.pop() | |
if len(block) > 0: | |
print ' 1 block, size: %d' % len(block) | |
out_fp.write(block) | |
out_fp.flush() | |
print 'Wrote tar file to %s' % out_filename |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment