Created
November 10, 2015 20:22
-
-
Save tko/ea17bf14d7b2119dcfe1 to your computer and use it in GitHub Desktop.
read concatenated gzip streams individually
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Copyright 2015 Tommi Komulainen | |
# Licensed under the Simplified BSD License | |
"""read concatenated gzip streams individually | |
A gzip file can contain multiple compressed parts concatenated together. | |
As most file formats can not simply be concatenated together this module | |
lets you read each part individually. | |
Example: | |
echo 'Hello ' | gzip > test.gz | |
echo 'World!' | gzip >> test.gz | |
>>> for part in GzipReadMulti(open('test.gz')): | |
>>> print(repr(part.read())) | |
'Hello \\n' | |
'World!\\n' | |
This is in contrast to the standard library 'gzip' module that implicitly | |
concatenates the individual parts together and loses the boundaries: | |
>>> gzip.open('test.gz').read() | |
'Hello \\nWorld!\\n' | |
""" | |
from __future__ import print_function | |
import io | |
import struct | |
import zlib | |
class GzipError(Exception): | |
pass | |
class _GzipReader(object): | |
# Always read 'blocksize' bytes when more data is needed from wrapped file. | |
# Note that 'readblock' completely bypasses the caching mechanism. | |
def __init__(self, fp, blocksize): | |
self._fp = fp | |
self.blocksize = blocksize | |
self.unused_input = '' | |
@property | |
def unused_input(self): | |
return self._buffer[self._pos:] | |
@unused_input.setter | |
def unused_input(self, val): | |
self._buffer = val | |
self._pos = 0 | |
def read(self, n): | |
if self._pos + n > len(self._buffer): | |
self._buffer += self.readblock() | |
pos = self._pos | |
self._pos += n | |
return self._buffer[pos:self._pos] | |
def readblock(self): | |
return self._fp.read(self.blocksize) | |
class GzipReadMulti(object): | |
def __init__(self, readable, rsize=io.DEFAULT_BUFFER_SIZE): | |
self._fp = _GzipReader(readable, rsize) | |
self._z = None | |
self.crc32 = self.size = None | |
def __iter__(self): | |
return self | |
def next(self): | |
assert self._z is None, 'next() called before consuming all output' | |
if not self._check_gzip_header(): | |
raise StopIteration | |
self._z = zlib.decompressobj(-zlib.MAX_WBITS) | |
self.crc32 = self.size = 0 # _check_gzip_trailer | |
return self | |
def read(self, size=io.DEFAULT_BUFFER_SIZE): | |
assert size > 0, 'read(-1) not implemented' | |
if self._z is None: | |
return '' | |
output = None | |
if self._fp.unused_input: # resume unfinished business | |
output = self._z.decompress(self._fp.unused_input, size) | |
if not output: | |
input = self._fp.readblock() | |
if input: | |
output = self._z.decompress(input, size) | |
else: # EOF | |
# FIXME: could produce more than 'size' bytes | |
output = self._z.flush() | |
self._checksum(output) | |
self._fp.unused_input = self._z.unconsumed_tail or self._z.unused_data | |
if self._z.unused_data: # end of gzip member | |
self._check_gzip_trailer() | |
self._z = None | |
return output | |
def _check_gzip_header(self): | |
# +---+---+---+---+---+---+---+---+---+---+ | |
# |ID1|ID2|CM |FLG| MTIME |XFL|OS | | |
# +---+---+---+---+---+---+---+---+---+---+ | |
sig = self._fp.read(2) | |
if not sig: # EOF ok | |
return False | |
if sig != '\037\213': | |
raise GzipError('Not a gzipped file') | |
try: | |
hdr = self._fp.read(8) | |
cm, flg, _, _, _ = struct.unpack('<BBIBB', hdr) | |
except (IOError, struct.error) as e: | |
raise GzipError('Failed to parse gzip header (%s)' % (e,)) | |
if cm != 8: | |
raise GzipError('Unknown compression method %d' % (cm,)) | |
if flg: | |
raise GzipError('Unsupported flags %s' % (hex(flg),)) | |
return True | |
def _check_gzip_trailer(self): | |
assert not self._z.unconsumed_tail | |
# +---+---+---+---+---+---+---+---+ | |
# | CRC32 | ISIZE | | |
# +---+---+---+---+---+---+---+---+ | |
try: | |
trailer = self._fp.read(8) | |
crc32, isize = struct.unpack('<iL', trailer) | |
except (IOError, struct.error) as e: | |
raise GzipError('Failed to parse gzip trailer (%s)' % (e,)) | |
if crc32 != self.crc32: | |
raise GzipError('CRC32 check failed, got %s expected %s' % (hex(self.crc32), hex(crc32))) | |
mysize = self.size & 0xffffffffL | |
if isize != mysize: | |
raise GzipError('Size check failed, got %s expected %s' % (mysize, isize)) | |
def _checksum(self, data): | |
self.size += len(data) | |
self.crc32 = zlib.crc32(data, self.crc32) | |
if __name__ == '__main__': | |
import datetime | |
import sys | |
import tarfile | |
fp = open(sys.argv[1]) | |
for i, gz in enumerate(GzipReadMulti(fp)): | |
print('gzip member', i) | |
tar = tarfile.open(name=None, mode='r|', fileobj=gz) | |
for info in tar: | |
# m = t.extractfile(i) | |
# print(m) | |
print(info.name, datetime.datetime.fromtimestamp(info.mtime).isoformat()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment