tko · November 10, 2015 20:22
diff --git a/gzipmulti.py b/gzipmulti.py
 #!/usr/bin/env python
 # Copyright 2015 Tommi Komulainen
 # Licensed under the Simplified BSD License
 """read concatenated gzip streams individually

 A gzip file can contain multiple compressed parts concatenated together.
 As most file formats can not simply be concatenated together this module
 lets you read each part individually.

 Example:
    echo 'Hello ' | gzip > test.gz
    echo 'World!' | gzip >> test.gz

 >>> for part in GzipReadMulti(open('test.gz')):
 >>>     print(repr(part.read()))
 'Hello \\n'
 'World!\\n'

 This is in contrast to the standard library 'gzip' module that implicitly
 concatenates the individual parts together and loses the boundaries:
 >>> gzip.open('test.gz').read()
 'Hello \\nWorld!\\n'
 """
 from __future__ import print_function
 import io
 import struct
 import zlib


 class GzipError(Exception):
    pass


 class _GzipReader(object):
    # Always read 'blocksize' bytes when more data is needed from wrapped file.
    # Note that 'readblock' completely bypasses the caching mechanism.
    def __init__(self, fp, blocksize):
        self._fp = fp
        self.blocksize = blocksize

        self.unused_input = ''

    @property
    def unused_input(self):
        return self._buffer[self._pos:]

    @unused_input.setter
    def unused_input(self, val):
        self._buffer = val
        self._pos = 0

    def read(self, n):
        if self._pos + n > len(self._buffer):
            self._buffer += self.readblock()

        pos = self._pos
        self._pos += n
        return self._buffer[pos:self._pos]

    def readblock(self):
        return self._fp.read(self.blocksize)


 class GzipReadMulti(object):
    def __init__(self, readable, rsize=io.DEFAULT_BUFFER_SIZE):
        self._fp = _GzipReader(readable, rsize)

        self._z = None
        self.crc32 = self.size = None

    def __iter__(self):
        return self

    def next(self):
        assert self._z is None, 'next() called before consuming all output'

        if not self._check_gzip_header():
            raise StopIteration

        self._z = zlib.decompressobj(-zlib.MAX_WBITS)
        self.crc32 = self.size = 0  # _check_gzip_trailer
        return self

    def read(self, size=io.DEFAULT_BUFFER_SIZE):
        assert size > 0, 'read(-1) not implemented'

        if self._z is None:
            return ''

        output = None
        if self._fp.unused_input:  # resume unfinished business
            output = self._z.decompress(self._fp.unused_input, size)
        if not output:
            input = self._fp.readblock()
            if input:
                output = self._z.decompress(input, size)
            else:  # EOF
                # FIXME: could produce more than 'size' bytes
                output = self._z.flush()
        self._checksum(output)

        self._fp.unused_input = self._z.unconsumed_tail or self._z.unused_data
        if self._z.unused_data:  # end of gzip member
            self._check_gzip_trailer()
            self._z = None

        return output

    def _check_gzip_header(self):
        # +---+---+---+---+---+---+---+---+---+---+
        # |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
        # +---+---+---+---+---+---+---+---+---+---+
        sig = self._fp.read(2)
        if not sig:  # EOF ok
            return False

        if sig != '\037\213':
            raise GzipError('Not a gzipped file')
        try:
            hdr = self._fp.read(8)
            cm, flg, _, _, _ = struct.unpack('<BBIBB', hdr)
        except (IOError, struct.error) as e:
            raise GzipError('Failed to parse gzip header (%s)' % (e,))
        if cm != 8:
            raise GzipError('Unknown compression method %d' % (cm,))
        if flg:
            raise GzipError('Unsupported flags %s' % (hex(flg),))
        return True

    def _check_gzip_trailer(self):
        assert not self._z.unconsumed_tail
        # +---+---+---+---+---+---+---+---+
        # |     CRC32     |     ISIZE     |
        # +---+---+---+---+---+---+---+---+
        try:
            trailer = self._fp.read(8)
            crc32, isize = struct.unpack('<iL', trailer)
        except (IOError, struct.error) as e:
            raise GzipError('Failed to parse gzip trailer (%s)' % (e,))

        if crc32 != self.crc32:
            raise GzipError('CRC32 check failed, got %s expected %s' % (hex(self.crc32), hex(crc32)))
        mysize = self.size & 0xffffffffL
        if isize != mysize:
            raise GzipError('Size check failed, got %s expected %s' % (mysize, isize))

    def _checksum(self, data):
        self.size += len(data)
        self.crc32 = zlib.crc32(data, self.crc32)


 if __name__ == '__main__':
    import datetime
    import sys
    import tarfile

    fp = open(sys.argv[1])

    for i, gz in enumerate(GzipReadMulti(fp)):
        print('gzip member', i)

        tar = tarfile.open(name=None, mode='r|', fileobj=gz)
        for info in tar:
            # m = t.extractfile(i)
            # print(m)
            print(info.name, datetime.datetime.fromtimestamp(info.mtime).isoformat())
	#!/usr/bin/env python
	# Copyright 2015 Tommi Komulainen
	# Licensed under the Simplified BSD License
	"""read concatenated gzip streams individually

	A gzip file can contain multiple compressed parts concatenated together.
	As most file formats can not simply be concatenated together this module
	lets you read each part individually.

	Example:
	echo 'Hello ' \| gzip > test.gz
	echo 'World!' \| gzip >> test.gz

	>>> for part in GzipReadMulti(open('test.gz')):
	>>> print(repr(part.read()))
	'Hello \\n'
	'World!\\n'

	This is in contrast to the standard library 'gzip' module that implicitly
	concatenates the individual parts together and loses the boundaries:
	>>> gzip.open('test.gz').read()
	'Hello \\nWorld!\\n'
	"""
	from __future__ import print_function
	import io
	import struct
	import zlib


	class GzipError(Exception):
	pass


	class _GzipReader(object):
	# Always read 'blocksize' bytes when more data is needed from wrapped file.
	# Note that 'readblock' completely bypasses the caching mechanism.
	def __init__(self, fp, blocksize):
	self._fp = fp
	self.blocksize = blocksize

	self.unused_input = ''

	@property
	def unused_input(self):
	return self._buffer[self._pos:]

	@unused_input.setter
	def unused_input(self, val):
	self._buffer = val
	self._pos = 0

	def read(self, n):
	if self._pos + n > len(self._buffer):
	self._buffer += self.readblock()

	pos = self._pos
	self._pos += n
	return self._buffer[pos:self._pos]

	def readblock(self):
	return self._fp.read(self.blocksize)


	class GzipReadMulti(object):
	def __init__(self, readable, rsize=io.DEFAULT_BUFFER_SIZE):
	self._fp = _GzipReader(readable, rsize)

	self._z = None
	self.crc32 = self.size = None

	def __iter__(self):
	return self

	def next(self):
	assert self._z is None, 'next() called before consuming all output'

	if not self._check_gzip_header():
	raise StopIteration

	self._z = zlib.decompressobj(-zlib.MAX_WBITS)
	self.crc32 = self.size = 0 # _check_gzip_trailer
	return self

	def read(self, size=io.DEFAULT_BUFFER_SIZE):
	assert size > 0, 'read(-1) not implemented'

	if self._z is None:
	return ''

	output = None
	if self._fp.unused_input: # resume unfinished business
	output = self._z.decompress(self._fp.unused_input, size)
	if not output:
	input = self._fp.readblock()
	if input:
	output = self._z.decompress(input, size)
	else: # EOF
	# FIXME: could produce more than 'size' bytes
	output = self._z.flush()
	self._checksum(output)

	self._fp.unused_input = self._z.unconsumed_tail or self._z.unused_data
	if self._z.unused_data: # end of gzip member
	self._check_gzip_trailer()
	self._z = None

	return output

	def _check_gzip_header(self):
	# +---+---+---+---+---+---+---+---+---+---+
	# \|ID1\|ID2\|CM \|FLG\| MTIME \|XFL\|OS \|
	# +---+---+---+---+---+---+---+---+---+---+
	sig = self._fp.read(2)
	if not sig: # EOF ok
	return False

	if sig != '\037\213':
	raise GzipError('Not a gzipped file')
	try:
	hdr = self._fp.read(8)
	cm, flg, _, _, _ = struct.unpack('<BBIBB', hdr)
	except (IOError, struct.error) as e:
	raise GzipError('Failed to parse gzip header (%s)' % (e,))
	if cm != 8:
	raise GzipError('Unknown compression method %d' % (cm,))
	if flg:
	raise GzipError('Unsupported flags %s' % (hex(flg),))
	return True

	def _check_gzip_trailer(self):
	assert not self._z.unconsumed_tail
	# +---+---+---+---+---+---+---+---+
	# \| CRC32 \| ISIZE \|
	# +---+---+---+---+---+---+---+---+
	try:
	trailer = self._fp.read(8)
	crc32, isize = struct.unpack('<iL', trailer)
	except (IOError, struct.error) as e:
	raise GzipError('Failed to parse gzip trailer (%s)' % (e,))

	if crc32 != self.crc32:
	raise GzipError('CRC32 check failed, got %s expected %s' % (hex(self.crc32), hex(crc32)))
	mysize = self.size & 0xffffffffL
	if isize != mysize:
	raise GzipError('Size check failed, got %s expected %s' % (mysize, isize))

	def _checksum(self, data):
	self.size += len(data)
	self.crc32 = zlib.crc32(data, self.crc32)


	if __name__ == '__main__':
	import datetime
	import sys
	import tarfile

	fp = open(sys.argv[1])

	for i, gz in enumerate(GzipReadMulti(fp)):
	print('gzip member', i)

	tar = tarfile.open(name=None, mode='r\|', fileobj=gz)
	for info in tar:
	# m = t.extractfile(i)
	# print(m)
	print(info.name, datetime.datetime.fromtimestamp(info.mtime).isoformat())