Skip to content

Instantly share code, notes, and snippets.

@tko
Created November 10, 2015 20:22
Show Gist options
  • Save tko/ea17bf14d7b2119dcfe1 to your computer and use it in GitHub Desktop.
Save tko/ea17bf14d7b2119dcfe1 to your computer and use it in GitHub Desktop.
read concatenated gzip streams individually
#!/usr/bin/env python
# Copyright 2015 Tommi Komulainen
# Licensed under the Simplified BSD License
"""read concatenated gzip streams individually
A gzip file can contain multiple compressed parts concatenated together.
As most file formats can not simply be concatenated together this module
lets you read each part individually.
Example:
echo 'Hello ' | gzip > test.gz
echo 'World!' | gzip >> test.gz
>>> for part in GzipReadMulti(open('test.gz')):
>>> print(repr(part.read()))
'Hello \\n'
'World!\\n'
This is in contrast to the standard library 'gzip' module that implicitly
concatenates the individual parts together and loses the boundaries:
>>> gzip.open('test.gz').read()
'Hello \\nWorld!\\n'
"""
from __future__ import print_function
import io
import struct
import zlib
class GzipError(Exception):
pass
class _GzipReader(object):
# Always read 'blocksize' bytes when more data is needed from wrapped file.
# Note that 'readblock' completely bypasses the caching mechanism.
def __init__(self, fp, blocksize):
self._fp = fp
self.blocksize = blocksize
self.unused_input = ''
@property
def unused_input(self):
return self._buffer[self._pos:]
@unused_input.setter
def unused_input(self, val):
self._buffer = val
self._pos = 0
def read(self, n):
if self._pos + n > len(self._buffer):
self._buffer += self.readblock()
pos = self._pos
self._pos += n
return self._buffer[pos:self._pos]
def readblock(self):
return self._fp.read(self.blocksize)
class GzipReadMulti(object):
def __init__(self, readable, rsize=io.DEFAULT_BUFFER_SIZE):
self._fp = _GzipReader(readable, rsize)
self._z = None
self.crc32 = self.size = None
def __iter__(self):
return self
def next(self):
assert self._z is None, 'next() called before consuming all output'
if not self._check_gzip_header():
raise StopIteration
self._z = zlib.decompressobj(-zlib.MAX_WBITS)
self.crc32 = self.size = 0 # _check_gzip_trailer
return self
def read(self, size=io.DEFAULT_BUFFER_SIZE):
assert size > 0, 'read(-1) not implemented'
if self._z is None:
return ''
output = None
if self._fp.unused_input: # resume unfinished business
output = self._z.decompress(self._fp.unused_input, size)
if not output:
input = self._fp.readblock()
if input:
output = self._z.decompress(input, size)
else: # EOF
# FIXME: could produce more than 'size' bytes
output = self._z.flush()
self._checksum(output)
self._fp.unused_input = self._z.unconsumed_tail or self._z.unused_data
if self._z.unused_data: # end of gzip member
self._check_gzip_trailer()
self._z = None
return output
def _check_gzip_header(self):
# +---+---+---+---+---+---+---+---+---+---+
# |ID1|ID2|CM |FLG| MTIME |XFL|OS |
# +---+---+---+---+---+---+---+---+---+---+
sig = self._fp.read(2)
if not sig: # EOF ok
return False
if sig != '\037\213':
raise GzipError('Not a gzipped file')
try:
hdr = self._fp.read(8)
cm, flg, _, _, _ = struct.unpack('<BBIBB', hdr)
except (IOError, struct.error) as e:
raise GzipError('Failed to parse gzip header (%s)' % (e,))
if cm != 8:
raise GzipError('Unknown compression method %d' % (cm,))
if flg:
raise GzipError('Unsupported flags %s' % (hex(flg),))
return True
def _check_gzip_trailer(self):
assert not self._z.unconsumed_tail
# +---+---+---+---+---+---+---+---+
# | CRC32 | ISIZE |
# +---+---+---+---+---+---+---+---+
try:
trailer = self._fp.read(8)
crc32, isize = struct.unpack('<iL', trailer)
except (IOError, struct.error) as e:
raise GzipError('Failed to parse gzip trailer (%s)' % (e,))
if crc32 != self.crc32:
raise GzipError('CRC32 check failed, got %s expected %s' % (hex(self.crc32), hex(crc32)))
mysize = self.size & 0xffffffffL
if isize != mysize:
raise GzipError('Size check failed, got %s expected %s' % (mysize, isize))
def _checksum(self, data):
self.size += len(data)
self.crc32 = zlib.crc32(data, self.crc32)
if __name__ == '__main__':
import datetime
import sys
import tarfile
fp = open(sys.argv[1])
for i, gz in enumerate(GzipReadMulti(fp)):
print('gzip member', i)
tar = tarfile.open(name=None, mode='r|', fileobj=gz)
for info in tar:
# m = t.extractfile(i)
# print(m)
print(info.name, datetime.datetime.fromtimestamp(info.mtime).isoformat())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment