Skip to content

Instantly share code, notes, and snippets.

@mattbillenstein
Created January 29, 2013 22:07
Show Gist options
  • Save mattbillenstein/4668396 to your computer and use it in GitHub Desktop.
Save mattbillenstein/4668396 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import zlib
import bz2
BLOCK_SIZE = 1024*1024
def main():
size = 0
gzip_size = 0
bz2_size = 0
for block in file_data():
size += len(block)
gzip_size += len(zlib.compress(block))
bz2_size += len(bz2.compress(block))
print 'file size', size
print 'gzip size %i (%i%%)' % (gzip_size, percent(gzip_size, size))
print 'bz2 size %i (%i%%)' % (bz2_size, percent(bz2_size, size))
def yield_files(files):
'''
yield the contents of files as a stream of 1MiB which plays nicely with
bzip2 and a possible input of a bunch of tiny files...
'''
block = ''
for f in files:
while 1:
data = f.read(BLOCK_SIZE)
if not data:
break
block += data
if len(block) >= BLOCK_SIZE:
yield block[:BLOCK_SIZE]
block = block[BLOCK_SIZE:]
if block:
yield block
def file_data():
files = sys.argv[1:]
if not files:
return yield_files([sys.stdin])
else:
return yield_files(open(f) for f in files)
def percent(part, whole):
return int(100.0 * part / whole)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment