Skip to content

Instantly share code, notes, and snippets.

@informationsea
Created February 23, 2012 11:31
Show Gist options
  • Save informationsea/1892476 to your computer and use it in GitHub Desktop.
Save informationsea/1892476 to your computer and use it in GitHub Desktop.
Random Readable Compressed File (Licensed by GPL3 or later)
#!/usr/bin/env python2.7
__author__ = 'informationsea'
__copyright__ = 'Copyright (C) 2012 informationsea All Rights Reserved.'
import bz2
import gzip
import struct
class RandomZip(object):
"""
"""
def __init__(self, filepath, mode, compress=9,
algorithm='bz2', compress_block_size = 1024*1024):
"""
Arguments:
- `filepath`:
- `mode`:
- `compress`:
- `algorithm`:
- `compress_block_size`:
"""
self._filepath = filepath
self._mode = mode
self._compress = compress
self._algorithm = algorithm
self._compress_block_size = compress_block_size
self._file = None
self._buffer = ''
self._filepos = list()
self._read_pointer = 0
self._read_buffer_index = 0
self._read_pointer_in_buffer = 0
def open(self):
"""
Arguments:
- `self`:
"""
if not self._file:
self._file = file(self._filepath, self._mode)
if self._mode == 'w':
self._file.write('RZIP')
elif self._mode == 'r':
header = self._file.read(4)
if header != 'RZIP':
self._file.close()
raise StandardError('Not rzip')
header = self._file.read(8*2)
current_pointer = 4
current_raw_pointer = 0
while header:
compressed_length, raw_length = struct.unpack('QQ', header)
self._filepos.append((current_pointer, current_raw_pointer, compressed_length))
#print compressed_length, raw_length
current_pointer += compressed_length+8*2
current_raw_pointer += raw_length
self._file.seek(current_pointer)
header = self._file.read(8*2)
else:
raise StandardError('Uknown mode')
def close(self):
"""
Arguments:
- `self`:
"""
if self._mode == 'w':
if len(self._buffer):
compressed = bz2.compress(self._buffer, self._compress)
self._file.write(struct.pack('QQ', len(compressed), len(self._buffer)))
self._file.write(compressed)
self._file.close()
def write(self, data):
"""
Arguments:
- `data`:
"""
if self._mode != 'w':
raise StandardError('Not write mode')
self._buffer += data
while len(self._buffer) >= self._compress_block_size:
compressed = bz2.compress(self._buffer[:self._compress_block_size], self._compress)
self._file.write(struct.pack('QQ', len(compressed), self._compress_block_size))
self._file.write(compressed)
#print 'write', len(compressed), self._compress_block_size
self._buffer = self._buffer[self._compress_block_size:]
def seek(self, pos):
"""
Arguments:
- `self`:
- `pos`:
"""
for i, (compressed_pointer, raw_pointer, x) in enumerate(self._filepos):
if pos < raw_pointer:
self._read_buffer_index = i - 1
self._read_pointer = pos
self._read_pointer_in_buffer = pos - self._filepos[self._read_buffer_index][1]
self._file.seek(self._filepos[self._read_buffer_index][0] + 8*2)
self._buffer = bz2.decompress(self._file.read(self._filepos[self._read_buffer_index][2]))
#print self._read_buffer_index, self._read_pointer_in_buffer, len(self._buffer)
#print self._buffer
break
else:
i = len(self._filepos)
self._read_buffer_index = i - 1
self._read_pointer = pos
self._read_pointer_in_buffer = pos - self._filepos[self._read_buffer_index][1]
self._file.seek(self._filepos[self._read_buffer_index][0] + 8*2)
self._buffer = bz2.decompress(self._file.read(self._filepos[self._read_buffer_index][2]))
def read(self, size):
"""
Arguments:
- `self`:
- `size`:
"""
remain_length = size
buf = ''
while remain_length:
if remain_length < len(self._buffer) - self._read_pointer_in_buffer:
buf += self._buffer[self._read_pointer_in_buffer:self._read_pointer_in_buffer+remain_length]
self._read_pointer += remain_length
self._read_pointer_in_buffer += remain_length
remain_length = 0
else:
buf += self._buffer[self._read_pointer_in_buffer:]
remain_length -= len(self._buffer) - self._read_pointer_in_buffer
if len(self._filepos)-1 <= self._read_buffer_index:
return buf
self._read_buffer_index += 1
self._read_pointer_in_buffer = 0
self._read_pointer += len(self._buffer) - self._read_pointer_in_buffer
self._file.seek(self._filepos[self._read_buffer_index][0] + 8*2)
self._buffer = bz2.decompress(self._file.read(self._filepos[self._read_buffer_index][2]))
return buf
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Random accessable zip')
parser.add_argument('files', help='files', nargs='+')
parser.add_argument('-d', '--decompress', help='decompress', action='store_true')
parser.add_argument('-s', '--start-from', help='decompress start position', type=int, default=0)
parser.add_argument('-l', '--length', help='decompress length', type=int, default=100)
parser.add_argument('-c', '--compress-block-size', type=int, default=1024*1024)
options = parser.parse_args()
if options.decompress:
for one in options.files:
r = RandomZip(one, 'r')
r.open()
r.seek(options.start_from)
print r.read(options.length)
else:
for one in options.files:
with file(one, 'r') as f:
r = RandomZip(one+'.rz', 'w', compress_block_size=options.compress_block_size)
r.open()
buf = f.read(1000*1000)
while buf:
r.write(buf)
buf = f.read(1000*1000)
r.close()
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment