Skip to content

Instantly share code, notes, and snippets.

@msikma
Created February 28, 2016 17:59
Show Gist options
  • Save msikma/48620960eaa73a108e64 to your computer and use it in GitHub Desktop.
Save msikma/48620960eaa73a108e64 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from bz2 import BZ2Decompressor
class BZ2TextFileStreamer:
'''
Iterates through a bzip2 compressed text file, decoding as it goes along
and yielding lines one by one until the end of the file. Since the file
is not completely contained in memory, it can be used to decode
and process very large text files line-by-line efficiently.
'''
bz2de = BZ2Decompressor()
def __init__(self, bz2_file, encoding='utf-8', read_size=512, lb='\n'):
'''
Sets up the streamer.
:param bz2_file: path to the bzip2 compressed file
:param encoding: Text encoding
:param read_size: Number of bytes to read at a time
:param lb: Linebreak to look for (\n by default)
'''
self.file = open(bz2_file, 'rb')
self.encoding = encoding
self.read_size = read_size
self.linebreak = lb
def close(self):
'''
Closes the file.
'''
self.file.close()
def __del__(self):
'''
Ensures that the file is closed on destruction.
'''
self.close()
def __iter__(self):
'''
Iterator that runs through the compressed file and yields only
full lines explicitly terminated by a linebreak.
'''
# String buffer that will contain decompressed text.
buffer = ''
while True:
# Read a series of compressed bytes.
file_bytes = self.file.read(self.read_size)
if file_bytes == b'':
# File has reached EOF, so close it and stop iteration.
self.close()
return
# Decompress bytes and interpret using the correct encoding.
buffer += self.bz2de.decompress(file_bytes).decode(self.encoding)
# Yield lines if we have any; put the remainder back in the buffer.
if self.linebreak in buffer:
lines = buffer.split(self.linebreak)
for line in lines[:-1]:
yield line
buffer = lines[-1]
sql_path = 'dict/jmdict_unittest.sql.bz2'
sql_stream = BZ2TextFileStreamer(sql_path)
for line in sql_stream:
print(repr(line))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment