Created
February 28, 2016 17:59
-
-
Save msikma/48620960eaa73a108e64 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bz2 import BZ2Decompressor | |
class BZ2TextFileStreamer: | |
''' | |
Iterates through a bzip2 compressed text file, decoding as it goes along | |
and yielding lines one by one until the end of the file. Since the file | |
is not completely contained in memory, it can be used to decode | |
and process very large text files line-by-line efficiently. | |
''' | |
bz2de = BZ2Decompressor() | |
def __init__(self, bz2_file, encoding='utf-8', read_size=512, lb='\n'): | |
''' | |
Sets up the streamer. | |
:param bz2_file: path to the bzip2 compressed file | |
:param encoding: Text encoding | |
:param read_size: Number of bytes to read at a time | |
:param lb: Linebreak to look for (\n by default) | |
''' | |
self.file = open(bz2_file, 'rb') | |
self.encoding = encoding | |
self.read_size = read_size | |
self.linebreak = lb | |
def close(self): | |
''' | |
Closes the file. | |
''' | |
self.file.close() | |
def __del__(self): | |
''' | |
Ensures that the file is closed on destruction. | |
''' | |
self.close() | |
def __iter__(self): | |
''' | |
Iterator that runs through the compressed file and yields only | |
full lines explicitly terminated by a linebreak. | |
''' | |
# String buffer that will contain decompressed text. | |
buffer = '' | |
while True: | |
# Read a series of compressed bytes. | |
file_bytes = self.file.read(self.read_size) | |
if file_bytes == b'': | |
# File has reached EOF, so close it and stop iteration. | |
self.close() | |
return | |
# Decompress bytes and interpret using the correct encoding. | |
buffer += self.bz2de.decompress(file_bytes).decode(self.encoding) | |
# Yield lines if we have any; put the remainder back in the buffer. | |
if self.linebreak in buffer: | |
lines = buffer.split(self.linebreak) | |
for line in lines[:-1]: | |
yield line | |
buffer = lines[-1] | |
sql_path = 'dict/jmdict_unittest.sql.bz2' | |
sql_stream = BZ2TextFileStreamer(sql_path) | |
for line in sql_stream: | |
print(repr(line)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment