Created
November 11, 2013 21:40
-
-
Save msukmanowsky/7420914 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
try: | |
import cStringIO as StringIO | |
except ImportError: | |
import StringIO | |
class EscapedLineReader(object): | |
"""Custom reader for files where we could have escaped new lines. | |
This is not an efficient implementation and should move to Cython. Also, the line | |
delimiter is assumed to be single char and doesn't work right now for Windows | |
style \r\n endings. | |
>>> with open('somefile.txt', 'r') as fp: | |
... reader = EscapedLineReader(fp) | |
... for line in reader: | |
... do_stuff() | |
""" | |
MAX_BYTES = 1000 | |
def __init__(self, fp, quote_chars=['\'', '"'], line_delimiter='\n'): | |
self.fp = fp | |
self.quote_chars = quote_chars | |
self.line_delimiter = line_delimiter | |
def __iter__(self): | |
return self | |
def _get_line(self): | |
line = StringIO.StringIO() # buffer to hold our eventual line | |
finished = False | |
pos = self.fp.tell() # Store current pos within file, we'll need for rewind | |
# acts as a counter to ensure that quote characters are balanced in the | |
# string | |
quote_counters = defaultdict(int) | |
while finished == False: | |
chars = self.fp.read(self.MAX_BYTES) | |
if chars == '': | |
raise StopIteration() # EOF | |
for char in chars: | |
if char in self.quote_chars: | |
quote_counters[char] += 1 | |
line.write(char) | |
if char == self.line_delimiter and \ | |
all(map(lambda x: x % 2 == 0, quote_counters.itervalues())): | |
# We have a properly terminated line, rewind the file | |
# pointer and mark as done | |
self.fp.seek(pos + len(line.getvalue())) | |
finished = True | |
break | |
return line.getvalue() | |
def next(self): | |
return self._get_line() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment