Created
April 12, 2014 20:51
-
-
Save salmoni/10556194 to your computer and use it in GitHub Desktop.
Parses a single line of CSV with a set (multiple!) delimiters and a set (multiple!) of quotation characters. Embedded quotes are kept honest (see examples at bottom of file).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ParseLine(line, delims, quotes): | |
""" | |
Parses a line of text into components. This attempts to | |
be a proper parser that can cope with multiple delimiters. | |
""" | |
inQuote = False # flag for being 'within' quotes | |
token = '' # current token | |
tokens = [] # list of tokens | |
for char in line: | |
if inQuote: # so if we're in the middle of a quote... | |
if char == inQuoteChar: # ...and have a matching quote character... | |
tokens.append(token) # add the token to list (ignore quote character) | |
token = '' # and begin new token | |
inQuote = False # flag that we're not in a quote any more | |
else: # But if char is a non-matching quote... | |
token += char # ...just add to token | |
elif char in delims: # or if char is a delimiter... | |
if len(token) > 0: # ...and token is worth recording... | |
tokens.append(token) # add token to list | |
token = '' # and begin new token | |
else: # if token has 0 length and no content... | |
pass # ...adjacent delimiters so do nothing | |
elif char in quotes: # But if char is a quote... | |
inQuoteChar = char # record it to check for matching quote later | |
inQuote = True # and flag that we're in a quotation | |
else: # And if char is anything else... | |
token += char # add to token | |
if len(token) > 0: # Check if last item is worth recording (len > 0) | |
tokens.append(token) # add to list of tokens | |
return tokens # return list of tokens | |
# Some slightly gnarly test data | |
delims = ",; " # comma, semi-colon and space as delimiters | |
quotes = '"'+"'" # double and single quotes together | |
line = """Col1, 'col2"' col3; col4 "col5,;", col6""" | |
print ParseLine(line, delims, quotes) | |
line = """1,2 3.3;4 5,6""" | |
print ParseLine(line, delims, quotes) | |
line = """6,5 4;3 2.2,1""" | |
print ParseLine(line, delims, quotes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment