Created
September 17, 2015 15:02
-
-
Save gregorykremler/35351d3f118321098538 to your computer and use it in GitHub Desktop.
Script to explore source encoding and parse CSV data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import cStringIO | |
import csv | |
import sys | |
import unicodecsv | |
from bs4 import UnicodeDammit | |
######################################################################## | |
# STANDARD LIBRARY UNICODE READER | |
# https://docs.python.org/2/library/csv.html | |
######################################################################## | |
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): | |
# csv.py doesn't do Unicode; encode temporarily as UTF-8: | |
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), | |
dialect=dialect, **kwargs) | |
for row in csv_reader: | |
# decode UTF-8 back to Unicode, cell by cell: | |
yield [unicode(cell, 'utf-8') for cell in row] | |
def utf_8_encoder(unicode_csv_data): | |
for line in unicode_csv_data: | |
yield line.encode('utf-8') | |
def read_unicode_csv(fp, delimiter, *args, **kwargs): | |
with open(fp) as fh: | |
reader = unicode_csv_reader(fh, delimiter=delimiter) | |
for row in reader: | |
print row | |
######################################################################## | |
# STANDARD LIBRARY EXPLICIT ENCODING READER | |
# https://docs.python.org/2/library/csv.html#examples | |
######################################################################## | |
class UTF8Recoder: | |
""" | |
Iterator that reads an encoded stream and reencodes the input to UTF-8. | |
""" | |
def __init__(self, f, encoding): | |
self.reader = codecs.getreader(encoding)(f) | |
def __iter__(self): | |
return self | |
def next(self): | |
return self.reader.next().encode("utf-8") | |
class UnicodeReader: | |
""" | |
A CSV reader which will iterate over lines in the CSV file "f", | |
which is encoded in the given encoding. | |
""" | |
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): | |
f = UTF8Recoder(f, encoding) | |
self.reader = csv.reader(f, dialect=dialect, **kwds) | |
def next(self): | |
row = self.reader.next() | |
return [unicode(s, "utf-8") for s in row] | |
def __iter__(self): | |
return self | |
def read_encoded_csv(fp, delimiter, encoding): | |
with open(fp) as fh: | |
reader = UnicodeReader(fh, delimiter=delimiter, encoding=encoding) | |
for row in reader: | |
print row | |
######################################################################## | |
# UNICODECSV EXPLICIT ENCODING READER | |
# https://github.com/jdunck/python-unicodecsv | |
######################################################################## | |
def read_encoded_unicodecsv(fp, delimiter, encoding): | |
with open(fp) as fh: | |
reader = unicodecsv.reader(fh, delimiter=delimiter, encoding=encoding) | |
for row in reader: | |
print row | |
######################################################################## | |
# UNICODE, DAMMIT IMPLICIT ENCODING READER | |
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit | |
######################################################################## | |
def read_encoded_unicodedammit(fp, delimiter, *args, **kwargs): | |
# first get the unicode text | |
with open(fp) as fh: | |
dammit = UnicodeDammit(fh.read()) | |
print "Suspected original encoding is: {}\n".format( | |
dammit.original_encoding) | |
# then write to buffer for reader initialization | |
buff = cStringIO.StringIO() | |
buff.write(dammit.unicode_markup.encode('utf-8')) | |
buff.seek(0) | |
# finally, initialize a reader | |
reader = csv.reader(buff, delimiter=delimiter) | |
for row in reader: | |
print row | |
def main(): | |
if len(sys.argv) != 5: | |
print ( | |
"Missing filename, delimiter, reader, and/or encoding option(s)." | |
"\n\nReader options include: { csv | encoded_csv | unicodecsv" | |
" | unicode_dammit }\n") | |
sys.exit(1) | |
else: | |
script, filename, delimiter, reader, encoding = sys.argv | |
def get_reader_wrapper(reader): | |
return { | |
'csv': read_unicode_csv, | |
'encoded_csv': read_encoded_csv, | |
'unicodecsv': read_encoded_unicodecsv, | |
'unicode_dammit': read_encoded_unicodedammit | |
}.get(reader, "Please select a valid reader option.") | |
reader = get_reader_wrapper(reader) | |
if isinstance(reader, str): | |
print reader | |
else: | |
reader(filename, delimiter, encoding) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment