Skip to content

Instantly share code, notes, and snippets.

@gregorykremler
Created September 17, 2015 15:02
Show Gist options
  • Save gregorykremler/35351d3f118321098538 to your computer and use it in GitHub Desktop.
Save gregorykremler/35351d3f118321098538 to your computer and use it in GitHub Desktop.
Script to explore source encoding and parse CSV data.
import codecs
import cStringIO
import csv
import sys
import unicodecsv
from bs4 import UnicodeDammit
########################################################################
# STANDARD LIBRARY UNICODE READER
# https://docs.python.org/2/library/csv.html
########################################################################
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# decode UTF-8 back to Unicode, cell by cell:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
def read_unicode_csv(fp, delimiter, *args, **kwargs):
with open(fp) as fh:
reader = unicode_csv_reader(fh, delimiter=delimiter)
for row in reader:
print row
########################################################################
# STANDARD LIBRARY EXPLICIT ENCODING READER
# https://docs.python.org/2/library/csv.html#examples
########################################################################
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8.
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
def read_encoded_csv(fp, delimiter, encoding):
with open(fp) as fh:
reader = UnicodeReader(fh, delimiter=delimiter, encoding=encoding)
for row in reader:
print row
########################################################################
# UNICODECSV EXPLICIT ENCODING READER
# https://github.com/jdunck/python-unicodecsv
########################################################################
def read_encoded_unicodecsv(fp, delimiter, encoding):
with open(fp) as fh:
reader = unicodecsv.reader(fh, delimiter=delimiter, encoding=encoding)
for row in reader:
print row
########################################################################
# UNICODE, DAMMIT IMPLICIT ENCODING READER
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit
########################################################################
def read_encoded_unicodedammit(fp, delimiter, *args, **kwargs):
# first get the unicode text
with open(fp) as fh:
dammit = UnicodeDammit(fh.read())
print "Suspected original encoding is: {}\n".format(
dammit.original_encoding)
# then write to buffer for reader initialization
buff = cStringIO.StringIO()
buff.write(dammit.unicode_markup.encode('utf-8'))
buff.seek(0)
# finally, initialize a reader
reader = csv.reader(buff, delimiter=delimiter)
for row in reader:
print row
def main():
if len(sys.argv) != 5:
print (
"Missing filename, delimiter, reader, and/or encoding option(s)."
"\n\nReader options include: { csv | encoded_csv | unicodecsv"
" | unicode_dammit }\n")
sys.exit(1)
else:
script, filename, delimiter, reader, encoding = sys.argv
def get_reader_wrapper(reader):
return {
'csv': read_unicode_csv,
'encoded_csv': read_encoded_csv,
'unicodecsv': read_encoded_unicodecsv,
'unicode_dammit': read_encoded_unicodedammit
}.get(reader, "Please select a valid reader option.")
reader = get_reader_wrapper(reader)
if isinstance(reader, str):
print reader
else:
reader(filename, delimiter, encoding)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment