Skip to content

Instantly share code, notes, and snippets.

@1328
Created May 16, 2014 14:12
Show Gist options
  • Select an option

  • Save 1328/9263ae8cfb4cdd342461 to your computer and use it in GitHub Desktop.

Select an option

Save 1328/9263ae8cfb4cdd342461 to your computer and use it in GitHub Desktop.
Poor mans csv readers
import re
from pprint import pprint
def option1(fn):
'''the screwy quoting break the csv module, so we just have to do it ourselves'''
data = []
with open(fn, mode='r') as fh:
for r in fh:
if r:
data.append(r.strip().split(','))
return data
'''generators are just like functions, except they return piece by piece'''
'''this is really useful in processing files'''
'''so let's do this again, using a generator to slice through the file'''
def gen_option2(fn):
'''generator for option 2'''
with open(fn, mode='r') as fh:
for r in fh:
if r:
yield [i.strip('[\'\"]') for i in r.strip().split(',')]
def option2(fn):
'''this option uses a generator to make things a bit cleaner'''
data = []
for row in gen_option2(fn):
data.append(row)
return data
'''ok, now let's really clean up the data'''
'''this option uses a helper function cleaner(item) that cleans item'''
'''the cleaner function takes in a string and:
1. changes blanks to a Nonetype
2. changes strings that are floats, like "1.23" into actual floats, e.g. 1.23
3. changes ints into ints, so if 1.0 == 1, it returns just int(1)
4. removes extra spaces from strings
'''
def gen_option3(fn):
'''generator for option 3'''
'''just like option 2, but uses cleaner on each (i)'''
with open(fn, mode='r') as fh:
for r in fh:
if r:
yield [cleaner(i) for i in r.strip().split(',')]
def cleaner(i):
'''this is the cleaner function described above'''
i = i.strip('[\'\"]')
if not i:
return None
try:
i = float(i)
if int(i) == i:
i = int(i)
except ValueError:
# ok we have a string
# let's get rid of excess spaces
i = re.sub('\s+',' ',i)
return i
def option3(fn):
'''ok, let's extend the generator and add in a cleaner function'''
'''to pretty things up some more'''
data = []
for row in gen_option3(fn):
data.append(row)
return data
def main():
x = option3('FILE2.COMPAS')
x = option3('file1.REAL')
pprint(x)
print(x[0][0])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment