Created
November 20, 2014 23:52
-
-
Save Morasta/68c2b3ba969685b3394d to your computer and use it in GitHub Desktop.
CSVToLookups
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Separates columns in a CSV into individual files with unique values per field | |
# Useful for generating lookup tables with unique values from the original csv's field | |
# | |
# Room for implementation improvement, but this works as a quick and dirty solution | |
#!/usr/bin/python | |
import csv | |
from distutils.util import strtobool | |
input_filename = raw_input('Enter input filename: ') | |
insert_ids = None | |
while insert_ids is None: | |
try: | |
insert_ids = strtobool(raw_input('Include ID column in output? (y/n) ')) | |
except ValueError: | |
print "Error: Invalid selection." | |
include_header = None | |
while include_header is None: | |
try: | |
include_header = strtobool(raw_input('Include column/field names in output? (y/n) ')) | |
except ValueError: | |
print "Error: Invalid selection." | |
delimiter = ',' | |
quote_char = '"' | |
escape_char = '\\' | |
csv_fp = open(input_filename, 'rU') | |
csv_reader = csv.DictReader(csv_fp, fieldnames=[], restkey='undefined-fieldnames', delimiter=delimiter, quotechar=quote_char) | |
results = dict() #output dict format (cola: [val1, val2, ..., valn], colb: [val1, val2, ...], ...) | |
header_row = True | |
for row in csv_reader: | |
if header_row == True: | |
header_row = False | |
csv_reader.fieldnames = row['undefined-fieldnames'] | |
for field in csv_reader.fieldnames: | |
results.setdefault(field) | |
else: | |
for key in results: | |
if results[key] == None: | |
results[key] = [row[key]] | |
else: | |
if row[key] not in results[key]: | |
results[key].append(row[key]) | |
for key in results: | |
if insert_ids == True: | |
initial_id = 1 | |
with open("output/"+key+'_output.csv', 'wb') as csvfile: | |
output = csv.writer(csvfile, delimiter = delimiter, quotechar=quote_char, quoting=csv.QUOTE_NONE, escapechar=escape_char) | |
if include_header == True: | |
if insert_ids == True: | |
output.writerow(['id', u''.join(key).encode('utf8').strip()]) | |
else: | |
output.writerow([u''.join(key).encode('utf8').strip()]) | |
results[key].sort() | |
for record in results[key]: | |
if record is not "": | |
if insert_ids == True: | |
output.writerow([initial_id, u''.join(record).encode('utf8').strip()]) | |
initial_id += 1 | |
else: | |
output.writerow([u''.join(record).encode('utf8').strip()]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment