Created
March 2, 2015 19:59
-
-
Save cds-amal/c94c2e32be21f0a96bae to your computer and use it in GitHub Desktop.
csv to json script for city of record data -- this is a starting point
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os.path | |
import json | |
import csv | |
from tidylib import tidy_document | |
import html2text as h2t | |
dbfolder = '../CROL-PDF/Sample Database' | |
# ( filename, fieldtoclean, csvdelimiter) | |
files = \ | |
[('procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv', | |
'AdditionalDescription', ','), | |
('procPublicationRequestDMSSPortal Oct-Dec 2014.csv', None, '|'), | |
('procPublicationRequest_Oct-Dec_2014_clean.csv', | |
'AdditionalDescription', ','), | |
('procPublicationRequest_pipes.csv', 'AdditionalDescription', '|')] | |
for fn, field, delimiter in files: | |
out = fn.replace('.csv', '.json') # ugly | |
fn = os.path.join(dbfolder, fn) | |
items = [] | |
with open(fn) as csvfile: | |
reader = csv.DictReader(csvfile, delimiter=delimiter) | |
for row in reader: | |
for k, v in row.items(): | |
if v: | |
# decode using a codepage that handles 0x91, 92, 93 etc | |
row[k] = v.decode('cp1250') | |
if field: | |
try: | |
doc, errors, = tidy_document(row[field]) | |
row[field] = h2t.html2text(doc) | |
except Exception, e: | |
print 'oopsie' | |
print e | |
print row[field] | |
items.append(row) | |
with open(out, 'w') as outfile: | |
json.dump(items, outfile, indent=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment