Skip to content

Instantly share code, notes, and snippets.

@binhngoc17
Created April 9, 2014 20:34
Show Gist options
  • Save binhngoc17/10311749 to your computer and use it in GitHub Desktop.
Save binhngoc17/10311749 to your computer and use it in GitHub Desktop.
Generate arff file for analysis in Weka (Prudential Challenge)
from datetime import datetime
import csv
label = """
@relation whatever
@attribute AGE numeric
@attribute GENDER {M,F}
@attribute HOSPITAL string
@attribute DIAGNOSISCODE string
@attribute DATEOFADM date "yyyy-MM-dd HH:mm:ss"
@attribute LENOFADM numeric
@attribute BILLCAT {IN,OU,DY,NA}
@attribute WARDTYPE {A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Z,8,NA}
@attribute HOSPITALBILL numeric
@data"""
print label
with open('core/final_set.csv', 'r') as infile:
reader = csv.reader(infile, delimiter=',', quotechar='"')
labels = reader.next()
count = 0
label_mapping = {}
for k in labels:
label_mapping[k] = count
count +=1
admit_id = {}
# print 'Labels: ' + ','.join(labels)
hospitals = []
for values in reader:
try:
date_of_admit = datetime.strptime(values[label_mapping['DATEOFADM']], '%d/%m/%Y')
date_of_discharge = datetime.strptime(values[label_mapping['DATEDISCHARGE']], '%d/%m/%Y')
except:
continue
hospitals.append(values[label_mapping['HOSPITAL']])
len_of_stay = date_of_discharge - date_of_admit
patient_age = int(date_of_admit.strftime('%Y')) - int(values[label_mapping['YMDOB']])
admit_id[values[label_mapping['HRN']]] = admit_id.get(values[label_mapping['HRN']], 0) + 1
if not values[label_mapping['BILLCAT']]:
values[label_mapping['BILLCAT']] = 'NA'
if not values[label_mapping['WARDTYPE']]:
values[label_mapping['WARDTYPE']] = 'NA'
print '%s,%s,"%s","%s","%s",%s,%s,%s,%s' % (patient_age,
values[label_mapping['GENDER']],
values[label_mapping['HOSPITAL']],
values[label_mapping['DIAGNOSISCODE']],
date_of_admit.strftime("%Y-%m-%d %H:%M:%S"),
len_of_stay.days,
values[label_mapping['BILLCAT']],
values[label_mapping['WARDTYPE']],
values[label_mapping['HOSPITALBILL']],
)
one_admissions = [key for key in admit_id if admit_id[key] == 1]
# print 'Number of one admission: %s' % len(one_admissions)
# print 'Number of unique ids: %s' % len(admit_id.keys())
# hospitals = list(set(hospitals))
# hospitals.sort()
# print hospitals
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment