Last active
August 29, 2015 14:15
-
-
Save phaustin/7e080a90934546457e41 to your computer and use it in GitHub Desktop.
openpyxl to pandas with bad values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
from builtins import next | |
from openpyxl import load_workbook | |
import glob,os | |
import pandas as pd | |
import numpy as np | |
import logging | |
logging.basicConfig(level=logging.DEBUG) | |
home=os.environ['HOME'] | |
the_file=glob.glob('{}/Dropbox/e340_2014_spring/Exams/Mid_term_1/grades/*cl*201.xlsx'.format(home))[0] | |
wb=load_workbook(the_file,data_only=True,use_iterators=True) | |
combine,=wb.get_sheet_names() | |
sheet=wb[combine] | |
row_iter=sheet.iter_rows() | |
headers = [c.value for c in next(row_iter) if c.value] | |
headers=[item.replace(' ','_').lower() for item in headers] | |
grades = np.empty(1, dtype=[('id','int_'),('last','unicode_'),('first','unicode_'), | |
('pref','unicode_'),('group_num','int_'), | |
('group_mark','int_'),('ind_mark','int_')]) | |
df=pd.DataFrame(grades) | |
count=0 | |
for count,row in enumerate(row_iter): | |
out=[item.value for item in row] | |
out[0]=int(out[0]) | |
if len(out[3]) == 0: | |
out[3]='nan' | |
for cell,number in enumerate(out[-3:]): | |
try: | |
out[-3 + cell]=int(number) | |
except (ValueError,TypeError): | |
out[-3 + cell]=int(-999) | |
df.loc[count]=out | |
print(df) | |
with pd.HDFStore('store.h5','w') as store: | |
store.put('mid_term1',df,format='table') | |
with pd.HDFStore('store.h5','r') as store: | |
new_df=store['mid_term1'] | |
* convert a row to a dictionary and subset | |
the_file='classlists_2014W_UBC_EOSC_340_201-1.xlsx' | |
wb=load_workbook(the_file,data_only=True,use_iterators=True) | |
names=wb.get_sheet_names() | |
sheet=wb['Combined'] | |
row_iter=sheet.iter_rows() | |
colnames=next(row_iter) | |
colnames=[c.value for c in colnames] | |
the_list=[] | |
for row in row_iter: | |
values=[c.value for c in row] | |
the_dict=dict(zip(colnames,values)) | |
newnames={'Student Number':'studentid','Given Name':'firstname','Surname':'lastname','Preferred Name':'preferred_name'} | |
prune_keys=newnames.keys() | |
sub_dict={newnames[k]: the_dict[k] for k in prune_keys} | |
the_list.append(sub_dict) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment