Created
April 26, 2020 10:40
-
-
Save beneon/8f828e4a401241d0a601846fffc43967 to your computer and use it in GitHub Desktop.
数据清理-规陪带教数据
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 2020-4-26编码,将三个文件处理以后合并成一个excel | |
import os | |
import pandas as pd | |
import re | |
# 从第三行开始,第一列医生名称,第二列进修生带教数量,第三列规陪数量,第四列月份,第五列年份 | |
xl_file_path = os.path.join('datafile','summary.xlsx') | |
df = pd.read_excel(xl_file_path) | |
#原始表其实是两个表格的联合,先拆成basic和detail两个表 | |
#原始数据有空行,我把姓名给向下填充了,但是其他的内容没有向下填充,所以这里就使用groupby加sum把nan给当0消耗掉 | |
df_basic = df[['姓名','year','进修','规陪']].groupby(['姓名','year']).sum() | |
#对于detail这个表,遍历拆分出月份和数量两个信息 | |
df_detail = df[['姓名','year','月份']] | |
def extract_month_quant(month_str): | |
def gen_pd_series(months,quant): | |
return pd.Series({'months':months,'quant':quant}) | |
if pd.isna(month_str): | |
return gen_pd_series(months=0,quant=0) | |
# 1个月: 1 | |
re_month_format = re.compile(r'(\d+)个月[::]\s*(\d+)') | |
mo = re_month_format.match(month_str.strip()) | |
if mo: | |
return gen_pd_series(months=int(mo.group(1)),quant=int(mo.group(2))) | |
else: | |
raise Exception(f"{month_str}'s format is wrong for extract_month_quant") | |
df_detail[['规陪带教时长(月)','人数']] = df_detail['月份'].apply(lambda e:extract_month_quant(e)) | |
df_detail['规陪带教人月数'] = df_detail['规陪带教时长(月)']*df_detail['人数'] | |
df_detail_select = df_detail[['姓名','year','规陪带教人月数']].drop_duplicates().set_index(['姓名','year']) | |
df_final = df_basic.join(df_detail_select) | |
# reset index | |
df_basic = df_basic.reset_index() | |
df_final = df_final.reset_index() | |
xlwriter = pd.ExcelWriter(os.path.join('datafile','2017-2019带教统计.xlsx'),engine='xlsxwriter') | |
df_final.to_excel(excel_writer=xlwriter,sheet_name='统计信息',index=False) | |
df_basic.to_excel(excel_writer=xlwriter,sheet_name='带教基本信息',index=False) | |
df_detail.to_excel(excel_writer=xlwriter,sheet_name='规陪带教情况细则',index=False) | |
xlwriter.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment