Last active
July 19, 2019 13:29
-
-
Save michaelchughes/1647fb91c08a86ae59d858de93a11522 to your computer and use it in GitHub Desktop.
Convert Ordered ICD DataFrame to Indicator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import argparse | |
def convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df, | |
list_of_possible_icd_code_values=None, | |
list_of_possible_icd_code_names=None): | |
''' Convert rank-ordered diagnosis codes into indicator format | |
Args | |
---- | |
ordered_dx_df : Pandas Dataframe | |
One column for each possible rank | |
E.g. 1st column is most important dx, 2nd col is next important, ... | |
Returns | |
------- | |
indicator_df : Pandas Dataframe | |
One column for each possible unique diagnosis code | |
''' | |
assert ordered_dx_df.values.dtype == object | |
if list_of_possible_icd_code_values is None: | |
list_of_possible_icd_code_values = np.unique(ordered_dx_df.values) | |
list_of_possible_icd_code_values = [ | |
x for x in list_of_possible_icd_code_values if len(x) > 0] | |
list_of_possible_icd_code_values = np.asarray(list_of_possible_icd_code_values, dtype=unicode) | |
if list_of_possible_icd_code_names is None: | |
list_of_possible_icd_code_names = [ | |
'icd_%s' % (code) for code in list_of_possible_icd_code_values] | |
list_of_possible_icd_code_names = np.asarray(list_of_possible_icd_code_names, dtype=unicode) | |
sort_ids = np.argsort(list_of_possible_icd_code_values) | |
list_of_possible_icd_code_values = list_of_possible_icd_code_values[sort_ids] | |
list_of_possible_icd_code_names = list_of_possible_icd_code_names[sort_ids] | |
vec_list = list() | |
for row in range(ordered_dx_df.shape[0]): | |
cur_vec = np.zeros(len(list_of_possible_icd_code_values), dtype=np.int32) | |
for col in range(ordered_dx_df.shape[1]): | |
cur_code = ordered_dx_df.iloc[row, col] | |
if cur_code == '': | |
continue | |
loc = np.searchsorted(list_of_possible_icd_code_values, cur_code) | |
cur_vec[loc] = 1 | |
vec_list.append(cur_vec) | |
indicator_df = pd.DataFrame( | |
vec_list, | |
columns=list_of_possible_icd_code_names) | |
return indicator_df | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--ordered_dx_csv_file', type=str, | |
default='ordered_dx_simple.csv') | |
parser.add_argument('--indicator_dx_csv_file', type=str, | |
default='indicator_dx_simple.csv') | |
parser.add_argument('--n_icd_codes', type=int, default=10) | |
args = parser.parse_args() | |
ordered_dx_df = pd.read_csv(args.ordered_dx_csv_file, dtype=object) | |
ordered_dx_df.fillna('', inplace=True) | |
# Create estimate of indicators | |
# Using known list of possible codes | |
est_ind_df = convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df, map(str, range(args.n_icd_codes))) | |
# Create estimate of indicators version 2 | |
# Detect possible codes by what's in the file | |
est2_ind_df = convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df) | |
true_ind_df = pd.read_csv(args.indicator_dx_csv_file, dtype=np.int32) | |
print("True indicator_df has %d columns" % true_ind_df.shape[1]) | |
print("True indicator_df: first 5 rows") | |
print(true_ind_df.head()) | |
print("Estimated indicator_df has %d columns" % est_ind_df.shape[1]) | |
print("Estimated indicator_df: first 5 rows") | |
print(est_ind_df.head()) | |
print("Estimated version 2:") | |
print(est2_ind_df.head()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import argparse | |
from collections import OrderedDict | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--n_subjects', type=int, default=5) | |
parser.add_argument('--n_icd_codes', type=int, default=10) | |
parser.add_argument('--min_codes_per_subject', type=int, default=1) | |
parser.add_argument('--max_codes_per_subject', type=int, default=4) | |
parser.add_argument('--seed', type=int, default=54321) | |
parser.add_argument('--nickname', type=str, default='simple') | |
args = parser.parse_args() | |
n_subjects = args.n_subjects | |
n_icd_codes = args.n_icd_codes | |
min_codes_per_subject = args.min_codes_per_subject | |
max_codes_per_subject = args.max_codes_per_subject | |
prng = np.random.RandomState(args.seed) | |
codes_per_subject = dict() | |
ordered_dx_dicts = list() | |
indicator_vectors = list() | |
# Loop over each subject, generate a random set of icd codes | |
for subj_id in range(n_subjects): | |
# How many distinct codes to draw for this subj | |
n_codes = prng.randint(low=min_codes_per_subject, high=max_codes_per_subject) | |
# Select a list of codes uniformly from all available codes without replacement | |
code_list = prng.choice(n_icd_codes, n_codes, replace=False) | |
codes_per_subject[subj_id] = code_list.tolist() | |
# Store representation with ordered priority of dx | |
dx_dict = OrderedDict() | |
for ii, code in enumerate(code_list): | |
dx_dict['dx_%02d' % ii] = code | |
ordered_dx_dicts.append(dx_dict) | |
# Store representation as an indicator vector | |
vec = np.zeros(n_icd_codes, dtype=np.int32) | |
vec[code_list] = 1 | |
indicator_vectors.append(vec) | |
ordered_dx_df = pd.DataFrame(ordered_dx_dicts) | |
ordered_dx_df.to_csv( | |
'ordered_dx_%s.csv' % args.nickname, | |
float_format='%.0f', | |
index=False) | |
indicator_dx_df = pd.DataFrame( | |
np.vstack(indicator_vectors), | |
columns=['icd_%s' % i for i in range(n_icd_codes)]) | |
indicator_dx_df.to_csv( | |
'indicator_dx_%s.csv' % args.nickname, | |
float_format='%.0f', | |
index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Expected output: