Last active
July 19, 2019 13:29
-
-
Save michaelchughes/1647fb91c08a86ae59d858de93a11522 to your computer and use it in GitHub Desktop.
Convert Ordered ICD DataFrame to Indicator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import argparse | |
def convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df, | |
list_of_possible_icd_code_values=None, | |
list_of_possible_icd_code_names=None): | |
''' Convert rank-ordered diagnosis codes into indicator format | |
Args | |
---- | |
ordered_dx_df : Pandas Dataframe | |
One column for each possible rank | |
E.g. 1st column is most important dx, 2nd col is next important, ... | |
Returns | |
------- | |
indicator_df : Pandas Dataframe | |
One column for each possible unique diagnosis code | |
''' | |
assert ordered_dx_df.values.dtype == object | |
if list_of_possible_icd_code_values is None: | |
list_of_possible_icd_code_values = np.unique(ordered_dx_df.values) | |
list_of_possible_icd_code_values = [ | |
x for x in list_of_possible_icd_code_values if len(x) > 0] | |
list_of_possible_icd_code_values = np.asarray(list_of_possible_icd_code_values, dtype=unicode) | |
if list_of_possible_icd_code_names is None: | |
list_of_possible_icd_code_names = [ | |
'icd_%s' % (code) for code in list_of_possible_icd_code_values] | |
list_of_possible_icd_code_names = np.asarray(list_of_possible_icd_code_names, dtype=unicode) | |
sort_ids = np.argsort(list_of_possible_icd_code_values) | |
list_of_possible_icd_code_values = list_of_possible_icd_code_values[sort_ids] | |
list_of_possible_icd_code_names = list_of_possible_icd_code_names[sort_ids] | |
vec_list = list() | |
for row in range(ordered_dx_df.shape[0]): | |
cur_vec = np.zeros(len(list_of_possible_icd_code_values), dtype=np.int32) | |
for col in range(ordered_dx_df.shape[1]): | |
cur_code = ordered_dx_df.iloc[row, col] | |
if cur_code == '': | |
continue | |
loc = np.searchsorted(list_of_possible_icd_code_values, cur_code) | |
cur_vec[loc] = 1 | |
vec_list.append(cur_vec) | |
indicator_df = pd.DataFrame( | |
vec_list, | |
columns=list_of_possible_icd_code_names) | |
return indicator_df | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--ordered_dx_csv_file', type=str, | |
default='ordered_dx_simple.csv') | |
parser.add_argument('--indicator_dx_csv_file', type=str, | |
default='indicator_dx_simple.csv') | |
parser.add_argument('--n_icd_codes', type=int, default=10) | |
args = parser.parse_args() | |
ordered_dx_df = pd.read_csv(args.ordered_dx_csv_file, dtype=object) | |
ordered_dx_df.fillna('', inplace=True) | |
# Create estimate of indicators | |
# Using known list of possible codes | |
est_ind_df = convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df, map(str, range(args.n_icd_codes))) | |
# Create estimate of indicators version 2 | |
# Detect possible codes by what's in the file | |
est2_ind_df = convert_ordered_dx_df_to_indicator_df( | |
ordered_dx_df) | |
true_ind_df = pd.read_csv(args.indicator_dx_csv_file, dtype=np.int32) | |
print("True indicator_df has %d columns" % true_ind_df.shape[1]) | |
print("True indicator_df: first 5 rows") | |
print(true_ind_df.head()) | |
print("Estimated indicator_df has %d columns" % est_ind_df.shape[1]) | |
print("Estimated indicator_df: first 5 rows") | |
print(est_ind_df.head()) | |
print("Estimated version 2:") | |
print(est2_ind_df.head()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import argparse | |
from collections import OrderedDict | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--n_subjects', type=int, default=5) | |
parser.add_argument('--n_icd_codes', type=int, default=10) | |
parser.add_argument('--min_codes_per_subject', type=int, default=1) | |
parser.add_argument('--max_codes_per_subject', type=int, default=4) | |
parser.add_argument('--seed', type=int, default=54321) | |
parser.add_argument('--nickname', type=str, default='simple') | |
args = parser.parse_args() | |
n_subjects = args.n_subjects | |
n_icd_codes = args.n_icd_codes | |
min_codes_per_subject = args.min_codes_per_subject | |
max_codes_per_subject = args.max_codes_per_subject | |
prng = np.random.RandomState(args.seed) | |
codes_per_subject = dict() | |
ordered_dx_dicts = list() | |
indicator_vectors = list() | |
# Loop over each subject, generate a random set of icd codes | |
for subj_id in range(n_subjects): | |
# How many distinct codes to draw for this subj | |
n_codes = prng.randint(low=min_codes_per_subject, high=max_codes_per_subject) | |
# Select a list of codes uniformly from all available codes without replacement | |
code_list = prng.choice(n_icd_codes, n_codes, replace=False) | |
codes_per_subject[subj_id] = code_list.tolist() | |
# Store representation with ordered priority of dx | |
dx_dict = OrderedDict() | |
for ii, code in enumerate(code_list): | |
dx_dict['dx_%02d' % ii] = code | |
ordered_dx_dicts.append(dx_dict) | |
# Store representation as an indicator vector | |
vec = np.zeros(n_icd_codes, dtype=np.int32) | |
vec[code_list] = 1 | |
indicator_vectors.append(vec) | |
ordered_dx_df = pd.DataFrame(ordered_dx_dicts) | |
ordered_dx_df.to_csv( | |
'ordered_dx_%s.csv' % args.nickname, | |
float_format='%.0f', | |
index=False) | |
indicator_dx_df = pd.DataFrame( | |
np.vstack(indicator_vectors), | |
columns=['icd_%s' % i for i in range(n_icd_codes)]) | |
indicator_dx_df.to_csv( | |
'indicator_dx_%s.csv' % args.nickname, | |
float_format='%.0f', | |
index=False) |
Expected output:
True indicator_df has 10 columns
True indicator_df: first 5 rows
icd_0 icd_1 icd_2 icd_3 icd_4 icd_5 icd_6 icd_7 icd_8 icd_9
0 0 1 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 0 0 1 0
2 0 1 0 0 0 1 0 0 0 1
3 0 0 1 0 0 0 0 1 1 0
4 0 0 0 0 0 1 0 0 0 0
Estimated indicator_df has 10 columns
Estimated indicator_df: first 5 rows
icd_0 icd_1 icd_2 icd_3 icd_4 icd_5 icd_6 icd_7 icd_8 icd_9
0 0 1 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 0 0 1 0
2 0 1 0 0 0 1 0 0 0 1
3 0 0 1 0 0 0 0 1 1 0
4 0 0 0 0 0 1 0 0 0 0
Estimated version 2:
icd_1 icd_2 icd_5 icd_7 icd_8 icd_9
0 1 0 0 0 1 0
1 0 0 0 0 1 0
2 1 0 1 0 0 1
3 0 1 0 1 1 0
4 0 0 1 0 0 0
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use the function convert_ordered_dx_df_to_indicator_df to convert a dataframe containing columns of ordered diagnosis codes into a new dataframe with an indicator column for each possible icd code.
To test the functionality:
$ python create_random_icd_dataset.py
OUTCOME: creates CSV files in current folder showing an "ordered" representation and an "indicator" representation
$ python convert_ordered_to_indicator.py
OUTCOME: prints out the "true" indicator (read from disk) and the "estimated" indicator (computed by our convert function)