Skip to content

Instantly share code, notes, and snippets.

@michaelchughes
Last active July 19, 2019 13:29
Show Gist options
  • Save michaelchughes/1647fb91c08a86ae59d858de93a11522 to your computer and use it in GitHub Desktop.
Save michaelchughes/1647fb91c08a86ae59d858de93a11522 to your computer and use it in GitHub Desktop.
Convert Ordered ICD DataFrame to Indicator
import numpy as np
import pandas as pd
import argparse
def convert_ordered_dx_df_to_indicator_df(
ordered_dx_df,
list_of_possible_icd_code_values=None,
list_of_possible_icd_code_names=None):
''' Convert rank-ordered diagnosis codes into indicator format
Args
----
ordered_dx_df : Pandas Dataframe
One column for each possible rank
E.g. 1st column is most important dx, 2nd col is next important, ...
Returns
-------
indicator_df : Pandas Dataframe
One column for each possible unique diagnosis code
'''
assert ordered_dx_df.values.dtype == object
if list_of_possible_icd_code_values is None:
list_of_possible_icd_code_values = np.unique(ordered_dx_df.values)
list_of_possible_icd_code_values = [
x for x in list_of_possible_icd_code_values if len(x) > 0]
list_of_possible_icd_code_values = np.asarray(list_of_possible_icd_code_values, dtype=unicode)
if list_of_possible_icd_code_names is None:
list_of_possible_icd_code_names = [
'icd_%s' % (code) for code in list_of_possible_icd_code_values]
list_of_possible_icd_code_names = np.asarray(list_of_possible_icd_code_names, dtype=unicode)
sort_ids = np.argsort(list_of_possible_icd_code_values)
list_of_possible_icd_code_values = list_of_possible_icd_code_values[sort_ids]
list_of_possible_icd_code_names = list_of_possible_icd_code_names[sort_ids]
vec_list = list()
for row in range(ordered_dx_df.shape[0]):
cur_vec = np.zeros(len(list_of_possible_icd_code_values), dtype=np.int32)
for col in range(ordered_dx_df.shape[1]):
cur_code = ordered_dx_df.iloc[row, col]
if cur_code == '':
continue
loc = np.searchsorted(list_of_possible_icd_code_values, cur_code)
cur_vec[loc] = 1
vec_list.append(cur_vec)
indicator_df = pd.DataFrame(
vec_list,
columns=list_of_possible_icd_code_names)
return indicator_df
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--ordered_dx_csv_file', type=str,
default='ordered_dx_simple.csv')
parser.add_argument('--indicator_dx_csv_file', type=str,
default='indicator_dx_simple.csv')
parser.add_argument('--n_icd_codes', type=int, default=10)
args = parser.parse_args()
ordered_dx_df = pd.read_csv(args.ordered_dx_csv_file, dtype=object)
ordered_dx_df.fillna('', inplace=True)
# Create estimate of indicators
# Using known list of possible codes
est_ind_df = convert_ordered_dx_df_to_indicator_df(
ordered_dx_df, map(str, range(args.n_icd_codes)))
# Create estimate of indicators version 2
# Detect possible codes by what's in the file
est2_ind_df = convert_ordered_dx_df_to_indicator_df(
ordered_dx_df)
true_ind_df = pd.read_csv(args.indicator_dx_csv_file, dtype=np.int32)
print("True indicator_df has %d columns" % true_ind_df.shape[1])
print("True indicator_df: first 5 rows")
print(true_ind_df.head())
print("Estimated indicator_df has %d columns" % est_ind_df.shape[1])
print("Estimated indicator_df: first 5 rows")
print(est_ind_df.head())
print("Estimated version 2:")
print(est2_ind_df.head())
import numpy as np
import pandas as pd
import argparse
from collections import OrderedDict
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--n_subjects', type=int, default=5)
parser.add_argument('--n_icd_codes', type=int, default=10)
parser.add_argument('--min_codes_per_subject', type=int, default=1)
parser.add_argument('--max_codes_per_subject', type=int, default=4)
parser.add_argument('--seed', type=int, default=54321)
parser.add_argument('--nickname', type=str, default='simple')
args = parser.parse_args()
n_subjects = args.n_subjects
n_icd_codes = args.n_icd_codes
min_codes_per_subject = args.min_codes_per_subject
max_codes_per_subject = args.max_codes_per_subject
prng = np.random.RandomState(args.seed)
codes_per_subject = dict()
ordered_dx_dicts = list()
indicator_vectors = list()
# Loop over each subject, generate a random set of icd codes
for subj_id in range(n_subjects):
# How many distinct codes to draw for this subj
n_codes = prng.randint(low=min_codes_per_subject, high=max_codes_per_subject)
# Select a list of codes uniformly from all available codes without replacement
code_list = prng.choice(n_icd_codes, n_codes, replace=False)
codes_per_subject[subj_id] = code_list.tolist()
# Store representation with ordered priority of dx
dx_dict = OrderedDict()
for ii, code in enumerate(code_list):
dx_dict['dx_%02d' % ii] = code
ordered_dx_dicts.append(dx_dict)
# Store representation as an indicator vector
vec = np.zeros(n_icd_codes, dtype=np.int32)
vec[code_list] = 1
indicator_vectors.append(vec)
ordered_dx_df = pd.DataFrame(ordered_dx_dicts)
ordered_dx_df.to_csv(
'ordered_dx_%s.csv' % args.nickname,
float_format='%.0f',
index=False)
indicator_dx_df = pd.DataFrame(
np.vstack(indicator_vectors),
columns=['icd_%s' % i for i in range(n_icd_codes)])
indicator_dx_df.to_csv(
'indicator_dx_%s.csv' % args.nickname,
float_format='%.0f',
index=False)
@michaelchughes
Copy link
Author

Use the function convert_ordered_dx_df_to_indicator_df to convert a dataframe containing columns of ordered diagnosis codes into a new dataframe with an indicator column for each possible icd code.

To test the functionality:

  1. Create a sample toy dataset

$ python create_random_icd_dataset.py

OUTCOME: creates CSV files in current folder showing an "ordered" representation and an "indicator" representation

  1. Try out reading in the ordered repr, and applying the convert function

$ python convert_ordered_to_indicator.py

OUTCOME: prints out the "true" indicator (read from disk) and the "estimated" indicator (computed by our convert function)

@michaelchughes
Copy link
Author

michaelchughes commented Jul 19, 2019

Expected output:

True indicator_df has 10 columns
True indicator_df: first 5 rows
   icd_0  icd_1  icd_2  icd_3  icd_4  icd_5  icd_6  icd_7  icd_8  icd_9
0      0      1      0      0      0      0      0      0      1      0
1      0      0      0      0      0      0      0      0      1      0
2      0      1      0      0      0      1      0      0      0      1
3      0      0      1      0      0      0      0      1      1      0
4      0      0      0      0      0      1      0      0      0      0
Estimated indicator_df has 10 columns
Estimated indicator_df: first 5 rows
   icd_0  icd_1  icd_2  icd_3  icd_4  icd_5  icd_6  icd_7  icd_8  icd_9
0      0      1      0      0      0      0      0      0      1      0
1      0      0      0      0      0      0      0      0      1      0
2      0      1      0      0      0      1      0      0      0      1
3      0      0      1      0      0      0      0      1      1      0
4      0      0      0      0      0      1      0      0      0      0
Estimated version 2:
   icd_1  icd_2  icd_5  icd_7  icd_8  icd_9
0      1      0      0      0      1      0
1      0      0      0      0      1      0
2      1      0      1      0      0      1
3      0      1      0      1      1      0
4      0      0      1      0      0      0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment