Created
May 30, 2019 09:50
-
-
Save jessefreeman/da01989fdabd73f3c9f2ce54ddd59fff to your computer and use it in GitHub Desktop.
A Python script to convert ChestXray14 CSV labels into meta.json files to use with MissingLink.ai's Data Volumes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import random | |
import tqdm | |
class_mapping = { | |
0: 'Atelectasis', | |
1: 'Cardiomegaly', | |
2: 'Effusion', | |
3: 'Infiltration', | |
4: 'Mass', | |
5: 'Nodule', | |
6: 'Pneumonia', | |
7: 'Pneumothorax', | |
8: 'Consolidation', | |
9: 'Edema', | |
10: 'Emphysema', | |
11: 'Fibrosis', | |
12: 'Pleural_Thickening', | |
13: 'Hernia' | |
} | |
df = pd.read_csv('Data_Entry_2017.csv') | |
df = df.rename(index=str, columns={"Follow-up #":"Follow_up_Number", | |
"OriginalImage[Width": "OriginalImage_Width", | |
"Height]": "OriginalImage_Height", | |
"OriginalImagePixelSpacing[x":"OriginalImagePixelSpacing_x", | |
"y]":"OriginalImagePixelSpacing_y", | |
"Patient ID":"Patient_ID", | |
"Patient Age":"Patient_Age", | |
"Patient Gender":"Patient_Gender", | |
"View Position":"View_Position", | |
"Image Index":"Image_Index", | |
"Finding Labels":"Finding_Labels", | |
"Unnamed: 11":"Unnamed_11"}) | |
df['Single_Finding'] = False | |
df['Paper_Split'] = 'train' | |
df['type'] = 'Image' | |
df['Index_No'] = df['Image_Index'] | |
# print(df.head()) | |
dict = df.to_dict(orient='records') | |
image_names = df['Image_Index'].values.tolist() | |
# print(df.head()) | |
# | |
# print (image_names[:10]) | |
train_val_list_pd = pd.read_csv('train_val_list.txt') | |
train_val_list = train_val_list_pd.values.tolist() | |
train_val_list = [i[0] for i in train_val_list] | |
test_list_pd = pd.read_csv('test_list.txt') | |
test_list = test_list_pd.values.tolist() | |
test_list = [i[0] for i in test_list] | |
choose = ['train', 'validation'] | |
# uncomment this to run faster in demos | |
image_names = image_names[:500] | |
class_mapping_keys = class_mapping.keys() | |
class_mapping_items = class_mapping.items() | |
# print(class_mapping_items) | |
for count, image in enumerate(tqdm.tqdm(image_names)): | |
temp = image.split(".")[0] | |
df.loc[str(count), ['Index_No']] = temp | |
# print(df.loc[df['Image_Index'] == image]) | |
label = df.loc[df['Image_Index'] == image]['Finding_Labels'].values.tolist() | |
label = label[0] | |
# print("Before:", df.loc[str(count), ['Single_Finding']]) | |
try: | |
label_key = [key for key, value in class_mapping_items if value == label] #[0] | |
if label_key[0] in class_mapping_keys: | |
df.loc[str(count), ['Single_Finding']] = True | |
except IndexError as error: | |
# Output expected IndexErrors. | |
pass | |
# print("After:", df.loc[str(count), ['Single_Finding']]) | |
if image in train_val_list: | |
df.loc[str(count),['Paper_Split']] = random.choice(choose) | |
if image in test_list: | |
df.loc[str(count),['Paper_Split']] = 'test' | |
print(df.head()) | |
pwd = os.getcwd() | |
#Get Bbox file | |
df_bb = pd.read_csv('BBox_List_2017.csv') | |
df_bb = df_bb.rename(index=str, columns={"Image Index":"Image_Index", | |
"Finding Label":"Finding_Labels_Bbox", | |
"Bbox [x":"Bbox_x", | |
"y":"Bbox_y", | |
"w":"Bbox_w", | |
"h]":"Bbox_h", | |
"Unnamed: 6":"Unnamed_6", | |
"Unnamed: 7":"Unnamed_7", | |
"Unnamed: 8":"Unnamed_8" | |
}) | |
df_bb = df_bb.loc[:,['Image_Index','Finding_Labels_Bbox','Bbox_x','Bbox_y','Bbox_w','Bbox_h']] | |
# print("df_bb.head()") | |
# | |
# print(df_bb.head()) | |
df_merged = pd.merge(df, df_bb, on="Image_Index", how="outer") | |
# print(df_merged) | |
# Save | |
df_merged.to_json('df_merged.json', orient='records', lines=True) | |
for count, image in enumerate(tqdm.tqdm(image_names)): | |
# if count%10000==0: | |
# print(count) | |
# | |
# print(image) | |
fullname = image + '.metadata.json' | |
name = os.path.join(pwd, 'meta_bbox_temp', fullname) | |
temp_dir = os.path.join(pwd, 'meta_bbox_temp') | |
if not os.path.exists(temp_dir): | |
os.makedirs(temp_dir) | |
df_merged.iloc[count].to_json(str(name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment