Created
June 17, 2024 03:32
-
-
Save jalotra/4cc501a96228863d1dc6ee267b3bdedf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
from pathlib import Path | |
import pathlib | |
import os | |
import random | |
DATA_FOLDER = "../data" | |
def strip_line(line: List[str]): | |
return [x.strip() for x in line] | |
def resolve_path(path: Path): | |
return path.resolve().__str__() | |
def read_train_series_description(): | |
# Reverse index on train_series_descriptions | |
orientation = {} | |
with open(Path(DATA_FOLDER) / "train_series_descriptions.csv", "r") as f: | |
data = f.readlines() | |
for line in data[1:]: | |
study_id, series_id, series_description = strip_line(line.split(",")) | |
if study_id not in orientation: | |
orientation[study_id] = {} | |
if series_id not in orientation[study_id]: | |
orientation[study_id][series_id] = None | |
orientation[study_id][series_id] = series_description | |
return orientation | |
def read_train_label_coors(): | |
label_coors = {} | |
with open(Path(DATA_FOLDER) / "train_label_coordinates.csv", "r") as f: | |
data = f.readlines() | |
for line in data[1:]: | |
study_id, series_id, instance_number, condition, level, x_coor, y_coor = ( | |
strip_line(line.split(",")) | |
) | |
if study_id not in label_coors: | |
label_coors[study_id] = {} | |
if series_id not in label_coors[study_id]: | |
label_coors[study_id][series_id] = {} | |
label_coors[study_id][series_id][instance_number] = { | |
"condition": condition, | |
"level": level, | |
"x_coor": x_coor, | |
"y_coor": y_coor, | |
} | |
return label_coors | |
def read_image_data(study_id: str, series_id: str): | |
folder_path = Path(DATA_FOLDER) / "train_images" / study_id / series_id | |
images = os.listdir(folder_path) | |
if len(images) <= 0: | |
raise Exception(f"Not able to find any images at this path : {folder_path}") | |
return [folder_path.resolve() + "/" + img_name for img_name in images] | |
def check_file_path(path: pathlib.PosixPath): | |
if path.exists() and path.is_file(): | |
return str(path.resolve()) | |
else: | |
raise Exception(f"Not able to get the file from path : {path}") | |
def read_train_csv_data(): | |
outputs = {} | |
with open(Path(DATA_FOLDER) / "train.csv", "r") as f: | |
data = list(map(lambda x: x.strip("\n"), f.readlines())) | |
columns = data[0].split(",") | |
for line in data[1:]: | |
line = list(map(lambda x: x.strip("\n"), line.split(","))) | |
outputs[line[0]] = {} | |
for idx, col in enumerate(columns[1:]): | |
outputs[line[0]][col] = line[idx + 1] | |
return outputs | |
def check(study_id: str, series_id: str, data: list[dict]): | |
for d in data: | |
if study_id not in d.keys(): | |
return False | |
if series_id not in d[study_id].keys(): | |
return False | |
return True | |
# This is the main DS that loads all the metadata into memory | |
def data_load_ds(total_study_ids_needed: int | None): | |
description = read_train_series_description() | |
labels = read_train_label_coors() | |
train = read_train_csv_data() | |
with open(Path(DATA_FOLDER) / "train.json", "w") as f: | |
f.write(json.dumps(train)) | |
study_ids_series_ids = {} | |
study_ids = list(set(os.listdir(Path(DATA_FOLDER) / "train_images"))) | |
for study_id in study_ids: | |
series_ids = list( | |
set(os.listdir(Path(DATA_FOLDER) / "train_images" / study_id)) | |
) | |
study_ids_series_ids[study_id] = series_ids | |
meta_output = {} | |
try: | |
for study_id in study_ids_series_ids.keys(): | |
meta_output[study_id] = {} | |
meta_output[study_id]["labels"] = train[study_id] | |
for series_id in study_ids_series_ids[study_id]: | |
if not check( | |
study_id=study_id, series_id=series_id, data=[labels, description] | |
): | |
continue | |
if study_id not in meta_output: | |
meta_output[study_id] = {} | |
if series_id not in meta_output[study_id]: | |
meta_output[study_id][series_id] = {} | |
meta_output[study_id][series_id]["orientation"] = description[study_id][ | |
series_id | |
] | |
images_list = os.listdir( | |
Path(DATA_FOLDER) / "train_images" / study_id / series_id | |
) | |
if "images" not in meta_output[study_id][series_id]: | |
meta_output[study_id][series_id]["images"] = [] | |
if "labels" not in meta_output[study_id][series_id]: | |
meta_output[study_id][series_id]["labels"] = [] | |
for image in images_list: | |
meta_output[study_id][series_id]["images"].append( | |
{ | |
"instance_number": image.split(".")[0], | |
"path": resolve_path( | |
Path(DATA_FOLDER) | |
/ "train_images" | |
/ study_id | |
/ series_id | |
/ image | |
), | |
} | |
) | |
# Add labels values if available | |
# Note : These are not available for all .dcms | |
label_here = labels[study_id][series_id] | |
for instance_number, instance_value in label_here.items(): | |
meta_output[study_id][series_id]["labels"].append( | |
{ | |
**instance_value, | |
"instance_number": instance_number, | |
} | |
) | |
except Exception as e: | |
raise e | |
return meta_output | |
if __name__ == "__main__": | |
from pprint import pprint | |
import json | |
with open(Path(DATA_FOLDER) / "ds.json", "w") as f: | |
data = data_load_ds(None) | |
# pprint(data[list(data.keys())[0]]) | |
f.write(json.dumps(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment