Skip to content

Instantly share code, notes, and snippets.

@jalotra
Created June 17, 2024 03:32
Show Gist options
  • Save jalotra/4cc501a96228863d1dc6ee267b3bdedf to your computer and use it in GitHub Desktop.
Save jalotra/4cc501a96228863d1dc6ee267b3bdedf to your computer and use it in GitHub Desktop.
from typing import List
from pathlib import Path
import pathlib
import os
import random
DATA_FOLDER = "../data"
def strip_line(line: List[str]):
return [x.strip() for x in line]
def resolve_path(path: Path):
return path.resolve().__str__()
def read_train_series_description():
# Reverse index on train_series_descriptions
orientation = {}
with open(Path(DATA_FOLDER) / "train_series_descriptions.csv", "r") as f:
data = f.readlines()
for line in data[1:]:
study_id, series_id, series_description = strip_line(line.split(","))
if study_id not in orientation:
orientation[study_id] = {}
if series_id not in orientation[study_id]:
orientation[study_id][series_id] = None
orientation[study_id][series_id] = series_description
return orientation
def read_train_label_coors():
label_coors = {}
with open(Path(DATA_FOLDER) / "train_label_coordinates.csv", "r") as f:
data = f.readlines()
for line in data[1:]:
study_id, series_id, instance_number, condition, level, x_coor, y_coor = (
strip_line(line.split(","))
)
if study_id not in label_coors:
label_coors[study_id] = {}
if series_id not in label_coors[study_id]:
label_coors[study_id][series_id] = {}
label_coors[study_id][series_id][instance_number] = {
"condition": condition,
"level": level,
"x_coor": x_coor,
"y_coor": y_coor,
}
return label_coors
def read_image_data(study_id: str, series_id: str):
folder_path = Path(DATA_FOLDER) / "train_images" / study_id / series_id
images = os.listdir(folder_path)
if len(images) <= 0:
raise Exception(f"Not able to find any images at this path : {folder_path}")
return [folder_path.resolve() + "/" + img_name for img_name in images]
def check_file_path(path: pathlib.PosixPath):
if path.exists() and path.is_file():
return str(path.resolve())
else:
raise Exception(f"Not able to get the file from path : {path}")
def read_train_csv_data():
outputs = {}
with open(Path(DATA_FOLDER) / "train.csv", "r") as f:
data = list(map(lambda x: x.strip("\n"), f.readlines()))
columns = data[0].split(",")
for line in data[1:]:
line = list(map(lambda x: x.strip("\n"), line.split(",")))
outputs[line[0]] = {}
for idx, col in enumerate(columns[1:]):
outputs[line[0]][col] = line[idx + 1]
return outputs
def check(study_id: str, series_id: str, data: list[dict]):
for d in data:
if study_id not in d.keys():
return False
if series_id not in d[study_id].keys():
return False
return True
# This is the main DS that loads all the metadata into memory
def data_load_ds(total_study_ids_needed: int | None):
description = read_train_series_description()
labels = read_train_label_coors()
train = read_train_csv_data()
with open(Path(DATA_FOLDER) / "train.json", "w") as f:
f.write(json.dumps(train))
study_ids_series_ids = {}
study_ids = list(set(os.listdir(Path(DATA_FOLDER) / "train_images")))
for study_id in study_ids:
series_ids = list(
set(os.listdir(Path(DATA_FOLDER) / "train_images" / study_id))
)
study_ids_series_ids[study_id] = series_ids
meta_output = {}
try:
for study_id in study_ids_series_ids.keys():
meta_output[study_id] = {}
meta_output[study_id]["labels"] = train[study_id]
for series_id in study_ids_series_ids[study_id]:
if not check(
study_id=study_id, series_id=series_id, data=[labels, description]
):
continue
if study_id not in meta_output:
meta_output[study_id] = {}
if series_id not in meta_output[study_id]:
meta_output[study_id][series_id] = {}
meta_output[study_id][series_id]["orientation"] = description[study_id][
series_id
]
images_list = os.listdir(
Path(DATA_FOLDER) / "train_images" / study_id / series_id
)
if "images" not in meta_output[study_id][series_id]:
meta_output[study_id][series_id]["images"] = []
if "labels" not in meta_output[study_id][series_id]:
meta_output[study_id][series_id]["labels"] = []
for image in images_list:
meta_output[study_id][series_id]["images"].append(
{
"instance_number": image.split(".")[0],
"path": resolve_path(
Path(DATA_FOLDER)
/ "train_images"
/ study_id
/ series_id
/ image
),
}
)
# Add labels values if available
# Note : These are not available for all .dcms
label_here = labels[study_id][series_id]
for instance_number, instance_value in label_here.items():
meta_output[study_id][series_id]["labels"].append(
{
**instance_value,
"instance_number": instance_number,
}
)
except Exception as e:
raise e
return meta_output
if __name__ == "__main__":
from pprint import pprint
import json
with open(Path(DATA_FOLDER) / "ds.json", "w") as f:
data = data_load_ds(None)
# pprint(data[list(data.keys())[0]])
f.write(json.dumps(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment