Created
June 2, 2025 09:41
-
-
Save lucasgautheron/ca89dd00e34669135ed92ec277904986 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datalad.api | |
from ChildProject.projects import ChildProject | |
from ChildProject.annotations import AnnotationManager | |
import pandas as pd | |
from os.path import basename, exists | |
def process_dataset(dataset): | |
if dataset["gin link"] == "": | |
return dataset | |
path = basename(dataset["Dataset"]) | |
if not exists(path): | |
repo_name = basename(dataset["gin link"].rstrip("/")) | |
url = f"[email protected]:/LAAC-LSCP/{repo_name}.git" | |
datalad.api.install(dataset=".", path=path, source=url) | |
try: | |
for md in ["recordings", "children", "annotations"]: | |
if not exists(f"{path}/metadata/{md}.csv"): | |
datalad.api.get([f"{path}/metadata/{md}.csv"]) | |
except: | |
print(f"failed to retrieve metadata from dataset {path}!") | |
return dataset | |
project = ChildProject(path) | |
project.read() | |
dataset["n_children"] = project.children["child_id"].nunique() | |
dataset["n_recordings"] = project.recordings["recording_filename"].nunique() | |
dataset["total_duration"] = ( | |
(project.recordings["duration"] / 3600 / 1000).sum() | |
if "duration" in project.recordings.columns | |
else 0 | |
) | |
am = AnnotationManager(project) | |
am.read() | |
for annotation_set in ["vtc", "vcm", "alice", "its"]: | |
annotations = am.annotations[am.annotations["set"] == annotation_set] | |
duration = ( | |
(annotations["range_offset"] - annotations["range_onset"]) / 3600 / 1000 | |
).sum() | |
dataset[f"{annotation_set}_duration"] = duration | |
return dataset | |
datasets = pd.read_csv("datasets.csv").dropna(subset=["gin link"]) | |
datasets = [process_dataset(dataset) for dataset in datasets.to_dict(orient="records")] | |
datasets = pd.DataFrame(datasets) | |
print( | |
datasets[ | |
["n_children", "n_recordings", "total_duration", "vtc_duration", "its_duration"] | |
] | |
.fillna(0) | |
.sum(axis=0) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
n_children 2740.000000 | |
n_recordings 22079.000000 | |
total_duration 87201.515153 | |
vtc_duration 65156.314276 | |
its_duration 55138.649785 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment