Skip to content

Instantly share code, notes, and snippets.

@lucasgautheron
Created June 2, 2025 09:41
Show Gist options
  • Save lucasgautheron/ca89dd00e34669135ed92ec277904986 to your computer and use it in GitHub Desktop.
Save lucasgautheron/ca89dd00e34669135ed92ec277904986 to your computer and use it in GitHub Desktop.
import datalad.api
from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
import pandas as pd
from os.path import basename, exists
def process_dataset(dataset):
if dataset["gin link"] == "":
return dataset
path = basename(dataset["Dataset"])
if not exists(path):
repo_name = basename(dataset["gin link"].rstrip("/"))
url = f"[email protected]:/LAAC-LSCP/{repo_name}.git"
datalad.api.install(dataset=".", path=path, source=url)
try:
for md in ["recordings", "children", "annotations"]:
if not exists(f"{path}/metadata/{md}.csv"):
datalad.api.get([f"{path}/metadata/{md}.csv"])
except:
print(f"failed to retrieve metadata from dataset {path}!")
return dataset
project = ChildProject(path)
project.read()
dataset["n_children"] = project.children["child_id"].nunique()
dataset["n_recordings"] = project.recordings["recording_filename"].nunique()
dataset["total_duration"] = (
(project.recordings["duration"] / 3600 / 1000).sum()
if "duration" in project.recordings.columns
else 0
)
am = AnnotationManager(project)
am.read()
for annotation_set in ["vtc", "vcm", "alice", "its"]:
annotations = am.annotations[am.annotations["set"] == annotation_set]
duration = (
(annotations["range_offset"] - annotations["range_onset"]) / 3600 / 1000
).sum()
dataset[f"{annotation_set}_duration"] = duration
return dataset
datasets = pd.read_csv("datasets.csv").dropna(subset=["gin link"])
datasets = [process_dataset(dataset) for dataset in datasets.to_dict(orient="records")]
datasets = pd.DataFrame(datasets)
print(
datasets[
["n_children", "n_recordings", "total_duration", "vtc_duration", "its_duration"]
]
.fillna(0)
.sum(axis=0)
)
n_children 2740.000000
n_recordings 22079.000000
total_duration 87201.515153
vtc_duration 65156.314276
its_duration 55138.649785
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment