Created
June 20, 2022 18:53
-
-
Save JasperVanDenBosch/ee8acd25579e7d867e1f5e42cbcc2632 to your computer and use it in GitHub Desktop.
Find openneuro datasets that have fmriprep results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Find openneuro datasets that have an fmriprep derivative | |
This is a fairly brute-force method. Takes about one hour, and 8GB of HD space. | |
requirements: | |
datalad | |
pandas | |
requests | |
""" | |
from time import sleep | |
from tempfile import TemporaryDirectory | |
from os.path import join, isdir | |
import json | |
import datalad.api as datalad | |
import requests, pandas | |
github_username = '<YOUR USERNAME>' | |
github_token = '<YOUR TOKEN>' | |
max_pages = 50 | |
base_url = 'https://api.github.com/orgs/OpenNeuroDatasets/repos' | |
ids = [] | |
print('Querying github', end='') | |
for p in range(1, max_pages+1): | |
print('.', end='', flush=True) | |
response = requests.get(base_url+f'?page={p}', | |
auth=(github_username, github_token)) | |
assert response.status_code == 200 | |
repos = response.json() | |
for repo in repos: | |
ids.append(repo['name']) | |
sleep(0.2) ## be kind to GitHub | |
if not len(repos): | |
break | |
else: | |
raise ValueError(f'More than {max_pages} pages.') | |
ids.sort() | |
print(f'\nFound {len(ids)} repos in {p-1} pages') | |
fmriprep_datasets = [] | |
cloning_errors = [] | |
with TemporaryDirectory(ignore_cleanup_errors=True) as tmp_dir: | |
for r, repo_id in enumerate(ids, start=1): | |
repo_dir = join(tmp_dir, repo_id) | |
try: | |
datalad.clone(f'///openneuro/{repo_id}', repo_dir) | |
except: | |
cloning_errors.append(repo_id) | |
continue | |
if isdir(join(repo_dir, 'derivatives', 'fmriprep')): | |
with open(join(repo_dir, 'README')) as fhandle: | |
readme = fhandle.read() | |
with open(join(repo_dir, 'dataset_description.json')) as fhandle: | |
metadata = json.load(fhandle) | |
fmriprep_datasets.append(dict( | |
did=repo_id, | |
name=metadata.get('name', 'NO_NAME'), | |
desc=readme | |
)) | |
print(f'{r}/{len(ids)}') | |
sleep(0.2) ## be kind to openneuro.org | |
print(f'Found {len(fmriprep_datasets)} datasets with fmriprep') | |
print(f'While encountering {len(cloning_errors)} errors while cloning') | |
df = pandas.DataFrame(fmriprep_datasets) | |
df.to_csv('fmriprep_datasets.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment