Last active
October 12, 2021 22:48
-
-
Save nutjob4life/6dfabc67b284f892d3300fdb3ecfac4d to your computer and use it in GitHub Desktop.
LabCAS Download Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
# | |
# Sample LabCAS Download script | |
# | |
# To run this, you'll need Python 3 with the `requests` package. The easiest | |
# way to do this is with a "virtual environment" by running: | |
# | |
# $ python3 -m venv venv | |
# $ cd venv | |
# $ bin/pip install --quiet --upgrade pip requests | |
# | |
# Now, use bin/python instead of your usual system python. | |
# | |
# Then, set environment variables as follows: | |
# | |
# - LABCAS_ID: the name of the collection or dataset you want to download; if | |
# unset, it defaults to `Barrett's_Esophagus_Methylation_Profile_Dataset` | |
# - TARGET_DIR: the local directory in which to save the data; if unset, it | |
# defaults to the current directory | |
# - EDRN_USERNAME: the username of the EDRN account to use to fetch the data | |
# - EDRN_PASSWORD: the credential that authenticates `EDRN_USERNAME` | |
# | |
# And finally run this script: | |
# | |
# $ bin/python labcas-download.py | |
# | |
# Created by Asitang Mishra, [email protected] | |
# Refined by https://github.com/nutjob4life | |
import requests | |
import os | |
import urllib.parse | |
# Use environment variables | |
labcas_id = os.getenv('LABCAS_ID', "Barrett's_Esophagus_Methylation_Profile_Dataset") | |
target_dir = os.getenv('TARGET_DIR', '.') | |
edrn_username = os.getenv('EDRN_USERNAME', '') | |
edrn_password = os.getenv('EDRN_PASSWORD', '') | |
# ============= list all files | |
if edrn_password == '' or edrn_username == '': | |
auth = None | |
else: | |
auth = (edrn_username, edrn_password) | |
url = "https://edrn-labcas.jpl.nasa.gov/data-access-api/" | |
request_type = 'collections' if len(labcas_id.split('/')) == 1 else 'datasets' | |
request_url = url + request_type + '/list?rows=20000000&q=id:' + labcas_id | |
response = requests.get(request_url, timeout=10, auth=auth) | |
files_list = [item for item in response.text.split('\n') if item.strip() != ''] | |
if len(files_list) != 0: | |
print('Preparing', len(files_list), 'files to be downloaded....') | |
else: | |
print('No files present or accessible in LabCAS for:', labcas_id) | |
print('Please visit https://edrn-labcas.jpl.nasa.gov/ to find the correct id or check your access to the dataset.') | |
# ============= download files | |
for file_url in files_list: | |
file_rel_path = urllib.parse.unquote(file_url.split('id')[1][1:]) | |
print('Downloading:', file_rel_path) | |
response = requests.get(file_url, stream=True, auth=auth) | |
os.makedirs(os.path.join(target_dir, os.path.dirname(file_rel_path)), exist_ok=True) | |
handle = open(os.path.join(target_dir, file_rel_path), "wb") | |
for chunk in response.iter_content(chunk_size=512): | |
if chunk: | |
handle.write(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment