Last active
February 6, 2024 16:31
-
-
Save nvictus/d1a04c2c2d3e949e75e12aa03b35485f to your computer and use it in GitHub Desktop.
Dump fastq-dump! Download NCBI-SRA FASTQs directly from the European Nucleotide Archive.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import hashlib | |
import os.path as op | |
import os | |
import re | |
import warnings | |
from contextlib import closing | |
from urllib.parse import urlsplit | |
from urllib.request import urlopen | |
import requests | |
import tqdm | |
# API docs PDF available here: https://www.ebi.ac.uk/ena/portal/api/doc | |
ENA_BASE_URL = "https://www.ebi.ac.uk/ena/portal/api" | |
def _get_run_report(accession): | |
params = { | |
"accession": str(accession), | |
"result": "read_run", | |
"fields": "run_accession,sample_title,fastq_ftp,fastq_md5,fastq_bytes", | |
"format": "json", | |
} | |
r = requests.get(ENA_BASE_URL + '/filereport', params) | |
r.raise_for_status() | |
msg = r.json() | |
for record in msg: | |
for field in ["fastq_ftp", "fastq_md5", "fastq_bytes"]: | |
record[field] = record[field].split(";") | |
record["fastq_bytes"] = list(map(int, record["fastq_bytes"])) | |
return msg | |
def _copyfileobj_with_progbar(fsrc, fdst, filesize, bufsize=16 * 1024, desc=None): | |
file_hash = hashlib.md5() | |
progbar = tqdm.tqdm(total=filesize, desc=desc, unit="B", unit_scale=True) | |
try: | |
while True: | |
buf = fsrc.read(bufsize) | |
if not buf: | |
break | |
fdst.write(buf) | |
file_hash.update(buf) | |
progbar.update(bufsize) | |
finally: | |
progbar.close() | |
return file_hash.hexdigest() | |
def _download_fastq(report, dest_dir): | |
dest_paths = [] | |
for url, filesize, filemd5 in zip( | |
report["fastq_ftp"], report["fastq_bytes"], report["fastq_md5"] | |
): | |
if not url.startswith("ftp://"): | |
url = "ftp://" + url | |
filename = urlsplit(url).path.split("/")[-1] | |
outpath = op.join(dest_dir, filename) | |
dest_paths.append(outpath) | |
with closing(urlopen(url)) as r, open(outpath, "wb") as f: | |
checksum = _copyfileobj_with_progbar(r, f, filesize, desc=outpath) | |
if checksum != filemd5: | |
warnings.warn( | |
f"Checksum mismatch for {filename}: {checksum} != {filemd5}" | |
) | |
return dest_paths | |
def download_fastq(run_accession, dest_dir=""): | |
""" | |
Download FASTQ files for a sequencing run from ENA. | |
Sequencing runs may be from NCBI-SRA (SRR), EMBL-SRA (ERR), DDBJ-SRA (DRR). | |
Parameters | |
---------- | |
run_accession : str | |
Sequencing run accession, having format (E|D|S)RR[0-9]{6,}. | |
dest_dir: str, optional | |
Destination directory for download. Default is the cwd. | |
Returns | |
------- | |
dict | |
Download metadata, including sample name, urls, file sizes and md5 | |
hashes. | |
Examples | |
-------- | |
>>> download_fastq('SRR001030', '/tmp') | |
{'sample_accession': 'SAMN00000119', | |
'run_accession': 'SRR001030', | |
'sample_title': 'Generic sample from Homo sapiens', | |
'fastq_ftp': ['ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001030/SRR001030.fastq.gz'], | |
'fastq_md5': ['56ad9495ef258a7fd589ef384130797f'], | |
'fastq_bytes': [19778064], | |
'fastq_download': ['/tmp/SRR001030.fastq.gz']} | |
""" | |
report = _get_run_report(run_accession) | |
for record in report: | |
print("Downloading:", record["sample_title"]) | |
download_paths = _download_fastq(record, dest_dir) | |
record["fastq_download"] = download_paths | |
if isinstance(report, list) and len(report) == 1: | |
report = report[0] | |
return report | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"accession", help="Sequencing run accession, having format (E|D|S)RR[0-9]{6,}." | |
) | |
parser.add_argument( | |
"-d", "--dest-dir", default="", help="Specify the download directory." | |
) | |
parser.add_argument( | |
"-s", "--symlink-title", | |
action="store_true", | |
help="Create (hopefully) informative symlinks from the sample's title.", | |
) | |
args = parser.parse_args() | |
report = download_fastq(args.accession, args.dest_dir) | |
if args.symlink_title: | |
sample_title = report["sample_title"] | |
for i, download_path in enumerate(report["fastq_download"]): | |
dirpath, filename = op.split(download_path) | |
try: | |
side_ext = re.findall(r"((_[12])?(\.fastq\S*))", filename)[0][0] | |
except IndexError: | |
side_ext = "." + str(i + 1) | |
if side_ext.startswith("_"): | |
side_ext = "." + side_ext[1:] | |
os.symlink(download_path, op.join(dirpath, sample_title + side_ext)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment