Skip to content

Instantly share code, notes, and snippets.

@thanhleviet
Created August 8, 2025 09:46
Show Gist options
  • Save thanhleviet/ac91c8bff715197c59b3d0a1ef48dc5f to your computer and use it in GitHub Desktop.
Save thanhleviet/ac91c8bff715197c59b3d0a1ef48dc5f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import argparse
import subprocess
import sys
import os
def download_refseq_genomes(taxids, output_dir=".", reference_only=True):
"""
Download RefSeq genomes for given taxid(s) using NCBI datasets tool
Args:
taxids (str or list): Taxonomic ID(s) - can be a single taxid or comma-separated list
output_dir (str): Output directory for downloaded files
reference_only (bool): Whether to download only reference genomes
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Handle comma-separated taxids
if isinstance(taxids, str):
taxid_list = [t.strip() for t in taxids.split(',')]
else:
taxid_list = taxids
# Create filename based on taxids
if len(taxid_list) == 1:
output_file = os.path.join(output_dir, f"{taxid_list[0]}_refseq.zip")
else:
output_file = os.path.join(output_dir, f"{'_'.join(taxid_list)}_refseq.zip")
# Build command with multiple taxids
cmd = [
"datasets",
"download",
"genome",
"taxon"
]
cmd.extend(taxid_list)
cmd.extend(["--filename", output_file])
if reference_only:
cmd.insert(-2, "--reference")
print(f"Downloading RefSeq reference genomes for taxid(s): {', '.join(taxid_list)}...")
else:
print(f"Downloading all RefSeq genomes for taxid(s): {', '.join(taxid_list)}...")
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print(f"Successfully downloaded to: {output_file}")
if result.stdout.strip():
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
if reference_only and "no genome assemblies were found" in e.stderr:
print("No reference genomes found. Trying to download all available genomes...")
return download_refseq_genomes(taxids, output_dir, reference_only=False)
else:
print(f"Error downloading genomes: {e}")
if e.stdout.strip():
print(f"Command output: {e.stdout}")
if e.stderr.strip():
print(f"Command error: {e.stderr}")
return False
except FileNotFoundError:
print("Error: 'datasets' command not found. Please install NCBI datasets tool.")
print("See: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/")
return False
def main():
parser = argparse.ArgumentParser(description="Download RefSeq genomes using taxid(s)")
parser.add_argument("taxids", help="Taxonomic ID(s) - single taxid or comma-separated list")
parser.add_argument("-o", "--output-dir", default=".",
help="Output directory (default: current directory)")
parser.add_argument("--all", action="store_true",
help="Download all genomes instead of just reference genomes")
args = parser.parse_args()
success = download_refseq_genomes(args.taxids, args.output_dir, reference_only=not args.all)
if not success:
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment