Created
August 8, 2025 09:46
-
-
Save thanhleviet/ac91c8bff715197c59b3d0a1ef48dc5f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import subprocess | |
import sys | |
import os | |
def download_refseq_genomes(taxids, output_dir=".", reference_only=True): | |
""" | |
Download RefSeq genomes for given taxid(s) using NCBI datasets tool | |
Args: | |
taxids (str or list): Taxonomic ID(s) - can be a single taxid or comma-separated list | |
output_dir (str): Output directory for downloaded files | |
reference_only (bool): Whether to download only reference genomes | |
""" | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Handle comma-separated taxids | |
if isinstance(taxids, str): | |
taxid_list = [t.strip() for t in taxids.split(',')] | |
else: | |
taxid_list = taxids | |
# Create filename based on taxids | |
if len(taxid_list) == 1: | |
output_file = os.path.join(output_dir, f"{taxid_list[0]}_refseq.zip") | |
else: | |
output_file = os.path.join(output_dir, f"{'_'.join(taxid_list)}_refseq.zip") | |
# Build command with multiple taxids | |
cmd = [ | |
"datasets", | |
"download", | |
"genome", | |
"taxon" | |
] | |
cmd.extend(taxid_list) | |
cmd.extend(["--filename", output_file]) | |
if reference_only: | |
cmd.insert(-2, "--reference") | |
print(f"Downloading RefSeq reference genomes for taxid(s): {', '.join(taxid_list)}...") | |
else: | |
print(f"Downloading all RefSeq genomes for taxid(s): {', '.join(taxid_list)}...") | |
try: | |
result = subprocess.run(cmd, check=True, capture_output=True, text=True) | |
print(f"Successfully downloaded to: {output_file}") | |
if result.stdout.strip(): | |
print(result.stdout) | |
return True | |
except subprocess.CalledProcessError as e: | |
if reference_only and "no genome assemblies were found" in e.stderr: | |
print("No reference genomes found. Trying to download all available genomes...") | |
return download_refseq_genomes(taxids, output_dir, reference_only=False) | |
else: | |
print(f"Error downloading genomes: {e}") | |
if e.stdout.strip(): | |
print(f"Command output: {e.stdout}") | |
if e.stderr.strip(): | |
print(f"Command error: {e.stderr}") | |
return False | |
except FileNotFoundError: | |
print("Error: 'datasets' command not found. Please install NCBI datasets tool.") | |
print("See: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/") | |
return False | |
def main(): | |
parser = argparse.ArgumentParser(description="Download RefSeq genomes using taxid(s)") | |
parser.add_argument("taxids", help="Taxonomic ID(s) - single taxid or comma-separated list") | |
parser.add_argument("-o", "--output-dir", default=".", | |
help="Output directory (default: current directory)") | |
parser.add_argument("--all", action="store_true", | |
help="Download all genomes instead of just reference genomes") | |
args = parser.parse_args() | |
success = download_refseq_genomes(args.taxids, args.output_dir, reference_only=not args.all) | |
if not success: | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment