thanhleviet · August 8, 2025 09:46
diff --git a/download_genomes.py b/download_genomes.py
 #!/usr/bin/env python3

 import argparse
 import subprocess
 import sys
 import os

 def download_refseq_genomes(taxids, output_dir=".", reference_only=True):
    """
    Download RefSeq genomes for given taxid(s) using NCBI datasets tool
    
    Args:
        taxids (str or list): Taxonomic ID(s) - can be a single taxid or comma-separated list
        output_dir (str): Output directory for downloaded files
        reference_only (bool): Whether to download only reference genomes
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Handle comma-separated taxids
    if isinstance(taxids, str):
        taxid_list = [t.strip() for t in taxids.split(',')]
    else:
        taxid_list = taxids
    
    # Create filename based on taxids
    if len(taxid_list) == 1:
        output_file = os.path.join(output_dir, f"{taxid_list[0]}_refseq.zip")
    else:
        output_file = os.path.join(output_dir, f"{'_'.join(taxid_list)}_refseq.zip")
    
    # Build command with multiple taxids
    cmd = [
        "datasets",
        "download",
        "genome",
        "taxon"
    ]
    cmd.extend(taxid_list)
    cmd.extend(["--filename", output_file])
    
    if reference_only:
        cmd.insert(-2, "--reference")
        print(f"Downloading RefSeq reference genomes for taxid(s): {', '.join(taxid_list)}...")
    else:
        print(f"Downloading all RefSeq genomes for taxid(s): {', '.join(taxid_list)}...")
    
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(f"Successfully downloaded to: {output_file}")
        if result.stdout.strip():
            print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        if reference_only and "no genome assemblies were found" in e.stderr:
            print("No reference genomes found. Trying to download all available genomes...")
            return download_refseq_genomes(taxids, output_dir, reference_only=False)
        else:
            print(f"Error downloading genomes: {e}")
            if e.stdout.strip():
                print(f"Command output: {e.stdout}")
            if e.stderr.strip():
                print(f"Command error: {e.stderr}")
            return False
    except FileNotFoundError:
        print("Error: 'datasets' command not found. Please install NCBI datasets tool.")
        print("See: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/")
        return False

 def main():
    parser = argparse.ArgumentParser(description="Download RefSeq genomes using taxid(s)")
    parser.add_argument("taxids", help="Taxonomic ID(s) - single taxid or comma-separated list")
    parser.add_argument("-o", "--output-dir", default=".", 
                        help="Output directory (default: current directory)")
    parser.add_argument("--all", action="store_true",
                        help="Download all genomes instead of just reference genomes")
    
    args = parser.parse_args()
    
    success = download_refseq_genomes(args.taxids, args.output_dir, reference_only=not args.all)
    if not success:
        sys.exit(1)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import argparse
	import subprocess
	import sys
	import os

	def download_refseq_genomes(taxids, output_dir=".", reference_only=True):
	"""
	Download RefSeq genomes for given taxid(s) using NCBI datasets tool

	Args:
	taxids (str or list): Taxonomic ID(s) - can be a single taxid or comma-separated list
	output_dir (str): Output directory for downloaded files
	reference_only (bool): Whether to download only reference genomes
	"""
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Handle comma-separated taxids
	if isinstance(taxids, str):
	taxid_list = [t.strip() for t in taxids.split(',')]
	else:
	taxid_list = taxids

	# Create filename based on taxids
	if len(taxid_list) == 1:
	output_file = os.path.join(output_dir, f"{taxid_list[0]}_refseq.zip")
	else:
	output_file = os.path.join(output_dir, f"{'_'.join(taxid_list)}_refseq.zip")

	# Build command with multiple taxids
	cmd = [
	"datasets",
	"download",
	"genome",
	"taxon"
	]
	cmd.extend(taxid_list)
	cmd.extend(["--filename", output_file])

	if reference_only:
	cmd.insert(-2, "--reference")
	print(f"Downloading RefSeq reference genomes for taxid(s): {', '.join(taxid_list)}...")
	else:
	print(f"Downloading all RefSeq genomes for taxid(s): {', '.join(taxid_list)}...")

	try:
	result = subprocess.run(cmd, check=True, capture_output=True, text=True)
	print(f"Successfully downloaded to: {output_file}")
	if result.stdout.strip():
	print(result.stdout)
	return True
	except subprocess.CalledProcessError as e:
	if reference_only and "no genome assemblies were found" in e.stderr:
	print("No reference genomes found. Trying to download all available genomes...")
	return download_refseq_genomes(taxids, output_dir, reference_only=False)
	else:
	print(f"Error downloading genomes: {e}")
	if e.stdout.strip():
	print(f"Command output: {e.stdout}")
	if e.stderr.strip():
	print(f"Command error: {e.stderr}")
	return False
	except FileNotFoundError:
	print("Error: 'datasets' command not found. Please install NCBI datasets tool.")
	print("See: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/")
	return False

	def main():
	parser = argparse.ArgumentParser(description="Download RefSeq genomes using taxid(s)")
	parser.add_argument("taxids", help="Taxonomic ID(s) - single taxid or comma-separated list")
	parser.add_argument("-o", "--output-dir", default=".",
	help="Output directory (default: current directory)")
	parser.add_argument("--all", action="store_true",
	help="Download all genomes instead of just reference genomes")

	args = parser.parse_args()

	success = download_refseq_genomes(args.taxids, args.output_dir, reference_only=not args.all)
	if not success:
	sys.exit(1)

	if __name__ == "__main__":
	main()