thanhleviet · March 14, 2022 13:56
diff --git a/download_pubmlst_contigs.py b/download_pubmlst_contigs.py
 #!/usr/bin/env python
 import requests
 import time
 import pathlib
 from joblib import Parallel, delayed
 from pqdm.processes import pqdm

 def url_template(id):
    url0 = "https://pubmlst.org/bigsdb?db=pubmlst_campylobacter_isolates&page=plugin&name=Contigs&format=text&isolate_id="
    url1 = "&match=1&pc_untagged=0&min_length=&header=1"
    return f"{url0}{id}{url1}"

 def download(id):
    try:
        file_name = f"./download/{id}.fa"
        if not pathlib.Path(file_name).exists():
            r = requests.get(url_template(id))
            if r.status_code == 200:
                open(file_name,"w").write(r.text)
                # print(f"Downloaded {id}")
            # else:
                # print(f"Something went wrong: {r.status_code}")
            time.sleep(1)
        # else:
        #     print(f"{file_name} exists!")
    except Exception as e:
        print(e)
 #A file of genome ids, each line is a single genome
 input = "test.txt"
 ids = []

 with open(input, "r") as _fh:
    for line in _fh:
        ids.append(line.strip())

 print(f"Starting download... {len(ids)} files from {input}")
 # Parallel(n_jobs=10)(delayed(download)(id) for id in ids)
 pqdm(ids, download, n_jobs=10)
	#!/usr/bin/env python
	import requests
	import time
	import pathlib
	from joblib import Parallel, delayed
	from pqdm.processes import pqdm

	def url_template(id):
	url0 = "https://pubmlst.org/bigsdb?db=pubmlst_campylobacter_isolates&page=plugin&name=Contigs&format=text&isolate_id="
	url1 = "&match=1&pc_untagged=0&min_length=&header=1"
	return f"{url0}{id}{url1}"

	def download(id):
	try:
	file_name = f"./download/{id}.fa"
	if not pathlib.Path(file_name).exists():
	r = requests.get(url_template(id))
	if r.status_code == 200:
	open(file_name,"w").write(r.text)
	# print(f"Downloaded {id}")
	# else:
	# print(f"Something went wrong: {r.status_code}")
	time.sleep(1)
	# else:
	# print(f"{file_name} exists!")
	except Exception as e:
	print(e)
	#A file of genome ids, each line is a single genome
	input = "test.txt"
	ids = []

	with open(input, "r") as _fh:
	for line in _fh:
	ids.append(line.strip())

	print(f"Starting download... {len(ids)} files from {input}")
	# Parallel(n_jobs=10)(delayed(download)(id) for id in ids)
	pqdm(ids, download, n_jobs=10)