Skip to content

Instantly share code, notes, and snippets.

@thanhleviet
Created March 14, 2022 13:56
Show Gist options
  • Save thanhleviet/29cfe0955917652dc57c0a6423d03186 to your computer and use it in GitHub Desktop.
Save thanhleviet/29cfe0955917652dc57c0a6423d03186 to your computer and use it in GitHub Desktop.
download (Campylobacter) genomes from pubmlst.org in parallel
#!/usr/bin/env python
import requests
import time
import pathlib
from joblib import Parallel, delayed
from pqdm.processes import pqdm
def url_template(id):
url0 = "https://pubmlst.org/bigsdb?db=pubmlst_campylobacter_isolates&page=plugin&name=Contigs&format=text&isolate_id="
url1 = "&match=1&pc_untagged=0&min_length=&header=1"
return f"{url0}{id}{url1}"
def download(id):
try:
file_name = f"./download/{id}.fa"
if not pathlib.Path(file_name).exists():
r = requests.get(url_template(id))
if r.status_code == 200:
open(file_name,"w").write(r.text)
# print(f"Downloaded {id}")
# else:
# print(f"Something went wrong: {r.status_code}")
time.sleep(1)
# else:
# print(f"{file_name} exists!")
except Exception as e:
print(e)
#A file of genome ids, each line is a single genome
input = "test.txt"
ids = []
with open(input, "r") as _fh:
for line in _fh:
ids.append(line.strip())
print(f"Starting download... {len(ids)} files from {input}")
# Parallel(n_jobs=10)(delayed(download)(id) for id in ids)
pqdm(ids, download, n_jobs=10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment