Skip to content

Instantly share code, notes, and snippets.

@faruqsandi
Last active September 21, 2024 04:13
Show Gist options
  • Save faruqsandi/fe215c449c391d8c08a8aa01c833d206 to your computer and use it in GitHub Desktop.
Save faruqsandi/fe215c449c391d8c08a8aa01c833d206 to your computer and use it in GitHub Desktop.
GATK Collecting Coverage Counts
from tabnanny import verbose
import pandas as pd
import pooch
from tqdm import tqdm
import time
tqdm.pandas()
import os
from pathlib import Path
from datetime import datetime
download_table = pd.read_csv("igsr_30x GRCh38.tsv", sep="\t")
# download_table = download_table.iloc[::-1]
total_len = len(download_table)
last_timestamp = datetime.utcnow()
temp_download_dir = "/mnt/disks/biodb/temp_cram_download"
# temp_download_dir = "./temp_cram_download"
output_path_folder = "/mnt/disks/biodb/copyreadcounts_output"
os.makedirs(temp_download_dir, exist_ok=True)
for i, (url, md5, sample) in download_table[["url", "md5", "Sample"]].iterrows():
print(datetime.utcnow(), datetime.utcnow()-last_timestamp)
last_timestamp = datetime.utcnow()
print(f"{i}/{total_len} ({round(i/total_len, 3)})) ", url, md5, sample)
output_path = os.path.join(output_path_folder, f"{sample}.counts.hdf5")
if os.path.isfile(output_path):
print('skipping')
continue
# break
try:
downloaded_cram = os.path.join(temp_download_dir, os.path.basename(url))
# os.system(f'wget -O {downloaded_cram} {url}')
# downloaded_cram = pooch.retrieve(url, known_hash=md5, path=temp_download_dir, progressbar=True)
os.system(f'axel -a -n 8 --output {downloaded_cram} {url}')
except Exception as e:
print(str(e))
time.sleep(600)
continue
print(datetime.utcnow(), datetime.utcnow()-last_timestamp)
last_timestamp = datetime.utcnow()
os.makedirs(output_path_folder, exist_ok=True)
# os.path.basename(downloaded_cram)
command = [
"./gatk-4.4.0.0/gatk CollectReadCounts",
f"-I {downloaded_cram}",
"-L ../galaxy_workflow/hg38_exome_v2.0.2_targets_sorted_validated.re_annotated.bed",
"-R /mnt/disks/biodb/references/hg38/Homo_sapiens_assembly38.fasta",
f"-O {output_path}",
"-imr OVERLAPPING_ONLY",
]
command1 = " ".join(command)
print(datetime.utcnow(), datetime.utcnow()-last_timestamp)
last_timestamp = datetime.utcnow()
os.system(f"samtools index {downloaded_cram}")
print(datetime.utcnow(), datetime.utcnow()-last_timestamp)
last_timestamp = datetime.utcnow()
os.system(command1)
print(datetime.utcnow(), datetime.utcnow()-last_timestamp)
last_timestamp = datetime.utcnow()
Path(downloaded_cram).unlink(missing_ok=True)
Path(downloaded_cram+'.crai').unlink(missing_ok=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment