Last active
June 7, 2021 00:46
-
-
Save mbk0asis/ae1d3b697e1462bbc1b30a4a62372f19 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################### | |
# # | |
# to report base composition at all mapped sites # | |
# # | |
#################################################### | |
# only when NM tag is missing in BAM | |
$ samtools calmd BAM ref.fasta | samtools view -b - -o BAM.NMtag.bam | |
# bam-readcount (use "-l" for regions of interest, "-w 0" for no warning) | |
$ ls *.NMtag.bam | parallel '~/bin/bam-readcount/bin/bam-readcount -w 0 -f ../ref_Mut/target_genes.fasta -l ../ref_Mut/amplicons.narrow.bed {} > {}.bamCnt' & | |
# extract data from bam-readcount results | |
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 1-2,5,6 > $b.position & done | |
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 1 14 1000) > $b.type & done | |
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 2 14 1000) > $b.cnt & done | |
# combine extracted data | |
$ for b in *.bamCnt; do paste $b.position $b.type $b.cnt > $b.baseComp & done | |
# Calculate freqeuncy of base composition | |
$ for b in *.baseComp; do awk 'OFS="\t"{gsub(/:/,"\t",$6); print $0}' $b | awk '{for(i=6;i<=NF;i++) printf $i/$4":"; print ""}' | sed 's/:$//g' > $b.freq & done | |
# combine w/ base composition data | |
$ for b in *.baseComp; do paste $b $b.freq > $b.final & done | |
# Detection of Minor variants | |
# (e.g. count the number of site or genomic position containing at least one of minor variants with frequency between 1 ~ 20%) | |
$ for b in *.final; do awk '{print $1"_"$2"\t"$3"\t",$7}' $b | sed 's/:/\t/g' | awk '{if($1>=0.01 && $1<0.2) print $0"\tA\t"$1; else if($2>=0.01 && $2<0.2) print $0"\tC\t"$2; else if($3>=0.01 && $3<0.2) print $0"\tG\t"$3; else if($4>=0.01 && $4<0.2) print $0"\tT\t"$4}' | awk 'OFS="\t"{print $1,$2">"$7,$3,$4,$5,$6}' | sort > $b.1-20pct.altBase; done | |
$ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.3 -t $'\t' | sort | uniq > $b.minor.var.freq & done | |
$ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.2 -t $'\t' | sort | uniq > $b.minor.var.base & done | |
$ paste *.1-20pct.altBase.minor.var.freq | cut -f1,$(seq -s , 2 2 1000) | sed 's/\t/,/g' > ../All.minor.var.1-20pct.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment