Skip to content

Instantly share code, notes, and snippets.

@mbk0asis
Last active June 7, 2021 00:46
Show Gist options
  • Save mbk0asis/ae1d3b697e1462bbc1b30a4a62372f19 to your computer and use it in GitHub Desktop.
Save mbk0asis/ae1d3b697e1462bbc1b30a4a62372f19 to your computer and use it in GitHub Desktop.
####################################################
# #
# to report base composition at all mapped sites #
# #
####################################################
# only when NM tag is missing in BAM
$ samtools calmd BAM ref.fasta | samtools view -b - -o BAM.NMtag.bam
# bam-readcount (use "-l" for regions of interest, "-w 0" for no warning)
$ ls *.NMtag.bam | parallel '~/bin/bam-readcount/bin/bam-readcount -w 0 -f ../ref_Mut/target_genes.fasta -l ../ref_Mut/amplicons.narrow.bed {} > {}.bamCnt' &
# extract data from bam-readcount results
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 1-2,5,6 > $b.position & done
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 1 14 1000) > $b.type & done
$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 2 14 1000) > $b.cnt & done
# combine extracted data
$ for b in *.bamCnt; do paste $b.position $b.type $b.cnt > $b.baseComp & done
# Calculate freqeuncy of base composition
$ for b in *.baseComp; do awk 'OFS="\t"{gsub(/:/,"\t",$6); print $0}' $b | awk '{for(i=6;i<=NF;i++) printf $i/$4":"; print ""}' | sed 's/:$//g' > $b.freq & done
# combine w/ base composition data
$ for b in *.baseComp; do paste $b $b.freq > $b.final & done
# Detection of Minor variants
# (e.g. count the number of site or genomic position containing at least one of minor variants with frequency between 1 ~ 20%)
$ for b in *.final; do awk '{print $1"_"$2"\t"$3"\t",$7}' $b | sed 's/:/\t/g' | awk '{if($1>=0.01 && $1<0.2) print $0"\tA\t"$1; else if($2>=0.01 && $2<0.2) print $0"\tC\t"$2; else if($3>=0.01 && $3<0.2) print $0"\tG\t"$3; else if($4>=0.01 && $4<0.2) print $0"\tT\t"$4}' | awk 'OFS="\t"{print $1,$2">"$7,$3,$4,$5,$6}' | sort > $b.1-20pct.altBase; done
$ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.3 -t $'\t' | sort | uniq > $b.minor.var.freq & done
$ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.2 -t $'\t' | sort | uniq > $b.minor.var.base & done
$ paste *.1-20pct.altBase.minor.var.freq | cut -f1,$(seq -s , 2 2 1000) | sed 's/\t/,/g' > ../All.minor.var.1-20pct.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment