mbk0asis · June 7, 2021 00:46
diff --git a/base compostion b/base compostion
 ####################################################
 #                                                  #
 #  to report base composition at all mapped sites  #
 #                                                  #
 ####################################################

 # only when NM tag is missing in BAM
 $ samtools calmd BAM ref.fasta | samtools view -b - -o BAM.NMtag.bam

 # bam-readcount (use "-l" for regions of interest, "-w 0" for no warning)
 $ ls *.NMtag.bam | parallel '~/bin/bam-readcount/bin/bam-readcount -w 0 -f ../ref_Mut/target_genes.fasta -l ../ref_Mut/amplicons.narrow.bed {} > {}.bamCnt' &


 # extract data from bam-readcount results
 $ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 1-2,5,6 > $b.position & done
 $ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 1 14 1000) > $b.type & done
 $ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b | cut -f 8- | sed 's/\t/:/g' | cut -d: -f $(seq -s , 2 14 1000) > $b.cnt & done 


 # combine extracted data
 $ for b in *.bamCnt; do paste $b.position $b.type $b.cnt > $b.baseComp & done


 # Calculate freqeuncy of base composition
 $ for b in *.baseComp; do awk 'OFS="\t"{gsub(/:/,"\t",$6); print $0}' $b | awk '{for(i=6;i<=NF;i++) printf $i/$4":"; print ""}' | sed 's/:$//g' > $b.freq & done


 # combine w/ base composition data
 $ for b in *.baseComp; do paste $b $b.freq > $b.final & done


 # Detection of Minor variants 
 # (e.g. count the number of site or genomic position containing at least one of minor variants with frequency between 1 ~ 20%)
 $ for b in *.final; do awk '{print $1"_"$2"\t"$3"\t",$7}' $b | sed 's/:/\t/g' | awk '{if($1>=0.01 && $1<0.2) print $0"\tA\t"$1; else if($2>=0.01 && $2<0.2) print $0"\tC\t"$2; else if($3>=0.01 && $3<0.2) print $0"\tG\t"$3; else if($4>=0.01 && $4<0.2) print $0"\tT\t"$4}' | awk 'OFS="\t"{print $1,$2">"$7,$3,$4,$5,$6}' | sort > $b.1-20pct.altBase; done
 $ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.3 -t $'\t' | sort | uniq > $b.minor.var.freq & done
 $ for b in *.1-20pct.altBase; do cat *.1-20pct.altBase | cut -f 1 | sort | uniq | join - $b -a1 -e - -o 1.1,2.2 -t $'\t' | sort | uniq > $b.minor.var.base & done
 $ paste *.1-20pct.altBase.minor.var.freq | cut -f1,$(seq -s , 2 2 1000) | sed 's/\t/,/g' > ../All.minor.var.1-20pct.csv
	####################################################
	# #
	# to report base composition at all mapped sites #
	# #
	####################################################

	# only when NM tag is missing in BAM
	$ samtools calmd BAM ref.fasta \| samtools view -b - -o BAM.NMtag.bam

	# bam-readcount (use "-l" for regions of interest, "-w 0" for no warning)
	$ ls *.NMtag.bam \| parallel '~/bin/bam-readcount/bin/bam-readcount -w 0 -f ../ref_Mut/target_genes.fasta -l ../ref_Mut/amplicons.narrow.bed {} > {}.bamCnt' &


	# extract data from bam-readcount results
	$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b \| cut -f 1-2,5,6 > $b.position & done
	$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b \| cut -f 8- \| sed 's/\t/:/g' \| cut -d: -f $(seq -s , 1 14 1000) > $b.type & done
	$ for b in *.bamCnt; do awk 'OFS="\t"{print $1,$2,$0}' $b \| cut -f 8- \| sed 's/\t/:/g' \| cut -d: -f $(seq -s , 2 14 1000) > $b.cnt & done


	# combine extracted data
	$ for b in *.bamCnt; do paste $b.position $b.type $b.cnt > $b.baseComp & done


	# Calculate freqeuncy of base composition
	$ for b in *.baseComp; do awk 'OFS="\t"{gsub(/:/,"\t",$6); print $0}' $b \| awk '{for(i=6;i<=NF;i++) printf $i/$4":"; print ""}' \| sed 's/:$//g' > $b.freq & done


	# combine w/ base composition data
	$ for b in *.baseComp; do paste $b $b.freq > $b.final & done


	# Detection of Minor variants
	# (e.g. count the number of site or genomic position containing at least one of minor variants with frequency between 1 ~ 20%)
	$ for b in *.final; do awk '{print $1"_"$2"\t"$3"\t",$7}' $b \| sed 's/:/\t/g' \| awk '{if($1>=0.01 && $1<0.2) print $0"\tA\t"$1; else if($2>=0.01 && $2<0.2) print $0"\tC\t"$2; else if($3>=0.01 && $3<0.2) print $0"\tG\t"$3; else if($4>=0.01 && $4<0.2) print $0"\tT\t"$4}' \| awk 'OFS="\t"{print $1,$2">"$7,$3,$4,$5,$6}' \| sort > $b.1-20pct.altBase; done
	$ for b in .1-20pct.altBase; do cat .1-20pct.altBase \| cut -f 1 \| sort \| uniq \| join - $b -a1 -e - -o 1.1,2.3 -t $'\t' \| sort \| uniq > $b.minor.var.freq & done
	$ for b in .1-20pct.altBase; do cat .1-20pct.altBase \| cut -f 1 \| sort \| uniq \| join - $b -a1 -e - -o 1.1,2.2 -t $'\t' \| sort \| uniq > $b.minor.var.base & done
	$ paste *.1-20pct.altBase.minor.var.freq \| cut -f1,$(seq -s , 2 2 1000) \| sed 's/\t/,/g' > ../All.minor.var.1-20pct.csv