-
-
Save aseetharam/d1b68c3100280182bd2499c432046403 to your computer and use it in GitHub Desktop.
Add padding up and/or downstream of TSS coordinates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| BEDTOOLS=$(dirname $(which bedtools)) | |
| FEATURECOUNTS=$(which featureCounts) | |
| R=$(which R) | |
| IDIR=$(pwd) | |
| DIR=$(pwd)/GeneBed_output | |
| GTF=gencode.vM30.annotation.gtf | |
| CHR_LEN=mm10.chrom.sizes | |
| ## check if file exists | |
| if [ -f "${CHR_LEN}" ];then | |
| echo "File ${CHR_LEN} exists" | |
| else | |
| echo "File ${CHR_LEN} not exists" | |
| echo "please download the chromosome length file from http://hgdownload.cse.ucsc.edu/goldenPath/mm10/bigZips/" | |
| fi | |
| ## check if file exists and not empty | |
| if [ -f "${GTF}" ];then | |
| if [ -s "${GTF}" ];then | |
| echo "File ${GTF} exists and not empty" | |
| else | |
| echo "File ${GTF} exists but empty" | |
| echo "please download the GTF file from http://www.gencodegenes.org/releases/current.html" | |
| fi | |
| else | |
| echo "File ${GTF} not exists" | |
| fi | |
| grep -v "chrUn" $CHR_LEN | grep -v "random" > ${DIR}/mm10.chrom.sizes | |
| ## extract TSS coordinates and convert to BED and SAF: GeneID,Chr,Start,End,Strand | |
| ## add slop downstream TSS: for transcripts <= slop; use len transcript | |
| ## NOTE: use transcript ID (some genes have alt transcripts where len is > & < slop | |
| mkdir -p ${DIR} | |
| LEN=3000 | |
| GENOME=mm10 | |
| ############### | |
| ## TSS based ## | |
| ############### | |
| ## Extract coord for transcripts < $LEN kb; use actual coords | |
| grep -w transcript ${GTF} | awk -v len=$LEN '($5-$4 <= len)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk '{OFS="\t"} {print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.short | |
| ## Extract TSS coordinates for fwd strand genes > $LEN kb | |
| grep -w transcript ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.long+ | |
| ## Extract TSS coordinates for reverse strand genes > kb | |
| grep -w transcript ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.long- | |
| ## Add padding to TSS for transcripts > $LEN | |
| cat ${DIR}/tmp.tss.long+ ${DIR}/tmp.tss.long- | sort -k1,1 -k 2n,2 |\ | |
| ${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} -s -l 0 -r $LEN > ${DIR}/tmp.tss.long | |
| cat ${DIR}/tmp.tss.long ${DIR}/tmp.tss.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\ | |
| grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_tss_plus_${LEN}.bed | |
| rm -rf ${DIR}/tmp.tss* | |
| ################ | |
| ## gene-based ## | |
| ################ | |
| ## keep gene_id and gene_name | |
| awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/gencode.vM13.gene.bed | |
| GENE_BED=${DIR}/gencode.vM13.gene.bed | |
| ## gene bed > saf | |
| awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' ${GENE_BED} > ${DIR}/gencode.vM13.gene.saf | |
| ## strand-specific gene start coords for plotting TSS | |
| awk 'BEGIN{FS=OFS="\t"}($6=="+"){print $1,$2-1,$2,$4,$5,$6}' ${GENE_BED} > ${DIR}/tmp.bed | |
| awk 'BEGIN{FS=OFS="\t"}($6=="-"){print $1,$3,$3+1,$4,$5,$6}' ${GENE_BED} >> ${DIR}/tmp.bed | |
| sort -k1,1 -k2n,2 ${DIR}/tmp.bed > ${DIR}/mm10_gene_tss.bed | |
| rm -rf ${DIR}/tmp.bed | |
| ## gene_id only | |
| ## TSS | |
| cat ${DIR}/mm10_gene_tss.bed | tr '_' '\t' | cut -f 5 --complement > ${DIR}/mm10_gene_tss.gene_id.bed | |
| ##------------- | |
| ## gene body coordinates | |
| awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6,$5,$4}' > ${DIR}/gencode.vM13.gene_id.bed | |
| ##------------- | |
| ## TSS +/- kb | |
| GENE_TSS_BED=${DIR}/mm10_gene_tss.bed | |
| ${BEDTOOLS}/slopBed -i ${GENE_TSS_BED} -g ${CHR_LEN} -s -l $LEN -r $LEN | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' > ${DIR}/mm10_genes_${LEN}.saf | |
| ##------------- | |
| ## TSS +kb | |
| ## Extract coord for transcripts < $LEN; use actual coords | |
| awk '($3=="gene")' ${GTF} | awk '($5-$4 <= 3000)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.short | |
| ## Extract TSS coordinates for fwd strand genes > $LEN | |
| awk '($3=="gene")' ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long+ | |
| ## Extract TSS coordinates for reverse strand genes > 3kb | |
| awk '($3=="gene")' ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
| cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
| sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long- | |
| ## add slop to TSS for genes > slop | |
| SLOP_OPTS='-s -l 0 -r 3000' | |
| cat ${DIR}/tmp.gene.long+ ${DIR}/tmp.gene.long- | sort -k1,1 -k 2n,2 | ${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} ${SLOP_OPTS} > ${DIR}/tmp.gene.long | |
| cat ${DIR}/tmp.gene.long ${DIR}/tmp.gene.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\ | |
| grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_genes_plus_${LEN}.bed | |
| rm -rf ${DIR}/tmp.gene* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment