Skip to content

Instantly share code, notes, and snippets.

@aseetharam
Forked from dinovski/geneBed.sh
Last active March 30, 2023 16:44
Show Gist options
  • Save aseetharam/d1b68c3100280182bd2499c432046403 to your computer and use it in GitHub Desktop.
Save aseetharam/d1b68c3100280182bd2499c432046403 to your computer and use it in GitHub Desktop.
Add padding up and/or downstream of TSS coordinates
#!/bin/bash
BEDTOOLS=$(dirname $(which bedtools))
FEATURECOUNTS=$(which featureCounts)
R=$(which R)
IDIR=$(pwd)
DIR=$(pwd)/GeneBed_output
GTF=gencode.vM30.annotation.gtf
CHR_LEN=mm10.chrom.sizes
## check if file exists
if [ -f "${CHR_LEN}" ];then
echo "File ${CHR_LEN} exists"
else
echo "File ${CHR_LEN} not exists"
echo "please download the chromosome length file from http://hgdownload.cse.ucsc.edu/goldenPath/mm10/bigZips/"
fi
## check if file exists and not empty
if [ -f "${GTF}" ];then
if [ -s "${GTF}" ];then
echo "File ${GTF} exists and not empty"
else
echo "File ${GTF} exists but empty"
echo "please download the GTF file from http://www.gencodegenes.org/releases/current.html"
fi
else
echo "File ${GTF} not exists"
fi
grep -v "chrUn" $CHR_LEN | grep -v "random" > ${DIR}/mm10.chrom.sizes
## extract TSS coordinates and convert to BED and SAF: GeneID,Chr,Start,End,Strand
## add slop downstream TSS: for transcripts <= slop; use len transcript
## NOTE: use transcript ID (some genes have alt transcripts where len is > & < slop
mkdir -p ${DIR}
LEN=3000
GENOME=mm10
###############
## TSS based ##
###############
## Extract coord for transcripts < $LEN kb; use actual coords
grep -w transcript ${GTF} | awk -v len=$LEN '($5-$4 <= len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.short
## Extract TSS coordinates for fwd strand genes > $LEN kb
grep -w transcript ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.long+
## Extract TSS coordinates for reverse strand genes > kb
grep -w transcript ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.long-
## Add padding to TSS for transcripts > $LEN
cat ${DIR}/tmp.tss.long+ ${DIR}/tmp.tss.long- | sort -k1,1 -k 2n,2 |\
${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} -s -l 0 -r $LEN > ${DIR}/tmp.tss.long
cat ${DIR}/tmp.tss.long ${DIR}/tmp.tss.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_tss_plus_${LEN}.bed
rm -rf ${DIR}/tmp.tss*
################
## gene-based ##
################
## keep gene_id and gene_name
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/gencode.vM13.gene.bed
GENE_BED=${DIR}/gencode.vM13.gene.bed
## gene bed > saf
awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' ${GENE_BED} > ${DIR}/gencode.vM13.gene.saf
## strand-specific gene start coords for plotting TSS
awk 'BEGIN{FS=OFS="\t"}($6=="+"){print $1,$2-1,$2,$4,$5,$6}' ${GENE_BED} > ${DIR}/tmp.bed
awk 'BEGIN{FS=OFS="\t"}($6=="-"){print $1,$3,$3+1,$4,$5,$6}' ${GENE_BED} >> ${DIR}/tmp.bed
sort -k1,1 -k2n,2 ${DIR}/tmp.bed > ${DIR}/mm10_gene_tss.bed
rm -rf ${DIR}/tmp.bed
## gene_id only
## TSS
cat ${DIR}/mm10_gene_tss.bed | tr '_' '\t' | cut -f 5 --complement > ${DIR}/mm10_gene_tss.gene_id.bed
##-------------
## gene body coordinates
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6,$5,$4}' > ${DIR}/gencode.vM13.gene_id.bed
##-------------
## TSS +/- kb
GENE_TSS_BED=${DIR}/mm10_gene_tss.bed
${BEDTOOLS}/slopBed -i ${GENE_TSS_BED} -g ${CHR_LEN} -s -l $LEN -r $LEN | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' > ${DIR}/mm10_genes_${LEN}.saf
##-------------
## TSS +kb
## Extract coord for transcripts < $LEN; use actual coords
awk '($3=="gene")' ${GTF} | awk '($5-$4 <= 3000)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.short
## Extract TSS coordinates for fwd strand genes > $LEN
awk '($3=="gene")' ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long+
## Extract TSS coordinates for reverse strand genes > 3kb
awk '($3=="gene")' ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long-
## add slop to TSS for genes > slop
SLOP_OPTS='-s -l 0 -r 3000'
cat ${DIR}/tmp.gene.long+ ${DIR}/tmp.gene.long- | sort -k1,1 -k 2n,2 | ${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} ${SLOP_OPTS} > ${DIR}/tmp.gene.long
cat ${DIR}/tmp.gene.long ${DIR}/tmp.gene.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_genes_plus_${LEN}.bed
rm -rf ${DIR}/tmp.gene*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment