This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract gene lengths from GTF file | |
# Download GTF file from https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files | |
echo "Gene_ID, Gene_name, Gene_type, Chr, Start, End, Gene_length" > gdc_gencode_v36_annotation.csv | |
awk '{if($3 == "gene") print $0}' gencode.v36.annotation.gtf | awk -F '[\t ;]' '{print $10","$16","$13","$1","$4","$5","$5-$4}' >> gdc_gencode_v36_annotation.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract mature and precursor miRNA list from miRBase gff3 file | |
# Can be used for interconversion | |
grep -v '#' hsa.gff3 | grep -w "miRNA" | cut -f9 > temp | |
paste <(cut -d ';' -f1 temp | cut -d '=' -f1) <(cut -d ';' -f2 temp | cut -d '=' -f1) <(cut -d ';' -f3 temp | cut -d '=' -f1) <(cut -d ';' -f4 temp | cut -d '=' -f1) | head -n1 >> miRbaseIDs.tsv | |
paste <(cut -d ';' -f1 temp | cut -d '=' -f2) <(cut -d ';' -f2 temp | cut -d '=' -f2) <(cut -d ';' -f3 temp | cut -d '=' -f2) <(cut -d ';' -f4 temp | cut -d '=' -f2) >> miRbaseIDs.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract list of unique Gene IDs in GTF file | |
awk '{if($3 == "gene") print $0}' ../Homo_sapiens.GRCh38.84.gtf | cut -f9 | cut -d ';' -f1 | cut -d ' ' -f2 | sort | uniq | wc -l | |
# Extract gene list of particular biotype | |
grep 'gene_biotype "protein_coding"' ../Homo_sapiens.GRCh38.84.gtf | awk '{if($3 == "gene") print $0}' | cut -f9 | cut -d ';' -f1 | cut -d ' ' -f2 | sort | uniq | wc -l | |
# Create subset of Ensembl GTF file based on gene biotype |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The PANTHER GO output doesnot directly support visualisation by REVIGO. | |
# Using this single line code in linux terminal we can make it compatible for REVIGO | |
# Gene Ontology enrichment analysis done using PANTHER (interface from: http://geneontology.org/) | |
# The results are exported as Table | |
# The initial few lines are cleaned to keep only the result table on which we execute the following: | |
paste <(cut -f1 PantherOut.tsv | cut -d "(" -f2 | cut -d ")" -f1) <(cut -f7 List.tsv) >> RevigoIn.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code to generate tx2gene from reference transcriptome for use with tximport() in R | |
# Reference transcriptome: wget ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz | |
zcat Homo_sapiens.GRCh38.cdna.all.fa.gz | grep '>' | cut -d ' ' -f1,4,7 > temp | |
paste <(cut -d '>' -f2 temp | cut -d ' ' -f1) <(cut -d ' ' -f2 temp | cut -d ':' -f2) <(cut -d ' ' -f3 temp | cut -d ':' -f2) >> tx2gene.txt | |
rm temp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dendextend) | |
countsPC <- read.table("CountsPC_Gene_MultimapOverlap.tsv", header=T, row.names=c(1)) | |
factors <- as.data.frame(read.table("factors.txt", sep="\t", header = TRUE, row.names=c(1))) | |
dist <- dist(t(countsPC), method="euclidean") | |
cluster <- hclust(dist) | |
# main, sub, xlab, ylab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################### | |
# Code to extract GC% of genes from GFF file | |
# using bedtools | |
# - Arindam Ghosh (22 August 2019) | |
########################################### | |
bedtools nuc -fi Homo_sapiens.GRCh38.dna.primary_assembly.fa -bed Homo_sapiens.GRCh38.84.gff3 | grep ID=gene:ENSG > temp.txt | |
echo -e "GeneID\tpct_GC" > GRCh38_GeneGC.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# make_rRNA.sh | |
# Kamil Slowikowski | |
# December 12, 2014 | |
# | |
# Modified: Arindam Ghosh (July 24, 2019 ) | |
# | |
# | |
# Referenc Genome: GRCh38.p5 Ensembl release 84 | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Code to cdeate heatmap of riboswitch counts in different bacteria. | |
library(pheatmap) | |
data<- as.matrix(read.csv("heat_map_table.csv", header=T, row.names=c(1))) | |
col_key = c("#800000", "#e6194B", "#3cb44b", "#ffe119", "#4363d8", "#f58231", "#911eb4", "#42d4f4", "#f032e6", "#bfef45", "#fabebe", "#469990", "#fffac8") | |
#Since max count was 12 we used 12 distinct colours | |
pheatmap(data, treeheight_row = 0, treeheight_col = 0, cluster_rows=FALSE, cluster_cols=FALSE, color=rev(col_key), cellwidth=16, cellheight=15, height=15, width=10,filename="Heatmap.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Using NCBI's efetch to retrive GSM id (sample id) from a given SRR id(run id) | |
efetch -db sra -id SRR2453355 -format xml | xmllint --xpath "EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE/EXPERIMENT/@alias" - | cut -d \" -f2 |
NewerOlder