ag1805x’s gists

ag1805x / gtf_2_geneLength

Created June 22, 2022 12:57

	# Extract gene lengths from GTF file
	# Download GTF file from https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
	echo "Gene_ID, Gene_name, Gene_type, Chr, Start, End, Gene_length" > gdc_gencode_v36_annotation.csv
	awk '{if($3 == "gene") print $0}' gencode.v36.annotation.gtf \| awk -F '[\t ;]' '{print $10","$16","$13","$1","$4","$5","$5-$4}' >> gdc_gencode_v36_annotation.csv

ag1805x / miRBase_GFF2IDlist

Last active August 15, 2020 16:26

Extract mature and precursor miRNA list from miRBase gff3 file. Can be used for inter-conversion

	# Extract mature and precursor miRNA list from miRBase gff3 file
	# Can be used for interconversion

	grep -v '#' hsa.gff3 \| grep -w "miRNA" \| cut -f9 > temp
	paste <(cut -d ';' -f1 temp \| cut -d '=' -f1) <(cut -d ';' -f2 temp \| cut -d '=' -f1) <(cut -d ';' -f3 temp \| cut -d '=' -f1) <(cut -d ';' -f4 temp \| cut -d '=' -f1) \| head -n1 >> miRbaseIDs.tsv
	paste <(cut -d ';' -f1 temp \| cut -d '=' -f2) <(cut -d ';' -f2 temp \| cut -d '=' -f2) <(cut -d ';' -f3 temp \| cut -d '=' -f2) <(cut -d ';' -f4 temp \| cut -d '=' -f2) >> miRbaseIDs.tsv

ag1805x / exploreGTF

Last active July 13, 2020 07:00

One liners to explore GTF file

	# Extract list of unique Gene IDs in GTF file

	awk '{if($3 == "gene") print $0}' ../Homo_sapiens.GRCh38.84.gtf \| cut -f9 \| cut -d ';' -f1 \| cut -d ' ' -f2 \| sort \| uniq \| wc -l

	# Extract gene list of particular biotype

	grep 'gene_biotype "protein_coding"' ../Homo_sapiens.GRCh38.84.gtf \| awk '{if($3 == "gene") print $0}' \| cut -f9 \| cut -d ';' -f1 \| cut -d ' ' -f2 \| sort \| uniq \| wc -l

	# Create subset of Ensembl GTF file based on gene biotype

ag1805x / Pather2Revigo

Created December 24, 2019 06:41

Using Panther GO into Revigo

	# The PANTHER GO output doesnot directly support visualisation by REVIGO.
	# Using this single line code in linux terminal we can make it compatible for REVIGO
	# Gene Ontology enrichment analysis done using PANTHER (interface from: http://geneontology.org/)
	# The results are exported as Table
	# The initial few lines are cleaned to keep only the result table on which we execute the following:

	paste <(cut -f1 PantherOut.tsv \| cut -d "(" -f2 \| cut -d ")" -f1) <(cut -f7 List.tsv) >> RevigoIn.tsv

ag1805x / tx2gene_fasta

Created September 18, 2019 07:44

Code to generate tx2gene from reference transcriptome for use with tximport() in R

	# Code to generate tx2gene from reference transcriptome for use with tximport() in R
	# Reference transcriptome: wget ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz


	zcat Homo_sapiens.GRCh38.cdna.all.fa.gz \| grep '>' \| cut -d ' ' -f1,4,7 > temp
	paste <(cut -d '>' -f2 temp \| cut -d ' ' -f1) <(cut -d ' ' -f2 temp \| cut -d ':' -f2) <(cut -d ' ' -f3 temp \| cut -d ':' -f2) >> tx2gene.txt
	rm temp

ag1805x / Sample_hClust.R

Created September 16, 2019 06:41

Using hclust() to cluster RNA-seq samples and visualize dendrogram

	library(dendextend)

	countsPC <- read.table("CountsPC_Gene_MultimapOverlap.tsv", header=T, row.names=c(1))
	factors <- as.data.frame(read.table("factors.txt", sep="\t", header = TRUE, row.names=c(1)))

	dist <- dist(t(countsPC), method="euclidean")
	cluster <- hclust(dist)


	# main, sub, xlab, ylab

ag1805x / GFF2GCcontent

Created August 22, 2019 10:35

Code to extract GC% of genes from GFF file using bedtools

	###########################################
	# Code to extract GC% of genes from GFF file
	# using bedtools
	# - Arindam Ghosh (22 August 2019)
	###########################################


	bedtools nuc -fi Homo_sapiens.GRCh38.dna.primary_assembly.fa -bed Homo_sapiens.GRCh38.84.gff3 \| grep ID=gene:ENSG > temp.txt

	echo -e "GeneID\tpct_GC" > GRCh38_GeneGC.txt

ag1805x / GRCh38.rRNA.interval_list

Created July 24, 2019 11:06

	#!/usr/bin/env bash
	# make_rRNA.sh
	# Kamil Slowikowski
	# December 12, 2014
	#
	# Modified: Arindam Ghosh (July 24, 2019 )
	#
	#
	# Referenc Genome: GRCh38.p5 Ensembl release 84
	#

ag1805x / Heatmap_BacterialRiboswitchCount.R

Created June 10, 2019 11:11

#Code to cdeate heatmap of riboswitch counts in different bacteria.

	#Code to cdeate heatmap of riboswitch counts in different bacteria.


	library(pheatmap)
	data<- as.matrix(read.csv("heat_map_table.csv", header=T, row.names=c(1)))
	col_key = c("#800000", "#e6194B", "#3cb44b", "#ffe119", "#4363d8", "#f58231", "#911eb4", "#42d4f4", "#f032e6", "#bfef45", "#fabebe", "#469990", "#fffac8")
	#Since max count was 12 we used 12 distinct colours
	pheatmap(data, treeheight_row = 0, treeheight_col = 0, cluster_rows=FALSE, cluster_cols=FALSE, color=rev(col_key), cellwidth=16, cellheight=15, height=15, width=10,filename="Heatmap.png")

ag1805x / SRRtoGSM

Last active March 24, 2023 11:43

Extract GSM id for a SRR id.

	#Using NCBI's efetch to retrive GSM id (sample id) from a given SRR id(run id)

	efetch -db sra -id SRR2453355 -format xml \| xmllint --xpath "EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE/EXPERIMENT/@alias" - \| cut -d \" -f2

	# Extract mature and precursor miRNA list from miRBase gff3 file
	# Can be used for interconversion

	grep -v '#' hsa.gff3 \| grep -w "miRNA" \| cut -f9 > temp
	paste <(cut -d ';' -f1 temp \| cut -d '=' -f1) <(cut -d ';' -f2 temp \| cut -d '=' -f1) <(cut -d ';' -f3 temp \| cut -d '=' -f1) <(cut -d ';' -f4 temp \| cut -d '=' -f1) \| head -n1 >> miRbaseIDs.tsv
	paste <(cut -d ';' -f1 temp \| cut -d '=' -f2) <(cut -d ';' -f2 temp \| cut -d '=' -f2) <(cut -d ';' -f3 temp \| cut -d '=' -f2) <(cut -d ';' -f4 temp \| cut -d '=' -f2) >> miRbaseIDs.tsv

Arindam Ghosh ag1805x