dinovski’s gists

dinovski / gtex_tpm.py

Last active April 20, 2023 14:14

Calculate median transcript-level TPM by GTEx tissue

	#!/usr/bin/env python

	import pandas as pd
	import numpy as np
	import csv
	import statistics

	# mapping of sample ID to tissue:
	#curl https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt -o GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
	# TPM per sample per transcript:

dinovski / geneBed.sh

Created April 27, 2020 20:19

Add padding up and/or downstream of TSS coordinates

	#!/bin/bash

	BEDTOOLS=/local/build/BEDTools_2.21.0/bin
	FEATURECOUNTS=/local/build/subread/subread-1.5.1-Linux-x86_64/bin/featureCounts
	R=/local/build/R/R-3.4.0/bin/R

	IDIR=/in/path
	DIR=/out/path

	GTF=gencode.vM13.annotation.gtf

dinovski / dispersion.R

Last active December 23, 2020 09:16

estimate the biological coefficient of variation across biological replicates

	#!/usr/bin/Rscript

	library(edgeR)
	library(limma)

	## The BCV is the relative variability of expression between biological replicates
	## The square root of the negative binomial dispersion for a gene
	## is the biological coefficient of variation (BCV) across replicates (stdev/mean)

	## input: ge (raw count table filtered for lowly expressed genes, eg CPM > 1)

dinovski / MSIanalysis.R

Last active September 28, 2018 09:29

reanalysis of GSE26682

	#!/usr/bin/Rscript
	#source("http://bioconductor.org/biocLite.R")
	#biocLite("curatedCRCData")
	#biocLite("inSilicoMerging")
	library(Biobase)
	library(GEOquery)
	library(rgl)
	library(inSilicoMerging)
	library(gdata)
	library(scatterplot3d)

dinovski / inferAncestry.sh

Last active October 19, 2018 07:32

imputation, phasing, and file wrangling for local ancestry inference with RFMix

	#!/bin/bash

	## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
	## and phasing with Shapeit (http://www.shapeit.fr/)
	## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
	##
	## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
	## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html

	## Bin paths

dinovski / VCFcompare.py

Last active September 8, 2018 21:32

compare genotypes for the same sample from different sequencing experiments (eg. control data versus 1000 Genomes data for the same HapMap samples)

	#!/usr/bin/python

	import vcf
	import sys

	Usage="""
	python VCFcompare.py ${1KG_VCF} ${CONTROL_VCF} ${CHR} ${OUTFILE}
	"""

	if len(sys.argv) < 5:

dinovski / autoFACS.R

Last active December 23, 2020 09:19

import cell sorting data and perform automated gating using unsupervised learning (k-means)

	#!/usr/bin/Rscript

	## Perform automated gating of FACS data using unsupervised clustering and regression analysis.
	## Data is first filtered to exclude debris (low FSC and high SSC) and doublets (FSC-A v FSC-H)
	## and the CSV is exported using FlowJo (flowjo.com).

	## This analysis was designed to assess the effects of various microsatellite lengths on gene expression
	## using a reporter assay. Briefly, varying 'eSTRs' (gBlock gene fragments) were cloned upstream of a CMV
	## promoter driving GFP reporter gene expression and co-transfected with an RFP expression plasmid at a
	## raio of 2:1. Reference samples of GFP:RFP were co-transfected at various ratios (1:1, 2:1, 3:1, 4:1) and

dinovski / uniqueGenes.R

Last active September 7, 2018 19:59

get unique min/max gene coordinates from gene bed

	inDir="/bioinfo/users/dzielins/dbase/mouse/mm10/"
	genome="mm10"
	geneBed="gencode.vM13.annotation_gene.bed"

	inBed <- read.table(paste0(inDir, geneBed), header = FALSE, stringsAsFactors = FALSE, sep = "\t")
	colnames(inBed) <- c("chr", "start", "end", "gene", "score", "strand")

	## Keep min start and max end for each gene
	start <- aggregate(start ~ chr + gene + strand, inBed, min)
	end <- aggregate(end ~ chr + gene + strand, inBed, max)

dinovski / fasta2bed.py

Last active September 10, 2022 16:07

case insensitive motif search of a FASTA file (eg. find all 'GATC' sites for binning damID reads).

	#!/usr/bin/env python

	from Bio import SeqIO
	import re
	import os
	import sys
	import csv

	usage="""
	Perform case insensitive motif search of a FASTA file and output 3 BED files: motif_sites.bed (motif coordinates), motif_fragments.bed (coordinates between each motif), and motif_fragments_full.bed (coordinates between and including each motif)

dinovski / qualDist.py

Last active September 8, 2018 21:38

quality score distribution from fastq

	#!/usr/bin/env python

	usage="""
	## quality score distribution from fastq
	gunzip -c fastq.gz \| python qualDist.py
	"""

	import sys

	num = 0

Dina dinovski