indapa’s gists

indapa / bed_complement_basewise.py

Created December 8, 2010 16:38

determine intervals in file2.bed not covered by the file1.bed, given a length file of sources

	#!/usr/bin/python
	from optparse import OptionParser

	from bx.bitset import *
	from bx.bitset_builders import *

	def read_len( lengthfh ):
	"""Read a 'LEN' file and return a mapping from chromosome to length"""
	mapping = dict()
	for line in lengthfh:

indapa / hwe_markov_chain.R

Created March 21, 2011 18:44

simulate Hardy-Weinberg Equilibrium using Markov chain

	hwe_markov <- function(theta,g) {

	#Taken from Ch7 question 7.9 from
	# E.A. Suess and B.E. Trumbo
	#Introduction to Probability and Gibbs Sampling with R 2010


	# simulate Hardy Weinberg equlibrium using Markov chains

	#Let the genotypes

indapa / w-f.R

Created March 22, 2011 18:06

simulate Wright-Fisher model in R

	#simulate Wright Fisher model
	#see this for more info: http://www.genetics.wustl.edu/bio5488/lecture_notes_2004/popgen_1.pdf


	wright_fisher <- function(N,m,j) {

	#Preliminaries
	#number of chromosomes
	N=N

indapa / ploidy.py

Created March 4, 2012 03:26

Using itertools to count the number of possible genotypes given a certain ploidy

	#!/usr/bin/env python
	from itertools import *




	#print the number of possible genotypes with various ploidies
	ploidys=[2,3,4,5,6,7,8,9,10]
	for p in ploidys:
	print p, len([combo for combo in combinations_with_replacement(['A','C','G','T'],p) ])

indapa / reshape.R

Created March 9, 2012 19:31

Using the reshape module on R

	library(reshape)

	#suppose I read a file with the header lines of sample and 13 target ids
	# the values represent median target coverage from an exome capture experiment
	median <- read.table("median.txt", header=T)
	> median
	sample t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13
	1 s1 10 34 31 66 34 61 40 46 117 29 55 96 33
	2 s2 20 29 25 36 27 53 34 26 89 28 40 70 26
	3 s3 8 31 31 66 33 57 31 36 93 36 40 74 27

indapa / student_reasoning.py

Created March 17, 2012 17:51

CPD computations for Bayesian network

	#I'm taking the STanford on line course Probabilistic Graphical Models <pgm-class.org>
	# They have a simple example of a Bayesian network for a Student letter of reference
	# the code below was originally posted by a fellow classmate taking the class
	# it really helped me in understanding how to do computations on conditional probability tables (CPDs) that
	# form the heart of Bayesian networks

	from numpy import *
	P_I = array([
	[0.7,0.3]
	])

indapa / generator_while.py

Created May 3, 2012 17:06

Python generators and itertools.izip

	"""http://docs.python.org/library/itertools.html#itertools.izip"""
	#thanks Brent Pedersen


	vcf_gen1=vcfobj1.yieldVcfRecordwithGenotypes(vcfh1)
	vcf_gen2=vcfobj1.yieldVcfRecordwithGenotypes(vcfh2)


	for vrec1, vrec2 in itertools.izip(vcf_gen1, vcf_gen2):
	print vrec1.toStringwithGenotypes()

indapa / fastq_summary.py

Created June 3, 2012 21:22

get the mean base composition for a fastq file after running fastq_stats.py from Galaxy

indapa / ecdf.R

Created June 14, 2012 15:31

Plot empirical CDF function in R

	#plot the empirical CDF

	x=rnorm(100) # 100 random draws from standard normal
	f=ecdf(x) # ecdf returns a function
	percentiles=f(x) # return the percentiles of the data points
	plot(percentiles, x)

	#see ?ecdf for more information

indapa / enumerateGenotypes.py

Created July 6, 2012 02:51

Enumerate the possible genotypes for a given ploidy

	from itertools import *

	ploidys=[2,3,4]

	for p in ploidys:
	l=[ combo for combo in combinations_with_replacement(['A','C','G','T'],p) ]
	for g in l:
	print "".join( list(g) ), "ploidy: ", p
	print

Amit Indap indapa