Stephen Turner stephenturner

Data scientist in biotech, former academic, Principal Scientist and Head of Genomic Strategy at Colossal Biosciences

stephenturner / qqbase.r

Created November 17, 2010 15:29

qqbase.r

	# Originally posted at http://gettinggeneticsdone.blogspot.com/2010/07/qq-plots-of-p-values-in-r-using-base.html

	# Define the function
	ggd.qqplot = function(pvector, main=NULL, ...) {
	o = -log10(sort(pvector,decreasing=F))
	e = -log10( 1:length(o)/length(o) )
	plot(e,o,pch=19,cex=1, main=main, ...,
	xlab=expression(Expected~~-log[10](italic(p))),
	ylab=expression(Observed~~-log[10](italic(p))),
	xlim=c(0,max(e)), ylim=c(0,max(o)))

stephenturner / logisticcurve.r

Created November 23, 2010 21:18

logisticcurve.r

	x=seq(-5,5,.01)
	invlogit=function(x) exp(x)/(1+exp(x))
	y=invlogit(x)
	plot(x,y,pch=16,ylab=expression(paste(logit^{-1},(x))))
	abline(v=0)
	abline(h=.5)
	text(.55,.55,expression(paste("Slope is ",beta/4)),adj=c(0,0))

stephenturner / pvalue-from-lm-object.r

Created November 30, 2010 17:46

pvalue-from-lm-object.r

	# Function to extract the overall ANOVA p-value out of a linear model object
	lmp <- function (modelobject) {
	if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
	f <- summary(modelobject)$fstatistic
	p <- pf(f[1],f[2],f[3],lower.tail=F)
	attributes(p) <- NULL
	return(p)
	}

	# simulate some data

stephenturner / forestplot.r

Created February 11, 2011 02:59

forestplot.r

	# d is a data frame with 4 columns
	# d$x gives variable names
	# d$y gives center point
	# d$ylo gives lower limits
	# d$yhi gives upper limits
	forestplot <- function(d, xlab="Odds Ratio", ylab="Study"){
	require(ggplot2)
	p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) +
	geom_pointrange() +
	coord_flip() +

stephenturner / randomforestdemo.r

Created February 15, 2011 22:53

randomforestdemo.r

	rm(list=ls(all=TRUE))
	library(randomForest)

	###############Classification################
	data(iris)
	head(iris)
	iris.rf <- randomForest(Species~., data=iris, importance=T, proximity=T)
	iris.rf.subset <- randomForest(Species~., data=iris[c(1:3,5)], importance=T, proximity=T)
	iris.rf.subset2 <- randomForest(Species~. -Petal.Length -Petal.Width, data=iris, importance=T, proximity=T)
	print(iris.rf)

stephenturner / 2011-02-15 rf adiposity.r

Created February 16, 2011 00:59

2011-02-15 rf adiposity.r

	library(randomForest)

	###############################################################################
	############################## load functions #################################
	###############################################################################

	# need to document this!
	rfr2 = function(randomForestModel) {
	printoutput = capture.output(print(randomForestModel))
	varline = grep("explained",printoutput,value=TRUE)

stephenturner / permute_column.r

Created February 17, 2011 21:21

permute_column.r

	# permutes a column in a data.frame, sets seed optionally
	permute <- function (dataframe, columnToPermute="column", seed=NULL) {
	if (!is.null(seed)) set.seed(seed)
	colindex <- which(names(dataframe)==columnToPermute)
	permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
	dataframe[colindex] <- permutedcol
	return(dataframe)
	}

stephenturner / ggd_rf_example.r

Created February 18, 2011 21:56

ggd_rf_example.r

	#load the iris data
	data(iris)

	# this data has 150 rows
	nrow(iris)

	# look at the first few
	head(iris)

	# splitdf function will return a list of training and testing sets

stephenturner / splitdf.randomize.r

Created February 19, 2011 02:01

splitdf.randomize.r

	#splitdf splits a data frame into a training and testing set.
	#returns a list of two data frames: trainset and testset.
	#you can optionally apply a random seed.
	splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 \| trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]

stephenturner / propmiss.r

Created February 24, 2011 03:09

propmiss.r

	propmiss <- function(dataframe) {
	m <- sapply(dataframe, function(x) {
	data.frame(
	nmiss=sum(is.na(x)),
	n=length(x),
	propmiss=sum(is.na(x))/length(x)
	)
	})
	d <- data.frame(t(m))
	d <- sapply(d, unlist)