stephenturner’s gists

stephenturner / RF vs lm and FS 1.r

Created March 1, 2011 19:31

RF vs lm and FS 1.r

	> #First define the function
	> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
	>
	> # first, fit a stepwise model and test it on new data
	> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
	> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
	> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
	> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.6991431
	>

stephenturner / splitbycol.pl

Created March 10, 2011 21:41

splitbycol.pl

	#!/usr/bin/perl

	# Thursday, March 10, 2011
	# Stephen D. Turner

	chomp(my $pwd = `pwd`);
	my $help = "\nUsage: $0 <file to split> <# columns in each split>\n\n";
	die $help if @ARGV!=2;

	#sub trim($);

stephenturner / gwas_in_r.R

Created March 14, 2011 20:20

gwas_in_r.R

	##########################################################################
	############################ Documentation ###############################
	##########################################################################

	# There are essentially two functions in this code.

	# (1)lm_wrapper/glm_wrapper: these functions take an outcome vector y, a
	# data frame containing only SNPs, and optionally a data frame of
	# covariates. lm_wrapper runs linear regression while glm_wrapper will run
	# logistic regression, on every SNP in the SNP data frame.

stephenturner / forlani.r

Created March 18, 2011 22:02

forlani.r

	#-----------------------------------------------------------------------------------------------
	#
	# Calculate PCs when the number of SNPs (m>4000) is large
	# From G with SNPs on the columns, keep the first 20 PCs
	# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
	# X'X can't be handled directly.
	# INSTEAD,
	# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
	# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
	# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant

stephenturner / pickSNPs.r

Created March 24, 2011 03:14

pickSNPs.r

	#-------------------------------------------------------------------------------------
	# Select a set of SNPs > dist bp (default 100kb) apart
	# Map is a matrix with colnames "chrom" and "position".
	# Matrix MUST BE sorted by chrom,position
	# Can have more columns but the colnames "chrom" and "position" can't be changed
	# The function returns a vector of row indices corresponding
	# to the rows you should pick for your subset.
	#-------------------------------------------------------------------------------------
	pickSNPs<-function(map, dist = 100000) {
	t=as.data.frame(table(map$chrom))

stephenturner / 2011-03-30 igf1 stufy maf vs 1000g maf.r

Created March 31, 2011 03:22

2011-03-30 igf1 stufy maf vs 1000g maf.r

	d=query("-- DISCOVERED IN STUDY, BUT SOME MIGHT NOT BE IN 1000 GENOMES
	select distinct *, CASE WHEN refvar_study!=refvar_1000g and maf1000g IS NOT NULL THEN 1 ELSE 0 END as mismatch from
	(SELECT a.chr, a.pos18, c.pos19, a.major\|\|a.minor AS refvar_study, a.maxpoolmaf, c.major\|\|c.minor AS refvar_1000g, c.maf as maf1000g
	FROM igf1_pooledmaf a
	LEFT JOIN igf1_map1819 b ON a.pos18=b.hg18
	LEFT JOIN igf1_1000g_freq c ON b.hg19=c.pos19);")

	nrow(d)
	ht(d)

stephenturner / 2011-04-01 igf1 how many snps.sql

Created April 2, 2011 02:18

2011-04-01 igf1 how many snps.sql

	-- ARGH.. SQLite doesn't implement a full outer join, OR a right join.
	-- For this reason it might be a good idea to transition over to MySQL.
	-- Here's a goofy workaround using two left-joins, taken from Wikipedia:
	-- http://en.wikipedia.org/wiki/Join_%28SQL%29#Full_outer_join
	CREATE VIEW igf1_union AS
	SELECT *, CASE
	WHEN meanmaf IS NULL THEN maf1000g
	WHEN maf1000g IS NULL THEN meanmaf
	WHEN (meanmaf IS NOT NULL and maf1000g IS NOT NULL) THEN max(meanmaf, maf1000g)
	END AS bestmaf

stephenturner / igf1-db-ex.txt

Created April 4, 2011 20:07

igf1-db-ex.txt

	sqlite> select * from igf1_studydata limit 10;
	chr pos18 pos19 ref var refvar novel offtarget thresh maxmaf meanmaf highqual superhighqual
	---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- -------------
	12 101266026 102741896 C A C/A 0 0 0.05 0.362 0.252 1 1
	12 101266373 102742243 C A C/A 0 0 0.05 0.0805 0.0131 1 1
	12 101267346 102743216 T G T/G 0 0 0.05 0.6937 0.4321 1 1
	12 101267684 102743554 C T C/T 0 0 0.05 0.0772 0.0353 1 1
	12 101268423 102744293 A G A/G 0

stephenturner / snp distribution histograms.r

Created April 4, 2011 21:58

snp distribution histograms.r

	myquery="SELECT * FROM igf1_union_nomismatch WHERE highqual=1 OR (highqual IS NULL AND maf1000g>0.01);"
	d=query(myquery)
	main <- paste(nrow(d),"SNPs")
	p=qplot(d$pos18, binw=2000, xlab="Chromosome 12 position (hg18)", ylab="SNP Density", main=main)
	ggsave("2011-04-04 SNP Density 728 SNPs.png", p, w=6, h=6, dpi=100)


	myquery="SELECT * FROM igf1_union_nomismatch WHERE maxmaf>=0.05 OR (meanmaf IS NULL AND maf1000g>0.01);"
	d=query(myquery)
	main <- paste(nrow(d),"SNPs")

stephenturner / getflank.r

Created April 12, 2011 02:07

getflank.r

	# Load the BSgenome package and download the hg19 reference sequence
	# For documentation see http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
	source("http://www.bioconductor.org/biocLite.R")
	biocLite("BSgenome")
	biocLite("BSgenome.Hsapiens.UCSC.hg19") #installs the human genome (~850 MB download).

	library('BSgenome.Hsapiens.UCSC.hg19')

	# Function to pull flanking sequence. Defaults to +/- 10 bp
	getflank <- function(position, alleles="[N/N]", chr="chr12", offset=10) {

Stephen Turner stephenturner