stephenturner’s gists

stephenturner / 2011-04-12 union nomismatch priority.sql

Created April 13, 2011 00:21

2011-04-12 union nomismatch prioritypriority.sql

	DROP TABLE IF EXISTS union_nomismatch;
	CREATE TABLE union_nomismatch
	SELECT "12" as chrom, pos18, pos19, if(alleles_study IS NOT NULL, alleles_study, alleles_1000g) as alleles, maf1000g, maxmaf, meanmaf, novel, offtarget, thresh, highqual, superhighqual,
	CASE
	WHEN meanmaf IS NULL THEN maf1000g
	WHEN maf1000g IS NULL THEN meanmaf
	WHEN (meanmaf IS NOT NULL and maf1000g IS NOT NULL) THEN GREATEST(meanmaf, maf1000g)
	END AS bestmaf
	FROM (
	SELECT

stephenturner / 2011-04-12 get flanking seq for igf1.r

Created April 13, 2011 01:24

2011-04-12 get flanking seq for igf1.r

	## Load the BSgenome package and download the hg19 reference sequence
	## http://www.bioconductor.org/packages/release/bioc/html/BSgenome.html
	# source("http://www.bioconductor.org/biocLite.R")
	# biocLite("BSgenome")
	# biocLite("BSgenome.Hsapiens.UCSC.hg19")

	library('BSgenome.Hsapiens.UCSC.hg19')
	installed.genomes()

	getflank <- function(position, alleles="[N/N]", chr="chr12", offset=10) {

stephenturner / 2011-04-15 SNP Overlap.sql

Created April 15, 2011 22:33

2011-04-15 SNP Overlap.sql

	#---------------
	# Sample overlap
	#---------------

	# Samples where in_gwas=1. n=2503
	SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

	# Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
	SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;

stephenturner / qqman.r

Created April 19, 2011 21:22

qqman.r

	# Stephen Turner
	# http://StephenTurner.us/
	# http://GettingGeneticsDone.blogspot.com/
	# See license at http://gettinggeneticsdone.blogspot.com/p/copyright.html

	# Last updated: Tuesday, April 19, 2011
	# R code for making manhattan plots and QQ plots from plink output files.
	# manhattan() with GWAS data this can take a lot of memory, recommended for use on 64bit machines only, for now.
	# Altnernatively, use bmanhattan() , i.e., base manhattan. uses base graphics. way faster.

stephenturner / picking snps for pca.r

Created April 20, 2011 00:44

picking snps for pca.r

	#-------------------------------------------------------------------------------------
	# If you split the dataset (3195094 snps) into 320 files of 10,000 snps
	# (the last file will have 5094 snps), then you can use this code to generate
	# a "map" that will map the original column numbers (1:3195094) to a file number
	# (1:1000) and a column number (1:10000).
	#-------------------------------------------------------------------------------------
	# nfiles <- 320
	# ncolsperfile <- 10000
	# ncols <- nfiles*ncolsperfile
	# column <- 1:ncols

stephenturner / PCA on 4k markers.r

Created April 20, 2011 00:49

PCA on 4k markers.r

	#-----------------------------------------------------------------------------------------------
	#
	# Calculate PCs when the number of SNPs (m>4000) is large
	# From G with SNPs on the columns, keep the first 20 PCs
	# X = normed G, we want PCs (=Xv) where v is eigenvector of X'X (the covariance matrix)
	# X'X can't be handled directly.
	# INSTEAD,
	# If XX'v= lv, X'XX'v = lX'v => if find eigenv for XX' then X'v is eigenv for X'X
	# SIMILARLY, if X'Xv = lv, then XX'(Xv) = l(Xv) => the eigenv of XX' must be PCs that we want
	# OF COURSE, the final result of the matrix with Xv must be orthonormal up to some constant

stephenturner / pca plot results.r

Created April 20, 2011 00:50

pca plot results.r

	load("D:/2011-02-09 MEC GWAS DATA/2011-03-23 PCA/pcs-2-without-d.RData")
	eigenvalues <- pcs[[2]]
	pcs <- data.frame(pcs[[1]])
	pcs <- cbind(query("select * from mec_genotyped"),pcs)
	ht(pcs)
	dbWriteTable(con, "pcs", pcs, overwrite=T)

	d<-read.csv("D:/2011-02-09 MEC GWAS DATA/PHENOTYPE DATA/2011-03-23 initial data from christian.csv",T)
	dbWriteTable(con, "mec_pheno", d, overwrite=T)

stephenturner / 2011-04-14 GWAS Overlap.sql2011-04-14 GWAS Overlap.sql

Created April 27, 2011 01:44

2011-04-14 GWAS Overlap.sql

	-- Sample overlap

	-- Samples where in_gwas=1. n=2503
	SELECT count(*) FROM igf1.sample_aapc_gwas where in_gwas=1;

	-- Samples where in_gwas=1 and that are actually in the GWAS data. n=2130.
	SELECT count(*) FROM igf1.sample_aapc_gwas a JOIN mec.sample_genotyped b on a.m_id=b.m_id where in_gwas=1;

	-- What about the missing 373?
	SELECT a.* FROM igf1.sample_aapc_gwas a LEFT JOIN mec.sample_genotyped b on a.m_id=b.m_id WHERE in_gwas=1 AND b.m_id IS NULL;

stephenturner / load and count whr.sql

Created June 4, 2011 02:03

load and count whr.sql

	drop table pheno4;
	create table pheno4 (sampleindex int, m_id varchar(25), area varchar(1), q1_age_cohent int, q3prelim_waist int, q3prelim_hip int, age_calc float);
	load data local infile 'C:/cygwin/home/sturner/mec_gwas/phenotypes/waisthip.csv'
	into table pheno4
	fields terminated by ','
	ignore 1 lines
	(sampleindex, m_id, area, q1_age_cohent, q3prelim_waist, q3prelim_hip, age_calc)
	;
	alter table pheno4 add column q3prelim_whr float after q3prelim_hip;
	update pheno4 set q3prelim_whr=q3prelim_waist/q3prelim_hip;

stephenturner / iona-pull-711-snps-priority-3.r

Created June 21, 2011 18:55

iona-pull-711-snps-priority-3.r

	d <- query(" #new
	SELECT
	if(b.snp='.' OR b.snp IS NULL, CONCAT_WS('_','chr12',a.pos19), b.snp) as Locus_Name,
	'SNP' as Target_Type,
	NULL as Sequence,
	a.chrom AS Chromosome,
	a.pos19 as Coordinate,
	'19' as Genome_Build_Version,
	'HSapiensReferenceSequence' as Source,
	'19' as Source_version,

Stephen Turner stephenturner