monogenea’s gists

monogenea / 2-poissonGWAS.R

Last active December 2, 2019 14:22

	# Load lipid datasets & match SNP-Lipidomics samples
	lipidsMalay <- read.delim("public/Lipidomic/117Malay_282lipids.txt", row.names = 1)
	lipidsIndian <- read.delim("public/Lipidomic/120Indian_282lipids.txt", row.names = 1)
	lipidsChinese <- read.delim("public/Lipidomic/122Chinese_282lipids.txt", row.names = 1)

	all(Reduce(intersect, list(colnames(lipidsMalay),
	colnames(lipidsIndian),
	colnames(lipidsChinese))) == colnames(lipidsMalay)) # TRUE
	lip <- rbind(lipidsMalay, lipidsIndian, lipidsChinese)

monogenea / 3-poissonGWAS.R

Created October 7, 2019 18:48

	library(snpStats)
	library(doParallel)
	library(SNPRelate)
	library(GenABEL)
	library(dplyr)
	source("GWASfunction.R")
	load("PhenoGenoMap.RData")

	# Use SNP call rate of 100%, MAF of 0.1 (very stringent)
	maf <- 0.1

monogenea / 4-poissonGWAS.R

Created October 7, 2019 18:48

	# Sample call rate & heterozygosity
	callMat <- !is.na(genData$SNP)
	Sampstats <- row.summary(genData$SNP)
	hetExp <- callMat %% (2 SNPstats$MAF * (1 - SNPstats$MAF)) # Hardy-Weinberg heterozygosity (expected)
	hetObs <- with(Sampstats, Heterozygosity * (ncol(genData$SNP)) * Call.rate)
	Sampstats$hetF <- 1-(hetObs/hetExp)
	# Use sample call rate of 100%, het threshold of 0.1 (very stringent)
	het <- 0.1 # Set cutoff for inbreeding coefficient;
	het_call <- with(Sampstats, abs(hetF) < het & Call.rate == 1)
	genData$SNP <- genData$SNP[het_call,]

monogenea / 5-poissonGWAS.R

Created October 7, 2019 18:49

	# LD and kinship coeff
	ld <- .2
	kin <- .1
	snpgdsBED2GDS(bed.fn = "convertGDS.bed", bim.fn = "convertGDS.bim",
	fam.fn = "convertGDS.fam", out.gdsfn = "myGDS",
	cvt.chr = "char")
	genofile <- snpgdsOpen("myGDS", readonly = F)
	gds.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
	gds.ids <- sub("-1", "", gds.ids)
	add.gdsn(genofile, "sample.id", gds.ids, replace = T)

monogenea / 6-poissonGWAS.R

Created October 7, 2019 18:50

	# PCA
	set.seed(100)
	pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids,
	snp.id = snpset.ibd, num.thread = 1)
	pctab <- data.frame(sample.id = pca$sample.id,
	PC1 = pca$eigenvect[,1],
	PC2 = pca$eigenvect[,2],
	stringsAsFactors = F)

	# Subset and/or reorder origin accordingly

monogenea / 7-poissonGWAS.R

Created October 7, 2019 18:51

	# Choose trait for association analysis, use colnames(genData$LIP) for listing
	# NOTE: Ignore the first column of genData$LIP (gender)
	target <- "Cholesterol"

	phenodata <- data.frame("id" = rownames(genData$LIP),
	"phenotype" = scale(genData$LIP[,target]), stringsAsFactors = F)

	# Conduct GWAS (will take a while)
	start <- Sys.time()
	GWAA(genodata = genData$SNP, phenodata = phenodata, filename = paste(target, ".txt", sep = ""))

monogenea / 8-poissonGWAS.R

Created October 7, 2019 18:52

	# Manhattan plot
	GWASout <- read.table(paste(target, ".txt", sep = ""), header = T, colClasses = c("character", rep("numeric",4)))
	GWASout$type <- rep("typed", nrow(GWASout))
	GWASout$Neg_logP <- -log10(GWASout$p.value)
	GWASout <- merge(GWASout, genData$MAP[,c("SNP", "chr", "position")])
	GWASout <- GWASout[order(GWASout$Neg_logP, decreasing = T),]

	png(paste(target, ".png", sep = ""), height = 500,width = 1000)
	GWAS_Manhattan(GWASout)
	dev.off()

monogenea / 9-poissonGWAS.R

Created October 7, 2019 18:53

	# QQ plot using GenABEL estlambda function
	png(paste(target, "_QQplot.png", sep = ""), width = 500, height = 500)
	lambda <- estlambda(GWASout$t.value**2, plot = T, method = "median")
	dev.off()

monogenea / 1-poissonPLS.R

Created October 7, 2019 18:56

	# Load caret, install if necessary
	library(caret)
	arcene <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data",
	sep = " ",
	colClasses = c(rep("numeric", 10000), "NULL"))
	# Add the labels as an additional column
	arcene$class <- factor(scan("https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels",
	sep = "\t"))

monogenea / 2-poissonPLS.R

Created October 7, 2019 18:57

	# Compile cross-validation settings
	set.seed(100)
	myfolds <- createMultiFolds(arcene$class, k = 5, times = 10)
	control <- trainControl("repeatedcv", index = myfolds, selectionFunction = "oneSE")

	# Train PLS model
	mod1 <- train(class ~ ., data = arcene,
	method = "pls",
	metric = "Accuracy",
	tuneLength = 20,

Francisco Lima monogenea