Skip to content

Instantly share code, notes, and snippets.

View monogenea's full-sized avatar

Francisco Lima monogenea

View GitHub Profile
# Load lipid datasets & match SNP-Lipidomics samples
lipidsMalay <- read.delim("public/Lipidomic/117Malay_282lipids.txt", row.names = 1)
lipidsIndian <- read.delim("public/Lipidomic/120Indian_282lipids.txt", row.names = 1)
lipidsChinese <- read.delim("public/Lipidomic/122Chinese_282lipids.txt", row.names = 1)
all(Reduce(intersect, list(colnames(lipidsMalay),
colnames(lipidsIndian),
colnames(lipidsChinese))) == colnames(lipidsMalay)) # TRUE
lip <- rbind(lipidsMalay, lipidsIndian, lipidsChinese)
library(snpStats)
library(doParallel)
library(SNPRelate)
library(GenABEL)
library(dplyr)
source("GWASfunction.R")
load("PhenoGenoMap.RData")
# Use SNP call rate of 100%, MAF of 0.1 (very stringent)
maf <- 0.1
# Sample call rate & heterozygosity
callMat <- !is.na(genData$SNP)
Sampstats <- row.summary(genData$SNP)
hetExp <- callMat %*% (2 * SNPstats$MAF * (1 - SNPstats$MAF)) # Hardy-Weinberg heterozygosity (expected)
hetObs <- with(Sampstats, Heterozygosity * (ncol(genData$SNP)) * Call.rate)
Sampstats$hetF <- 1-(hetObs/hetExp)
# Use sample call rate of 100%, het threshold of 0.1 (very stringent)
het <- 0.1 # Set cutoff for inbreeding coefficient;
het_call <- with(Sampstats, abs(hetF) < het & Call.rate == 1)
genData$SNP <- genData$SNP[het_call,]
# LD and kinship coeff
ld <- .2
kin <- .1
snpgdsBED2GDS(bed.fn = "convertGDS.bed", bim.fn = "convertGDS.bim",
fam.fn = "convertGDS.fam", out.gdsfn = "myGDS",
cvt.chr = "char")
genofile <- snpgdsOpen("myGDS", readonly = F)
gds.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
gds.ids <- sub("-1", "", gds.ids)
add.gdsn(genofile, "sample.id", gds.ids, replace = T)
# PCA
set.seed(100)
pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids,
snp.id = snpset.ibd, num.thread = 1)
pctab <- data.frame(sample.id = pca$sample.id,
PC1 = pca$eigenvect[,1],
PC2 = pca$eigenvect[,2],
stringsAsFactors = F)
# Subset and/or reorder origin accordingly
# Choose trait for association analysis, use colnames(genData$LIP) for listing
# NOTE: Ignore the first column of genData$LIP (gender)
target <- "Cholesterol"
phenodata <- data.frame("id" = rownames(genData$LIP),
"phenotype" = scale(genData$LIP[,target]), stringsAsFactors = F)
# Conduct GWAS (will take a while)
start <- Sys.time()
GWAA(genodata = genData$SNP, phenodata = phenodata, filename = paste(target, ".txt", sep = ""))
# Manhattan plot
GWASout <- read.table(paste(target, ".txt", sep = ""), header = T, colClasses = c("character", rep("numeric",4)))
GWASout$type <- rep("typed", nrow(GWASout))
GWASout$Neg_logP <- -log10(GWASout$p.value)
GWASout <- merge(GWASout, genData$MAP[,c("SNP", "chr", "position")])
GWASout <- GWASout[order(GWASout$Neg_logP, decreasing = T),]
png(paste(target, ".png", sep = ""), height = 500,width = 1000)
GWAS_Manhattan(GWASout)
dev.off()
# QQ plot using GenABEL estlambda function
png(paste(target, "_QQplot.png", sep = ""), width = 500, height = 500)
lambda <- estlambda(GWASout$t.value**2, plot = T, method = "median")
dev.off()
# Load caret, install if necessary
library(caret)
arcene <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data",
sep = " ",
colClasses = c(rep("numeric", 10000), "NULL"))
# Add the labels as an additional column
arcene$class <- factor(scan("https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels",
sep = "\t"))
# Compile cross-validation settings
set.seed(100)
myfolds <- createMultiFolds(arcene$class, k = 5, times = 10)
control <- trainControl("repeatedcv", index = myfolds, selectionFunction = "oneSE")
# Train PLS model
mod1 <- train(class ~ ., data = arcene,
method = "pls",
metric = "Accuracy",
tuneLength = 20,