Skip to content

Instantly share code, notes, and snippets.

View monogenea's full-sized avatar

Francisco Lima monogenea

View GitHub Profile
# Generate scaled 4*5 matrix with random std normal samples
set.seed(101)
mat <- scale(matrix(rnorm(20), 4, 5))
dimnames(mat) <- list(paste("Sample", 1:4), paste("Var", 1:5))
# Perform PCA
myPCA <- prcomp(mat, scale. = F, center = F)
myPCA$rotation # loadings
myPCA$x # scores
plot(varImp(mod1), 10, main = "PLS-DA")
plot(varImp(mod2), 10, main = "PCA-DA")
# Compile models and compare performance
models <- resamples(list("PLS-DA" = mod1, "PCA-DA" = mod2, "RF" = mod3))
bwplot(models, metric = "Accuracy")
# PCA-DA
mod2 <- train(class ~ ., data = arcene,
method = "lda",
metric = "Accuracy",
trControl = control,
preProc = c("zv","center","scale","pca"))
# RF
mod3 <- train(class ~ ., data = arcene,
method = "ranger",
# Compile cross-validation settings
set.seed(100)
myfolds <- createMultiFolds(arcene$class, k = 5, times = 10)
control <- trainControl("repeatedcv", index = myfolds, selectionFunction = "oneSE")
# Train PLS model
mod1 <- train(class ~ ., data = arcene,
method = "pls",
metric = "Accuracy",
tuneLength = 20,
# Load caret, install if necessary
library(caret)
arcene <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data",
sep = " ",
colClasses = c(rep("numeric", 10000), "NULL"))
# Add the labels as an additional column
arcene$class <- factor(scan("https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels",
sep = "\t"))
# QQ plot using GenABEL estlambda function
png(paste(target, "_QQplot.png", sep = ""), width = 500, height = 500)
lambda <- estlambda(GWASout$t.value**2, plot = T, method = "median")
dev.off()
# Manhattan plot
GWASout <- read.table(paste(target, ".txt", sep = ""), header = T, colClasses = c("character", rep("numeric",4)))
GWASout$type <- rep("typed", nrow(GWASout))
GWASout$Neg_logP <- -log10(GWASout$p.value)
GWASout <- merge(GWASout, genData$MAP[,c("SNP", "chr", "position")])
GWASout <- GWASout[order(GWASout$Neg_logP, decreasing = T),]
png(paste(target, ".png", sep = ""), height = 500,width = 1000)
GWAS_Manhattan(GWASout)
dev.off()
# Choose trait for association analysis, use colnames(genData$LIP) for listing
# NOTE: Ignore the first column of genData$LIP (gender)
target <- "Cholesterol"
phenodata <- data.frame("id" = rownames(genData$LIP),
"phenotype" = scale(genData$LIP[,target]), stringsAsFactors = F)
# Conduct GWAS (will take a while)
start <- Sys.time()
GWAA(genodata = genData$SNP, phenodata = phenodata, filename = paste(target, ".txt", sep = ""))
# PCA
set.seed(100)
pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids,
snp.id = snpset.ibd, num.thread = 1)
pctab <- data.frame(sample.id = pca$sample.id,
PC1 = pca$eigenvect[,1],
PC2 = pca$eigenvect[,2],
stringsAsFactors = F)
# Subset and/or reorder origin accordingly