Forked from FloWuenne/gist:b4d1729b5ec2ceecfb4ce532e0fd8d67
Created
April 9, 2020 20:24
-
-
Save krishnakatyal/66dcd23525138f2b5cf89d46bbbed9dc to your computer and use it in GitHub Desktop.
Plot PCA for ethnicity from any given VCF file combined with 1000 genomes data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Plot PCA for ethnicity of a given VCF file | |
## Import publication theme | |
## A nice R ggplot theme that I found online. Original can be found here: | |
## https://rpubs.com/Koundy/71792 | |
theme_Publication <- function(base_size=14, base_family="helvetica") { | |
library(grid) | |
library(ggthemes) | |
(theme_foundation(base_size=base_size, base_family=base_family) | |
+ theme(plot.title = element_text(face = "bold", | |
size = rel(1.2), hjust = 0.5), | |
text = element_text(), | |
panel.background = element_rect(colour = NA), | |
plot.background = element_rect(colour = NA), | |
panel.border = element_rect(colour = NA), | |
axis.title = element_text(face = "bold",size = rel(1)), | |
axis.title.y = element_text(angle=90,vjust =2), | |
axis.title.x = element_text(vjust = -0.2), | |
axis.text = element_text(), | |
axis.line = element_line(colour="black"), | |
axis.ticks = element_line(), | |
panel.grid.major = element_line(colour="#f0f0f0"), | |
panel.grid.minor = element_blank(), | |
legend.key = element_rect(colour = NA), | |
legend.position = "bottom", | |
legend.direction = "horizontal", | |
legend.key.size= unit(0.2, "cm"), | |
legend.margin = unit(0, "cm"), | |
legend.title = element_text(face="italic"), | |
plot.margin=unit(c(10,5,5,5),"mm"), | |
strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"), | |
strip.text = element_text(face="bold") | |
)) | |
} | |
scale_fill_Publication <- function(...){ | |
library(scales) | |
discrete_scale("fill","Publication",manual_pal(values = c("#386cb0","#fdb462","#7fc97f","#ef3b2c","#662506","#a6cee3","#fb9a99","#984ea3","#ffff33")), ...) | |
} | |
scale_colour_Publication <- function(...){ | |
library(scales) | |
discrete_scale("colour","Publication",manual_pal(values = c("#386cb0","#fdb462","#7fc97f","#ef3b2c","#662506","#a6cee3","#fb9a99","#984ea3","#ffff33")), ...) | |
} | |
## Import libraries | |
library(gdsfmt) | |
library(SNPRelate) | |
library(ggplot2) | |
library(ggdendro) | |
library(RColorBrewer) | |
library(plyr) | |
library(dplyr) | |
## Set working directory | |
## Set color palette for populations | |
color_palette <- c("#912CEE","#f40000","#4DBD33","#4885ed","#ffc300","#2b2b28") | |
## Specify your VCF file | |
vcf.fn <- "my.vcf" | |
# Transform the VCF file into gds format | |
snpgdsVCF2GDS(vcf.fn, "Variants_1000G_exomes.gds", method="biallelic.only") | |
## Data Analysis | |
## open the newly created gds file | |
genofile <- snpgdsOpen('Variants_1000G_exomes.gds') | |
## LD based SNP pruning | |
set.seed(1000) | |
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2) | |
snpset.id <- unlist(snpset) | |
## Perform PCA on the SNP data | |
pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=4) | |
pc.percent <- pca$varprop*100 | |
head(round(pc.percent, 2)) | |
### Plotting | |
### Plot results from PCA analysis of variant data | |
tab12 <- data.frame(sample.id = pca$sample.id, | |
EV1 = pca$eigenvect[,1], # the first eigenvector | |
EV2 = pca$eigenvect[,2], # the second eigenvector | |
stringsAsFactors = FALSE) | |
tab34 <- data.frame(sample.id = pca$sample.id, | |
EV3 = pca$eigenvect[,3], # the first eigenvector | |
EV4 = pca$eigenvect[,4], # the second eigenvector | |
stringsAsFactors = FALSE) | |
plot(tab34$EV2, tab34$EV1, xlab="eigenvector 3", ylab="eigenvector 4") | |
## Assign ethnicity description from 1000G file | |
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id")) | |
populations <- read.table("All_samples_populations.tab",header=T) | |
tab <- data.frame(sample.id = pca$sample.id, | |
EV1 = pca$eigenvect[,1], # the first eigenvector | |
EV2 = pca$eigenvect[,2], # the second eigenvector | |
stringsAsFactors = FALSE) | |
colnames(tab) <- c("sample.id","EV1","EV2") | |
PCA_allVariants2 <- ggplot(tab,aes(x=EV1,y=EV2)) + | |
geom_point(size=2) + | |
scale_x_continuous("PC1") + | |
scale_y_continuous("PC2") + | |
scale_colour_manual(values=color_palette[1:length(unique(tab$pop))]) + | |
theme_Publication() | |
PCA_allVariants2 | |
ggsave(file="Variants_PCA_pointsize2.svg",plot=PCA_allVariants2) | |
ggsave(file="Variants_PCA_pointsize1.svg",plot=PCA_allVariants1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment