explodecomputer · December 18, 2020 12:35
diff --git a/gwas.sh b/gwas.sh
 #===================================#
 #          GWAS Practical           #
 #      iBSC Genomic Medicine        #
 #       Lavinia Paternoster         #
 #           08/11/2018              #
 #===================================#


 # Run the GWAS without PCs but using the cleaned genetic data

 cd ~/ibsc_unit2/data

 plink \
  --bfile geno_qc \
  --pheno phen_clean.txt \
  --pheno-name CRP \
  --covar covs.txt \
  --covar-name age,sex \
  --linear \
  --out ../output/gwas/CRP_nopc

 awk 'NR==1 || /ADD/' ../output/gwas/precomputed/CRP_nopc.assoc.linear > ../output/gwas/CRP_nopc.assoc.linear.add

 # Run the GWAS with PCs

 plink \
    --bfile geno_qc \
    --pheno phen_clean.txt \
    --pheno-name CRP \
    --covar covs.txt \
    --covar-name PC1-PC10,age,sex \
    --linear \
    --out ../output/gwas/CRP_allcovs

 awk 'NR==1 || /ADD/' ../output/gwas/precomputed/CRP_allcovs.assoc.linear > ../output/gwas/CRP_allcovs.assoc.linear.add

 # See which hits are independent using 'clump'

 plink \
  --bfile geno_qc \
  --clump ../output/gwas/precomputed/CRP_allcovs.assoc.linear.add \
  --clump-kb 10000 \
  --clump-r2 0.001 \
  --clump-p1 5e-8 \
  --clump-p2 5e-8 \
  --out ../output/gwas/CRP_allcovs.assoc.linear.add

 # Generate allele freqs for var explained formula

 plink \
  --bfile geno_qc \
  --freq \
  --out geno_qc_freq
diff --git a/gwas_plots_interpretation.R b/gwas_plots_interpretation.R
 #=====================#
 # Interpreting a GWAS #
 # Lavinia Paternoster #
 #     08/11/2018      #
 #=====================#

 install.packages("qqman")
 install.packages("GenABEL")

 #==================#
 #load GWAS results #
 #==================#
 results <- read.table("~/Documents/ibsc_unit2/output/gwas/precomputed/CRP_nopc.assoc.linear.add", header=T)

 #============#
 # Make plots #
 #============#

 #make QQ and manhattan
 library(qqman)
 png('~/Documents/ibsc_unit2/output/gwas/CRP_man_nopc.png')
 manhattan(results, ylim=c(0,20))
 dev.off()

 png('~/Documents/ibsc_unit2/output/gwas/CRP_qq_nopc.png')
 qq(results$P)
 dev.off()

 #estimate lambda
 library(GenABEL)
 estlambda(results$P, method="median")

 #===========================#
 #load adjusted GWAS results #
 #===========================#
 results <- read.table("~/Documents/ibsc_unit2/output/gwas/precomputed/CRP_allcovs.assoc.linear.add", header=T)


 #==============================#
 # Make plots for adjusted GWAS #
 #==============================#

 #make QQ and manhattan
 library(qqman)
 png('~/Documents/ibsc_unit2/output/gwas/CRP_man_withpc.png')
 manhattan(results, ylim=c(0,20))
 dev.off()

 png('~/Documents/ibsc_unit2/output/gwas/CRP_qq_withpc.png')
 qq(results$P)
 dev.off()

 #estimate lambda
 library(GenABEL)
 estlambda(results$P, method="median")


 #=============================#
 # Look at significant results #
 #=============================#

 #how many significant results are there?
 length(results$P[which(results$P<5e-8)])

 #what is the top signal?
 results[which(results$P<5e-8), ]

 # I now want to save this as a table that I can output
 table <- results[which(results$P<5e-8), ]
 write.table(table, file="~/Documents/ibsc_unit2/output/gwas/CRP_results_table.txt")

 #==================#
 # Independent hits #
 #==================#

 # We now want to look at only the clumped results - Run plink --clump command
 # load in the clumped results
 crp_clumped <- read.table("~/Documents/ibsc_unit2/output/gwas/CRP_allcovs.assoc.linear.add.clumped", header=T)

 #subset the results to only include the independent results
 clumped_results <- subset(results, SNP %in% crp_clumped$SNP)
 #have a look at the new table
 clumped_results

 #output this table
 table2 <- clumped_results[which(clumped_results$P<5e-8), ]
 write.table(table2, file="~/Documents/ibsc_unit2/output/gwas/crp_clumped_results_table.txt")

 #==================================#
 # calculate variance explained     #
 #==================================#

 # You use the formula from the lecture:
 # r2 = (2p(1-p)beta^2)/Var(P)
 # so we need to find each item (p, beta and Var(P))

 # calculate Var(P)
 # load in phen_clean & then calculate variance of CRP
 phen_clean <- read.table("~/Documents/ibsc_unit2/data/phen_clean.txt", header=T)
 var <- var(phen_clean$CRP, na.rm=TRUE) ## This doesn't change between SNPs

 # For each SNP you need the beta - already in the clumped_results object

 # For each SNP you need p - we will need to generate this in plink and then load in.

 # Plink code:
 #  plink \
 #    --bfile ~/Documents/ibsc_unit2/data/geno_qc \
 #    --freq \
 #    --out ~/Documents/ibsc_unit2/data/geno_qc_freq

 # Then load in the freqs
 freqs <- read.table("~/Documents/ibsc_unit2/data/geno_qc_freq.frq", header=T)

 # get just the freqs in the clumped set
 clumped_freqs <- subset(freqs, SNP %in% crp_clumped$SNP)

 # now we have everything we need to run the formula

 # we'll create a new r2 column in clumped_results
 clumped_results$r2 <- (clumped_results$BETA^2*2*clumped_freqs$MAF*(1- clumped_freqs$MAF))/var
 clumped_results ###take a look at the new column

 # we can add up the r2s to get total r2 explained by all independent significant SNPs
 sum(clumped_results$r2)

 # you should get 0.025 - the SNPs together explain 2.5% of the variance in CRP
	#===================================#
	# GWAS Practical #
	# iBSC Genomic Medicine #
	# Lavinia Paternoster #
	# 08/11/2018 #
	#===================================#


	# Run the GWAS without PCs but using the cleaned genetic data

	cd ~/ibsc_unit2/data

	plink \
	--bfile geno_qc \
	--pheno phen_clean.txt \
	--pheno-name CRP \
	--covar covs.txt \
	--covar-name age,sex \
	--linear \
	--out ../output/gwas/CRP_nopc

	awk 'NR==1 \|\| /ADD/' ../output/gwas/precomputed/CRP_nopc.assoc.linear > ../output/gwas/CRP_nopc.assoc.linear.add

	# Run the GWAS with PCs

	plink \
	--bfile geno_qc \
	--pheno phen_clean.txt \
	--pheno-name CRP \
	--covar covs.txt \
	--covar-name PC1-PC10,age,sex \
	--linear \
	--out ../output/gwas/CRP_allcovs

	awk 'NR==1 \|\| /ADD/' ../output/gwas/precomputed/CRP_allcovs.assoc.linear > ../output/gwas/CRP_allcovs.assoc.linear.add

	# See which hits are independent using 'clump'

	plink \
	--bfile geno_qc \
	--clump ../output/gwas/precomputed/CRP_allcovs.assoc.linear.add \
	--clump-kb 10000 \
	--clump-r2 0.001 \
	--clump-p1 5e-8 \
	--clump-p2 5e-8 \
	--out ../output/gwas/CRP_allcovs.assoc.linear.add

	# Generate allele freqs for var explained formula

	plink \
	--bfile geno_qc \
	--freq \
	--out geno_qc_freq
	#=====================#
	# Interpreting a GWAS #
	# Lavinia Paternoster #
	# 08/11/2018 #
	#=====================#

	install.packages("qqman")
	install.packages("GenABEL")

	#==================#
	#load GWAS results #
	#==================#
	results <- read.table("~/Documents/ibsc_unit2/output/gwas/precomputed/CRP_nopc.assoc.linear.add", header=T)

	#============#
	# Make plots #
	#============#

	#make QQ and manhattan
	library(qqman)
	png('~/Documents/ibsc_unit2/output/gwas/CRP_man_nopc.png')
	manhattan(results, ylim=c(0,20))
	dev.off()

	png('~/Documents/ibsc_unit2/output/gwas/CRP_qq_nopc.png')
	qq(results$P)
	dev.off()

	#estimate lambda
	library(GenABEL)
	estlambda(results$P, method="median")

	#===========================#
	#load adjusted GWAS results #
	#===========================#
	results <- read.table("~/Documents/ibsc_unit2/output/gwas/precomputed/CRP_allcovs.assoc.linear.add", header=T)


	#==============================#
	# Make plots for adjusted GWAS #
	#==============================#

	#make QQ and manhattan
	library(qqman)
	png('~/Documents/ibsc_unit2/output/gwas/CRP_man_withpc.png')
	manhattan(results, ylim=c(0,20))
	dev.off()

	png('~/Documents/ibsc_unit2/output/gwas/CRP_qq_withpc.png')
	qq(results$P)
	dev.off()

	#estimate lambda
	library(GenABEL)
	estlambda(results$P, method="median")


	#=============================#
	# Look at significant results #
	#=============================#

	#how many significant results are there?
	length(results$P[which(results$P<5e-8)])

	#what is the top signal?
	results[which(results$P<5e-8), ]

	# I now want to save this as a table that I can output
	table <- results[which(results$P<5e-8), ]
	write.table(table, file="~/Documents/ibsc_unit2/output/gwas/CRP_results_table.txt")

	#==================#
	# Independent hits #
	#==================#

	# We now want to look at only the clumped results - Run plink --clump command
	# load in the clumped results
	crp_clumped <- read.table("~/Documents/ibsc_unit2/output/gwas/CRP_allcovs.assoc.linear.add.clumped", header=T)

	#subset the results to only include the independent results
	clumped_results <- subset(results, SNP %in% crp_clumped$SNP)
	#have a look at the new table
	clumped_results

	#output this table
	table2 <- clumped_results[which(clumped_results$P<5e-8), ]
	write.table(table2, file="~/Documents/ibsc_unit2/output/gwas/crp_clumped_results_table.txt")

	#==================================#
	# calculate variance explained #
	#==================================#

	# You use the formula from the lecture:
	# r2 = (2p(1-p)beta^2)/Var(P)
	# so we need to find each item (p, beta and Var(P))

	# calculate Var(P)
	# load in phen_clean & then calculate variance of CRP
	phen_clean <- read.table("~/Documents/ibsc_unit2/data/phen_clean.txt", header=T)
	var <- var(phen_clean$CRP, na.rm=TRUE) ## This doesn't change between SNPs

	# For each SNP you need the beta - already in the clumped_results object

	# For each SNP you need p - we will need to generate this in plink and then load in.

	# Plink code:
	# plink \
	# --bfile ~/Documents/ibsc_unit2/data/geno_qc \
	# --freq \
	# --out ~/Documents/ibsc_unit2/data/geno_qc_freq

	# Then load in the freqs
	freqs <- read.table("~/Documents/ibsc_unit2/data/geno_qc_freq.frq", header=T)

	# get just the freqs in the clumped set
	clumped_freqs <- subset(freqs, SNP %in% crp_clumped$SNP)

	# now we have everything we need to run the formula

	# we'll create a new r2 column in clumped_results
	clumped_results$r2 <- (clumped_results$BETA^22clumped_freqs$MAF*(1- clumped_freqs$MAF))/var
	clumped_results ###take a look at the new column

	# we can add up the r2s to get total r2 explained by all independent significant SNPs
	sum(clumped_results$r2)

	# you should get 0.025 - the SNPs together explain 2.5% of the variance in CRP