explodecomputer · July 18, 2018 15:21
diff --git a/hwe_sim.r b/hwe_sim.r
 ---
 title: Calculating HWE
 ---

 Choose a set of parameters. For each allele frequency and sample size, run 100 simulations

 ```{r}
 library(tidyr)
 library(ggplot2)
 library(dplyr)
 param <- expand.grid(
 	sim=1:100,
 	n=c(100,1000),
 	p=seq(0.01, 0.99, by=0.01),
 	cs = NA,
 	f = NA
 )
 ```

 Generate data and perform HWE tests


 ```{r}
 for(i in 1:nrow(param))
 {
 	# Simulate genotype
 	d <- rbinom(param$n[i], 2, param$p[i])

 	# Observed genotype counts
 	o <- c(sum(d == 2), sum(d == 1), sum(d == 0))

 	# Expected genotype counts
 	e <- c(param$p[i]^2, 2 * param$p[i] * (1-param$p[i]), (1-param$p[i])^2) * param$n[i]

 	# HWE tests
 	param$cs[i] <- chisq.test(rbind(o, e))$p.value
 	param$fe[i] <- fisher.test(rbind(o, e))$p.value
 }
 param$index <- 1:nrow(param)
 param <- gather(param, key="test", value="pvalue", cs, fe)
 param$test[param$test == "fe"] <- "Fisher"
 param$test[param$test == "cs"] <- "Chisq"

 ```

 What is the distribution of p-values for the two different tests?

 ```{r}
 ggplot(param, aes(x=pvalue)) +
 geom_histogram() +
 facet_grid(n ~ test)
 ```

 Breakdown p-values by allele frequency and sample size

 ```{r}
 ggplot(param, aes(x=p, y=pvalue)) +
 geom_point(aes(colour=test), alpha=0.1) +
 geom_smooth(aes(colour=test)) +
 facet_grid(n ~ .) +
 labs(x="Allele frequency", y="HWD p-value")
 ```

 Breaking down by allele frequency and sample size, what is the FDR:

 ```{r}
 sparam <- group_by(param, n, p, test) %>%
 summarise(nsim=n(), fdr = sum(pvalue < 0.05) / n())

 ggplot(sparam, aes(x=p, y=fdr)) +
 geom_bar(stat="identity", position="dodge", aes(fill=test)) +
 facet_grid(n ~ .) +
 ylim(0, 1)
 ```
	---
	title: Calculating HWE
	---

	Choose a set of parameters. For each allele frequency and sample size, run 100 simulations

	```{r}
	library(tidyr)
	library(ggplot2)
	library(dplyr)
	param <- expand.grid(
	sim=1:100,
	n=c(100,1000),
	p=seq(0.01, 0.99, by=0.01),
	cs = NA,
	f = NA
	)
	```

	Generate data and perform HWE tests


	```{r}
	for(i in 1:nrow(param))
	{
	# Simulate genotype
	d <- rbinom(param$n[i], 2, param$p[i])

	# Observed genotype counts
	o <- c(sum(d == 2), sum(d == 1), sum(d == 0))

	# Expected genotype counts
	e <- c(param$p[i]^2, 2 * param$p[i] * (1-param$p[i]), (1-param$p[i])^2) * param$n[i]

	# HWE tests
	param$cs[i] <- chisq.test(rbind(o, e))$p.value
	param$fe[i] <- fisher.test(rbind(o, e))$p.value
	}
	param$index <- 1:nrow(param)
	param <- gather(param, key="test", value="pvalue", cs, fe)
	param$test[param$test == "fe"] <- "Fisher"
	param$test[param$test == "cs"] <- "Chisq"

	```

	What is the distribution of p-values for the two different tests?

	```{r}
	ggplot(param, aes(x=pvalue)) +
	geom_histogram() +
	facet_grid(n ~ test)
	```

	Breakdown p-values by allele frequency and sample size

	```{r}
	ggplot(param, aes(x=p, y=pvalue)) +
	geom_point(aes(colour=test), alpha=0.1) +
	geom_smooth(aes(colour=test)) +
	facet_grid(n ~ .) +
	labs(x="Allele frequency", y="HWD p-value")
	```

	Breaking down by allele frequency and sample size, what is the FDR:

	```{r}
	sparam <- group_by(param, n, p, test) %>%
	summarise(nsim=n(), fdr = sum(pvalue < 0.05) / n())

	ggplot(sparam, aes(x=p, y=fdr)) +
	geom_bar(stat="identity", position="dodge", aes(fill=test)) +
	facet_grid(n ~ .) +
	ylim(0, 1)
	```