explodecomputer · February 10, 2017 18:15
diff --git a/ldscore_simplistic.R b/ldscore_simplistic.R
 library(dplyr)

 # Generate some LD blocks for 1000000 SNPs
 # Some blocks have more SNPs than others
 ld <- sample(1:1000, 1000000, replace=TRUE, prob=runif(1000))
 range(table(ld))

 # Generate some xsq stats for each SNP
 xsq <- rchisq(1000000, df=1)

 # How many SNPs per LX block?
 # This is the LD score
 counts <- as.data.frame(table(ld))

 dat <- data.frame(ld=ld, xsq=xsq)
 dat <- merge(dat, counts, by="ld")

 # Generate the "estimated" xsq - that is the combined xsq of all SNPs 
 dat <- group_by(dat, ld) %>%
 	mutate(xsqsum = sum(xsq))

 # The estimate of all SNPs is the same as an estimate of a random set of SNPs
 summary(lm(xsqsum ~ Freq, dat))
 summary(lm(xsqsum ~ Freq, dat[sample(1:1000000, 100),]))

 # Choose the best SNP from each LD block
 dattop <- group_by(dat, ld) %>%
 	dplyr::summarise(Freq=first(Freq), xsqsum = max(xsq))

 # much lower
 summary(lm(xsqsum ~ Freq, dattop))

 # Choose the worst SNP from each LD block
 datbottom <- group_by(dat, ld) %>%
 	dplyr::summarise(Freq=first(Freq), xsqsum = min(xsq))

 # Lower still
 summary(lm(xsqsum ~ Freq, datbottom))


 # Choose a random SNP from each LD block
 datrand <- group_by(dat, ld) %>%
 	dplyr::summarise(Freq=first(Freq), xsqsum = first(xsq))

 # between top and bottom
 summary(lm(xsqsum ~ Freq, datrand))
	library(dplyr)

	# Generate some LD blocks for 1000000 SNPs
	# Some blocks have more SNPs than others
	ld <- sample(1:1000, 1000000, replace=TRUE, prob=runif(1000))
	range(table(ld))

	# Generate some xsq stats for each SNP
	xsq <- rchisq(1000000, df=1)

	# How many SNPs per LX block?
	# This is the LD score
	counts <- as.data.frame(table(ld))

	dat <- data.frame(ld=ld, xsq=xsq)
	dat <- merge(dat, counts, by="ld")

	# Generate the "estimated" xsq - that is the combined xsq of all SNPs
	dat <- group_by(dat, ld) %>%
	mutate(xsqsum = sum(xsq))

	# The estimate of all SNPs is the same as an estimate of a random set of SNPs
	summary(lm(xsqsum ~ Freq, dat))
	summary(lm(xsqsum ~ Freq, dat[sample(1:1000000, 100),]))

	# Choose the best SNP from each LD block
	dattop <- group_by(dat, ld) %>%
	dplyr::summarise(Freq=first(Freq), xsqsum = max(xsq))

	# much lower
	summary(lm(xsqsum ~ Freq, dattop))

	# Choose the worst SNP from each LD block
	datbottom <- group_by(dat, ld) %>%
	dplyr::summarise(Freq=first(Freq), xsqsum = min(xsq))

	# Lower still
	summary(lm(xsqsum ~ Freq, datbottom))


	# Choose a random SNP from each LD block
	datrand <- group_by(dat, ld) %>%
	dplyr::summarise(Freq=first(Freq), xsqsum = first(xsq))

	# between top and bottom
	summary(lm(xsqsum ~ Freq, datrand))