Last active
January 24, 2016 11:44
-
-
Save flovv/be6e1c027409e36ec316 to your computer and use it in GitHub Desktop.
illustrate the relationship between panel-based measurement error and panel sizes for GFK and Nielsen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a simple script to illustrate the relationship between panel-based measurement error and panel sizes | |
# simple case of party estimate! | |
require(plyr) | |
require(ggplot2) | |
require(ggthemes) | |
n=1000 | |
p=5 | |
test <- c(40, 30, 15, 10 , 5) /100 | |
getDraws <- function(n, prob){ | |
sum(ifelse(runif(n, 0, 1) <= prob, 1,0)) /n | |
} | |
#getDraws(n, .05) | |
getVariance <- function(n, prob, redraws){ | |
replicate(redraws, getDraws(n, prob)) | |
} | |
line <- NULL | |
set.seed(123) | |
for(i in test){ | |
if(exists("line")){ | |
line <- rbind(line, data.frame(sample=getVariance(n, i, 100), trueValue =i)) | |
} | |
else{ | |
line <- data.frame(sample=getVariance(n, i, 100), trueValue =i) | |
} | |
} | |
line$trueValue <- as.factor(line$trueValue) | |
ggplot(line,aes(sample*100, fill=trueValue)) + geom_density(alpha=.4) +facet_wrap(~trueValue, scales="free")+ xlab("") | |
ggplot(line,aes(sample*100, fill=trueValue)) + geom_density(alpha=.4) + xlab("Vote Share in %") + theme_economist() | |
############################################# | |
confidence.interval = function(c.lev, margin=.5, p.ss, population) { | |
z.val = qnorm(.5+c.lev/200) | |
ss = ((population*p.ss-p.ss)/(population-p.ss)) | |
c.interval = sqrt((z.val^2 * margin * (1-margin))/ss) | |
r.cint = round(c.interval*100, digits = 2) | |
return(r.cint) | |
} | |
########################################## | |
confidence.interval(95, 0.4, 1000, 70000000) | |
############################################# | |
############################################# | |
df <- data.frame(m=c(50, 30, 10, 5, 3, 2, 1, .5, .2, .05, 0.01), interval =confidence.interval(99, c(50, 30, 10, 5, 3, 2, 1, .5, .2, .05, 0.01)/100 , 5000, 50000000), order= seq(1,11,1) ) | |
df <- data.frame(m=c(3, 2, 1, .5, .2, .05), interval =confidence.interval(99, c(3, 2, 1, .5, .2, .05)/100 , 5000, 50000000) ) | |
df$m2 <- factor(df$m) | |
#df$m <- reorder(df$m, d$Win) | |
p <- ggplot(df, aes(y=m, x=m2 )) | |
p + theme_bw(base_size = 18)+geom_point() + geom_errorbar(aes(ymin=m-interval, ymax=m+interval), width=.1) + ylab("Market Share (%) and Confidence Intervalls") + xlab("Market Share (%) / Size of the Website") + ggtitle("Panel Size 5000, Population 50 Mio") | |
############### percentage error! | |
df$percError <- df$interval / df$m *100 | |
p <- ggplot(df, aes(y=percError, x=m2)) + geom_point() + ylab("Relative Error, %") + xlab("Market Share (%) / Size of the Website") + ggtitle("Panel Size 5000, Population 50 Mio") | |
p +theme_economist(base_size = 16) | |
######################################### | |
## changing panel size! | |
df <- data.frame(m=c(1000, 3000, 5000, 10000, 20000, 50000), interval =confidence.interval(99, 1/100 , c(1000, 3000, 5000, 10000, 20000, 50000), 50000000), order= seq(1,6,1) ) | |
df$percError <- df$interval / 1 *100 | |
df$m2 <- factor(df$m) | |
p <- ggplot(df, aes(y=percError, x=m2)) + geom_point() + ylab("Relative Error, %") + xlab("Panel Size") + ggtitle("Varying Panel Size, Population 50 Mio, Website Size 1%") | |
p +theme_bw(base_size = 18) | |
p <- ggplot(df, aes(y=1, x=m2 )) | |
p +theme_bw(base_size = 18)+ geom_point() + geom_errorbar(aes(ymin=1-interval, ymax=1+interval), width=.1) + ylab("Market Share (%) and Confidence Intervalls") + xlab("Panel Size") + ggtitle("Varying Panel Size, Population 50 Mio, Website Size 1%") | |
############# facet! | |
df <- data.frame(m=c(1000, 3000, 5000, 10000, 20000, 50000), interval =confidence.interval(99, 1/100 , c(1000, 3000, 5000, 10000, 20000, 50000), 50000000), order= seq(1,6,1), size=1 ) | |
df2 <- data.frame(m=c(1000, 3000, 5000, 10000, 20000, 50000), interval =confidence.interval(99, .5/100 , c(1000, 3000, 5000, 10000, 20000, 50000), 50000000), order= seq(1,6,1), size=.5 ) | |
df3 <- data.frame(m=c(1000, 3000, 5000, 10000, 20000, 50000), interval =confidence.interval(99, .1/100 , c(1000, 3000, 5000, 10000, 20000, 50000), 50000000), order= seq(1,6,1), size=.1 ) | |
hopp <-rbind(df, df2) | |
hopp <- rbind(hopp, df3) | |
hopp$m2 <- factor(hopp$m) | |
#hopp$size <- factor(hopp$size) | |
p <- ggplot(hopp, aes(y=size, x=m2 )) | |
p +theme_bw(base_size = 18)+ geom_point() + geom_errorbar(aes(ymin=size-interval, ymax=size+interval), width=.1) + ylab("Market Share (%) and Confidence Intervalls") + xlab("Panel Size") + ggtitle("Varying Panel Size, Population 50 Mio, Varying Website Size") + facet_grid(~size) | |
hopp$percError <- hopp$interval / hopp$size *100 | |
p <- ggplot(hopp, aes(y=percError, x=m2 )) | |
p +theme_bw(base_size = 18)+ geom_point() + ylab("Relative Error, %") + xlab("Panel Size") + ggtitle("Varying Panel Size, Population 50 Mio, Varying Website Size") + facet_grid(~size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment