Last active
June 16, 2017 21:01
-
-
Save fwhigh/a68eb626018abb2985af1c2d8b7b93c1 to your computer and use it in GitHub Desktop.
The Streaming Distributed Bootstrap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(ggplot2) | |
thm <- theme_bw() | |
thm <- thm + theme(axis.line = element_line(colour = "black"), | |
panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.border = element_blank(), | |
panel.background = element_blank()) | |
# make fake uniformly distributed data | |
df <- data.table(index=as.factor(1:30),count=1,value=runif(30)) | |
print(df) | |
# plot the number of time each data point appears | |
p <- ggplot(df, aes(index, count)) + geom_bar(stat="identity") | |
p <- p + scale_y_continuous(limits = c(0, 8)) | |
p <- p + thm | |
print(p) | |
ggsave(p, file="example_raw_count_hist.png", bg = "transparent") | |
# bootstrap the mean of the data | |
qoi <- function(dat) { | |
return(mean(dat)) | |
} | |
se.estimator <- function(dat) { | |
return(sd(dat)/sqrt(length(dat))) | |
} | |
m <- 1e4 # number of bootstraps | |
# Do the bootstrap | |
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]}) | |
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n")) | |
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n")) | |
# perform a single bootstrap resampling using multinomial random numbers | |
df$resample=c(rmultinom(1,nrow(df),rep(1/nrow(df),nrow(df)))) | |
# plot the number of time each data point appears | |
p <- ggplot(df, aes(index, resample)) + geom_bar(stat="identity") | |
p <- p + scale_y_continuous(limits = c(0, 8)) | |
p <- p + thm | |
print(p) | |
ggsave(p, file="example_resample_hist.png", bg = "transparent") | |
# simulate 10000 bootstrap iterations and look at the frequency of the first data point per iteration | |
cnts <- data.table(frequency=rmultinom(10000,nrow(df),rep(1/nrow(df),nrow(df)))[1,]) | |
# count the number of times each frequency comes up | |
prob <- data.table(unique(cnts[,count:=.N,by=frequency])) | |
# compute the binomial | |
prob[,binomial:=dbinom(frequency,10000,1/10000)*10000] | |
print(prob[order(frequency),]) | |
# compute the poisson and plot | |
prob[,poisson:=dpois(frequency,1)*10000] | |
prob.melt <- melt(prob,id.vars=c('frequency')) | |
p <- ggplot(prob.melt,aes(frequency,value,color=variable,shape=variable)) + geom_point(size=5) | |
p <- p + scale_shape_discrete(solid=F) | |
p <- p + thm | |
print(p) | |
ggsave(p, file="first_index_freq_pois.png", bg = "transparent") | |
write.table(subset(df,select=c('value','count')),file='small_data.txt',quote=F,sep=" ",row.names=F,col.names=F) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
PIDS=$(ps ax | grep "t stream search" | grep -v grep | sed 's/^[ \t]*//g' | cut -d' ' -f1 | paste -s -d' ' -) | |
if [ "$PIDS" != "" ]; then | |
kill $PIDS | |
fi | |
\ls | grep -ve "png$" | xargs -I {} rm -v {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
cat small_data.txt | \ | |
sdbootstrap_inner.py \ | |
--online_update WeightedMeanUpdater \ | |
--n_boot 10000 > pyboot_out.txt | |
head pyboot_out.txt | |
/usr/bin/env Rscript -<<EOF | |
library(data.table) | |
df<-fread('pyboot_out.txt') | |
df[,mean(V3)] | |
df[,sd(V3)] | |
EOF | |
awk -v n_data=10000 ' | |
BEGIN { | |
for (i=1;i<=n_data;i++) { | |
print rand(),1 | |
} | |
}' > data.txt | |
head data.txt | |
/usr/bin/env Rscript -<<EOF | |
library(data.table) | |
df <- fread('data.txt', col.names=c("value","count")) | |
qoi <- function(dat) { | |
return(mean(dat)) | |
} | |
se.estimator <- function(dat) { | |
return(sd(dat)/sqrt(length(dat))) | |
} | |
m <- 1e4 | |
result <- sapply(1:m, function (i) {df[sample(1:nrow(df), replace = TRUE),qoi(value)]}) | |
cat(paste0("compare bootstrap mean ",mean(result)," to sample quantity of interest ",qoi(df$value),"\n")) | |
cat(paste0("compare bootstrap standard deviation ",sd(result)," to the sample standard error of the quantity of interest ",se.estimator(df$value),"\n")) | |
EOF | |
parallel --pipepart --jobs 6 --block 18k -a data.txt \ | |
sdbootstrap_inner.py --online_update WeightedMeanUpdater --n_boot 10000 | \ | |
sdbootstrap_outer.py > pyboot_out.txt | |
head pyboot_out.txt | |
/usr/bin/env Rscript -<<EOF | |
library(data.table) | |
df <- fread('pyboot_out.txt', col.names=c("master_index","index","value","count")) | |
df[,mean(value)] | |
df[,sd(value)] | |
EOF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(ggplot2) | |
args <- commandArgs(trailingOnly = TRUE) | |
thm <- theme_bw() | |
thm <- thm + theme(axis.line = element_line(colour = "black"), | |
panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.border = element_blank(), | |
panel.background = element_blank()) | |
i=0 | |
for (term in args[1:3]) { | |
df.raw=fread(paste0('pyboot_out_',term,'.txt')) | |
df.raw$term=term | |
if (i == 0) { | |
df=df.raw | |
} else { | |
df=rbind(df,df.raw) | |
} | |
i = i + 1 | |
} | |
df[,mean:=mean(V3),by=c('V1','term')] | |
df[,sd:=sd(V3),by=c('V1','term')] | |
df.u=unique(subset(df,select=c('V1','term','mean','sd'))) | |
df.u[,lower:=mean-sd] | |
df.u[,upper:=mean+sd] | |
p <- ggplot(df.u,aes(as.POSIXct(V1, origin="1970-01-01"),mean,color=term)) + geom_line() | |
p <- p + geom_ribbon(data=df.u,aes(ymin=lower,ymax=upper),alpha=0.3) | |
p <- p + xlab("time") + ylab("mean seconds between tweets, cumulative") | |
p <- p + thm | |
ggsave(p, file=args[4]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
SEARCH1=$(t trends | awk 'NR==1' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]') | |
SEARCH2=$(t trends | awk 'NR==2' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]') | |
SEARCH3=$(t trends | awk 'NR==3' | tr -d '#' | tr '[:upper:]' '[:lower:]' | tr -d '[:blank:]') | |
function t_stream() { | |
echo $1; | |
t stream search -c -l $1 | \ | |
gstdbuf -i0 -o0 -e0 awk -F, '$2 ~ /^2017/ {b=$2;gsub(/[:-]/," ",b);if (a) {print mktime(b)-mktime(a),1}; a=b}' | \ | |
python -u $(which sdbootstrap_inner.py) \ | |
--online_update WeightedMeanUpdater \ | |
--n_boot 200 \ | |
--update_every 10 \ | |
--no_flush_on_update > "pyboot_out_${1}.txt"; | |
} | |
t_stream $SEARCH1 & | |
t_stream $SEARCH2 & | |
t_stream $SEARCH3 & | |
# find process ID's to kill using | |
#ps ax | grep "t stream search" | |
Rscript twitter_example.R $SEARCH1 $SEARCH2 $SEARCH3 tweet_rate.png | |
open tweet_rate.png |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment