Last active
November 26, 2018 20:05
-
-
Save tdunning/26ce5f109cd0ea044caaf3ebfd21eb99 to your computer and use it in GitHub Desktop.
A simplified implementation of a merging t-digest in R with some visualization of the results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### x is either a vector of numbers or a data frame with sums and weights. Digest is a data frame. | |
merge = function(x, digest, compression=100) { | |
## Force the digest to be a data.frame, possibly empty | |
if (!is.data.frame(digest) && is.na(digest)) { | |
digest = data.frame(sum=c(), weight=c()) | |
} | |
## and coerce the incoming data likewise ... a vector of points have default weighting of 1 | |
if (!is.data.frame(x)) { | |
x = data.frame(sum=x, weight=1) | |
} | |
## glue and sort all of the data | |
merged = rbind(x, digest) | |
merged = merged[order(merged$sum/merged$weight),] | |
## and now merge into the result data.frame r | |
n = sum(merged$weight) | |
j = 1 # index for output | |
r = merged[j,] # starts with first row of merged data | |
q = 0 # weight before now is nil | |
for (i in 2:dim(merged)[1]) { | |
## increment in q is out pending output plus a candidate addition | |
dq = (r[j, "weight"] + merged[i,"weight"])/n | |
## check to see if it is acceptable | |
k.size = compression * (q.to.k(q + dq) - q.to.k(q)) | |
if (k.size > 1) { | |
## nope... commit to this output row start the next one | |
r = rbind(r, merged[i,]) | |
q = q + r$weight[j]/n | |
j = j+1 | |
} else { | |
## yep... it fits | |
r[j,] = r[j,] + merged[i,] | |
} | |
} | |
return(r) | |
} | |
### returns the center of a single centroid | |
center = function(td, i) { | |
td[i, "sum"] / td[i, "weight"] | |
} | |
interpolate = function(values, weights) { | |
return (sum(values * weights) / sum(weights)) | |
} | |
### computes the estimated cdf of a point | |
cdf = function(td, x) { | |
if (x < center(td,1)) { | |
return (0); | |
} else if (x > center(td,dim(td)[1])) { | |
return (1); | |
} else { | |
i = which(td$sum/td$weight > x)[1] - 1 | |
boundary = interpolate(center(td, i:(i+1)), td$weight[i:(i+1)]) | |
if (x < boundary) { | |
## on the right side of a centroid | |
base = sum(td$weight[1:(i-1)]) | |
local = td$weight[i] | |
## base + local/2 is total weight at the center of centroid i | |
## base + local is the weight on the right side of that same centroid | |
return (interpolate(c(base + local/2, base + local)/sum(td$weight), c(boundary-x, x-center(td,i)))) | |
} else { | |
## on the left side | |
base = sum(td$weight[1:i]) | |
local = td$weight[i+1] | |
## base is the center, base - local/2 is the left edge | |
return (interpolate(c(base - local/2, base)/sum(td$weight), c(center(td,i+1)-x, x-boundary))) | |
} | |
} | |
} | |
### returns the distribution as estimated by the centroids | |
dist = function(td) { | |
n = dim(td)[1] | |
## each centroid is assumed to be split evenly at its center | |
d = data.frame(x = td$sum / td$weight, cdf = (cumsum(td$weight) - td$weight/2)/sum(td$weight)) | |
return(d) | |
} | |
### Converts q to index function | |
q.to.k = function(q) { | |
asin(2*min(q,1)-1)/pi + 0.5 | |
} | |
M = 200 | |
N = 50 | |
### KS distribution 50%-ile and 95%-iles | |
### these approximations are accurate to 2-3 significant digits | |
### relative to values computed by | |
### Richard Simard's program. | |
### http://www.iro.umontreal.ca/~simardr/ksdir/KolmogorovSmirnovDist.java | |
zx = data.frame(n=(1:100), p50=0, p95=0) | |
for (i in 1:100) { | |
zx$p50[i] = 0.02599203/sqrt(i*M)*sqrt(M) | |
zx$p95[i] = 0.0427621/sqrt(i*M)*sqrt(M) | |
} | |
### plot the evolution of a t-digest over a bunch of additions of a thousand points | |
ks.summary = data.frame(i=c(), ks=c(), k=c(), ks1=c(), ks2=c()) | |
for (k in 1:10) { | |
xx = seq(0,1,by=0.01) | |
yy = 4 * sqrt(xx * (1-xx)) | |
td.x = NA | |
ks = rep(0, N+1) | |
layout(matrix(c(1,2), nrow=1)) | |
ks.details = data.frame(i=rep(0,N), ks1=0, ks2=0) | |
data.set = runif(N * M) | |
for (i in 1:N) { | |
batch = data.set[(i-1)*M + 1:M] | |
td.x = merge(batch, td.x) | |
plot(td.x$sum / td.x$weight, td.x$weight, type='b', cex=0.7, main=i, xlab='q', ylab='weight') | |
lines(xx,sum(td.x$sum) * yy / 100 * pi/2) | |
dx = dist(td.x) | |
ks[i] = max(abs(dx$cdf - dx$x)) | |
ks.details$ks1[i] = ks[i] | |
ks.details$ks2[i] = ks.test(data.set[1:(i*M)], punif)$statistic | |
plot(ks, type='l', xlab="Step", main=k) | |
print(c(k, dim(td.x)[1])) | |
## Mark roughly the median of the KS distribution | |
lines(p50 ~ n, zx, col='blue') | |
lines(p95 ~ n, zx, col='lightblue', lty=1) | |
} | |
ks.summary = rbind(ks.summary, data.frame(i=1:N, ks=ks[1:N], k=k, ks1=ks.details$ks1, ks2=ks.details$ks2)) | |
} | |
pdf('ks.evolution.pdf', width=6, height=5) | |
boxplot(ks ~ i, ks.summary) | |
lines(p50 ~ n, zx, col='blue') | |
lines(p95 ~ n, zx, col='lightblue', lty=1) | |
dev.off() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment