Created
October 30, 2015 00:30
-
-
Save zackw/f2e74a8d7b31baa88002 to your computer and use it in GitHub Desktop.
Compute the growth rate of Unicode.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/Rscript | |
suppressPackageStartupMessages({ | |
library(ggplot2) | |
library(scales) | |
library(grid) | |
}) | |
# Data from https://en.wikipedia.org/wiki/Unicode#Versions | |
# as of 29 Oct 2015 | |
ugrowth.d <- data.frame( | |
date=as.Date(c( | |
"1991-10-01", "1992-06-01", "1993-06-01", "1996-07-01", "1998-05-01", | |
"1999-09-01", "2001-03-01", "2002-03-01", "2003-04-01", "2005-03-01", | |
"2006-07-01", "2008-04-01", "2009-10-01", "2010-10-01", "2012-01-01", | |
"2012-09-01", "2013-09-01", "2014-06-01", "2015-06-01")), | |
nchars=c( | |
7161, 28359, 34233, 38950, 38952, | |
49259, 94205, 95221, 96447, 97720, | |
99089, 100713, 107361, 109449, 110181, | |
110182, 110187, 113021, 120737), | |
what=factor(c("Actual")) | |
) | |
# The last large jump in the number of assigned code points: | |
mat.cutoff <- as.Date("2000-01-01") | |
# The total number of *publicly assignable* code points. | |
# There are 1,114,112 code points reachable by UTF-16, of which 66 are | |
# permanently unassigned, 2048 are surrogates, and 137,468 are | |
# reserved for private use. | |
max.codepoint <- 974530 | |
# Similarly, for the BMP, there are 65,536 code points, of which | |
# 34 are permanently unassigned, 2048 are surrogates, and 6400 are | |
# reserved for private use. | |
max.bmp <- 57054 | |
# Supplemental planes all consist of 65,536 code points of which | |
# 2 are permanently unassigned. At present there are two all-PUA | |
# planes; this calculation is included in the above. | |
max.sup <- 65534 | |
m.all <- lm(nchars ~ date, ugrowth.d) | |
m.mat <- lm(nchars ~ date, subset(ugrowth.d, date > mat.cutoff)) | |
p.dates <- data.frame(date=seq.Date(as.Date("1991-10-01"), | |
as.Date("2991-10-01"), by="10 years")) | |
p.mdate <- subset(p.dates, date > mat.cutoff) | |
p.all <- as.data.frame(predict(m.all, new=p.dates, interval="prediction")) | |
p.mat <- as.data.frame(predict(m.mat, new=p.mdate, interval="prediction")) | |
colnames(p.all) <- c("nchars", "lb", "ub") | |
colnames(p.mat) <- c("nchars", "lb", "ub") | |
p.all$what <- factor(c("Predicted (all)")) | |
p.mat$what <- factor(c("Predicted (since 2000)")) | |
ugrowth.p <- rbind(cbind(p.dates, p.all), cbind(p.mdate, p.mat)) | |
baseplot <- | |
ggplot() + | |
geom_point(aes(x=date, y=nchars, colour=what, fill=what), data=ugrowth.d) + | |
geom_smooth(aes(x=date, y=nchars, ymin=lb, ymax=ub, colour=what, fill=what), | |
data=ugrowth.p, stat="identity") + | |
scale_colour_manual(values=c("#000000", "#66c2a5", "#8da0cb")) + | |
scale_fill_manual(values=c("#000000", "#66c2a5", "#8da0cb")) + | |
theme(legend.title=element_blank(), | |
legend.background=element_rect(fill="#f8f8f8"), | |
legend.position=c(0, 1), | |
legend.justification=c("left", "top")) | |
fullplot <- | |
baseplot + | |
scale_y_continuous("Assigned codepoints", expand=c(0,0), | |
breaks=c(0, max.bmp, max.bmp + (1:14)*max.sup, | |
max.codepoint)) + | |
scale_x_date("Year", labels=date_format("%Y"), expand=c(0.01,0), | |
limits=c(as.Date("1991-10-01"), as.Date("2541-10-01")), | |
breaks=seq.Date(as.Date("1991-10-01"), | |
as.Date("2541-10-01"), | |
length.out=6)) + | |
coord_cartesian(ylim=c(-max.sup/4, max.codepoint + max.sup/4)) | |
insetplot <- | |
baseplot + | |
scale_y_continuous("Assigned codepoints", expand=c(0,0), | |
breaks=c(0, max.bmp/2, max.bmp, | |
max.bmp + max.sup/2, max.bmp+max.sup)) + | |
scale_x_date("Year", labels=date_format("%Y"), expand=c(0.01,0), | |
limits=c(as.Date("1991-10-01"), as.Date("2021-10-01")), | |
breaks=seq.Date(as.Date("1991-10-01"), | |
as.Date("2021-10-01"), | |
length.out=6)) + | |
coord_cartesian(ylim=c(-max.sup/12, max.bmp + max.sup + max.sup/12)) + | |
theme(legend.position="none", | |
axis.title=element_blank()) | |
# annotation_custom ... just doesn't work, and I don't understand why. | |
# We do it the hard way instead. | |
png("unicode-growth-rate.png", width=1900, height=1100, res=96) | |
grid.newpage() | |
grid.draw(ggplotGrob(fullplot)) | |
pushViewport(viewport(x=0.98, y=0.06, w=0.38, h=0.36, | |
just=c("right", "bottom"))) | |
grid.draw(ggplotGrob(insetplot)) | |
popViewport() | |
invisible(dev.off()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment