Last active
January 28, 2021 17:16
-
-
Save geotheory/5748388 to your computer and use it in GitHub Desktop.
ggplot2 functionality for mapping to hexagon size and colour aesthetics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' GGPLOT2 FUNCTIONALITY FOR MAPPING TO HEXAGON SIZE AND COLOUR AESTHETICS | |
#' by Robin Edwards, 2013 (geotheory.co.uk, @geotheory) | |
#' This has been adapted from the ggplot bin_hex.R script that underpins geom_hex, etc | |
#' (see https://github.com/hadley/densityvis/blob/master/R/bin-hex.r). | |
#' | |
#' These functions implement aesthetic mapping to hexagon size (area), in addition to the existing | |
#' colour-mapping functionality. The key change is the addition of a new fourth variable (var4) | |
#' to hex_bin(), which complements the inbuilt hexagon binning functionality. The 'frequency.to.area' | |
#' argument enables the default mappings of binned data to colour and var4 to size to be interchanged. | |
#' The hmin/hmax arguments [0,1] set area mapping constraints (hmax can exceed 1). | |
#' xlim/xlat enable hexagon tesselation to be constrained independently of data range. | |
#' There may be some bugs in the implementation. A legend for hexagon size has not been implemented. | |
require(ggplot2) | |
require(plyr) | |
#' Bin data into hexagons (2d). | |
#' | |
#' @param x a numeric vector of x positions | |
#' @param y a numeric vector of y positions | |
#' @param weight \code{NULL} or a numeric vector providing weights for each | |
#' observation | |
#' @param height height of each hexagon, if \code{NULL} computed from ybins | |
#' @param width width of each hexagon, if \code{NULL} computed from ybins | |
#' @param xbins number of horizontal bins, if \code{width} unspecified | |
#' @param ybins number of vertical bins, if \code{height} unspecified | |
#' @param na.rm If \code{TRUE} missing values will be silently removed, | |
#' otherwise they will be removed with a warning. | |
#' @export | |
#' @seealso \code{\link{hex_pos}} for algorithm that finds hexagon center | |
#' closest to each point and \code{\link{hex_coord}} that generates | |
#' coordinates of each hexagon. | |
#' @return A data frame with columns \code{x}, \code{y} and \code{freq}, | |
#' and attributes \code{width} and \code{height}. | |
#' @S3method plot bin_hex | |
#' @examples | |
#' plot(hex_bin(runif(1e4), runif(1e4))) | |
#' plot(hex_bin(rnorm(1e4), rnorm(1e4))) | |
#' | |
#' data(baseball, package = "plyr") | |
#' bin <- hex_bin(baseball$g, baseball$ab) | |
hex_bin <- function(x, y, weight = NULL, var4 = NULL, width = NULL, height = NULL, | |
xbins = 20, ybins = 20, frequency.to.area = FALSE, na.rm = FALSE, | |
hmin = 0, hmax = 1, xlim = NULL, ylim = NULL, ...) { | |
if(hmax > 1) warning("hmax > 1 is likely to result in some hexagon overplotting") | |
cleaned <- clean_xy(x, y, weight, var4, xlim=xlim, ylim=ylim) | |
if (is.null(xlim)) xlim <- range(cleaned$x) | |
if (is.null(ylim)) ylim <- range(cleaned$y) | |
if (is.null(width)) width <- diff(xlim) / xbins | |
if (is.null(height)) height <- diff(ylim) / ybins | |
height <- height * sqrt(3) | |
pos <- hex_pos(cleaned$x, cleaned$y, width, height) | |
cleaned$x <- pos[,1]; cleaned$y <- pos[,2] | |
# bin values by hexagon | |
binned <- count(cleaned, c("x", "y"), "weight") | |
var4_sum <- aggregate(cleaned$var4, by=list(cleaned$x, cleaned$y), FUN=mean) | |
names(var4_sum) = c('x','y','var4') | |
# cols must match order of binned | |
binned$var4 <- var4_sum$var4[match(paste(binned$x, binned$y), paste(var4_sum$x, var4_sum$y))] | |
# swap size/frequency variables to map respectively to colour/size | |
names(binned) <- c('x','y','col','size') | |
if(frequency.to.area) binned <- transform(binned, size=col, col=size) | |
# 'freq' field now definitely maps to hex area | |
if(!is.null(var4) & min(binned$size)<0) warning("size vector cannot include negative values") | |
# scale size variable and min-max parameters from hexagon area to side | |
binned$size = hex_side(binned$size) | |
hmax_a = hex_side(hmax)/hex_side(1) | |
hmin_a = hex_side(hmin)/hex_side(1) | |
hrange = hmax_a - hmin_a | |
# normalise and rescale to custom min-max parameters | |
binned$size <- binned$size * hrange / max(binned$size) + hmin_a | |
structure( | |
binned, | |
width = width, | |
height = height, | |
class = c("bin_hex", "data.frame") | |
) | |
} | |
clean_xy <- function(x, y, weight=NULL, var4=NULL, na.rm=TRUE, xlim=NULL, ylim=NULL) { | |
# If !na.rm, remove missing values with a warning. | |
# Otherwise just remove them | |
missing <- !is.finite(x) | !is.finite(y) | |
nmissing <- sum(missing) | |
if (na.rm && nmissing > 0) { | |
warning("Removing ", nmissing, " missing values") | |
} | |
# Check weights, and throw out missing values and zero-weight observations | |
if (is.null(weight)) { | |
weight <- rep.int(1, length(x)) | |
} else { | |
weight[is.na(weight)] <- 0 | |
} | |
# Check sizes, and throw out missing values and zero-weight observations | |
if (is.null(var4)) { | |
var4 <- rep.int(1, length(x)) | |
} else { | |
var4[is.na(var4)] <- 0 | |
} | |
ok <- !missing & weight > 0 & var4 > 0 | |
if (all(ok)) { data.frame(x = x, y = y, weight = weight, var4 = var4) | |
} else { | |
x <- x[ok] | |
y <- y[ok] | |
var4 <- var4[ok] | |
weight <- weight[ok] | |
} | |
out <- data.frame(x = x, y = y, weight = weight, var4 = var4) | |
if (is.null(xlim)) xlim <- range(out$x); if (is.null(ylim)) ylim <- range(out$y) | |
out[out$x >= min(xlim) & out$x <= max(xlim) & out$y >= min(ylim) & out$y <= max(ylim),] | |
} | |
plot.bin_hex <- function(x, ...) { | |
if (!require("scales")) { | |
message("Scales package required for plotting 2d densities") | |
return() | |
} | |
if(all(x$col == x$col[1])) { # no variation in colour variable | |
col <- rep("black", nrow(x)) | |
} else col <- cscale(x$col, seq_gradient_pal(low = "grey70", high = "red")) | |
hexes <- hex_coord(x=x$x, y=x$y, width=attr(x, "width"), height=attr(x, "height"), | |
size=x$size, ...) | |
plot(hexes[,1], hexes[,2], type = "n", ...) | |
polygon(hexes, col = col, border = NA) | |
} | |
# Binning algorithms are available for various lattices in dimensions 2-8 | |
# (Conway and Sloane 1982). The following subroutine is a fast FORTRAN | |
# implementation of hexagonal binning. The key observation is that hexagon | |
# centers fall on two staggered lattices whose cells are rectangles. Presuming | |
# the long side of the rectangle is in the y direction, dividing the y | |
# coordinates by square root (3) [SQRT(3)] makes the cells square. Thus the | |
# algorithm uses two lattices with square cells. The first lattice has points | |
# at the integers with [0, 0] as the lower left value. The second lattice is | |
# shifted so that the lower left value is at [.5 , .5]. The x and y vectors | |
# are scaled into [0, SIZE] and [0, SIZE / SQRT(3)], respectively. SIZE | |
# determines the portions of the lattices that are used. For each data point, | |
# binning consists of finding one candidate lattice point from each lattice | |
# and then selecting the nearest of the two candidates. | |
#' Find centre of closest hexagon. | |
#' | |
#' @param x numeric x position | |
#' @param y numeric y position | |
#' @param width of hexagon | |
#' @param height of hexagon | |
#' @return matrix giving position of closest hexagon center | |
#' @keywords internal | |
#' @export | |
#' @examples | |
#' x <- runif(1e4) | |
#' y <- runif(1e4) | |
#' res <- hex_pos(x, y, 0.5, 0.5) | |
#' plot(x, y, type = "n") | |
#' segments(x, y, res[, 1], res[, 2], col = "grey80") | |
#' points(unique(res), pch = 20, cex = 2) | |
hex_pos <- function(x, y, width, height) { | |
height <- height / sqrt(3) | |
minx <- min(x, na.rm = TRUE) | |
miny <- min(y, na.rm = TRUE) | |
# Scale to [0, nrows/ncols] | |
sx <- (x - minx) / width | |
sy <- (y - miny) / height | |
# Find closest center: [0, 0] or [0.5, 0.5]? | |
fx <- round(sx) | |
fy <- round(sy) | |
dist_0 <- 3 * (sx - fx)^2 + (sy - fy)^2 | |
dist_1 <- 3 * (sx - fx + 0.5)^2 + (sy - fy + 0.5)^2 | |
dist_2 <- 3 * (sx - fx + 0.5)^2 + (sy - fy - 0.5)^2 | |
dist_3 <- 3 * (sx - fx - 0.5)^2 + (sy - fy + 0.5)^2 | |
dist_4 <- 3 * (sx - fx - 0.5)^2 + (sy - fy - 0.5)^2 | |
dist_smallest <- pmin(dist_0, dist_1, dist_2, dist_3, dist_4) | |
x_offset <- rep(0, length(x)) | |
x_offset[dist_smallest == dist_1] <- +0.5 | |
x_offset[dist_smallest == dist_2] <- +0.5 | |
x_offset[dist_smallest == dist_3] <- -0.5 | |
x_offset[dist_smallest == dist_4] <- -0.5 | |
y_offset <- rep(0, length(y)) | |
y_offset[dist_smallest == dist_1] <- +0.5 | |
y_offset[dist_smallest == dist_2] <- -0.5 | |
y_offset[dist_smallest == dist_3] <- +0.5 | |
y_offset[dist_smallest == dist_4] <- -0.5 | |
# Transform back to original coordinates | |
cbind(x = (fx - x_offset) * width + minx, y = (fy - y_offset) * height + miny) | |
} | |
#' Generate hexagon coordinates. | |
#' | |
#' Long axis is horizontal. Edges clock-wise from far-left, separated by | |
#' row of missing values. | |
#' | |
#' @param x horizontal position of center | |
#' @param y vertical position of center | |
#' @param width hex width | |
#' @param height hex height | |
#' @export | |
#' @keywords internal | |
#' @return A two column matrix with 7 times as many rows as input. | |
#' @examples | |
#' x <- runif(1000) | |
#' y <- runif(1000) | |
#' res <- unique(hex_pos(x, y, 0.5, 0.5)) | |
#' hexes <- hex_coord(res[, 1], res[, 2], 0.6, 0.5) | |
#' | |
#' hexes <- hex_coord(res[, 1], res[, 2], rnorm(1000,.5,.3), rnorm(1000,.5,.3)) | |
#' | |
#' plot(hexes, type = "n") | |
#' polygon(hexes) | |
#' points(res) | |
hex_coord <- function(x, y, width, height, size = 1) { | |
dx <- size * width / 6 | |
dy <- size * height / 2 / sqrt(3) | |
hex_x <- rbind(x - 2 * dx, x - dx, x + dx, x + 2 * dx, x + dx, x - dx, NA) | |
hex_y <- rbind(y, y + dy, y + dy, y, y - dy, y - dy, NA) | |
cbind(as.vector(hex_x), as.vector(hex_y)) | |
} | |
hex_coord_df <- function(x, y, width, height, size = 1) { | |
# like hex_coord but returns a dataframe of vertices grouped by an id variable | |
dx <- size * width / 6 | |
dy <- size * height / 2 / sqrt(3) | |
hex_x <- rbind(x - 2 * dx, x - dx, x + dx, x + 2 * dx, x + dx, x - dx) | |
hex_y <- rbind(y, y + dy, y + dy, y, y - dy, y - dy) | |
id <- rep(1:length(x), each=6) | |
data.frame(cbind(x=as.vector(hex_x), y=as.vector(hex_y), id)) | |
} | |
## Functions for calculating hexagon geometries | |
# hexagon side from area | |
hex_side = function(area) sqrt(2 * area / (sqrt(3)*3)) | |
# hexagon area from side (not used) | |
hex_area = function(side) side^2 * sqrt(3) * 3/2 | |
# quick function for gg-plotting with 1 line (limited functionality) | |
qplothex = function(x, y, var4 = NULL, f.to.a = FALSE, ...){ | |
bin = hex_bin(x=x, y=y, var4=var4, frequency.to.area=f.to.a, ...) | |
hexes = hex_coord_df(x=bin$x, y=bin$y, width=attr(bin, "width"), height=attr(bin, "height"), size=bin$size) | |
hexes$col = rep(bin$col, each=6) | |
ggplot(hexes, aes(x=x, y=y)) + geom_polygon(aes(fill=col, group=id)) | |
} | |
## EXAMPLES | |
# normal distributions example | |
df = data.frame(x=c(rnorm(6000,0,2), rnorm(3000,3,1)), y=c(rnorm(6000,0,2), rnorm(3000,3,1))) | |
df$sizex = df$x - min(df$x); df$sizey = df$y - min(df$y) | |
ggplot(df, aes(x=x,y=y)) + geom_hex() # existing ggplot function | |
# plotting with plot.bin_hex(), with bin counts mapping to colour (geom_hex equivalent) | |
plot(hex_bin(df$x, df$y)) | |
# bin counts map to area | |
plot(hex_bin(df$x, df$y, frequency.to.area=TRUE)) | |
# using a 4th variable (var4) to map to area | |
plot(hex_bin(df$x, df$y, var4=df$sizey)) | |
# 4th variable mapped to colour and frequency to area | |
plot(hex_bin(df$x, df$y, var4=df$sizey, frequency.to.area=TRUE)) | |
# using 'weight' argument to map uniformly distributed raster-type data | |
plot(hex_bin(rep(1:100,100), rep(1:100,each=100))) | |
plot(hex_bin(rep(1:100,100), rep(1:100,each=100), weight=rep(1:100,100))) | |
plot(hex_bin(rep(1:100,100), rep(1:100,each=100), weight=rep(1:100,100), var4=rep(1:100,each=100))) | |
# setting minimum and maximum area parameters | |
plot(hex_bin(df$x, df$y, frequency.to.area=TRUE, hmin=.2)) | |
plot(hex_bin(df$x, df$y, frequency.to.area=TRUE, hmax=2)) | |
# mapping frequency to colour / 4th variable to size | |
plot(hex_bin(df$x, df$y, var4=df$sizex, frequency.to.area=FALSE)) | |
bin = hex_bin(df$x, df$y, var4=df$sizex, frequency.to.area=FALSE) | |
hexes = hex_coord_df(x=bin$x, y=bin$y, width=attr(bin,"width"), height=attr(bin,"height"), size=bin$size) | |
hexes$col = rep(bin$col, each=6) | |
ggplot(hexes, aes(x=x, y=y)) + geom_polygon(aes(fill=col, group=id)) | |
# mapping frequency to size / 4th variable to colour | |
plot(hex_bin(df$x, df$y, var4=df$sizex, frequency.to.area=TRUE)) | |
bin = hex_bin(df$x, df$y, var4=df$sizex, frequency.to.area=TRUE) | |
hexes = hex_coord_df(x=bin$x, y=bin$y, width=attr(bin,"width"), height=attr(bin,"height"), size=bin$size) | |
hexes$rightness = rep(bin$col, each=6) | |
ggplot(hexes, aes(x=x, y=y)) + geom_polygon(aes(fill=rightness, group=id)) | |
# Economics dataset example | |
ggplot(economics, aes(x=uempmed, y=unemploy, col=pop)) + geom_point() | |
plot(hex_bin(x=economics$uempmed, y=economics$unemploy, var4=economics$pop)) | |
plot(hex_bin(x=economics$uempmed, y=economics$unemploy, var4=economics$pop, frequency.to.area=TRUE)) | |
bin = hex_bin(economics$uempmed, economics$unemploy, var4=economics$pop, frequency.to.area=TRUE) | |
hexes = hex_coord_df(x=bin$x, y=bin$y, width=attr(bin, "width"), height=attr(bin, "height"), size=bin$size) | |
hexes$population = rep(bin$col, each=6) | |
names(hexes)[1:2] = c('uempmed','unemploy') | |
ggplot(hexes, aes(x=uempmed, y=unemploy)) + geom_polygon(aes(fill=population, group=id)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment