Skip to content

Instantly share code, notes, and snippets.

@geotheory
Last active October 28, 2020 23:42
Show Gist options
  • Save geotheory/d55b5154fa186b9cd17ae4ac11e3e15f to your computer and use it in GitHub Desktop.
Save geotheory/d55b5154fa186b9cd17ae4ac11e3e15f to your computer and use it in GitHub Desktop.
Value binning and histogram implementation in PostgresSQL/SQL with R comparison
require(tidyverse)
require(DBI)
require(RPostgres)
con = dbConnect(RPostgres::Postgres(), dbname = "xxx")
dbWriteTable(con, "diamonds", diamonds, row.names=FALSE, append=FALSE)
sql_method = dbGetQuery(con, "
with price_stats as (
select min(price) as min,
max(price) as max
from diamonds
),
histogram as (
select width_bucket(price, min, max, 11) as bucket,
int4range(min(price), max(price), '[]') as range,
count(*) as freq
from diamonds, price_stats
group by bucket
order by bucket
)
select bucket, range, freq
from histogram;
") %>% as_tibble() %>%
mutate(brk_min = str_extract(range, '[0-9]+(?=,)') %>% as.numeric(),
brk_max = str_extract(range, '(?<=,)[0-9]+') %>% as.numeric(),
n = as.integer(freq)) %>%
select(bucket, brk_min, brk_max, n) %>%
mutate(method = 'SQL')
brks = seq(min(diamonds$price), max(diamonds$price), length.out = 12)
r_method = diamonds %>% select(price) %>% mutate(bin = cut(price, brks, include.lowest = TRUE)) %>%
count(bin) %>%
mutate(range = as.character(bin) %>% str_remove_all('[\\[\\]()]'),
bucket = as.integer(bin),
method = 'R') %>%
separate(range, c('brk_min', 'brk_max'), sep = ',', convert = TRUE) %>%
select(bucket, brk_min, brk_max, n, method)
d = rbind(sql_method, r_method)
d %>% ggplot(aes(brk_max, n, col = method)) + geom_line()
@geotheory
Copy link
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment