Last active
October 24, 2019 00:06
-
-
Save dholstius/71fa74cdb24538f8c7bd5e464fc9a72a to your computer and use it in GitHub Desktop.
Summary Tree Tool, demonstrated using the default BY2011 category hierarchy (and BY2011 emissions).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' | |
#' Summary Tree Tool | |
#' | |
#' Discover which categories --- or groups of categories --- emit the largest | |
#' share(s) of a particular pollutant. | |
#' | |
#' Version history: | |
#' | |
#' - Created 2015-12-01 by dholstius | |
#' - Last updated 2019-10-23 by dholstius | |
#' | |
#' References: | |
#' | |
#' - Kenny Shirley and Howard Karloff (2019). `summarytrees`: Compute and Visualize | |
#' Summary Trees. R package version 0.1. | |
#' | |
#' - Kenny Shirley (2015). Visualizing Maximum Entropy Summary Trees Using R and d3. | |
#' Presented at the Joint Statistical Meeting (JSM), Seattle, WA. | |
#' http://www.kennyshirley.com/jsm2015/ | |
#' | |
library(inventory) | |
library(BY2011) | |
library(summarytrees) | |
#' | |
#' The hierarchy that you pick here constrains the solution space. | |
#' | |
#' Any "summary tree" solution has to be a tree that can be formed by iterative | |
#' consolidation, starting from the leaves of this "maximal tree". | |
#' | |
#' This will make more sense when you get to the end! | |
#' | |
summarytree_hierarchy <- | |
BY2011_category_hierarchy | |
#' | |
#' The data should be compatible with the hierarchy. | |
#' | |
summarytree_emission_data <- | |
BY2011_annual_emission_data | |
#' | |
#' The "weightiest" leaf nodes (single categories) will depend on which | |
#' pollutant you pick, and also on the year. | |
#' | |
#' Edit the code below where it says `NOx` and/or `CY(2011)`. Try using | |
#' `CO2`, `PM2.5`, or `ROG` instead of `NOx`. | |
#' | |
summarytree_scope <- | |
tibble( | |
year = CY(2011), | |
pol_abbr = "NOx", | |
cat_h0 = "Area sources") | |
#' | |
#' You shouldn't need to change the code below, unless: | |
#' | |
#' - you want `weight` to vary non-linearly, or with something other than emission (`ems_qty`); or | |
#' - you want to pre-aggreate to some granularity other than `cat_id`; or | |
#' - you want the tree to be based on something other than categories (e.g. SIC codes, or pollutants). | |
#' | |
node_weight_data <- | |
summarytree_emission_data %>% | |
with_hierarchy( | |
summarytree_hierarchy) %>% | |
semi_join( | |
summarytree_scope) %>% | |
group_by( | |
cat_id) %>% | |
summarise( | |
weight = total(ems_qty, digits = 4)) %>% | |
filter( | |
weight > 0) | |
#' | |
#' Construct the input data for the search algorithm. | |
#' | |
summarytree_input_data <- local({ | |
summarytree_structure <- | |
summarytree_hierarchy %>% | |
semi_join( | |
node_weight_data, | |
by = c("cat_id")) %>% | |
extract_tree() | |
weighted_adjacency_data <- | |
summarytree_structure %>% | |
as.data.frame( | |
form = "adjacency_list") %>% | |
left_join( | |
mutate_at(node_weight_data, vars(cat_id), as.character), | |
by = c("label" = "cat_id")) %>% | |
ensure( | |
!all_true(is.na(.$weight))) | |
summarytree_data <- | |
weighted_adjacency_data %>% | |
replace_na( | |
list(parent = 0, weight = 0)) %>% | |
transmute( | |
node = as.numeric(node), | |
parent = as.numeric(parent), | |
weight = as.numeric(weight), | |
label = str_replace(label, "^([[:digit:]]+)$", "#\\1"), | |
level = as.numeric(depth)) | |
}) | |
#' | |
#' For each `k` in `{1, 2, ..., MAX_TREE_SIZE}`, pre-compute the optimal (most | |
#' balanced) summary tree of size `k`. | |
#' | |
#' - The size `k` is just the number of nodes in a given summary tree. | |
#' - The tool will let you thumb through these pre-computed trees in order. | |
#' | |
summarytree_solutions <- local({ | |
#' | |
#' Increase `MAX_TREE_SIZE` if you find that you cannot explore deeply enough. | |
#' | |
MAX_TREE_SIZE <- 150 | |
#' | |
#' If you're a perfectionist, and have time to spare, you can use `optimal()` | |
#' instead of `greedy()` here. | |
#' | |
#' If you do, you'll have to pick a value for `epsilon`. This is the | |
#' nonnegative upper bound of the additive error when using the approximate | |
#' algorithm. Each `k`-node summary tree returned by the approximate algorithm | |
#' will have entropy within `epsilon` of the maximal entropy of a `k`-node | |
#' summary tree. | |
#' | |
#' See `?summarytrees::optimal` and `?summarytrees::greedy` for details. | |
#' | |
summarytree_solutions <- | |
with( | |
summarytree_input_data, | |
summarytrees::greedy( | |
node, parent, weight, label, | |
#epsilon = 1.0, | |
K = min( | |
MAX_TREE_SIZE, | |
n_distinct(summarytree_data$node)))) | |
}) | |
#' | |
#' Prepare JSON for the interactive visualization. | |
#' | |
#' You can change some things here, like `units`, `node.width`, `node.height`, | |
#' `print.weights`, and `color.level`. There doesn't seem to be a great way to | |
#' get the labels to wrap --- an injected `</br>` just gets escaped --- but if | |
#' you discover a way, please send me an email, and I'll update it here! | |
#' | |
summarytree_json <- | |
with( | |
summarytree_solutions, | |
summarytrees::prepare.vis( | |
tree.list = summary.trees, | |
labels = str_wrap(data[, "label"], width = 10), | |
tree = tree, | |
legend.width = 100, | |
node.width = 100, | |
node.height = 18, | |
units = str_c("tons/yr ", summarytree_scope$pol_abbr), | |
print.weights = FALSE, | |
legend.color = "lightsteelblue", | |
color.level = 1)) | |
#' | |
#' Actually show the visualization. | |
#' | |
summarytrees::draw.vis( | |
json.object = summarytree_json, | |
out.dir = tempfile(), | |
open.browser = TRUE) | |
#' | |
#' Pause here. | |
#' | |
readline("Press any key to stop and clean up ...") | |
#' | |
#' Clean up. This is required if you want to re-run with new parameters; | |
#' otherwise, you'll see the old tree and wonder why nothing changed. | |
#' | |
for (srv_id in servr::daemon_list()) { | |
servr::daemon_stop(srv_id) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment