Skip to content

Instantly share code, notes, and snippets.

@seasmith
Last active August 20, 2017 17:11
Show Gist options
  • Save seasmith/3c7c12ba1f8cb79e2a821c75e2cc41e9 to your computer and use it in GitHub Desktop.
Save seasmith/3c7c12ba1f8cb79e2a821c75e2cc41e9 to your computer and use it in GitHub Desktop.
Downloading and visualizing North Dakota horizontal-well oil production with the networkD3 package in R
# Load dependencies -------------------------------------------------------
library(httr)
library(rvest)
library(plyr)
library(tibble)
library(dplyr)
library(stringi)
library(igraph)
library(ggraph)
library(networkD3)
# County codes and abbreviations needed for joins
counties <- structure(list(CountyName = c("adams", "barnes", "benson", "billings",
"bottineau", "bowman", "burke", "burleigh", "cass", "cavalier",
"dickey", "divide", "dunn", "eddy", "emmons", "foster", "golden valley",
"grand forks", "grant", "griggs", "hettinger", "kidder", "lamoure",
"logan", "mchenry", "mcintosh", "mckenzie", "mclean", "mercer",
"morton", "mountrail", "nelson", "oliver", "pembina", "pierce",
"ramsey", "ransom", "renville", "richland", "rolette", "sargent",
"sheridan", "sioux", "slope", "stark", "steele", "stutsman",
"towner", "traill", "walsh", "ward", "wells", "williams"), Code = c(1L,
3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L,
29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L, 51L, 53L,
55L, 57L, 59L, 61L, 63L, 65L, 67L, 69L, 71L, 73L, 75L, 77L, 79L,
81L, 83L, 85L, 87L, 89L, 91L, 93L, 95L, 97L, 99L, 101L, 103L,
105L), Abbreviation = c("ADA", "BAR", "BEN", "BIL", "BOT", "BOW",
"BUR", "BUR", "CAS", "CAV", "DIC", "DIV", "DUN", "EDD", "EMM",
"FOS", "GOL", "GRA", "GRA", "GRI", "HET", "KID", "LAM", "LOG",
"MCH", "MCI", "MCK", "MCL", "MER", "MOR", "MOU", "NEL", "OLI",
"PEM", "PIE", "RAM", "RAN", "REN", "RIC", "ROL", "SAR", "SHE",
"SIO", "SLO", "STA", "STE", "STU", "TOW", "TRA", "WAL", "WAR",
"WEL", "WIL"), region = c(38001L, 38003L, 38005L, 38007L, 38009L,
38011L, 38013L, 38015L, 38017L, 38019L, 38021L, 38023L, 38025L,
38027L, 38029L, 38031L, 38033L, 38035L, 38037L, 38039L, 38041L,
38043L, 38045L, 38047L, 38049L, 38051L, 38053L, 38055L, 38057L,
38059L, 38061L, 38063L, 38065L, 38067L, 38069L, 38071L, 38073L,
38075L, 38077L, 38079L, 38081L, 38083L, 38085L, 38087L, 38089L,
38091L, 38093L, 38095L, 38097L, 38099L, 38101L, 38103L, 38105L
)), .Names = c("CountyName", "Code", "Abbreviation", "region"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-53L), spec = structure(list(cols = structure(list(CountyName = structure(list(), class = c("collector_character",
"collector")), Code = structure(list(), class = c("collector_integer",
"collector")), Abbreviation = structure(list(), class = c("collector_character",
"collector")), region = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("CountyName", "Code", "Abbreviation",
"region")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
# Get form info -----------------------------------------------------------
url <- "https://www.dmr.nd.gov/oilgas/bakkenwells.asp"
form <- url %>%
read_html() %>%
html_nodes("select") %>%
html_nodes("option")
form_values <- form %>%
html_attr("value") %>%
.[-1]
form_text <- form %>%
html_text() %>%
.[-1] %>%
gsub("\\s|/", "_", x = .)
# Create and read addresses -----------------------------------------------
queries <- paste0("menu1=", form_values)
addr <- lapply(queries, function(x) modify_url(url = url, query = x))
html <- addr %>% lapply(read_html)
# Get tables --------------------------------------------------------------
summary_target <- "Bakken Wells data table"
table_target <- html %>% lapply(function(x) {
x %>%
html_nodes("table") %>%
html_attr("summary") %>%
`==`(summary_target) %>%
which()
})
header <- html[[1]] %>%
html_nodes("table") %>%
`[[`(table_target[[1]]) %>%
html_nodes("thead") %>%
html_nodes("th") %>%
html_text() %>%
gsub("\\s|/", "_", x = .)
prods <- html %>% seq() %>% lapply(function(x) {
y <- html[[x]]
y <- html_nodes(y, "table")
y <- y[[table_target[[x]]]]
y <- html_table(y)
names(y) <- header
y <- mutate(y,
Completion_Date = as.Date(Completion_Date, format = "%m/%d/%Y"),
Last_Prod_Rpt_Date = as.Date(Last_Prod_Rpt_Date, format = "%m/%d/%Y"))
as_tibble(y)
})
names(prods) <- form_text
prod <- prods %>%
ldply() %>%
as_tibble()
# API parsing function.
# Copied and pasted from
# my misc pkg (seasmith)
parse_API <- function(x) {
old_scipen <- getOption("scipen")
options(scipen = 100)
len <- stringi::stri_length(x)
if (!(len == 10 | len == 12 | len == 14)) {
stop("Must have a valid 10 or 14 digit API number.")
}
state <- stringi::stri_sub(x, 1L, 2L)
county <- stringi::stri_sub(x, 3L, 5L)
uid <- stringi::stri_sub(x, 6L, 10L)
sidetrack <- stringi::stri_sub(x, 11L, 12L)
event <- stringi::stri_sub(x, 13L, 14L)
options(scipen = old_scipen)
return(list(state = state,
county = county,
uid = uid,
sidetrack = sidetrack,
event = event))
}
prod <- prod$API_No %>%
parse_API %$% county %>%
as.numeric() %>%
add_column(prod, County = .)
prod <- prod %>%
left_join(counties %>% select(CountyName, Code), by = c("County" = "Code")) %>%
mutate(County = stri_trans_totitle(CountyName)) %>%
select(-CountyName)
# Summarize data ----------------------------------------------------------
prod_nd <- prod %>%
summarise(Oil = sum(Cum_Oil))
prod_counties <- prod %>%
group_by(County) %>%
summarise(Oil = sum(Cum_Oil))
prod_counties_operators <- prod %>%
group_by(County, Operator) %>%
summarise(Oil = sum(Cum_Oil)) %>%
rename(from = County) %>%
mutate(to = paste(from, Operator, sep = "_"))
prod_counties <- prod_counties %>%
add_column(from = "North Dakota") %>%
select(from, everything()) %>%
rename(to = County)
prod_nd <- prod_nd %>%
add_column(from = "North Dakota", to = NA) %>%
select(from, to, Oil)
# Create network tables ---------------------------------------------------
## -- Separate top 10 and everyting else
# Grab the 10th largest producer
# and use their production mark
# as a cut point
prod_summ <- prod %>%
group_by(Operator) %>%
summarize(Oil = sum(Cum_Oil))
cut_mark <- prod_summ %>%
arrange(desc(Oil)) %>%
slice(10) %>%
.$Oil
# Tidy up the names
pco <- prod_counties_operators %>%
mutate(to = stri_replace(to, "", regex = ".*_"),
to = stri_trans_totitle(to))
# Take only `Operators` matching the top_10$Operators
pco_top_10 <- pco %>%
filter(Operator %in% top_10$Operator) %>%
rename(to = Operator)
# Take only `Operators` NOT matching the top_10$Operators
pco_all_others <- pco %>%
filter(!(Operator %in% top_10$Operator)) %>%
group_by(from) %>%
summarize(Oil = sum(Oil)) %>%
add_column(to = "All others") %>%
select(from, to, Oil)
# Stack them
pco_summ <- pco_top_10 %>%
bind_rows(pco_all_others)
## -- Now create network tables
nd <- list()
nd$nodes <- prod_nd$from %>%
c(prod_counties$to) %>%
tibble(name = .) %>%
add_row(name = pco_summ$to %>% unique())
nd$links <- prod_counties %>%
bind_rows(pco_summ)
# Create encodings to use instead of names
nd$encoding <- nd$nodes %>%
add_column(encoding_to = seq_len(nrow(nd$nodes)) - 1)
# Join encodings with names in nd$links
nd$links <- nd$links %>%
left_join(nd$encoding, by = c("to" = "name")) %>%
left_join(nd$encoding %>%
rename(encoding_from = encoding_to),
by = c("from" = "name"))
# Add a categorical column to control
# the coloring of the nodes
nd2$nodes <- nd2$nodes %>%
add_column(id = c("state",
rep("county", 13),
rep("operator", 11)))
# Make the graph ----------------------------------------------------------
sankeyNetwork(Links = nd$links,
Nodes = nd$nodes,
Source = "encoding_from",
Target = "encoding_to",
Value = "Oil",
NodeID = "name",
NodeGroup = "id",
units = "barrels",
fontSize = 16,
fontFamily = "Open Sans",
nodeWidth = 20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment