Last active
August 20, 2017 17:11
-
-
Save seasmith/3c7c12ba1f8cb79e2a821c75e2cc41e9 to your computer and use it in GitHub Desktop.
Downloading and visualizing North Dakota horizontal-well oil production with the networkD3 package in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load dependencies ------------------------------------------------------- | |
library(httr) | |
library(rvest) | |
library(plyr) | |
library(tibble) | |
library(dplyr) | |
library(stringi) | |
library(igraph) | |
library(ggraph) | |
library(networkD3) | |
# County codes and abbreviations needed for joins | |
counties <- structure(list(CountyName = c("adams", "barnes", "benson", "billings", | |
"bottineau", "bowman", "burke", "burleigh", "cass", "cavalier", | |
"dickey", "divide", "dunn", "eddy", "emmons", "foster", "golden valley", | |
"grand forks", "grant", "griggs", "hettinger", "kidder", "lamoure", | |
"logan", "mchenry", "mcintosh", "mckenzie", "mclean", "mercer", | |
"morton", "mountrail", "nelson", "oliver", "pembina", "pierce", | |
"ramsey", "ransom", "renville", "richland", "rolette", "sargent", | |
"sheridan", "sioux", "slope", "stark", "steele", "stutsman", | |
"towner", "traill", "walsh", "ward", "wells", "williams"), Code = c(1L, | |
3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, | |
29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L, 51L, 53L, | |
55L, 57L, 59L, 61L, 63L, 65L, 67L, 69L, 71L, 73L, 75L, 77L, 79L, | |
81L, 83L, 85L, 87L, 89L, 91L, 93L, 95L, 97L, 99L, 101L, 103L, | |
105L), Abbreviation = c("ADA", "BAR", "BEN", "BIL", "BOT", "BOW", | |
"BUR", "BUR", "CAS", "CAV", "DIC", "DIV", "DUN", "EDD", "EMM", | |
"FOS", "GOL", "GRA", "GRA", "GRI", "HET", "KID", "LAM", "LOG", | |
"MCH", "MCI", "MCK", "MCL", "MER", "MOR", "MOU", "NEL", "OLI", | |
"PEM", "PIE", "RAM", "RAN", "REN", "RIC", "ROL", "SAR", "SHE", | |
"SIO", "SLO", "STA", "STE", "STU", "TOW", "TRA", "WAL", "WAR", | |
"WEL", "WIL"), region = c(38001L, 38003L, 38005L, 38007L, 38009L, | |
38011L, 38013L, 38015L, 38017L, 38019L, 38021L, 38023L, 38025L, | |
38027L, 38029L, 38031L, 38033L, 38035L, 38037L, 38039L, 38041L, | |
38043L, 38045L, 38047L, 38049L, 38051L, 38053L, 38055L, 38057L, | |
38059L, 38061L, 38063L, 38065L, 38067L, 38069L, 38071L, 38073L, | |
38075L, 38077L, 38079L, 38081L, 38083L, 38085L, 38087L, 38089L, | |
38091L, 38093L, 38095L, 38097L, 38099L, 38101L, 38103L, 38105L | |
)), .Names = c("CountyName", "Code", "Abbreviation", "region" | |
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, | |
-53L), spec = structure(list(cols = structure(list(CountyName = structure(list(), class = c("collector_character", | |
"collector")), Code = structure(list(), class = c("collector_integer", | |
"collector")), Abbreviation = structure(list(), class = c("collector_character", | |
"collector")), region = structure(list(), class = c("collector_integer", | |
"collector"))), .Names = c("CountyName", "Code", "Abbreviation", | |
"region")), default = structure(list(), class = c("collector_guess", | |
"collector"))), .Names = c("cols", "default"), class = "col_spec")) | |
# Get form info ----------------------------------------------------------- | |
url <- "https://www.dmr.nd.gov/oilgas/bakkenwells.asp" | |
form <- url %>% | |
read_html() %>% | |
html_nodes("select") %>% | |
html_nodes("option") | |
form_values <- form %>% | |
html_attr("value") %>% | |
.[-1] | |
form_text <- form %>% | |
html_text() %>% | |
.[-1] %>% | |
gsub("\\s|/", "_", x = .) | |
# Create and read addresses ----------------------------------------------- | |
queries <- paste0("menu1=", form_values) | |
addr <- lapply(queries, function(x) modify_url(url = url, query = x)) | |
html <- addr %>% lapply(read_html) | |
# Get tables -------------------------------------------------------------- | |
summary_target <- "Bakken Wells data table" | |
table_target <- html %>% lapply(function(x) { | |
x %>% | |
html_nodes("table") %>% | |
html_attr("summary") %>% | |
`==`(summary_target) %>% | |
which() | |
}) | |
header <- html[[1]] %>% | |
html_nodes("table") %>% | |
`[[`(table_target[[1]]) %>% | |
html_nodes("thead") %>% | |
html_nodes("th") %>% | |
html_text() %>% | |
gsub("\\s|/", "_", x = .) | |
prods <- html %>% seq() %>% lapply(function(x) { | |
y <- html[[x]] | |
y <- html_nodes(y, "table") | |
y <- y[[table_target[[x]]]] | |
y <- html_table(y) | |
names(y) <- header | |
y <- mutate(y, | |
Completion_Date = as.Date(Completion_Date, format = "%m/%d/%Y"), | |
Last_Prod_Rpt_Date = as.Date(Last_Prod_Rpt_Date, format = "%m/%d/%Y")) | |
as_tibble(y) | |
}) | |
names(prods) <- form_text | |
prod <- prods %>% | |
ldply() %>% | |
as_tibble() | |
# API parsing function. | |
# Copied and pasted from | |
# my misc pkg (seasmith) | |
parse_API <- function(x) { | |
old_scipen <- getOption("scipen") | |
options(scipen = 100) | |
len <- stringi::stri_length(x) | |
if (!(len == 10 | len == 12 | len == 14)) { | |
stop("Must have a valid 10 or 14 digit API number.") | |
} | |
state <- stringi::stri_sub(x, 1L, 2L) | |
county <- stringi::stri_sub(x, 3L, 5L) | |
uid <- stringi::stri_sub(x, 6L, 10L) | |
sidetrack <- stringi::stri_sub(x, 11L, 12L) | |
event <- stringi::stri_sub(x, 13L, 14L) | |
options(scipen = old_scipen) | |
return(list(state = state, | |
county = county, | |
uid = uid, | |
sidetrack = sidetrack, | |
event = event)) | |
} | |
prod <- prod$API_No %>% | |
parse_API %$% county %>% | |
as.numeric() %>% | |
add_column(prod, County = .) | |
prod <- prod %>% | |
left_join(counties %>% select(CountyName, Code), by = c("County" = "Code")) %>% | |
mutate(County = stri_trans_totitle(CountyName)) %>% | |
select(-CountyName) | |
# Summarize data ---------------------------------------------------------- | |
prod_nd <- prod %>% | |
summarise(Oil = sum(Cum_Oil)) | |
prod_counties <- prod %>% | |
group_by(County) %>% | |
summarise(Oil = sum(Cum_Oil)) | |
prod_counties_operators <- prod %>% | |
group_by(County, Operator) %>% | |
summarise(Oil = sum(Cum_Oil)) %>% | |
rename(from = County) %>% | |
mutate(to = paste(from, Operator, sep = "_")) | |
prod_counties <- prod_counties %>% | |
add_column(from = "North Dakota") %>% | |
select(from, everything()) %>% | |
rename(to = County) | |
prod_nd <- prod_nd %>% | |
add_column(from = "North Dakota", to = NA) %>% | |
select(from, to, Oil) | |
# Create network tables --------------------------------------------------- | |
## -- Separate top 10 and everyting else | |
# Grab the 10th largest producer | |
# and use their production mark | |
# as a cut point | |
prod_summ <- prod %>% | |
group_by(Operator) %>% | |
summarize(Oil = sum(Cum_Oil)) | |
cut_mark <- prod_summ %>% | |
arrange(desc(Oil)) %>% | |
slice(10) %>% | |
.$Oil | |
# Tidy up the names | |
pco <- prod_counties_operators %>% | |
mutate(to = stri_replace(to, "", regex = ".*_"), | |
to = stri_trans_totitle(to)) | |
# Take only `Operators` matching the top_10$Operators | |
pco_top_10 <- pco %>% | |
filter(Operator %in% top_10$Operator) %>% | |
rename(to = Operator) | |
# Take only `Operators` NOT matching the top_10$Operators | |
pco_all_others <- pco %>% | |
filter(!(Operator %in% top_10$Operator)) %>% | |
group_by(from) %>% | |
summarize(Oil = sum(Oil)) %>% | |
add_column(to = "All others") %>% | |
select(from, to, Oil) | |
# Stack them | |
pco_summ <- pco_top_10 %>% | |
bind_rows(pco_all_others) | |
## -- Now create network tables | |
nd <- list() | |
nd$nodes <- prod_nd$from %>% | |
c(prod_counties$to) %>% | |
tibble(name = .) %>% | |
add_row(name = pco_summ$to %>% unique()) | |
nd$links <- prod_counties %>% | |
bind_rows(pco_summ) | |
# Create encodings to use instead of names | |
nd$encoding <- nd$nodes %>% | |
add_column(encoding_to = seq_len(nrow(nd$nodes)) - 1) | |
# Join encodings with names in nd$links | |
nd$links <- nd$links %>% | |
left_join(nd$encoding, by = c("to" = "name")) %>% | |
left_join(nd$encoding %>% | |
rename(encoding_from = encoding_to), | |
by = c("from" = "name")) | |
# Add a categorical column to control | |
# the coloring of the nodes | |
nd2$nodes <- nd2$nodes %>% | |
add_column(id = c("state", | |
rep("county", 13), | |
rep("operator", 11))) | |
# Make the graph ---------------------------------------------------------- | |
sankeyNetwork(Links = nd$links, | |
Nodes = nd$nodes, | |
Source = "encoding_from", | |
Target = "encoding_to", | |
Value = "Oil", | |
NodeID = "name", | |
NodeGroup = "id", | |
units = "barrels", | |
fontSize = 16, | |
fontFamily = "Open Sans", | |
nodeWidth = 20) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment