seasmith · August 20, 2017 17:11
diff --git a/nd_sankey.R b/nd_sankey.R
 # Load dependencies -------------------------------------------------------

 library(httr)
 library(rvest)
 library(plyr)
 library(tibble)
 library(dplyr)
 library(stringi)
 library(igraph)
 library(ggraph)
 library(networkD3)

 # County codes and abbreviations needed for joins
 counties <- structure(list(CountyName = c("adams", "barnes", "benson", "billings", 
 "bottineau", "bowman", "burke", "burleigh", "cass", "cavalier", 
 "dickey", "divide", "dunn", "eddy", "emmons", "foster", "golden valley", 
 "grand forks", "grant", "griggs", "hettinger", "kidder", "lamoure", 
 "logan", "mchenry", "mcintosh", "mckenzie", "mclean", "mercer", 
 "morton", "mountrail", "nelson", "oliver", "pembina", "pierce", 
 "ramsey", "ransom", "renville", "richland", "rolette", "sargent", 
 "sheridan", "sioux", "slope", "stark", "steele", "stutsman", 
 "towner", "traill", "walsh", "ward", "wells", "williams"), Code = c(1L, 
 3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 
 29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L, 51L, 53L, 
 55L, 57L, 59L, 61L, 63L, 65L, 67L, 69L, 71L, 73L, 75L, 77L, 79L, 
 81L, 83L, 85L, 87L, 89L, 91L, 93L, 95L, 97L, 99L, 101L, 103L, 
 105L), Abbreviation = c("ADA", "BAR", "BEN", "BIL", "BOT", "BOW", 
 "BUR", "BUR", "CAS", "CAV", "DIC", "DIV", "DUN", "EDD", "EMM", 
 "FOS", "GOL", "GRA", "GRA", "GRI", "HET", "KID", "LAM", "LOG", 
 "MCH", "MCI", "MCK", "MCL", "MER", "MOR", "MOU", "NEL", "OLI", 
 "PEM", "PIE", "RAM", "RAN", "REN", "RIC", "ROL", "SAR", "SHE", 
 "SIO", "SLO", "STA", "STE", "STU", "TOW", "TRA", "WAL", "WAR", 
 "WEL", "WIL"), region = c(38001L, 38003L, 38005L, 38007L, 38009L, 
 38011L, 38013L, 38015L, 38017L, 38019L, 38021L, 38023L, 38025L, 
 38027L, 38029L, 38031L, 38033L, 38035L, 38037L, 38039L, 38041L, 
 38043L, 38045L, 38047L, 38049L, 38051L, 38053L, 38055L, 38057L, 
 38059L, 38061L, 38063L, 38065L, 38067L, 38069L, 38071L, 38073L, 
 38075L, 38077L, 38079L, 38081L, 38083L, 38085L, 38087L, 38089L, 
 38091L, 38093L, 38095L, 38097L, 38099L, 38101L, 38103L, 38105L
 )), .Names = c("CountyName", "Code", "Abbreviation", "region"
 ), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
 -53L), spec = structure(list(cols = structure(list(CountyName = structure(list(), class = c("collector_character", 
 "collector")), Code = structure(list(), class = c("collector_integer", 
 "collector")), Abbreviation = structure(list(), class = c("collector_character", 
 "collector")), region = structure(list(), class = c("collector_integer", 
 "collector"))), .Names = c("CountyName", "Code", "Abbreviation", 
 "region")), default = structure(list(), class = c("collector_guess", 
 "collector"))), .Names = c("cols", "default"), class = "col_spec"))


 # Get form info -----------------------------------------------------------

 url <- "https://www.dmr.nd.gov/oilgas/bakkenwells.asp"

 form <- url %>%
  read_html() %>%
  html_nodes("select") %>%
  html_nodes("option")

 form_values <- form %>%
  html_attr("value") %>%
  .[-1]

 form_text <- form %>%
  html_text() %>%
  .[-1] %>%
  gsub("\\s|/", "_", x = .)


 # Create and read addresses -----------------------------------------------

 queries <- paste0("menu1=", form_values)
 addr    <- lapply(queries, function(x) modify_url(url = url, query = x))
 html    <- addr %>% lapply(read_html)


 # Get tables --------------------------------------------------------------

 summary_target <- "Bakken Wells data table"

 table_target <- html %>% lapply(function(x) {
  x %>%
    html_nodes("table") %>%
    html_attr("summary") %>%
    `==`(summary_target) %>%
    which()
 })

 header <- html[[1]] %>%
  html_nodes("table") %>%
  `[[`(table_target[[1]]) %>%
  html_nodes("thead") %>%
  html_nodes("th") %>%
  html_text() %>%
  gsub("\\s|/", "_", x = .)

 prods <- html %>% seq() %>% lapply(function(x) {
  y <- html[[x]]
  y <- html_nodes(y, "table")
  y <- y[[table_target[[x]]]]
  y <- html_table(y)
  names(y) <- header
  y <- mutate(y,
              Completion_Date = as.Date(Completion_Date, format = "%m/%d/%Y"),
              Last_Prod_Rpt_Date = as.Date(Last_Prod_Rpt_Date, format = "%m/%d/%Y"))
  as_tibble(y)
  })

 names(prods) <- form_text

 prod <- prods %>%
  ldply() %>%
  as_tibble()

  # API parsing function.
  # Copied and pasted from
  # my misc pkg (seasmith)
  parse_API <- function(x) {

    old_scipen <- getOption("scipen")
    options(scipen = 100)

    len <- stringi::stri_length(x)
    if (!(len == 10 | len == 12 | len == 14)) {
      stop("Must have a valid 10 or 14 digit API number.")
    }

    state     <- stringi::stri_sub(x, 1L, 2L)
    county    <- stringi::stri_sub(x, 3L, 5L)
    uid       <- stringi::stri_sub(x, 6L, 10L)
    sidetrack <- stringi::stri_sub(x, 11L, 12L)
    event     <- stringi::stri_sub(x, 13L, 14L)

    options(scipen = old_scipen)

    return(list(state     = state,
                county    = county,
                uid       = uid,
                sidetrack = sidetrack,
                event     = event))

  }
 		  
 prod <- prod$API_No %>%
  parse_API %$% county %>%
  as.numeric() %>%
  add_column(prod, County = .)

 prod <- prod %>%
  left_join(counties %>% select(CountyName, Code), by = c("County" = "Code")) %>%
  mutate(County = stri_trans_totitle(CountyName)) %>%
  select(-CountyName)
  

 # Summarize data ----------------------------------------------------------

 prod_nd <- prod %>%
  summarise(Oil = sum(Cum_Oil))

 prod_counties <- prod %>%
  group_by(County) %>%
  summarise(Oil = sum(Cum_Oil))

 prod_counties_operators <- prod %>%
  group_by(County, Operator) %>%
  summarise(Oil = sum(Cum_Oil)) %>%
  rename(from = County) %>%
  mutate(to = paste(from, Operator, sep = "_"))

 prod_counties <- prod_counties %>%
  add_column(from = "North Dakota") %>%
  select(from, everything()) %>%
  rename(to = County)

 prod_nd <- prod_nd %>%
  add_column(from = "North Dakota", to = NA) %>%
  select(from, to, Oil)
  

 # Create network tables ---------------------------------------------------

 ## -- Separate top 10 and everyting else
 # Grab the 10th largest producer
 # and use their production mark
 # as a cut point
 prod_summ <- prod %>%
  group_by(Operator) %>%
  summarize(Oil = sum(Cum_Oil))

 cut_mark <- prod_summ %>%
  arrange(desc(Oil)) %>%
  slice(10) %>%
  .$Oil

 # Tidy up the names
 pco <- prod_counties_operators %>%
  mutate(to = stri_replace(to, "", regex = ".*_"),
         to = stri_trans_totitle(to))

 # Take only `Operators` matching the top_10$Operators
 pco_top_10 <- pco %>%
  filter(Operator %in% top_10$Operator) %>%
  rename(to = Operator)

 # Take only `Operators` NOT matching the top_10$Operators
 pco_all_others <- pco %>%
  filter(!(Operator %in% top_10$Operator)) %>%
  group_by(from) %>%
  summarize(Oil = sum(Oil)) %>%
  add_column(to = "All others") %>%
  select(from, to, Oil)

 # Stack them
 pco_summ <- pco_top_10 %>%
  bind_rows(pco_all_others)

 ## -- Now create network tables
 nd <- list()

 nd$nodes <- prod_nd$from %>%
    c(prod_counties$to) %>%
    tibble(name = .) %>%
 	add_row(name = pco_summ$to %>% unique())

 nd$links <- prod_counties %>%
  bind_rows(pco_summ)

 # Create encodings to use instead of names
 nd$encoding <- nd$nodes %>%
  add_column(encoding_to = seq_len(nrow(nd$nodes)) - 1)

 # Join encodings with names in nd$links
 nd$links <- nd$links %>%
  left_join(nd$encoding, by = c("to" = "name")) %>%
  left_join(nd$encoding %>%
              rename(encoding_from = encoding_to),
 			by = c("from" = "name"))

 # Add a categorical column to control
 # the coloring of the nodes
 nd2$nodes <- nd2$nodes %>%
  add_column(id = c("state",
                    rep("county", 13),
                    rep("operator", 11)))


 # Make the graph ----------------------------------------------------------

             
 sankeyNetwork(Links = nd$links,
              Nodes = nd$nodes,
              Source = "encoding_from",
              Target = "encoding_to",
              Value = "Oil",
              NodeID = "name",
 	      NodeGroup = "id",
              units = "barrels",
              fontSize = 16,
 	      fontFamily = "Open Sans",
              nodeWidth = 20)
	# Load dependencies -------------------------------------------------------

	library(httr)
	library(rvest)
	library(plyr)
	library(tibble)
	library(dplyr)
	library(stringi)
	library(igraph)
	library(ggraph)
	library(networkD3)

	# County codes and abbreviations needed for joins
	counties <- structure(list(CountyName = c("adams", "barnes", "benson", "billings",
	"bottineau", "bowman", "burke", "burleigh", "cass", "cavalier",
	"dickey", "divide", "dunn", "eddy", "emmons", "foster", "golden valley",
	"grand forks", "grant", "griggs", "hettinger", "kidder", "lamoure",
	"logan", "mchenry", "mcintosh", "mckenzie", "mclean", "mercer",
	"morton", "mountrail", "nelson", "oliver", "pembina", "pierce",
	"ramsey", "ransom", "renville", "richland", "rolette", "sargent",
	"sheridan", "sioux", "slope", "stark", "steele", "stutsman",
	"towner", "traill", "walsh", "ward", "wells", "williams"), Code = c(1L,
	3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L,
	29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L, 51L, 53L,
	55L, 57L, 59L, 61L, 63L, 65L, 67L, 69L, 71L, 73L, 75L, 77L, 79L,
	81L, 83L, 85L, 87L, 89L, 91L, 93L, 95L, 97L, 99L, 101L, 103L,
	105L), Abbreviation = c("ADA", "BAR", "BEN", "BIL", "BOT", "BOW",
	"BUR", "BUR", "CAS", "CAV", "DIC", "DIV", "DUN", "EDD", "EMM",
	"FOS", "GOL", "GRA", "GRA", "GRI", "HET", "KID", "LAM", "LOG",
	"MCH", "MCI", "MCK", "MCL", "MER", "MOR", "MOU", "NEL", "OLI",
	"PEM", "PIE", "RAM", "RAN", "REN", "RIC", "ROL", "SAR", "SHE",
	"SIO", "SLO", "STA", "STE", "STU", "TOW", "TRA", "WAL", "WAR",
	"WEL", "WIL"), region = c(38001L, 38003L, 38005L, 38007L, 38009L,
	38011L, 38013L, 38015L, 38017L, 38019L, 38021L, 38023L, 38025L,
	38027L, 38029L, 38031L, 38033L, 38035L, 38037L, 38039L, 38041L,
	38043L, 38045L, 38047L, 38049L, 38051L, 38053L, 38055L, 38057L,
	38059L, 38061L, 38063L, 38065L, 38067L, 38069L, 38071L, 38073L,
	38075L, 38077L, 38079L, 38081L, 38083L, 38085L, 38087L, 38089L,
	38091L, 38093L, 38095L, 38097L, 38099L, 38101L, 38103L, 38105L
	)), .Names = c("CountyName", "Code", "Abbreviation", "region"
	), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
	-53L), spec = structure(list(cols = structure(list(CountyName = structure(list(), class = c("collector_character",
	"collector")), Code = structure(list(), class = c("collector_integer",
	"collector")), Abbreviation = structure(list(), class = c("collector_character",
	"collector")), region = structure(list(), class = c("collector_integer",
	"collector"))), .Names = c("CountyName", "Code", "Abbreviation",
	"region")), default = structure(list(), class = c("collector_guess",
	"collector"))), .Names = c("cols", "default"), class = "col_spec"))


	# Get form info -----------------------------------------------------------

	url <- "https://www.dmr.nd.gov/oilgas/bakkenwells.asp"

	form <- url %>%
	read_html() %>%
	html_nodes("select") %>%
	html_nodes("option")

	form_values <- form %>%
	html_attr("value") %>%
	.[-1]

	form_text <- form %>%
	html_text() %>%
	.[-1] %>%
	gsub("\\s\|/", "_", x = .)


	# Create and read addresses -----------------------------------------------

	queries <- paste0("menu1=", form_values)
	addr <- lapply(queries, function(x) modify_url(url = url, query = x))
	html <- addr %>% lapply(read_html)


	# Get tables --------------------------------------------------------------

	summary_target <- "Bakken Wells data table"

	table_target <- html %>% lapply(function(x) {
	x %>%
	html_nodes("table") %>%
	html_attr("summary") %>%
	`==`(summary_target) %>%
	which()
	})

	header <- html[[1]] %>%
	html_nodes("table") %>%
	`[[`(table_target[[1]]) %>%
	html_nodes("thead") %>%
	html_nodes("th") %>%
	html_text() %>%
	gsub("\\s\|/", "_", x = .)

	prods <- html %>% seq() %>% lapply(function(x) {
	y <- html[[x]]
	y <- html_nodes(y, "table")
	y <- y[[table_target[[x]]]]
	y <- html_table(y)
	names(y) <- header
	y <- mutate(y,
	Completion_Date = as.Date(Completion_Date, format = "%m/%d/%Y"),
	Last_Prod_Rpt_Date = as.Date(Last_Prod_Rpt_Date, format = "%m/%d/%Y"))
	as_tibble(y)
	})

	names(prods) <- form_text

	prod <- prods %>%
	ldply() %>%
	as_tibble()

	# API parsing function.
	# Copied and pasted from
	# my misc pkg (seasmith)
	parse_API <- function(x) {

	old_scipen <- getOption("scipen")
	options(scipen = 100)

	len <- stringi::stri_length(x)
	if (!(len == 10 \| len == 12 \| len == 14)) {
	stop("Must have a valid 10 or 14 digit API number.")
	}

	state <- stringi::stri_sub(x, 1L, 2L)
	county <- stringi::stri_sub(x, 3L, 5L)
	uid <- stringi::stri_sub(x, 6L, 10L)
	sidetrack <- stringi::stri_sub(x, 11L, 12L)
	event <- stringi::stri_sub(x, 13L, 14L)

	options(scipen = old_scipen)

	return(list(state = state,
	county = county,
	uid = uid,
	sidetrack = sidetrack,
	event = event))

	}

	prod <- prod$API_No %>%
	parse_API %$% county %>%
	as.numeric() %>%
	add_column(prod, County = .)

	prod <- prod %>%
	left_join(counties %>% select(CountyName, Code), by = c("County" = "Code")) %>%
	mutate(County = stri_trans_totitle(CountyName)) %>%
	select(-CountyName)


	# Summarize data ----------------------------------------------------------

	prod_nd <- prod %>%
	summarise(Oil = sum(Cum_Oil))

	prod_counties <- prod %>%
	group_by(County) %>%
	summarise(Oil = sum(Cum_Oil))

	prod_counties_operators <- prod %>%
	group_by(County, Operator) %>%
	summarise(Oil = sum(Cum_Oil)) %>%
	rename(from = County) %>%
	mutate(to = paste(from, Operator, sep = "_"))

	prod_counties <- prod_counties %>%
	add_column(from = "North Dakota") %>%
	select(from, everything()) %>%
	rename(to = County)

	prod_nd <- prod_nd %>%
	add_column(from = "North Dakota", to = NA) %>%
	select(from, to, Oil)


	# Create network tables ---------------------------------------------------

	## -- Separate top 10 and everyting else
	# Grab the 10th largest producer
	# and use their production mark
	# as a cut point
	prod_summ <- prod %>%
	group_by(Operator) %>%
	summarize(Oil = sum(Cum_Oil))

	cut_mark <- prod_summ %>%
	arrange(desc(Oil)) %>%
	slice(10) %>%
	.$Oil

	# Tidy up the names
	pco <- prod_counties_operators %>%
	mutate(to = stri_replace(to, "", regex = ".*_"),
	to = stri_trans_totitle(to))

	# Take only `Operators` matching the top_10$Operators
	pco_top_10 <- pco %>%
	filter(Operator %in% top_10$Operator) %>%
	rename(to = Operator)

	# Take only `Operators` NOT matching the top_10$Operators
	pco_all_others <- pco %>%
	filter(!(Operator %in% top_10$Operator)) %>%
	group_by(from) %>%
	summarize(Oil = sum(Oil)) %>%
	add_column(to = "All others") %>%
	select(from, to, Oil)

	# Stack them
	pco_summ <- pco_top_10 %>%
	bind_rows(pco_all_others)

	## -- Now create network tables
	nd <- list()

	nd$nodes <- prod_nd$from %>%
	c(prod_counties$to) %>%
	tibble(name = .) %>%
	add_row(name = pco_summ$to %>% unique())

	nd$links <- prod_counties %>%
	bind_rows(pco_summ)

	# Create encodings to use instead of names
	nd$encoding <- nd$nodes %>%
	add_column(encoding_to = seq_len(nrow(nd$nodes)) - 1)

	# Join encodings with names in nd$links
	nd$links <- nd$links %>%
	left_join(nd$encoding, by = c("to" = "name")) %>%
	left_join(nd$encoding %>%
	rename(encoding_from = encoding_to),
	by = c("from" = "name"))

	# Add a categorical column to control
	# the coloring of the nodes
	nd2$nodes <- nd2$nodes %>%
	add_column(id = c("state",
	rep("county", 13),
	rep("operator", 11)))


	# Make the graph ----------------------------------------------------------


	sankeyNetwork(Links = nd$links,
	Nodes = nd$nodes,
	Source = "encoding_from",
	Target = "encoding_to",
	Value = "Oil",
	NodeID = "name",
	NodeGroup = "id",
	units = "barrels",
	fontSize = 16,
	fontFamily = "Open Sans",
	nodeWidth = 20)