tilltnet · June 29, 2020 16:46
diff --git a/edge_gravity.R b/edge_gravity.R
 find_possible_paths <- function(g, k = NULL, ksp = c("baerde", "yenpathy")) {
  
  undirected <- FALSE
  
  if (!igraph::is.directed(g)) {
    undirected <- TRUE
    weight <- igraph::E(g)$weight
    e_l <- as.data.frame(igraph::as_edgelist(g))
    l_e <- as.data.frame(cbind(e_l[, 2], e_l[, 1]))
    e_l_l_e <- rbind(e_l, l_e)
    e_l_l_e$weight <- rep(weight, 2)
    g <- igraph::graph_from_data_frame(e_l_l_e)
  }
  
  if (is.null(igraph::V(g)$name)) {
    name_comb1 <- combn(igraph::V(g), 2)
  } else {
    name_comb1 <- combn(igraph::V(g)$name, 2)
  }
  
  name_comb2 <- rbind(name_comb1[2, ],
                      name_comb1[1, ])
  name_combs <- t(cbind(name_comb1, name_comb2))
  
  pair_names <- paste(name_combs[, 1],
                      name_combs[, 2])
  
  all_possible_paths <-
    if (!is.null(k) & ksp[1] == "yenpathy") {
      furrr::future_map2_dfr(name_combs[, 1],
                         name_combs[, 2],
                         ~ path_list_to_df(yenpathy::k_shortest_paths(g, .x, .y, k = k), .x, .y))
    } else if (!is.null(k) & ksp[1] == "baerde") {
      if(is.numeric(name_combs)) {
        g <- give_graph_names(g)
        name_combs <-
          cbind(paste0("v", name_combs[, 1]), 
                paste0("v", name_combs[, 2]))
      }
      furrr::future_map2_dfr(name_combs[, 1],
                         name_combs[, 2],
                         ~ path_list_to_df(k_shortest_yen(g, .x, .y, k = k), .x, .y))
    } else {
      furrr::future_map2_dfr(name_combs[, 1],
                         name_combs[, 2],
                         ~ path_list_to_df(purrr::map(igraph::all_simple_paths(g, .x, .y), as.character), .x, .y))
    }
  
  #names(all_possible_paths) <- pair_names
  
  if (!is.null(k)) {
    if (k > max(purrr::map_dbl(all_possible_paths, length)))
      message(
        "Maximum number of paths found per node pair is smaller than k, all possible paths found."
      )
    else
      message(
        "Maximum number of paths found per node pair equals k, there might be more possible paths. Try a higher number for k."
      )
  }
  
  # res <- furrr::future_map_dfr(all_possible_paths,
  #                              ~ purrr::map_dfr(.,
  #                                               function(x)
  #                                                 data.frame(from = x[-length(x)],
  #                                                            to = x[-1]), .id = "path"),
  #                              .id = "node pair")
  
  if (undirected) {
    all_possible_paths <- dplyr::mutate(all_possible_paths,
                         from_ = ifelse(from < to, from, to),
                         to_ = ifelse(from > to, from, to))
    dplyr::select(all_possible_paths, `node pair` = node.pair, path, from = from_, to = to_)
    
  } else {
    all_possible_paths
  }
  
  
 }

 count_edge_gravity <- function(g, k = NULL, ksp = c("baerde", "yenpathy")) {
    dplyr::count(find_possible_paths(g, k, ksp),
                 from, to, sort = TRUE, name = "edge gravity")
 }

 find_all_paths_node_pair <- 
  function(g, n1, n2, k) {
    a <- yenpathy::k_shortest_paths(g, n1, n2, k = k)
    path_list_to_df(a, n1, n2)
  }

 path_list_to_df <- function(l, n1, n2) {
  map_dfr(l, function(p)
    data.frame(
      `node pair` = paste(n1, n2),
      from = p[-length(p)],
      to = p[-1]
    ), .id = "path")
 }

 feed_chunks <- function(g, chunk, k) {
    map2_dfr(chunk$V1, 
             chunk$V2,
             function(x, y) find_all_paths_node_pair(g, x, y, k))
 }

 for_feed_chunks <- function(chunk, k) {
  res_tbl <- tibble()
  for(i in 1:nrow(chunk)) res_tbl <- bind_rows(find_all_paths_node_pair(g, chunk$V1[i], chunk$V2[i], k), res_tbl)
  res_tbl
  }

diff --git a/edge_gravity_examples.R b/edge_gravity_examples.R
 # libraries needed: 
 # dplyr, 
 # purrr, 
 # furrr, 
 # future (will be installed with furrr as a dependency) 
 # igraph, 
 # yenpathy (optional for limiting k)
 # ggplot2

 # In order to avoid running out of memory due to the memory leaks that are 
 # currently present in the c++ library used in yenpathy, use the
 # yenpathy fork in the install command below.

 # Using yenpathy now needs to be specified with the 'ksp' argument, see
 # example 4. yenpathy is much faster than the other solutions.

 # un-comment next two lines to install packages
 # install.packages("tidyverse", "furrr", "igraph")
 # remotes::install_github("tilltnet/yenpathy", build_vignettes = FALSE)
 # build_vignettes = TRUE fails when igraphdata not installed

 library(dplyr)
 library(ggplot2)
 library(igraph)
 library(tidyr)
 source("edge_gravity.R")

 # Example 1 --------------------------------------------------------------------
 # Graph from yenpathy documentation.
 my_graph <- data.frame(
  source = c(1, 4, 5, 1, 1, 8, 1, 2, 7, 3),
  sink = c(4, 5, 6, 6, 8, 6, 2, 7, 3, 6),
  weight = c(1, 1, 1.5, 5, 1.5, 2.5, 1.5, 0.5, 0.5, 0.5)
 )

 g1 <- graph_from_data_frame(my_graph)
 plot(g1)

 find_possible_paths(g1, 2)
 find_possible_paths(g1, 4)
 count_edge_gravity(g1)
 count_edge_gravity(g1, 4)

 # Example 2 --------------------------------------------------------------------
 # From Figure 1 in Helander, M. E., & McAllister, S. (2018). The gravity
 # of an edge. Applied Network Science, 3(1). 
 # https://doi.org/10.1007/s41109-018-0063-6

 # This is an undirected graph, that means that before the k-shortest paths
 # can be found the network needs to be transformed so that each edge is duplicated
 # as a directed connecting the adjacent nodes in both directions.
 # The find_possible_paths() function takes care of that.

 el2 <- tribble(
  ~from, ~to,
  "1", "2",
  "1", "3",
  "2", "3",
  "2", "4",
  "3", "4",
  "3", "5")

 g2 <- graph_from_data_frame(el2, directed = FALSE)
 plot(g2)

 # This reproduces data in Table 2 and Fig. 2 in Helander & McAllister 2018.

 # 58 paths
 find_possible_paths(g2, 5) %>% 
  count(`node pair`, path) %>% 
  nrow()

 # Instead of rank this uses the actual values of EB and EG.
 g2_edge_gravity <- count_edge_gravity(g2, 5)
 g2_edge_gravity <- count_edge_gravity(g2)
 el2$`edge betweeness` <- edge_betweenness(g2)
 full_join(el2, g2_edge_gravity) %>% 
  pivot_longer(c(`edge gravity`, `edge betweeness`)) %>% 
  ggplot(aes(x = reorder(paste(from, to), value), y = value, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  xlab(label = "edge")

 # Example 3 --------------------------------------------------------------------
 # This reproduces data in Fig. 14 Helander & McAllister 2018
 # Numbering of vertices is not the same though!
 library(netrankr)
 data("florentine_m")

 plot(florentine_m)
 find_possible_paths(florentine_m, ksp = "yenpathy") %>% 
  count(`node pair`, path, sort = TRUE) %>% 
  nrow()

 count_edge_gravity(g = florentine_m)
 count_edge_gravity(florentine_m, 34)
 count_edge_gravity(florentine_m, 34, ksp = "yenpathy")


 k_shortest_paths(graph_df = florentine_m, from = "Acciaiuol", to = "Ridolfi", k = 50)
 igraph::all_shortest_paths(florentine_m, "Acciaiuol")

 # Example 4 ---------------------------------------------------------------
 # "Bigger" network data, using parallel computing for finding paths!

 library(igraphdata)
 #data(package="igraphdata")
 data(UKfaculty)

 # Vertice/Edge Count
 length(V(UKfaculty))
 length(E(UKfaculty))


 library(future)

 # This is a memory saving, but rather slow solution.
 plan(multisession, gc = TRUE)
 start_time <- Sys.time()
 res4a <- count_edge_gravity(g = UKfaculty, k = 100)
 delta_t <- Sys.time() - start_time
 delta_t
 plan(sequential)


 # With k = 500 this takes (now!) 3 minutes on a Ryzen 9 3950X with 16 Cores and 32 Threads.
 # Memory usage seems fine when using yenpatyh fork from:
 plan(multisession, gc = TRUE)
 start_time <- Sys.time()
 res4b <- count_edge_gravity(g = UKfaculty, k = 500, ksp = "yenpathy")
 delta_t <- Sys.time() - start_time
 delta_t
 plan(sequential)

diff --git a/shortest_paths_renzhentaxi_baer.R b/shortest_paths_renzhentaxi_baer.R
 # This code is taken from https://stackoverflow.com/a/51509886
 # Changes:
 # - fixed bug, that would break code, when the actual k was smaller than the specified k - 1
 # - Not deleting rootPath nodes (except for spurNode) caused this to create paths with loops
 # - when vertices are not named but rather numerically numbered paths are now returned correctly

 library(igraph)
 library(tidyverse)

 #'@return the shortest path as a list of vertices or NULL if there is no path between src and dest
 shortest_path <- function(graph, src, dest){
  path <- suppressWarnings(get.shortest.paths(graph, src, dest))
  if(is.numeric(src))
    path <- as.numeric(path$vpath[[1]])
  else
    path <- names(path$vpath[[1]])
  if (length(path)==1) NULL else path
 } 

 #'@return the sum of the weights of all the edges in the given path
 path_weight <- function(path, graph) sum(E(graph, path=path)$weight)

 #'@description sorts a list of paths based on the weight of the path
 sort_paths <- function(graph, paths) paths[sapply(paths, path_weight, graph) %>% order]

 #'@description creates a list of edges that should be deleted
 find_edges_to_delete <- function(A, i, rootPath){
  edgesToDelete <- NULL
  for (p in A){
    rootPath_p <- p[1:i]
    if (all(rootPath_p == rootPath)){
      edge <- paste(p[i], ifelse(is.na(p[i+1]),p[i],p[i+1]), sep = '|')
      edgesToDelete[length(edgesToDelete)+1] <- edge
    }
  }
  unique(edgesToDelete)
 }

 give_graph_names <- function(graph) {
  V(graph)$name <- paste0("v", 1:length(V(graph)))
  graph
 }

 # returns the k shortest path from src to dest
 # sometimes it will return less than k shortest paths. This occurs when the max possible number of paths are less than k
 k_shortest_yen <- function(graph, src, dest, k){
  if (src == dest) stop('src and dest can not be the same (currently)')
  
  # accepted paths
  A <- list(shortest_path(graph, src, dest))
  if (k == 1) return (A)
  # potential paths
  B <- list()
  
  for (k_i in 2:k){
    prev_path <- A[[k_i-1]]
    num_nodes_to_loop <- length(prev_path) - 1
    for(i in 1:num_nodes_to_loop){
      spurNode <- prev_path[i]
      rootPath <- prev_path[1:i]
      
      edgesToDelete <- find_edges_to_delete(A, i,rootPath)
      t_g <- delete.edges(graph, edgesToDelete)
      #!#
      t_g <- delete.vertices(t_g, rootPath[rootPath != spurNode])
      
      spurPath <- shortest_path(t_g, spurNode, dest)
      
      #!#
      if (length(spurPath) > 0){
        total_path <- list(c(rootPath[-i], spurPath))
        if (!total_path %in% B) B[length(B)+1] <- total_path
      }
    }
    if (length(B) == 0) break
    B <- sort_paths(graph, B)
    A[k_i] <- B[1]
    B <- B[-1]
  }
  A
 }
	find_possible_paths <- function(g, k = NULL, ksp = c("baerde", "yenpathy")) {

	undirected <- FALSE

	if (!igraph::is.directed(g)) {
	undirected <- TRUE
	weight <- igraph::E(g)$weight
	e_l <- as.data.frame(igraph::as_edgelist(g))
	l_e <- as.data.frame(cbind(e_l[, 2], e_l[, 1]))
	e_l_l_e <- rbind(e_l, l_e)
	e_l_l_e$weight <- rep(weight, 2)
	g <- igraph::graph_from_data_frame(e_l_l_e)
	}

	if (is.null(igraph::V(g)$name)) {
	name_comb1 <- combn(igraph::V(g), 2)
	} else {
	name_comb1 <- combn(igraph::V(g)$name, 2)
	}

	name_comb2 <- rbind(name_comb1[2, ],
	name_comb1[1, ])
	name_combs <- t(cbind(name_comb1, name_comb2))

	pair_names <- paste(name_combs[, 1],
	name_combs[, 2])

	all_possible_paths <-
	if (!is.null(k) & ksp[1] == "yenpathy") {
	furrr::future_map2_dfr(name_combs[, 1],
	name_combs[, 2],
	~ path_list_to_df(yenpathy::k_shortest_paths(g, .x, .y, k = k), .x, .y))
	} else if (!is.null(k) & ksp[1] == "baerde") {
	if(is.numeric(name_combs)) {
	g <- give_graph_names(g)
	name_combs <-
	cbind(paste0("v", name_combs[, 1]),
	paste0("v", name_combs[, 2]))
	}
	furrr::future_map2_dfr(name_combs[, 1],
	name_combs[, 2],
	~ path_list_to_df(k_shortest_yen(g, .x, .y, k = k), .x, .y))
	} else {
	furrr::future_map2_dfr(name_combs[, 1],
	name_combs[, 2],
	~ path_list_to_df(purrr::map(igraph::all_simple_paths(g, .x, .y), as.character), .x, .y))
	}

	#names(all_possible_paths) <- pair_names

	if (!is.null(k)) {
	if (k > max(purrr::map_dbl(all_possible_paths, length)))
	message(
	"Maximum number of paths found per node pair is smaller than k, all possible paths found."
	)
	else
	message(
	"Maximum number of paths found per node pair equals k, there might be more possible paths. Try a higher number for k."
	)
	}

	# res <- furrr::future_map_dfr(all_possible_paths,
	# ~ purrr::map_dfr(.,
	# function(x)
	# data.frame(from = x[-length(x)],
	# to = x[-1]), .id = "path"),
	# .id = "node pair")

	if (undirected) {
	all_possible_paths <- dplyr::mutate(all_possible_paths,
	from_ = ifelse(from < to, from, to),
	to_ = ifelse(from > to, from, to))
	dplyr::select(all_possible_paths, `node pair` = node.pair, path, from = from_, to = to_)

	} else {
	all_possible_paths
	}


	}

	count_edge_gravity <- function(g, k = NULL, ksp = c("baerde", "yenpathy")) {
	dplyr::count(find_possible_paths(g, k, ksp),
	from, to, sort = TRUE, name = "edge gravity")
	}

	find_all_paths_node_pair <-
	function(g, n1, n2, k) {
	a <- yenpathy::k_shortest_paths(g, n1, n2, k = k)
	path_list_to_df(a, n1, n2)
	}

	path_list_to_df <- function(l, n1, n2) {
	map_dfr(l, function(p)
	data.frame(
	`node pair` = paste(n1, n2),
	from = p[-length(p)],
	to = p[-1]
	), .id = "path")
	}

	feed_chunks <- function(g, chunk, k) {
	map2_dfr(chunk$V1,
	chunk$V2,
	function(x, y) find_all_paths_node_pair(g, x, y, k))
	}

	for_feed_chunks <- function(chunk, k) {
	res_tbl <- tibble()
	for(i in 1:nrow(chunk)) res_tbl <- bind_rows(find_all_paths_node_pair(g, chunk$V1[i], chunk$V2[i], k), res_tbl)
	res_tbl
	}
	# libraries needed:
	# dplyr,
	# purrr,
	# furrr,
	# future (will be installed with furrr as a dependency)
	# igraph,
	# yenpathy (optional for limiting k)
	# ggplot2

	# In order to avoid running out of memory due to the memory leaks that are
	# currently present in the c++ library used in yenpathy, use the
	# yenpathy fork in the install command below.

	# Using yenpathy now needs to be specified with the 'ksp' argument, see
	# example 4. yenpathy is much faster than the other solutions.

	# un-comment next two lines to install packages
	# install.packages("tidyverse", "furrr", "igraph")
	# remotes::install_github("tilltnet/yenpathy", build_vignettes = FALSE)
	# build_vignettes = TRUE fails when igraphdata not installed

	library(dplyr)
	library(ggplot2)
	library(igraph)
	library(tidyr)
	source("edge_gravity.R")

	# Example 1 --------------------------------------------------------------------
	# Graph from yenpathy documentation.
	my_graph <- data.frame(
	source = c(1, 4, 5, 1, 1, 8, 1, 2, 7, 3),
	sink = c(4, 5, 6, 6, 8, 6, 2, 7, 3, 6),
	weight = c(1, 1, 1.5, 5, 1.5, 2.5, 1.5, 0.5, 0.5, 0.5)
	)

	g1 <- graph_from_data_frame(my_graph)
	plot(g1)

	find_possible_paths(g1, 2)
	find_possible_paths(g1, 4)
	count_edge_gravity(g1)
	count_edge_gravity(g1, 4)

	# Example 2 --------------------------------------------------------------------
	# From Figure 1 in Helander, M. E., & McAllister, S. (2018). The gravity
	# of an edge. Applied Network Science, 3(1).
	# https://doi.org/10.1007/s41109-018-0063-6

	# This is an undirected graph, that means that before the k-shortest paths
	# can be found the network needs to be transformed so that each edge is duplicated
	# as a directed connecting the adjacent nodes in both directions.
	# The find_possible_paths() function takes care of that.

	el2 <- tribble(
	~from, ~to,
	"1", "2",
	"1", "3",
	"2", "3",
	"2", "4",
	"3", "4",
	"3", "5")

	g2 <- graph_from_data_frame(el2, directed = FALSE)
	plot(g2)

	# This reproduces data in Table 2 and Fig. 2 in Helander & McAllister 2018.

	# 58 paths
	find_possible_paths(g2, 5) %>%
	count(`node pair`, path) %>%
	nrow()

	# Instead of rank this uses the actual values of EB and EG.
	g2_edge_gravity <- count_edge_gravity(g2, 5)
	g2_edge_gravity <- count_edge_gravity(g2)
	el2$`edge betweeness` <- edge_betweenness(g2)
	full_join(el2, g2_edge_gravity) %>%
	pivot_longer(c(`edge gravity`, `edge betweeness`)) %>%
	ggplot(aes(x = reorder(paste(from, to), value), y = value, fill = name)) +
	geom_bar(stat = "identity", position = "dodge") +
	coord_flip() +
	xlab(label = "edge")

	# Example 3 --------------------------------------------------------------------
	# This reproduces data in Fig. 14 Helander & McAllister 2018
	# Numbering of vertices is not the same though!
	library(netrankr)
	data("florentine_m")

	plot(florentine_m)
	find_possible_paths(florentine_m, ksp = "yenpathy") %>%
	count(`node pair`, path, sort = TRUE) %>%
	nrow()

	count_edge_gravity(g = florentine_m)
	count_edge_gravity(florentine_m, 34)
	count_edge_gravity(florentine_m, 34, ksp = "yenpathy")


	k_shortest_paths(graph_df = florentine_m, from = "Acciaiuol", to = "Ridolfi", k = 50)
	igraph::all_shortest_paths(florentine_m, "Acciaiuol")

	# Example 4 ---------------------------------------------------------------
	# "Bigger" network data, using parallel computing for finding paths!

	library(igraphdata)
	#data(package="igraphdata")
	data(UKfaculty)

	# Vertice/Edge Count
	length(V(UKfaculty))
	length(E(UKfaculty))


	library(future)

	# This is a memory saving, but rather slow solution.
	plan(multisession, gc = TRUE)
	start_time <- Sys.time()
	res4a <- count_edge_gravity(g = UKfaculty, k = 100)
	delta_t <- Sys.time() - start_time
	delta_t
	plan(sequential)


	# With k = 500 this takes (now!) 3 minutes on a Ryzen 9 3950X with 16 Cores and 32 Threads.
	# Memory usage seems fine when using yenpatyh fork from:
	plan(multisession, gc = TRUE)
	start_time <- Sys.time()
	res4b <- count_edge_gravity(g = UKfaculty, k = 500, ksp = "yenpathy")
	delta_t <- Sys.time() - start_time
	delta_t
	plan(sequential)
	# This code is taken from https://stackoverflow.com/a/51509886
	# Changes:
	# - fixed bug, that would break code, when the actual k was smaller than the specified k - 1
	# - Not deleting rootPath nodes (except for spurNode) caused this to create paths with loops
	# - when vertices are not named but rather numerically numbered paths are now returned correctly

	library(igraph)
	library(tidyverse)

	#'@return the shortest path as a list of vertices or NULL if there is no path between src and dest
	shortest_path <- function(graph, src, dest){
	path <- suppressWarnings(get.shortest.paths(graph, src, dest))
	if(is.numeric(src))
	path <- as.numeric(path$vpath[[1]])
	else
	path <- names(path$vpath[[1]])
	if (length(path)==1) NULL else path
	}

	#'@return the sum of the weights of all the edges in the given path
	path_weight <- function(path, graph) sum(E(graph, path=path)$weight)

	#'@description sorts a list of paths based on the weight of the path
	sort_paths <- function(graph, paths) paths[sapply(paths, path_weight, graph) %>% order]

	#'@description creates a list of edges that should be deleted
	find_edges_to_delete <- function(A, i, rootPath){
	edgesToDelete <- NULL
	for (p in A){
	rootPath_p <- p[1:i]
	if (all(rootPath_p == rootPath)){
	edge <- paste(p[i], ifelse(is.na(p[i+1]),p[i],p[i+1]), sep = '\|')
	edgesToDelete[length(edgesToDelete)+1] <- edge
	}
	}
	unique(edgesToDelete)
	}

	give_graph_names <- function(graph) {
	V(graph)$name <- paste0("v", 1:length(V(graph)))
	graph
	}

	# returns the k shortest path from src to dest
	# sometimes it will return less than k shortest paths. This occurs when the max possible number of paths are less than k
	k_shortest_yen <- function(graph, src, dest, k){
	if (src == dest) stop('src and dest can not be the same (currently)')

	# accepted paths
	A <- list(shortest_path(graph, src, dest))
	if (k == 1) return (A)
	# potential paths
	B <- list()

	for (k_i in 2:k){
	prev_path <- A[[k_i-1]]
	num_nodes_to_loop <- length(prev_path) - 1
	for(i in 1:num_nodes_to_loop){
	spurNode <- prev_path[i]
	rootPath <- prev_path[1:i]

	edgesToDelete <- find_edges_to_delete(A, i,rootPath)
	t_g <- delete.edges(graph, edgesToDelete)
	#!#
	t_g <- delete.vertices(t_g, rootPath[rootPath != spurNode])

	spurPath <- shortest_path(t_g, spurNode, dest)

	#!#
	if (length(spurPath) > 0){
	total_path <- list(c(rootPath[-i], spurPath))
	if (!total_path %in% B) B[length(B)+1] <- total_path
	}
	}
	if (length(B) == 0) break
	B <- sort_paths(graph, B)
	A[k_i] <- B[1]
	B <- B[-1]
	}
	A
	}