cderv · May 7, 2018 23:04
diff --git a/map_flat_use_case.md b/map_flat_use_case.md
diff --git a/map_flat_use_case.R b/map_flat_use_case.R
 #' ---
 #' output:
 #'   md_document:
 #'     pandoc_args: [
 #'       '-f', 'markdown-implicit_figures',
 #'       '-t', 'commonmark',
 #'       --wrap=preserve
 #'     ]
 #' ---
 #'
 #+ setup, include = FALSE
 knitr::opts_chunk$set(collapse = TRUE, comment = "#>", error = TRUE)


 #' This document presents some usage of `map_flap` family functions.
 #' You need to get the feature branch associated to run this code
 #'
 #' # Get purrr with `map_flat` functions
 #'
 #' ## Installating from GITHUB
 #' Use can use dev mode to isolate your installed dev version of package
 #+ eval = FALSE
 dev_mode(on = TRUE)
 #' Install from PR 502
 #+ eval = FALSE
 devtools::install_github("tidyverse/purrr#502")
 #' you can load purrr then
 #+ eval = FALSE
 library(purrr)
 #'
 #' ## Cloning repo and loading package with devtools::load_all()
 #'
 #' If you don't have a dev purrr clone folder already, you can just clone for this script
 #' using a temporary directory.
 temp_purrr <- fs::dir_create(fs::file_temp("purrr"))
 # feature branch is in cderv's repo not tidyverse's
 #+ results = 'hide'
 git2r::clone("https://github.com/cderv/purrr.git", local_path = temp_purrr, branch = "fix-405-map_flat_type")
 #' Use `devtools::load_all()` to load the dev version of the package
 #+ results = 'hide'
 devtools::load_all(temp_purrr)

 #' At the end use `fs::dir_delete(temp_purrr)` to delete. Note that the temp
 #' folder will be deleted automaticaly too.

 #' Once you have the dev version, you can try the different use cases.
 #'
 #' # First use case
 #'
 #' Let's say I want to compare the list of GH users that currently have issues still
 #' open in some of the repos of different organisations for comparaison.
 library(gh)
 library(dplyr, warn.conflicts = FALSE)
 library(tidyr)

 #' Select some repository like tidyr and purrr in the tidyverse or httr, xml2 and usethis in r-lib
 repos <- tibble(
  org = c("tidyverse", "r-lib"),
  repo = list(c("tidyr", "purrr"), c("httr", "xml2", "usethis"))
 )
 #' Use gh :package: to retrieve all open issues
 repos <- repos %>%
  unnest() %>%
  # .limit = "Inf" to get all opened issues
  mutate(issues = map2(org, repo, ~ gh("/repos/:org/:repo/issues", org = .x, repo = .y, .limit = "Inf")))
 #' All the repos do not have the same number of issues open, and there is no reason to.
 repos <- repos %>%
  mutate(nb_issues = map_int(issues, length))
 #' This is where `map_flat` variants can be useful. If we want a result with one
 #' vector of GH login by organisation, `map_chr()` won't work because of length difference,
 #' we need to map then flatten as character (`map() %>% flatten()`)
 open_issues_by_org <- repos %>%
  nest(-org) %>%
  mutate(users = map(data,
                     # extract GH login from nested list we obtained by gh package response
                     ~ map_flat_chr(.x$issues, ~ map_chr(.x, c("user", "login"))) %>% unique()))
 open_issues_by_org

 #' We can continue analysis by unnesting and if users open issues in the two organisation
 open_issues_by_org %>%
  unnest(users, .drop = TRUE) %>%
  count(users, sort = TRUE)


 #' # A second use case
 #'
 #' If we have some text column we want to split and results with a vector of
 #' word, it can be useful as sentences may have diffent number of word.
 tibble(
  document_id = c(1, 2),
  text = list(
    c(
      "some text with a few word",
      "some text with not a few but word",
      "some text"
    ),
    c(
      "some other text with a few word",
      "some other text with not a few but word",
      "some other text"
    )
  )
 ) %>%
  mutate(words = map(text, ~ map_flat_chr(.x, ~ strsplit(.x, " ")[[1]]))) %>%
  unnest(words)

 #' However, for this use case there is tidytext...
 #'

 #' # Third use case
 #'
 #' When scrapping some data to _rectangle_ a website, this can be useful. For example,
 #' we want to know which are the categories of coming movies for next week.
 #'
 library(rvest)

 url <- "https://www.imdb.com/movies-coming-soon/?ref_=nv_mv_cs_4"

 # get list of film coming soon
 coming_soon <- url %>%
  read_html() %>%
  html_nodes(".list_item")

 # create a list that contains information about all coming movies
 about_all_coming_movies <- list(
  nb_film = length(coming_soon),
  genre = map_flat_chr(coming_soon, ~ html_nodes(.x, ".cert-runtime-genre span[itemprop='genre']") %>%
                         html_text()) %>% unique(),
  director = coming_soon %>%
    map_flat_chr(~ html_nodes(.x, ".txt-block span[itemprop='director'] span[itemprop='name'] a") %>%
                   html_text()) %>% unique(),
  stars = coming_soon %>%
    map_flat_chr(~ html_nodes(.x, ".txt-block span[itemprop='actors'] span[itemprop='name'] a") %>%
                   html_text() %>% unique())
 )
 about_all_coming_movies

 #' # Fourth use case
 #'
 #' Some untidy file to clean. like this one whith one id per line
 #' that passes through several nodes. I want the vector or all nodes id.
 #'
 readr::read_csv("
 id,path
 1,1-2-6
 2,2-3
 4,5-7-1-9
 ") %>%
  pull(path) %>%
  map_flat_int(~ strsplit(.x, "-")[[1]] %>% as.integer()) %>%
  unique()
	#' ---
	#' output:
	#' md_document:
	#' pandoc_args: [
	#' '-f', 'markdown-implicit_figures',
	#' '-t', 'commonmark',
	#' --wrap=preserve
	#' ]
	#' ---
	#'
	#+ setup, include = FALSE
	knitr::opts_chunk$set(collapse = TRUE, comment = "#>", error = TRUE)


	#' This document presents some usage of `map_flap` family functions.
	#' You need to get the feature branch associated to run this code
	#'
	#' # Get purrr with `map_flat` functions
	#'
	#' ## Installating from GITHUB
	#' Use can use dev mode to isolate your installed dev version of package
	#+ eval = FALSE
	dev_mode(on = TRUE)
	#' Install from PR 502
	#+ eval = FALSE
	devtools::install_github("tidyverse/purrr#502")
	#' you can load purrr then
	#+ eval = FALSE
	library(purrr)
	#'
	#' ## Cloning repo and loading package with devtools::load_all()
	#'
	#' If you don't have a dev purrr clone folder already, you can just clone for this script
	#' using a temporary directory.
	temp_purrr <- fs::dir_create(fs::file_temp("purrr"))
	# feature branch is in cderv's repo not tidyverse's
	#+ results = 'hide'
	git2r::clone("https://github.com/cderv/purrr.git", local_path = temp_purrr, branch = "fix-405-map_flat_type")
	#' Use `devtools::load_all()` to load the dev version of the package
	#+ results = 'hide'
	devtools::load_all(temp_purrr)

	#' At the end use `fs::dir_delete(temp_purrr)` to delete. Note that the temp
	#' folder will be deleted automaticaly too.

	#' Once you have the dev version, you can try the different use cases.
	#'
	#' # First use case
	#'
	#' Let's say I want to compare the list of GH users that currently have issues still
	#' open in some of the repos of different organisations for comparaison.
	library(gh)
	library(dplyr, warn.conflicts = FALSE)
	library(tidyr)

	#' Select some repository like tidyr and purrr in the tidyverse or httr, xml2 and usethis in r-lib
	repos <- tibble(
	org = c("tidyverse", "r-lib"),
	repo = list(c("tidyr", "purrr"), c("httr", "xml2", "usethis"))
	)
	#' Use gh :package: to retrieve all open issues
	repos <- repos %>%
	unnest() %>%
	# .limit = "Inf" to get all opened issues
	mutate(issues = map2(org, repo, ~ gh("/repos/:org/:repo/issues", org = .x, repo = .y, .limit = "Inf")))
	#' All the repos do not have the same number of issues open, and there is no reason to.
	repos <- repos %>%
	mutate(nb_issues = map_int(issues, length))
	#' This is where `map_flat` variants can be useful. If we want a result with one
	#' vector of GH login by organisation, `map_chr()` won't work because of length difference,
	#' we need to map then flatten as character (`map() %>% flatten()`)
	open_issues_by_org <- repos %>%
	nest(-org) %>%
	mutate(users = map(data,
	# extract GH login from nested list we obtained by gh package response
	~ map_flat_chr(.x$issues, ~ map_chr(.x, c("user", "login"))) %>% unique()))
	open_issues_by_org

	#' We can continue analysis by unnesting and if users open issues in the two organisation
	open_issues_by_org %>%
	unnest(users, .drop = TRUE) %>%
	count(users, sort = TRUE)


	#' # A second use case
	#'
	#' If we have some text column we want to split and results with a vector of
	#' word, it can be useful as sentences may have diffent number of word.
	tibble(
	document_id = c(1, 2),
	text = list(
	c(
	"some text with a few word",
	"some text with not a few but word",
	"some text"
	),
	c(
	"some other text with a few word",
	"some other text with not a few but word",
	"some other text"
	)
	)
	) %>%
	mutate(words = map(text, ~ map_flat_chr(.x, ~ strsplit(.x, " ")[[1]]))) %>%
	unnest(words)

	#' However, for this use case there is tidytext...
	#'

	#' # Third use case
	#'
	#' When scrapping some data to _rectangle_ a website, this can be useful. For example,
	#' we want to know which are the categories of coming movies for next week.
	#'
	library(rvest)

	url <- "https://www.imdb.com/movies-coming-soon/?ref_=nv_mv_cs_4"

	# get list of film coming soon
	coming_soon <- url %>%
	read_html() %>%
	html_nodes(".list_item")

	# create a list that contains information about all coming movies
	about_all_coming_movies <- list(
	nb_film = length(coming_soon),
	genre = map_flat_chr(coming_soon, ~ html_nodes(.x, ".cert-runtime-genre span[itemprop='genre']") %>%
	html_text()) %>% unique(),
	director = coming_soon %>%
	map_flat_chr(~ html_nodes(.x, ".txt-block span[itemprop='director'] span[itemprop='name'] a") %>%
	html_text()) %>% unique(),
	stars = coming_soon %>%
	map_flat_chr(~ html_nodes(.x, ".txt-block span[itemprop='actors'] span[itemprop='name'] a") %>%
	html_text() %>% unique())
	)
	about_all_coming_movies

	#' # Fourth use case
	#'
	#' Some untidy file to clean. like this one whith one id per line
	#' that passes through several nodes. I want the vector or all nodes id.
	#'
	readr::read_csv("
	id,path
	1,1-2-6
	2,2-3
	4,5-7-1-9
	") %>%
	pull(path) %>%
	map_flat_int(~ strsplit(.x, "-")[[1]] %>% as.integer()) %>%
	unique()