luisDVA · April 7, 2023 17:44
diff --git a/pkgcomments.R b/pkgcomments.R
 # exploring comments about loaded packages
 library(bigrquery) # CRAN v1.4.1
 library(dplyr) # CRAN v1.1.1
 library(stringr) # CRAN v1.5.0
 library(readr) # CRAN v2.1.4
 library(tidyr) # CRAN v1.3.0
 library(rlang) # CRAN v1.1.0
 library(purrr) # CRAN v1.0.1


 # authorize and fetch from BigQuery
 # bq_auth()
 RlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "libmatches")
 RmdlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "Rmdlibmatches")
 QmdlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "Qmdlibmatches")
 Rlibmatches <- bq_table_download(RlibmatchesBQtab)
 Rmdlibmatches <- bq_table_download(RmdlibmatchesBQtab)
 Qmdlibmatches <- bq_table_download(QmdlibmatchesBQtab)
 # all formats together
 libmatches <- bind_rows(Rlibmatches, Rmdlibmatches, Qmdlibmatches)
 # to disk
 # write_csv(libmatches,"data/libmatches.csv")


 # from file
 libmatches <- read_csv("data/libmatches.csv")
 # just the lines with package load calls
 libmatches <- libmatches %>% mutate(libcalls = str_extract_all(content, "library\\(.+"))
 # cleanup inconsistent script parsing
 libmatches <- libmatches %>% filter(!str_detect(content, "^expected"))
 # split calls rowwise
 libmatches <- libmatches %>% unnest_longer(libcalls)
 # calls and comments
 libmatches <- libmatches %>%
  separate(libcalls, into = c("call", "comment"), sep = "#", extra = "merge") %>%
  mutate(across(everything(), str_trim))

 # clean and separate multiline expressions
 libmatches <- libmatches %>% mutate(call = str_remove(call, ";$"))
 libmatches <- libmatches %>%
  separate_rows(call, sep = ";") %>%
  mutate(call = str_trim(call))
 # remove non calls
 libmatches <- libmatches %>% filter(str_detect(call, "^libr"))
 # remove possible iterative loading
 libmatches <- libmatches %>% filter(!str_detect(call, "\\[i\\]"))

 # remove unmatched parens caused by wrapped library() calls and other issues
 # modified from this code golf approach that uses stack data structures
 # https://codegolf.stackexchange.com/questions/245625/remove-unmatched-brackets
 f <- \(x, `[` = gsub)"if"(x != (x <- "<([^<>]*)>"["{\\1}", x]),
  f(x), chartr("{}", "<>", "<|>"["", x]))

 libmatches$call <- map_chr(libmatches$call, ~ f(chartr(")(", "><", .x))) %>%
  chartr("<>", "()", .)

 # parse call arguments (assume 1st is package name)
 parselibcalls <- function(libcall) {
  as.character(call_args(str2lang(get_expr(libcall)))[[1]])
 }
 libmatches <- libmatches %>% mutate(
  pkgname = map_chr(libmatches$call, possibly(parselibcalls, otherwise = "code error")),
  .before = comment
 )

 # cleanup comments
 libmatches <- libmatches %>%
  mutate(comment = str_remove(comment, "^[ ]?#")) %>%
  mutate(comment = str_squish(comment))
 # to disk
 # write_csv(libmatches,"data/libmatches_processed.csv")
 libmatchesproc <- libmatches

 # how many unique files
 libmatchesproc %>%
  pull(id) %>%
  n_distinct()
 # pkgs per script
 libmatchesproc %>%
  count(id) %>%
  pull(n) %>%
  summary()
 # proportion with comments
 libmatchesproc <- libmatchesproc %>% mutate(hasComment = !is.na(comment))
 libmatchesproc %>%
  janitor::tabyl(id, hasComment) %>%
  mutate(n = `FALSE` + `TRUE`) %>%
  mutate(pctTrue = `TRUE` / n) %>%
  pull(`TRUE`) %>%
  summary()

 # pkgs per file
 libmatchesproc %>%
  count(id) %>%
  arrange(-n)

 # Popular comments
 libmatches %>%
  count(comment) %>%
  arrange(-n) %>%
  na.omit() %>%
  slice(1:20) %>%
  knitr::kable()

 # detect language
 library(cld3) # CRAN v1.5.0
 # sample comments
 libmatchesproc %>%
  filter(hasComment == TRUE) %>%
  sample_n(20) %>%
  select(pkgname, comment) %>%
  knitr::kable()
 # to focus on commented calls
 libcomments <- libmatchesproc %>% filter(hasComment == TRUE)
 libcomments <- libcomments %>% mutate(commentLanguage = detect_language(comment))

 # with comments, frequency
 libcomments %>%
  filter(!is.na(commentLanguage)) %>%
  janitor::tabyl(commentLanguage) %>%
  arrange(-n) %>%
  head() %>%
  mutate(across(where(is.numeric), round, 2)) %>%
  knitr::kable()

 # Spanish language
 libcomments %>%
  filter(commentLanguage == "es") %>%
  distinct(pkgname, comment) %>%
  sample_n(17) %>%
  knitr::kable()


 # remove duplicates (version histories for some files, etc)
 libcommentsDdup <- libcomments %>% distinct(pkgname, comment)

 # popular pkgs
 libcommentsDdup %>%
  add_count(pkgname) %>%
  distinct(pkgname, n) %>%
  top_n(n, n = 10) %>%
  arrange(-n) %>%
  knitr::kable()

 # pkg purpose or functions desired
 libcommentsDdup %>%
  filter(str_detect(comment, regex("^for |^para |^pour", ignore_case = TRUE))) %>%
  sample_n(20) %>%
  knitr::kable()

 # possible installation notes
 libcomments %>%
  filter(str_detect(comment, regex("instal|CRAN|github", ignore_case = TRUE))) %>%
  View()
 sample_n(20) %>% knitr::kable()

 # remarks about the tidyverse
 libcomments %>%
  filter(str_detect(comment, regex("tidyverse", ignore_case = TRUE))) %>%
  View()
 sample_n(20) %>% knitr::kable()


 # viz
 library(ggraph) # CRAN v2.0.5
 library(tidygraph) # CRAN v1.2.1
 library(graphlayouts) # CRAN v0.8.0
 library(ggrepel) # CRAN v0.9.1

 # only short comments for nicer viz
 shortcomments <- libcommentsDdup %>%
  mutate(commlength = str_length(comment)) %>%
  filter(commlength < 37)

 shortcommentstop <- shortcomments %>%
  add_count(pkgname) %>%
  filter(n > 10) %>%
  group_by(pkgname) %>%
  sample_n(7)

 rndpkgs <-
  shortcommentstop %>%
  group_by(pkgname) %>%
  summarise() %>%
  sample_n(5) %>%
  pull(pkgname)

 # prepare for network structure
 fornetwrk <- shortcommentstop %>%
  filter(pkgname %in% rndpkgs) %>%
  select(1, 2)

 pkgnet <- as_tbl_graph(fornetwrk)

 ggraph(pkgnet, layout = "nicely") +
  geom_edge_link(color = "blue", alpha = 0.2) +
  geom_text_repel(aes(x, y,
    label = name, segment.inflect = TRUE,
    family = "Atkinson Hyperlegible"
  ), size = 3) + theme_graph()

 # ggsave("pkggraph.png", width = 7, height = 5, units = "in")
	# exploring comments about loaded packages
	library(bigrquery) # CRAN v1.4.1
	library(dplyr) # CRAN v1.1.1
	library(stringr) # CRAN v1.5.0
	library(readr) # CRAN v2.1.4
	library(tidyr) # CRAN v1.3.0
	library(rlang) # CRAN v1.1.0
	library(purrr) # CRAN v1.0.1


	# authorize and fetch from BigQuery
	# bq_auth()
	RlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "libmatches")
	RmdlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "Rmdlibmatches")
	QmdlibmatchesBQtab <- bq_table("YOURBQprojectID", "rscriptsonGH", "Qmdlibmatches")
	Rlibmatches <- bq_table_download(RlibmatchesBQtab)
	Rmdlibmatches <- bq_table_download(RmdlibmatchesBQtab)
	Qmdlibmatches <- bq_table_download(QmdlibmatchesBQtab)
	# all formats together
	libmatches <- bind_rows(Rlibmatches, Rmdlibmatches, Qmdlibmatches)
	# to disk
	# write_csv(libmatches,"data/libmatches.csv")


	# from file
	libmatches <- read_csv("data/libmatches.csv")
	# just the lines with package load calls
	libmatches <- libmatches %>% mutate(libcalls = str_extract_all(content, "library\\(.+"))
	# cleanup inconsistent script parsing
	libmatches <- libmatches %>% filter(!str_detect(content, "^expected"))
	# split calls rowwise
	libmatches <- libmatches %>% unnest_longer(libcalls)
	# calls and comments
	libmatches <- libmatches %>%
	separate(libcalls, into = c("call", "comment"), sep = "#", extra = "merge") %>%
	mutate(across(everything(), str_trim))

	# clean and separate multiline expressions
	libmatches <- libmatches %>% mutate(call = str_remove(call, ";$"))
	libmatches <- libmatches %>%
	separate_rows(call, sep = ";") %>%
	mutate(call = str_trim(call))
	# remove non calls
	libmatches <- libmatches %>% filter(str_detect(call, "^libr"))
	# remove possible iterative loading
	libmatches <- libmatches %>% filter(!str_detect(call, "\\[i\\]"))

	# remove unmatched parens caused by wrapped library() calls and other issues
	# modified from this code golf approach that uses stack data structures
	# https://codegolf.stackexchange.com/questions/245625/remove-unmatched-brackets
	f <- \(x, `[` = gsub)"if"(x != (x <- "<([^<>]*)>"["{\\1}", x]),
	f(x), chartr("{}", "<>", "<\|>"["", x]))

	libmatches$call <- map_chr(libmatches$call, ~ f(chartr(")(", "><", .x))) %>%
	chartr("<>", "()", .)

	# parse call arguments (assume 1st is package name)
	parselibcalls <- function(libcall) {
	as.character(call_args(str2lang(get_expr(libcall)))[[1]])
	}
	libmatches <- libmatches %>% mutate(
	pkgname = map_chr(libmatches$call, possibly(parselibcalls, otherwise = "code error")),
	.before = comment
	)

	# cleanup comments
	libmatches <- libmatches %>%
	mutate(comment = str_remove(comment, "^[ ]?#")) %>%
	mutate(comment = str_squish(comment))
	# to disk
	# write_csv(libmatches,"data/libmatches_processed.csv")
	libmatchesproc <- libmatches

	# how many unique files
	libmatchesproc %>%
	pull(id) %>%
	n_distinct()
	# pkgs per script
	libmatchesproc %>%
	count(id) %>%
	pull(n) %>%
	summary()
	# proportion with comments
	libmatchesproc <- libmatchesproc %>% mutate(hasComment = !is.na(comment))
	libmatchesproc %>%
	janitor::tabyl(id, hasComment) %>%
	mutate(n = `FALSE` + `TRUE`) %>%
	mutate(pctTrue = `TRUE` / n) %>%
	pull(`TRUE`) %>%
	summary()

	# pkgs per file
	libmatchesproc %>%
	count(id) %>%
	arrange(-n)

	# Popular comments
	libmatches %>%
	count(comment) %>%
	arrange(-n) %>%
	na.omit() %>%
	slice(1:20) %>%
	knitr::kable()

	# detect language
	library(cld3) # CRAN v1.5.0
	# sample comments
	libmatchesproc %>%
	filter(hasComment == TRUE) %>%
	sample_n(20) %>%
	select(pkgname, comment) %>%
	knitr::kable()
	# to focus on commented calls
	libcomments <- libmatchesproc %>% filter(hasComment == TRUE)
	libcomments <- libcomments %>% mutate(commentLanguage = detect_language(comment))

	# with comments, frequency
	libcomments %>%
	filter(!is.na(commentLanguage)) %>%
	janitor::tabyl(commentLanguage) %>%
	arrange(-n) %>%
	head() %>%
	mutate(across(where(is.numeric), round, 2)) %>%
	knitr::kable()

	# Spanish language
	libcomments %>%
	filter(commentLanguage == "es") %>%
	distinct(pkgname, comment) %>%
	sample_n(17) %>%
	knitr::kable()


	# remove duplicates (version histories for some files, etc)
	libcommentsDdup <- libcomments %>% distinct(pkgname, comment)

	# popular pkgs
	libcommentsDdup %>%
	add_count(pkgname) %>%
	distinct(pkgname, n) %>%
	top_n(n, n = 10) %>%
	arrange(-n) %>%
	knitr::kable()

	# pkg purpose or functions desired
	libcommentsDdup %>%
	filter(str_detect(comment, regex("^for \|^para \|^pour", ignore_case = TRUE))) %>%
	sample_n(20) %>%
	knitr::kable()

	# possible installation notes
	libcomments %>%
	filter(str_detect(comment, regex("instal\|CRAN\|github", ignore_case = TRUE))) %>%
	View()
	sample_n(20) %>% knitr::kable()

	# remarks about the tidyverse
	libcomments %>%
	filter(str_detect(comment, regex("tidyverse", ignore_case = TRUE))) %>%
	View()
	sample_n(20) %>% knitr::kable()


	# viz
	library(ggraph) # CRAN v2.0.5
	library(tidygraph) # CRAN v1.2.1
	library(graphlayouts) # CRAN v0.8.0
	library(ggrepel) # CRAN v0.9.1

	# only short comments for nicer viz
	shortcomments <- libcommentsDdup %>%
	mutate(commlength = str_length(comment)) %>%
	filter(commlength < 37)

	shortcommentstop <- shortcomments %>%
	add_count(pkgname) %>%
	filter(n > 10) %>%
	group_by(pkgname) %>%
	sample_n(7)

	rndpkgs <-
	shortcommentstop %>%
	group_by(pkgname) %>%
	summarise() %>%
	sample_n(5) %>%
	pull(pkgname)

	# prepare for network structure
	fornetwrk <- shortcommentstop %>%
	filter(pkgname %in% rndpkgs) %>%
	select(1, 2)

	pkgnet <- as_tbl_graph(fornetwrk)

	ggraph(pkgnet, layout = "nicely") +
	geom_edge_link(color = "blue", alpha = 0.2) +
	geom_text_repel(aes(x, y,
	label = name, segment.inflect = TRUE,
	family = "Atkinson Hyperlegible"
	), size = 3) + theme_graph()

	# ggsave("pkggraph.png", width = 7, height = 5, units = "in")