stephenturner · August 20, 2025 15:12
diff --git a/gh-orgs-licenses.R b/gh-orgs-licenses.R
 # ------------------------------------------------------------
 # GitHub org license audit + stacked bar chart (R / tidyverse)
 # ------------------------------------------------------------
 # Prereqs:
 # install.packages(c("gh","dplyr","purrr","tidyr","stringr","ggplot2","forcats"))
 # Auth: Sys.setenv(GITHUB_PAT = "<your fine-grained or classic PAT>")  # optional but recommended

 library(gh)
 library(dplyr)
 library(purrr)
 library(tidyr)
 library(stringr)
 library(ggplot2)
 library(forcats)

 # ---------------------------
 # 1) INPUT: org IDs to check
 # ---------------------------
 orgs <- c(
  "sbg",
  "bionanogenomics",
  "BioSkryb",
  "BGIResearch",
  "nebiolabs",
  "GenapsysInc",
  "fulcrumgenomics",
  "ATCC-Bioinformatics",
  "seqeralabs",
  "dnanexus",
  "latchbio",
  "Elembio",
  "thermofisher",
  "Ultimagen",
  "Singular-Genomics",
  "Nanostring-Biostats",
  "MGI-tech-bioinformatics",
  "becls",
  "insightsengineering",
  "10xgenomics",
  "pacificbiosciences",
  "nanoporetech",
  "illumina"
 )

 # Feel free to replace with your own list.

 # ----------------------------------------------
 # 2) Helper: classify license into three buckets
 # ----------------------------------------------
 classify_license <- function(spdx_id, license_name) {
  # Normalize
  spdx_id <- toupper(trimws(coalesce(spdx_id, "")))
  lname <- tolower(trimws(coalesce(license_name, "")))

  # Buckets (edit to taste):
  open_spdx <- c(
    "MIT",
    "BSD-2-CLAUSE",
    "BSD-3-CLAUSE",
    "APACHE-2.0",
    "ISC",
    "UNLICENSE",
    "GPL-2.0",
    "GPL-3.0",
    "AGPL-3.0",
    "LGPL-2.1",
    "LGPL-3.0",
    "MPL-2.0",
    "EPL-2.0",
    "Unlicense",
    "Zlib",
    "CC-BY-4.0",
    "CC-BY-SA-4.0",
    "CC0-1.0",
    "Zlib"
  )

  restrictive_spdx <- c(
    "CC-BY-NC-4.0",
    "CC-BY-NC-SA-4.0",
    "PROPRIETARY",
    "NON-SPDX LICENSE"
  )
  # Some repos use non-SPDX names; simple string heuristics as a fallback:
  is_open_name <- function(x) {
    str_detect(x, "(mit|bsd|apache|gpl|lgpl|agpl|mpl|isc|unlicense|zlib)")
  }
  is_restrict_name <- function(x) {
    str_detect(x, "(non-?commercial|proprietary|no\\s*license)")
  }

  if (spdx_id %in% open_spdx || is_open_name(lname)) {
    "open"
  } else if (spdx_id %in% restrictive_spdx || is_restrict_name(lname)) {
    "restrictive"
  } else if (spdx_id == "" && lname == "") {
    "restrictive/unknown"
  } else {
    # Unknown / ambiguous licenses default to "none" (adjust if you prefer a separate "unknown")
    "restrictive/unknown"
  }
 }

 # -----------------------------------------------------
 # 3) Fetch repos for an org (safe wrapper + pagination)
 # -----------------------------------------------------
 get_org_repos <- function(org) {
  # gh() handles pagination with .limit = Inf; returns public repos
  res <- tryCatch(
    gh(
      "/orgs/{org}/repos",
      org = org,
      .limit = Inf,
      per_page = 100,
      type = "public"
    ),
    error = function(e) NULL
  )
  if (is.null(res) || length(res) == 0) {
    return(tibble(
      org = org,
      repo = character(),
      spdx_id = character(),
      license_name = character()
    ))
  }

  tibble(
    org = org,
    repo = map_chr(res, ~ .x$name %||% NA_character_),
    spdx_id = map_chr(res, ~ .x$license$spdx_id %||% NA_character_),
    license_name = map_chr(res, ~ .x$license$name %||% NA_character_)
  )
 }

 # --------------------------------------
 # 4) Pull everything + classify licenses
 # --------------------------------------
 repo_df <- map_dfr(orgs, get_org_repos)
 repo_df <-
  repo_df |>
  mutate(
    license_class = pmap_chr(
      list(spdx_id, license_name),
      ~ classify_license(..1, ..2)
    )
  )

 # If an org has zero public repos or fetch failed, ensure it still shows up with 0s
 counts_df <- repo_df %>%
  mutate(
    license_class = factor(
      license_class,
      levels = c("restrictive/unknown", "open")
    )
  ) %>%
  count(org, license_class, name = "n", .drop = FALSE) %>%
  group_by(org) %>%
  complete(license_class, fill = list(n = 0)) %>%
  ungroup()

 totals_df <- counts_df %>%
  group_by(org) %>%
  summarise(total_repos = sum(n), .groups = "drop") %>%
  arrange(desc(total_repos))

 # ----------------------------------------
 # 5) Plot: stacked bars by license classes
 # ----------------------------------------
 # Order orgs by total repos (descending) for readability
 counts_df <- counts_df %>%
  left_join(totals_df, by = "org") %>%
  mutate(org = fct_reorder(org, total_repos))

 p <- ggplot(counts_df, aes(x = org, y = n, fill = license_class)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "GitHub repositories by organization and license class",
    x = "Organization",
    y = "Number of repositories",
    fill = "License class"
  ) +
  theme_minimal(base_size = 12)

 print(p)

 # Export to clipboard for datawrapper
 counts_df |>
  select(-total_repos) |>
  pivot_wider(
    names_from = license_class,
    values_from = n,
    values_fill = 0
  ) |>
  clipr::write_clip()
	# ------------------------------------------------------------
	# GitHub org license audit + stacked bar chart (R / tidyverse)
	# ------------------------------------------------------------
	# Prereqs:
	# install.packages(c("gh","dplyr","purrr","tidyr","stringr","ggplot2","forcats"))
	# Auth: Sys.setenv(GITHUB_PAT = "<your fine-grained or classic PAT>") # optional but recommended

	library(gh)
	library(dplyr)
	library(purrr)
	library(tidyr)
	library(stringr)
	library(ggplot2)
	library(forcats)

	# ---------------------------
	# 1) INPUT: org IDs to check
	# ---------------------------
	orgs <- c(
	"sbg",
	"bionanogenomics",
	"BioSkryb",
	"BGIResearch",
	"nebiolabs",
	"GenapsysInc",
	"fulcrumgenomics",
	"ATCC-Bioinformatics",
	"seqeralabs",
	"dnanexus",
	"latchbio",
	"Elembio",
	"thermofisher",
	"Ultimagen",
	"Singular-Genomics",
	"Nanostring-Biostats",
	"MGI-tech-bioinformatics",
	"becls",
	"insightsengineering",
	"10xgenomics",
	"pacificbiosciences",
	"nanoporetech",
	"illumina"
	)

	# Feel free to replace with your own list.

	# ----------------------------------------------
	# 2) Helper: classify license into three buckets
	# ----------------------------------------------
	classify_license <- function(spdx_id, license_name) {
	# Normalize
	spdx_id <- toupper(trimws(coalesce(spdx_id, "")))
	lname <- tolower(trimws(coalesce(license_name, "")))

	# Buckets (edit to taste):
	open_spdx <- c(
	"MIT",
	"BSD-2-CLAUSE",
	"BSD-3-CLAUSE",
	"APACHE-2.0",
	"ISC",
	"UNLICENSE",
	"GPL-2.0",
	"GPL-3.0",
	"AGPL-3.0",
	"LGPL-2.1",
	"LGPL-3.0",
	"MPL-2.0",
	"EPL-2.0",
	"Unlicense",
	"Zlib",
	"CC-BY-4.0",
	"CC-BY-SA-4.0",
	"CC0-1.0",
	"Zlib"
	)

	restrictive_spdx <- c(
	"CC-BY-NC-4.0",
	"CC-BY-NC-SA-4.0",
	"PROPRIETARY",
	"NON-SPDX LICENSE"
	)
	# Some repos use non-SPDX names; simple string heuristics as a fallback:
	is_open_name <- function(x) {
	str_detect(x, "(mit\|bsd\|apache\|gpl\|lgpl\|agpl\|mpl\|isc\|unlicense\|zlib)")
	}
	is_restrict_name <- function(x) {
	str_detect(x, "(non-?commercial\|proprietary\|no\\s*license)")
	}

	if (spdx_id %in% open_spdx \|\| is_open_name(lname)) {
	"open"
	} else if (spdx_id %in% restrictive_spdx \|\| is_restrict_name(lname)) {
	"restrictive"
	} else if (spdx_id == "" && lname == "") {
	"restrictive/unknown"
	} else {
	# Unknown / ambiguous licenses default to "none" (adjust if you prefer a separate "unknown")
	"restrictive/unknown"
	}
	}

	# -----------------------------------------------------
	# 3) Fetch repos for an org (safe wrapper + pagination)
	# -----------------------------------------------------
	get_org_repos <- function(org) {
	# gh() handles pagination with .limit = Inf; returns public repos
	res <- tryCatch(
	gh(
	"/orgs/{org}/repos",
	org = org,
	.limit = Inf,
	per_page = 100,
	type = "public"
	),
	error = function(e) NULL
	)
	if (is.null(res) \|\| length(res) == 0) {
	return(tibble(
	org = org,
	repo = character(),
	spdx_id = character(),
	license_name = character()
	))
	}

	tibble(
	org = org,
	repo = map_chr(res, ~ .x$name %\|\|% NA_character_),
	spdx_id = map_chr(res, ~ .x$license$spdx_id %\|\|% NA_character_),
	license_name = map_chr(res, ~ .x$license$name %\|\|% NA_character_)
	)
	}

	# --------------------------------------
	# 4) Pull everything + classify licenses
	# --------------------------------------
	repo_df <- map_dfr(orgs, get_org_repos)
	repo_df <-
	repo_df \|>
	mutate(
	license_class = pmap_chr(
	list(spdx_id, license_name),
	~ classify_license(..1, ..2)
	)
	)

	# If an org has zero public repos or fetch failed, ensure it still shows up with 0s
	counts_df <- repo_df %>%
	mutate(
	license_class = factor(
	license_class,
	levels = c("restrictive/unknown", "open")
	)
	) %>%
	count(org, license_class, name = "n", .drop = FALSE) %>%
	group_by(org) %>%
	complete(license_class, fill = list(n = 0)) %>%
	ungroup()

	totals_df <- counts_df %>%
	group_by(org) %>%
	summarise(total_repos = sum(n), .groups = "drop") %>%
	arrange(desc(total_repos))

	# ----------------------------------------
	# 5) Plot: stacked bars by license classes
	# ----------------------------------------
	# Order orgs by total repos (descending) for readability
	counts_df <- counts_df %>%
	left_join(totals_df, by = "org") %>%
	mutate(org = fct_reorder(org, total_repos))

	p <- ggplot(counts_df, aes(x = org, y = n, fill = license_class)) +
	geom_col() +
	coord_flip() +
	labs(
	title = "GitHub repositories by organization and license class",
	x = "Organization",
	y = "Number of repositories",
	fill = "License class"
	) +
	theme_minimal(base_size = 12)

	print(p)

	# Export to clipboard for datawrapper
	counts_df \|>
	select(-total_repos) \|>
	pivot_wider(
	names_from = license_class,
	values_from = n,
	values_fill = 0
	) \|>
	clipr::write_clip()