Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created August 20, 2025 15:12
Show Gist options
  • Save stephenturner/72b9fe9ca62d7e17cc57b89c1213eb78 to your computer and use it in GitHub Desktop.
Save stephenturner/72b9fe9ca62d7e17cc57b89c1213eb78 to your computer and use it in GitHub Desktop.
Sequencing-related orgs' GitHub repos and licensing
# ------------------------------------------------------------
# GitHub org license audit + stacked bar chart (R / tidyverse)
# ------------------------------------------------------------
# Prereqs:
# install.packages(c("gh","dplyr","purrr","tidyr","stringr","ggplot2","forcats"))
# Auth: Sys.setenv(GITHUB_PAT = "<your fine-grained or classic PAT>") # optional but recommended
library(gh)
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(ggplot2)
library(forcats)
# ---------------------------
# 1) INPUT: org IDs to check
# ---------------------------
orgs <- c(
"sbg",
"bionanogenomics",
"BioSkryb",
"BGIResearch",
"nebiolabs",
"GenapsysInc",
"fulcrumgenomics",
"ATCC-Bioinformatics",
"seqeralabs",
"dnanexus",
"latchbio",
"Elembio",
"thermofisher",
"Ultimagen",
"Singular-Genomics",
"Nanostring-Biostats",
"MGI-tech-bioinformatics",
"becls",
"insightsengineering",
"10xgenomics",
"pacificbiosciences",
"nanoporetech",
"illumina"
)
# Feel free to replace with your own list.
# ----------------------------------------------
# 2) Helper: classify license into three buckets
# ----------------------------------------------
classify_license <- function(spdx_id, license_name) {
# Normalize
spdx_id <- toupper(trimws(coalesce(spdx_id, "")))
lname <- tolower(trimws(coalesce(license_name, "")))
# Buckets (edit to taste):
open_spdx <- c(
"MIT",
"BSD-2-CLAUSE",
"BSD-3-CLAUSE",
"APACHE-2.0",
"ISC",
"UNLICENSE",
"GPL-2.0",
"GPL-3.0",
"AGPL-3.0",
"LGPL-2.1",
"LGPL-3.0",
"MPL-2.0",
"EPL-2.0",
"Unlicense",
"Zlib",
"CC-BY-4.0",
"CC-BY-SA-4.0",
"CC0-1.0",
"Zlib"
)
restrictive_spdx <- c(
"CC-BY-NC-4.0",
"CC-BY-NC-SA-4.0",
"PROPRIETARY",
"NON-SPDX LICENSE"
)
# Some repos use non-SPDX names; simple string heuristics as a fallback:
is_open_name <- function(x) {
str_detect(x, "(mit|bsd|apache|gpl|lgpl|agpl|mpl|isc|unlicense|zlib)")
}
is_restrict_name <- function(x) {
str_detect(x, "(non-?commercial|proprietary|no\\s*license)")
}
if (spdx_id %in% open_spdx || is_open_name(lname)) {
"open"
} else if (spdx_id %in% restrictive_spdx || is_restrict_name(lname)) {
"restrictive"
} else if (spdx_id == "" && lname == "") {
"restrictive/unknown"
} else {
# Unknown / ambiguous licenses default to "none" (adjust if you prefer a separate "unknown")
"restrictive/unknown"
}
}
# -----------------------------------------------------
# 3) Fetch repos for an org (safe wrapper + pagination)
# -----------------------------------------------------
get_org_repos <- function(org) {
# gh() handles pagination with .limit = Inf; returns public repos
res <- tryCatch(
gh(
"/orgs/{org}/repos",
org = org,
.limit = Inf,
per_page = 100,
type = "public"
),
error = function(e) NULL
)
if (is.null(res) || length(res) == 0) {
return(tibble(
org = org,
repo = character(),
spdx_id = character(),
license_name = character()
))
}
tibble(
org = org,
repo = map_chr(res, ~ .x$name %||% NA_character_),
spdx_id = map_chr(res, ~ .x$license$spdx_id %||% NA_character_),
license_name = map_chr(res, ~ .x$license$name %||% NA_character_)
)
}
# --------------------------------------
# 4) Pull everything + classify licenses
# --------------------------------------
repo_df <- map_dfr(orgs, get_org_repos)
repo_df <-
repo_df |>
mutate(
license_class = pmap_chr(
list(spdx_id, license_name),
~ classify_license(..1, ..2)
)
)
# If an org has zero public repos or fetch failed, ensure it still shows up with 0s
counts_df <- repo_df %>%
mutate(
license_class = factor(
license_class,
levels = c("restrictive/unknown", "open")
)
) %>%
count(org, license_class, name = "n", .drop = FALSE) %>%
group_by(org) %>%
complete(license_class, fill = list(n = 0)) %>%
ungroup()
totals_df <- counts_df %>%
group_by(org) %>%
summarise(total_repos = sum(n), .groups = "drop") %>%
arrange(desc(total_repos))
# ----------------------------------------
# 5) Plot: stacked bars by license classes
# ----------------------------------------
# Order orgs by total repos (descending) for readability
counts_df <- counts_df %>%
left_join(totals_df, by = "org") %>%
mutate(org = fct_reorder(org, total_repos))
p <- ggplot(counts_df, aes(x = org, y = n, fill = license_class)) +
geom_col() +
coord_flip() +
labs(
title = "GitHub repositories by organization and license class",
x = "Organization",
y = "Number of repositories",
fill = "License class"
) +
theme_minimal(base_size = 12)
print(p)
# Export to clipboard for datawrapper
counts_df |>
select(-total_repos) |>
pivot_wider(
names_from = license_class,
values_from = n,
values_fill = 0
) |>
clipr::write_clip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment