Created
August 20, 2025 15:12
-
-
Save stephenturner/72b9fe9ca62d7e17cc57b89c1213eb78 to your computer and use it in GitHub Desktop.
Sequencing-related orgs' GitHub repos and licensing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------ | |
# GitHub org license audit + stacked bar chart (R / tidyverse) | |
# ------------------------------------------------------------ | |
# Prereqs: | |
# install.packages(c("gh","dplyr","purrr","tidyr","stringr","ggplot2","forcats")) | |
# Auth: Sys.setenv(GITHUB_PAT = "<your fine-grained or classic PAT>") # optional but recommended | |
library(gh) | |
library(dplyr) | |
library(purrr) | |
library(tidyr) | |
library(stringr) | |
library(ggplot2) | |
library(forcats) | |
# --------------------------- | |
# 1) INPUT: org IDs to check | |
# --------------------------- | |
orgs <- c( | |
"sbg", | |
"bionanogenomics", | |
"BioSkryb", | |
"BGIResearch", | |
"nebiolabs", | |
"GenapsysInc", | |
"fulcrumgenomics", | |
"ATCC-Bioinformatics", | |
"seqeralabs", | |
"dnanexus", | |
"latchbio", | |
"Elembio", | |
"thermofisher", | |
"Ultimagen", | |
"Singular-Genomics", | |
"Nanostring-Biostats", | |
"MGI-tech-bioinformatics", | |
"becls", | |
"insightsengineering", | |
"10xgenomics", | |
"pacificbiosciences", | |
"nanoporetech", | |
"illumina" | |
) | |
# Feel free to replace with your own list. | |
# ---------------------------------------------- | |
# 2) Helper: classify license into three buckets | |
# ---------------------------------------------- | |
classify_license <- function(spdx_id, license_name) { | |
# Normalize | |
spdx_id <- toupper(trimws(coalesce(spdx_id, ""))) | |
lname <- tolower(trimws(coalesce(license_name, ""))) | |
# Buckets (edit to taste): | |
open_spdx <- c( | |
"MIT", | |
"BSD-2-CLAUSE", | |
"BSD-3-CLAUSE", | |
"APACHE-2.0", | |
"ISC", | |
"UNLICENSE", | |
"GPL-2.0", | |
"GPL-3.0", | |
"AGPL-3.0", | |
"LGPL-2.1", | |
"LGPL-3.0", | |
"MPL-2.0", | |
"EPL-2.0", | |
"Unlicense", | |
"Zlib", | |
"CC-BY-4.0", | |
"CC-BY-SA-4.0", | |
"CC0-1.0", | |
"Zlib" | |
) | |
restrictive_spdx <- c( | |
"CC-BY-NC-4.0", | |
"CC-BY-NC-SA-4.0", | |
"PROPRIETARY", | |
"NON-SPDX LICENSE" | |
) | |
# Some repos use non-SPDX names; simple string heuristics as a fallback: | |
is_open_name <- function(x) { | |
str_detect(x, "(mit|bsd|apache|gpl|lgpl|agpl|mpl|isc|unlicense|zlib)") | |
} | |
is_restrict_name <- function(x) { | |
str_detect(x, "(non-?commercial|proprietary|no\\s*license)") | |
} | |
if (spdx_id %in% open_spdx || is_open_name(lname)) { | |
"open" | |
} else if (spdx_id %in% restrictive_spdx || is_restrict_name(lname)) { | |
"restrictive" | |
} else if (spdx_id == "" && lname == "") { | |
"restrictive/unknown" | |
} else { | |
# Unknown / ambiguous licenses default to "none" (adjust if you prefer a separate "unknown") | |
"restrictive/unknown" | |
} | |
} | |
# ----------------------------------------------------- | |
# 3) Fetch repos for an org (safe wrapper + pagination) | |
# ----------------------------------------------------- | |
get_org_repos <- function(org) { | |
# gh() handles pagination with .limit = Inf; returns public repos | |
res <- tryCatch( | |
gh( | |
"/orgs/{org}/repos", | |
org = org, | |
.limit = Inf, | |
per_page = 100, | |
type = "public" | |
), | |
error = function(e) NULL | |
) | |
if (is.null(res) || length(res) == 0) { | |
return(tibble( | |
org = org, | |
repo = character(), | |
spdx_id = character(), | |
license_name = character() | |
)) | |
} | |
tibble( | |
org = org, | |
repo = map_chr(res, ~ .x$name %||% NA_character_), | |
spdx_id = map_chr(res, ~ .x$license$spdx_id %||% NA_character_), | |
license_name = map_chr(res, ~ .x$license$name %||% NA_character_) | |
) | |
} | |
# -------------------------------------- | |
# 4) Pull everything + classify licenses | |
# -------------------------------------- | |
repo_df <- map_dfr(orgs, get_org_repos) | |
repo_df <- | |
repo_df |> | |
mutate( | |
license_class = pmap_chr( | |
list(spdx_id, license_name), | |
~ classify_license(..1, ..2) | |
) | |
) | |
# If an org has zero public repos or fetch failed, ensure it still shows up with 0s | |
counts_df <- repo_df %>% | |
mutate( | |
license_class = factor( | |
license_class, | |
levels = c("restrictive/unknown", "open") | |
) | |
) %>% | |
count(org, license_class, name = "n", .drop = FALSE) %>% | |
group_by(org) %>% | |
complete(license_class, fill = list(n = 0)) %>% | |
ungroup() | |
totals_df <- counts_df %>% | |
group_by(org) %>% | |
summarise(total_repos = sum(n), .groups = "drop") %>% | |
arrange(desc(total_repos)) | |
# ---------------------------------------- | |
# 5) Plot: stacked bars by license classes | |
# ---------------------------------------- | |
# Order orgs by total repos (descending) for readability | |
counts_df <- counts_df %>% | |
left_join(totals_df, by = "org") %>% | |
mutate(org = fct_reorder(org, total_repos)) | |
p <- ggplot(counts_df, aes(x = org, y = n, fill = license_class)) + | |
geom_col() + | |
coord_flip() + | |
labs( | |
title = "GitHub repositories by organization and license class", | |
x = "Organization", | |
y = "Number of repositories", | |
fill = "License class" | |
) + | |
theme_minimal(base_size = 12) | |
print(p) | |
# Export to clipboard for datawrapper | |
counts_df |> | |
select(-total_repos) |> | |
pivot_wider( | |
names_from = license_class, | |
values_from = n, | |
values_fill = 0 | |
) |> | |
clipr::write_clip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment