Created
April 21, 2016 18:41
-
-
Save jimhester/1379efd654ff36b0a412cc919a830a0a to your computer and use it in GitHub Desktop.
Programmatically determine forks of a repo with at least n commits. Does not try to disambiguate repeated force pushes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Q on @GitHub: does anyone now how to easily (ie automatically) list all forks | |
# of a repository that have more than N commits in the fork? | |
library(gh) # devtools::install_github("gaborcsardi/gh") | |
forks_n_commits <- function(owner, repo, n = 1) { | |
events <- gh("GET /networks/:owner/:repo/events", owner = owner, repo = repo, .limit = Inf) | |
fork_pushes <- Filter(function(x) x$type == "PushEvent" && x$repo$name != paste0(owner, "/", repo), | |
events) | |
sizes <- vapply(fork_pushes, function(x) as.integer(x$payload$distinct_size), integer(1)) | |
repos <- vapply(fork_pushes, function(x) x$repo$name, character(1)) | |
res <- setNames(aggregate(sizes, by = list(repos), FUN = sum), nm = c("repo", "n")) | |
res[res$n > n, ] | |
} |
I got the following to work as I wanted:
library(gh)
# MY_GITHUB_PAC is a personal access token you generate on GitHub https://github.com/settings/tokens
# Supplying this parameter just means your daily rate limit will grow from 60 to 5000
# You can also drop that argument altogether
forks_new_commits <- function(owner = "daattali", repo = "beautiful-jekyll") {
# Get all forks of the repo
all_forks <- gh("GET /repos/:owner/:repo/forks",
owner = owner,
repo = repo,
.limit = Inf,
.token = MY_GITHUB_PAC)
cat("Looks like ", owner, "/", repo, " has ", length(all_forks), " forks\n")
forks_info <- plyr::ldply(
all_forks,
function(fork) {
fork_owner <- fork$owner$login
fork_repo <- fork$name
fork_full_name <- paste0(fork_owner, "/", fork_repo)
cat(fork_full_name)
# Get the last 50 commits on the fork
fork_repo <- gh("GET /repos/:owner/:repo/commits",
owner = fork_owner,
repo = fork_repo,
.limit = 50,
.token = MY_GITHUB_PAC)
# Get the number of commits not by the original owner
num_new_commits <-
sum(unlist(lapply(fork_repo, function(x) { x$author$login })) != owner)
cat(" ", num_new_commits, " commits\n", sep = "")
data.frame(fork = fork_full_name,
num_new_commits = num_new_commits,
url = fork$html_url)
}
)
dplyr::arrange(forks_info, desc(num_new_commits))
}
UPDATE: This can be simplified a ton by using the "compare two commits" API, which you can use to directly get the number of commits the fork is ahead/behind the source. Example: https://api.github.com/repos/daattali/beautiful-jekyll/compare/daattali:master...alexwhan:master look at the ahead_by
and behind_by
return values
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this @jimhester
I just tried this and it seems to only returns a very small number of very recent results. For example
forks_n_commits("daattali", "beautiful-jekyll", 1)
Has 25 results which are all recent, but I know of many many older forks that have multiple commits. Am I using this wrong? If you're not sure, no worries, I don't actually expect you to solve this for me if you're busy :)