Prompt:
You're a senior Ruby engineer. Write a command-line Ruby script that uses
bundler/inline
to install and requirenokogiri
,terminal-table
, andruby-progressbar
.The script should:
- Scrape the list of GitHub repository dependents for a given repo (e.g.,
mongodb/node-mongodb-native
), paginating through all pages.- Accept CLI options:
--repo
: GitHub repo inowner/name
format (required)--limit
: Max number of results (default: unlimited)--min-stars
: Minimum number of stars to include--format
: Output format (table
orjson
)--delay
: Delay between requests (in ms)--backoff
: Retry delay for HTTP 429 (default 10s)--verbose
: Print debug messages- Use the GitHub web UI, not the API, and extract repo name, URL, and star count per dependent.
- Track the last 5 pages scraped in a circular buffer. If a duplicate URL is detected, abort to prevent infinite loops.
- Show a progress bar unless
--verbose
is enabled.- Gracefully handle
CTRL+C
to print results collected so far.- Print a summary on completion (or interruption) with:
- Total runtime (
HH:MM:SS
)- Pages scraped
- Matching results
- Skipped due to
--min-stars
- Ensure all output is clean and readable. Sort results by stars descending and deduplicate by repo name.
- Use a refined XPath to match the Next pagination link with text
"Next"
and class"BtnGroup-item"
.
Created
April 25, 2025 20:02
-
-
Save alexbevi/017db3ba12fee7376d8df8c35c387ee4 to your computer and use it in GitHub Desktop.
Github Dependents Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# github_dependents.rb | |
# | |
# This script scrapes the "Dependents" list from a GitHub repository's network page, | |
# following pagination and collecting dependent repositories. | |
# It supports: | |
# - Output as table or JSON | |
# - Filtering by minimum stars (--min-stars) | |
# - Rate limit handling (429 with configurable backoff) | |
# - Delay between page loads (--delay) | |
# - Progress bar (non-verbose mode) | |
# - Graceful CTRL+C handling with summary | |
# | |
# Safety feature: | |
# - Tracks the last 5 pages visited. If a duplicate URL appears, the script aborts | |
# to prevent infinite loops caused by incorrect "Next" page detection. | |
require 'bundler' | |
require 'bundler/inline' | |
Bundler.ui.silence do | |
gemfile(true) do | |
source 'https://rubygems.org' | |
gem 'nokogiri' | |
gem 'terminal-table' | |
gem 'ruby-progressbar' | |
end | |
end | |
require 'optparse' | |
require 'nokogiri' | |
require 'json' | |
require 'terminal-table' | |
require 'uri' | |
require 'net/http' | |
require 'openssl' | |
require 'ruby-progressbar' | |
options = { | |
format: 'table', | |
limit: nil, | |
min_stars: 0, | |
delay: 0, | |
backoff: 10, | |
verbose: false | |
} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: github_dependents.rb [options]" | |
opts.on("-r", "--repo REPO", "Repository (owner/name)") { |v| options[:repo] = v } | |
opts.on("-f", "--format FORMAT", "Output format (json or table)") { |v| options[:format] = v } | |
opts.on("-l", "--limit N", Integer, "Limit results (default: unlimited)") { |v| options[:limit] = v } | |
opts.on("--min-stars N", Integer, "Minimum number of stars (default: 0)") { |v| options[:min_stars] = v } | |
opts.on("--delay MS", Float, "Delay between pages in milliseconds (default: 0)") { |v| options[:delay] = v } | |
opts.on("--backoff SECONDS", Integer, "Initial backoff after 429 (default: 10)") { |v| options[:backoff] = v } | |
opts.on("-v", "--verbose", "Enable verbose output") { options[:verbose] = true } | |
opts.on("-h", "--help", "Print this help message") { puts opts; exit } | |
end.parse! | |
abort("Error: --repo is required (e.g., --repo rails/rails)") unless options[:repo] | |
GITHUB_TOKEN = ENV['GITHUB_TOKEN'] | |
abort("Error: GITHUB_TOKEN environment variable is required") unless GITHUB_TOKEN | |
USER_AGENT = "GitHubDependentScraper/1.0" | |
OWNER, REPO = options[:repo].split("/") | |
$collected_dependents = [] | |
$skipped_dependents = 0 | |
$pages_scraped = 0 | |
$start_time = Time.now | |
$url_history = [] # <= new: circular buffer of last 5 URLs | |
def log(msg, verbose) | |
puts "[DEBUG] #{msg}" if verbose | |
end | |
def dedup_and_sort(dependents) | |
dependents.uniq { |d| d[:name] }.sort_by { |d| -d[:stars] } | |
end | |
def print_summary(dependents, min_stars) | |
elapsed = Time.now - $start_time | |
hh, mm, ss = Time.at(elapsed).utc.strftime("%H:%M:%S").split(":") | |
puts "\n[INFO] Total runtime: #{hh}:#{mm}:#{ss}" | |
puts "[INFO] Pages scraped: #{$pages_scraped}" | |
puts "[INFO] Matching results: #{dependents.size}" | |
puts "[INFO] Skipped due to --min-stars=#{min_stars}: #{$skipped_dependents}" | |
end | |
Signal.trap("INT") do | |
puts "\n[INTERRUPTED] Script was cancelled via CTRL+C." | |
puts "[INFO] Collected #{$collected_dependents.size} dependents." | |
final = dedup_and_sort($collected_dependents) | |
if final.empty? | |
puts "[INFO] No data to display." | |
print_summary(final, options[:min_stars]) | |
exit(0) | |
end | |
if options[:format] == "json" | |
puts JSON.pretty_generate(final) | |
else | |
rows = final.map { |d| [d[:name], d[:url], d[:stars]] } | |
table = Terminal::Table.new( | |
title: "Dependents collected before interruption (min #{options[:min_stars]}★)", | |
headings: ['Name', 'URL', 'Stars'], | |
rows: rows | |
) | |
puts table | |
end | |
print_summary(final, options[:min_stars]) | |
exit(0) | |
end | |
def fetch_html(url, token, verbose, base_delay_ms, backoff_sec) | |
uri = URI(url) | |
delay_sec = backoff_sec | |
min_delay_sec = base_delay_ms ? (base_delay_ms / 1000.0) : 0 | |
loop do | |
log("Fetching: #{url}", verbose) | |
request = Net::HTTP::Get.new(uri) | |
request['User-Agent'] = "GitHubDependentScraper" | |
request['Authorization'] = "token #{token}" | |
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| | |
http.request(request) | |
end | |
if response.code == "429" | |
puts "[RATE LIMIT] GitHub responded with 429 Too Many Requests" | |
puts "[RATE LIMIT] Backing off for #{delay_sec} seconds..." | |
sleep(delay_sec) | |
delay_sec = [delay_sec - 1, min_delay_sec].max | |
next | |
end | |
unless response.is_a?(Net::HTTPSuccess) | |
raise "Failed to fetch page: #{response.code} #{response.message}" | |
end | |
return Nokogiri::HTML(response.body) | |
end | |
end | |
def extract_total_dependents(doc, verbose) | |
tab = doc.at_css("a[href*='dependent_type=REPOSITORY']") | |
return nil unless tab | |
text = tab.text.strip | |
count = text[/\d[\d,]*/].to_s.delete(',').to_i | |
log("Total dependents reported by GitHub: #{count}", verbose) | |
count | |
end | |
def find_next_page(doc, current_url, verbose) | |
next_link = doc.at_xpath('//a[contains(@class, "BtnGroup-item") and normalize-space(text())="Next"]') | |
if next_link && next_link['href'] | |
next_url = URI.join(current_url, next_link['href']).to_s | |
# Track URL history (circular buffer of last 5) | |
$url_history << next_url | |
$url_history.shift if $url_history.size > 5 | |
if $url_history.count(next_url) > 1 | |
puts "[ERROR] Duplicate page detected in history. Possible infinite loop." | |
puts "[DEBUG] Recent URLs:\n" + $url_history.join("\n") | |
exit(1) | |
end | |
log("Next page URL: #{next_url}", verbose) | |
return next_url | |
end | |
log("No next page found", verbose) | |
nil | |
end | |
def scrape_dependents(repo, limit, min_stars, delay_ms, backoff_sec, token, verbose) | |
results = [] | |
current_url = "https://github.com/#{repo}/network/dependents?dependent_type=REPOSITORY" | |
total_dependents = nil | |
scraped_count = 0 | |
progress = nil | |
while current_url | |
$pages_scraped += 1 | |
doc = fetch_html(current_url, token, verbose, delay_ms, backoff_sec) | |
if total_dependents.nil? | |
total_dependents = extract_total_dependents(doc, verbose) | |
if !verbose && total_dependents | |
progress = ProgressBar.create( | |
title: "Scraping", | |
total: total_dependents, | |
format: "%t: |%B| %p%% (%c/%C)", | |
progress_mark: '#', | |
remainder_mark: '·' | |
) | |
end | |
end | |
rows = doc.css("div.Box-row") | |
log("Page #{$pages_scraped}: #{rows.size} dependents found", verbose) | |
rows.each do |row| | |
repo_link = row.at_css("a[data-hovercard-type='repository']") | |
next unless repo_link | |
scraped_count += 1 | |
progress&.increment | |
full_name = repo_link.text.strip | |
href = URI.join("https://github.com", repo_link['href']).to_s | |
stars_text = row.css("span").find { |s| s.text.strip.match(/^\d[\d,]*$/) }&.text | |
stars = stars_text ? stars_text.strip.delete(',').to_i : 0 | |
if stars < min_stars | |
$skipped_dependents += 1 | |
log("Skipping #{full_name} (#{stars}★ < #{min_stars}★)", verbose) | |
else | |
log("Adding #{full_name} (#{stars}★)", verbose) | |
entry = { name: full_name, url: href, stars: stars } | |
results << entry | |
$collected_dependents << entry | |
end | |
return dedup_and_sort(results) if limit && results.size >= limit | |
end | |
current_url = find_next_page(doc, current_url, verbose) | |
sleep(delay_ms / 1000.0) if current_url && delay_ms > 0 | |
end | |
dedup_and_sort(results) | |
end | |
log("Options: #{options}", options[:verbose]) | |
dependents = scrape_dependents( | |
options[:repo], | |
options[:limit], | |
options[:min_stars], | |
options[:delay], | |
options[:backoff], | |
GITHUB_TOKEN, | |
options[:verbose] | |
) | |
if options[:format] == "json" | |
puts JSON.pretty_generate(dependents) | |
else | |
rows = dependents.map { |d| [d[:name], d[:url], d[:stars]] } | |
table = Terminal::Table.new( | |
title: "Dependents of #{options[:repo]} (min #{options[:min_stars]}★)", | |
headings: ['Name', 'URL', 'Stars'], | |
rows: rows | |
) | |
puts table | |
end | |
print_summary(dependents, options[:min_stars]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment