Skip to content

Instantly share code, notes, and snippets.

@alexbevi
Created April 25, 2025 20:02
Show Gist options
  • Save alexbevi/017db3ba12fee7376d8df8c35c387ee4 to your computer and use it in GitHub Desktop.
Save alexbevi/017db3ba12fee7376d8df8c35c387ee4 to your computer and use it in GitHub Desktop.
Github Dependents Scraper
#!/usr/bin/env ruby
# github_dependents.rb
#
# This script scrapes the "Dependents" list from a GitHub repository's network page,
# following pagination and collecting dependent repositories.
# It supports:
# - Output as table or JSON
# - Filtering by minimum stars (--min-stars)
# - Rate limit handling (429 with configurable backoff)
# - Delay between page loads (--delay)
# - Progress bar (non-verbose mode)
# - Graceful CTRL+C handling with summary
#
# Safety feature:
# - Tracks the last 5 pages visited. If a duplicate URL appears, the script aborts
# to prevent infinite loops caused by incorrect "Next" page detection.
require 'bundler'
require 'bundler/inline'
Bundler.ui.silence do
gemfile(true) do
source 'https://rubygems.org'
gem 'nokogiri'
gem 'terminal-table'
gem 'ruby-progressbar'
end
end
require 'optparse'
require 'nokogiri'
require 'json'
require 'terminal-table'
require 'uri'
require 'net/http'
require 'openssl'
require 'ruby-progressbar'
options = {
format: 'table',
limit: nil,
min_stars: 0,
delay: 0,
backoff: 10,
verbose: false
}
OptionParser.new do |opts|
opts.banner = "Usage: github_dependents.rb [options]"
opts.on("-r", "--repo REPO", "Repository (owner/name)") { |v| options[:repo] = v }
opts.on("-f", "--format FORMAT", "Output format (json or table)") { |v| options[:format] = v }
opts.on("-l", "--limit N", Integer, "Limit results (default: unlimited)") { |v| options[:limit] = v }
opts.on("--min-stars N", Integer, "Minimum number of stars (default: 0)") { |v| options[:min_stars] = v }
opts.on("--delay MS", Float, "Delay between pages in milliseconds (default: 0)") { |v| options[:delay] = v }
opts.on("--backoff SECONDS", Integer, "Initial backoff after 429 (default: 10)") { |v| options[:backoff] = v }
opts.on("-v", "--verbose", "Enable verbose output") { options[:verbose] = true }
opts.on("-h", "--help", "Print this help message") { puts opts; exit }
end.parse!
abort("Error: --repo is required (e.g., --repo rails/rails)") unless options[:repo]
GITHUB_TOKEN = ENV['GITHUB_TOKEN']
abort("Error: GITHUB_TOKEN environment variable is required") unless GITHUB_TOKEN
USER_AGENT = "GitHubDependentScraper/1.0"
OWNER, REPO = options[:repo].split("/")
$collected_dependents = []
$skipped_dependents = 0
$pages_scraped = 0
$start_time = Time.now
$url_history = [] # <= new: circular buffer of last 5 URLs
def log(msg, verbose)
puts "[DEBUG] #{msg}" if verbose
end
def dedup_and_sort(dependents)
dependents.uniq { |d| d[:name] }.sort_by { |d| -d[:stars] }
end
def print_summary(dependents, min_stars)
elapsed = Time.now - $start_time
hh, mm, ss = Time.at(elapsed).utc.strftime("%H:%M:%S").split(":")
puts "\n[INFO] Total runtime: #{hh}:#{mm}:#{ss}"
puts "[INFO] Pages scraped: #{$pages_scraped}"
puts "[INFO] Matching results: #{dependents.size}"
puts "[INFO] Skipped due to --min-stars=#{min_stars}: #{$skipped_dependents}"
end
Signal.trap("INT") do
puts "\n[INTERRUPTED] Script was cancelled via CTRL+C."
puts "[INFO] Collected #{$collected_dependents.size} dependents."
final = dedup_and_sort($collected_dependents)
if final.empty?
puts "[INFO] No data to display."
print_summary(final, options[:min_stars])
exit(0)
end
if options[:format] == "json"
puts JSON.pretty_generate(final)
else
rows = final.map { |d| [d[:name], d[:url], d[:stars]] }
table = Terminal::Table.new(
title: "Dependents collected before interruption (min #{options[:min_stars]}★)",
headings: ['Name', 'URL', 'Stars'],
rows: rows
)
puts table
end
print_summary(final, options[:min_stars])
exit(0)
end
def fetch_html(url, token, verbose, base_delay_ms, backoff_sec)
uri = URI(url)
delay_sec = backoff_sec
min_delay_sec = base_delay_ms ? (base_delay_ms / 1000.0) : 0
loop do
log("Fetching: #{url}", verbose)
request = Net::HTTP::Get.new(uri)
request['User-Agent'] = "GitHubDependentScraper"
request['Authorization'] = "token #{token}"
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
http.request(request)
end
if response.code == "429"
puts "[RATE LIMIT] GitHub responded with 429 Too Many Requests"
puts "[RATE LIMIT] Backing off for #{delay_sec} seconds..."
sleep(delay_sec)
delay_sec = [delay_sec - 1, min_delay_sec].max
next
end
unless response.is_a?(Net::HTTPSuccess)
raise "Failed to fetch page: #{response.code} #{response.message}"
end
return Nokogiri::HTML(response.body)
end
end
def extract_total_dependents(doc, verbose)
tab = doc.at_css("a[href*='dependent_type=REPOSITORY']")
return nil unless tab
text = tab.text.strip
count = text[/\d[\d,]*/].to_s.delete(',').to_i
log("Total dependents reported by GitHub: #{count}", verbose)
count
end
def find_next_page(doc, current_url, verbose)
next_link = doc.at_xpath('//a[contains(@class, "BtnGroup-item") and normalize-space(text())="Next"]')
if next_link && next_link['href']
next_url = URI.join(current_url, next_link['href']).to_s
# Track URL history (circular buffer of last 5)
$url_history << next_url
$url_history.shift if $url_history.size > 5
if $url_history.count(next_url) > 1
puts "[ERROR] Duplicate page detected in history. Possible infinite loop."
puts "[DEBUG] Recent URLs:\n" + $url_history.join("\n")
exit(1)
end
log("Next page URL: #{next_url}", verbose)
return next_url
end
log("No next page found", verbose)
nil
end
def scrape_dependents(repo, limit, min_stars, delay_ms, backoff_sec, token, verbose)
results = []
current_url = "https://github.com/#{repo}/network/dependents?dependent_type=REPOSITORY"
total_dependents = nil
scraped_count = 0
progress = nil
while current_url
$pages_scraped += 1
doc = fetch_html(current_url, token, verbose, delay_ms, backoff_sec)
if total_dependents.nil?
total_dependents = extract_total_dependents(doc, verbose)
if !verbose && total_dependents
progress = ProgressBar.create(
title: "Scraping",
total: total_dependents,
format: "%t: |%B| %p%% (%c/%C)",
progress_mark: '#',
remainder_mark: '·'
)
end
end
rows = doc.css("div.Box-row")
log("Page #{$pages_scraped}: #{rows.size} dependents found", verbose)
rows.each do |row|
repo_link = row.at_css("a[data-hovercard-type='repository']")
next unless repo_link
scraped_count += 1
progress&.increment
full_name = repo_link.text.strip
href = URI.join("https://github.com", repo_link['href']).to_s
stars_text = row.css("span").find { |s| s.text.strip.match(/^\d[\d,]*$/) }&.text
stars = stars_text ? stars_text.strip.delete(',').to_i : 0
if stars < min_stars
$skipped_dependents += 1
log("Skipping #{full_name} (#{stars}★ < #{min_stars}★)", verbose)
else
log("Adding #{full_name} (#{stars}★)", verbose)
entry = { name: full_name, url: href, stars: stars }
results << entry
$collected_dependents << entry
end
return dedup_and_sort(results) if limit && results.size >= limit
end
current_url = find_next_page(doc, current_url, verbose)
sleep(delay_ms / 1000.0) if current_url && delay_ms > 0
end
dedup_and_sort(results)
end
log("Options: #{options}", options[:verbose])
dependents = scrape_dependents(
options[:repo],
options[:limit],
options[:min_stars],
options[:delay],
options[:backoff],
GITHUB_TOKEN,
options[:verbose]
)
if options[:format] == "json"
puts JSON.pretty_generate(dependents)
else
rows = dependents.map { |d| [d[:name], d[:url], d[:stars]] }
table = Terminal::Table.new(
title: "Dependents of #{options[:repo]} (min #{options[:min_stars]}★)",
headings: ['Name', 'URL', 'Stars'],
rows: rows
)
puts table
end
print_summary(dependents, options[:min_stars])

Prompt:

You're a senior Ruby engineer. Write a command-line Ruby script that uses bundler/inline to install and require nokogiri, terminal-table, and ruby-progressbar.

The script should:

  1. Scrape the list of GitHub repository dependents for a given repo (e.g., mongodb/node-mongodb-native), paginating through all pages.
  2. Accept CLI options:
    • --repo: GitHub repo in owner/name format (required)
    • --limit: Max number of results (default: unlimited)
    • --min-stars: Minimum number of stars to include
    • --format: Output format (table or json)
    • --delay: Delay between requests (in ms)
    • --backoff: Retry delay for HTTP 429 (default 10s)
    • --verbose: Print debug messages
  3. Use the GitHub web UI, not the API, and extract repo name, URL, and star count per dependent.
  4. Track the last 5 pages scraped in a circular buffer. If a duplicate URL is detected, abort to prevent infinite loops.
  5. Show a progress bar unless --verbose is enabled.
  6. Gracefully handle CTRL+C to print results collected so far.
  7. Print a summary on completion (or interruption) with:
    • Total runtime (HH:MM:SS)
    • Pages scraped
    • Matching results
    • Skipped due to --min-stars
  8. Ensure all output is clean and readable. Sort results by stars descending and deduplicate by repo name.
  9. Use a refined XPath to match the Next pagination link with text "Next" and class "BtnGroup-item".
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment