Skip to content

Instantly share code, notes, and snippets.

@keidax
Last active March 20, 2025 19:40
Show Gist options
  • Save keidax/e9a514fa5aef93e98eedf1665e923329 to your computer and use it in GitHub Desktop.
Save keidax/e9a514fa5aef93e98eedf1665e923329 to your computer and use it in GitHub Desktop.
Download source code from GitHub for any programming language
#!/usr/bin/env ruby
require "optparse"
require "pathname"
def load_options
options = {limit: 100, patterns: [], threads: 1}
OptionParser.new do |opt|
opt.on("--language LANG", "-l", "Name of language") do |value|
options[:language] = value.downcase
end
opt.on("--pattern PATTERN", "-p", "Pattern for file extensions, e.g. '*.rb', may be given multiple times") do |value|
options[:patterns] << value
end
opt.on("--limit N", "Maximum number of repositories to load (defaults to 100)") do |value|
options[:limit] = Integer(value)
end
opt.on("--threads N", "Number of download threads (defaults to 1)") do |value|
options[:threads] = Integer(value)
end
end.parse!
unless options.key?(:language)
raise "Missing --language option"
end
if options[:patterns].empty?
raise "Missing --pattern option"
end
options
end
def load_urls(language)
url_file = Pathname.new("#{language}_urls.txt")
if !url_file.exist? || url_file.empty?
puts "Loading #{language} repositories, this only has to happen once..."
api_result = `gh api '/search/repositories?q=language:#{language}' --jq '.items[].ssh_url' --paginate`
raise "API query failed" unless $?.success?
url_file.write(api_result)
puts "Found #{api_result.lines.size} #{language} repositories"
end
File.readlines(url_file, chomp: true)
end
def bold(str)
"\e[1;37m#{str}\e[0m"
end
def green(str)
"\e[32m#{str}\e[0m"
end
def checkout_repo(directory, url, patterns)
unless directory.exist?
directory.mkpath
end
if directory.empty?
puts "cloning #{bold(directory)}"
`git clone --no-checkout --depth=1 --filter=blob:none #{url} #{directory} --quiet`
raise "git clone failed" unless $?.success?
shell_patterns = patterns.map { |pat| "'#{pat}'" }.join(" ")
`git -C #{directory} sparse-checkout set --no-cone #{shell_patterns}`
raise "git sparse-checkout failed" unless $?.success?
`git -C #{directory} checkout --quiet --no-progress`
raise "git checkout failed" unless $?.success?
else
puts "already cloned #{bold(directory)}, skipping"
end
end
options = load_options
git_urls = load_urls(options[:language]).first(options[:limit])
download_queue = Thread::Queue.new(git_urls)
download_queue.close
download_threads = options[:threads].times.map do
Thread.new do
while url = download_queue.pop
base_path = url.delete_prefix("[email protected]:").delete_suffix(".git")
directory = Pathname.new("#{options[:language]}/#{base_path}")
checkout_repo(directory, url, options[:patterns])
end
end
end
download_threads.map(&:join)
puts green("Finished downloading #{git_urls.size} repositories")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment