Run ./load_repos.rb --help
for all available options.
See https://keid.ax/github-corpus for more information.
Run ./load_repos.rb --help
for all available options.
See https://keid.ax/github-corpus for more information.
#!/usr/bin/env ruby | |
require "optparse" | |
require "pathname" | |
def load_options | |
options = {limit: 100, patterns: [], threads: 1} | |
OptionParser.new do |opt| | |
opt.on("--language LANG", "-l", "Name of language") do |value| | |
options[:language] = value.downcase | |
end | |
opt.on("--pattern PATTERN", "-p", "Pattern for file extensions, e.g. '*.rb', may be given multiple times") do |value| | |
options[:patterns] << value | |
end | |
opt.on("--limit N", "Maximum number of repositories to load (defaults to 100)") do |value| | |
options[:limit] = Integer(value) | |
end | |
opt.on("--threads N", "Number of download threads (defaults to 1)") do |value| | |
options[:threads] = Integer(value) | |
end | |
end.parse! | |
unless options.key?(:language) | |
raise "Missing --language option" | |
end | |
if options[:patterns].empty? | |
raise "Missing --pattern option" | |
end | |
options | |
end | |
def load_urls(language) | |
url_file = Pathname.new("#{language}_urls.txt") | |
if !url_file.exist? || url_file.empty? | |
puts "Loading #{language} repositories, this only has to happen once..." | |
api_result = `gh api '/search/repositories?q=language:#{language}' --jq '.items[].ssh_url' --paginate` | |
raise "API query failed" unless $?.success? | |
url_file.write(api_result) | |
puts "Found #{api_result.lines.size} #{language} repositories" | |
end | |
File.readlines(url_file, chomp: true) | |
end | |
def bold(str) | |
"\e[1;37m#{str}\e[0m" | |
end | |
def green(str) | |
"\e[32m#{str}\e[0m" | |
end | |
def checkout_repo(directory, url, patterns) | |
unless directory.exist? | |
directory.mkpath | |
end | |
if directory.empty? | |
puts "cloning #{bold(directory)}" | |
`git clone --no-checkout --depth=1 --filter=blob:none #{url} #{directory} --quiet` | |
raise "git clone failed" unless $?.success? | |
shell_patterns = patterns.map { |pat| "'#{pat}'" }.join(" ") | |
`git -C #{directory} sparse-checkout set --no-cone #{shell_patterns}` | |
raise "git sparse-checkout failed" unless $?.success? | |
`git -C #{directory} checkout --quiet --no-progress` | |
raise "git checkout failed" unless $?.success? | |
else | |
puts "already cloned #{bold(directory)}, skipping" | |
end | |
end | |
options = load_options | |
git_urls = load_urls(options[:language]).first(options[:limit]) | |
download_queue = Thread::Queue.new(git_urls) | |
download_queue.close | |
download_threads = options[:threads].times.map do | |
Thread.new do | |
while url = download_queue.pop | |
base_path = url.delete_prefix("[email protected]:").delete_suffix(".git") | |
directory = Pathname.new("#{options[:language]}/#{base_path}") | |
checkout_repo(directory, url, options[:patterns]) | |
end | |
end | |
end | |
download_threads.map(&:join) | |
puts green("Finished downloading #{git_urls.size} repositories") |