Skip to content

Instantly share code, notes, and snippets.

@maxrp
Created July 1, 2013 19:30
Show Gist options
  • Save maxrp/5903801 to your computer and use it in GitHub Desktop.
Save maxrp/5903801 to your computer and use it in GitHub Desktop.
Check out all the git projects advertised on a cgit page.
#!/usr/bin/env python
"""
Get all git projects advertised on a cgit index as submodules for a repo.
Usage:
./scrape_cgit_repos.py http://git.example.com ./repos
"""
from bs4 import BeautifulSoup as soupy
import multiprocessing
import os.path
import requests
import subprocess
import sys
import urlparse
def cgit2giturl(url, href):
"""Strip /cgit prefix and generate the corresponding git:// URL."""
_, domain, _, _, _, _ = urlparse.urlparse(url)
cgitless_href = href.replace('/cgit', '')
data = ('git', domain, cgitless_href, None, None, None)
return urlparse.urlunparse(data)
def get_repo_links(url):
"""Fetch the cgit index page, retrieve all repo links."""
page = requests.get(url)
doc = soupy(page.text)
for repo in doc.find_all('td', {'class': 'sublevel-repo'}):
yield (cgit2giturl(url, repo.a.attrs['href']),
repo.a.attrs['title'])
def add_submodule(url, name, target):
"""Generate the `git submodule add` command for a named repo at a URL to be
checked out into a target directory."""
target_path = os.path.join(target, name)
return ['git', 'submodule', 'add', url, target_path]
def work(cmd):
"""Just return a subprocess call to make upstream code cleaner."""
return subprocess.call(cmd, shell=False)
def main(url, repo_path):
"""Clone, parallelized, all the git projects at a cgit root url as
submodules in a repo_path."""
process_count = (multiprocessing.cpu_count() - 1)
pool = multiprocessing.Pool(processes=process_count)
tasks = [add_submodule(repo[0], repo[1], repo_path)
for repo in get_repo_links(url)]
results = pool.map(work, tasks)
for result_id in range(len(results)):
print '{0}: {1}'.format(' '.join(tasks[result_id]), results[result_id])
if __name__ == '__main__':
if 3 > len(sys.argv):
print __doc__
sys.exit(255)
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment