Created
July 1, 2013 19:30
-
-
Save maxrp/5903801 to your computer and use it in GitHub Desktop.
Check out all the git projects advertised on a cgit page.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Get all git projects advertised on a cgit index as submodules for a repo. | |
Usage: | |
./scrape_cgit_repos.py http://git.example.com ./repos | |
""" | |
from bs4 import BeautifulSoup as soupy | |
import multiprocessing | |
import os.path | |
import requests | |
import subprocess | |
import sys | |
import urlparse | |
def cgit2giturl(url, href): | |
"""Strip /cgit prefix and generate the corresponding git:// URL.""" | |
_, domain, _, _, _, _ = urlparse.urlparse(url) | |
cgitless_href = href.replace('/cgit', '') | |
data = ('git', domain, cgitless_href, None, None, None) | |
return urlparse.urlunparse(data) | |
def get_repo_links(url): | |
"""Fetch the cgit index page, retrieve all repo links.""" | |
page = requests.get(url) | |
doc = soupy(page.text) | |
for repo in doc.find_all('td', {'class': 'sublevel-repo'}): | |
yield (cgit2giturl(url, repo.a.attrs['href']), | |
repo.a.attrs['title']) | |
def add_submodule(url, name, target): | |
"""Generate the `git submodule add` command for a named repo at a URL to be | |
checked out into a target directory.""" | |
target_path = os.path.join(target, name) | |
return ['git', 'submodule', 'add', url, target_path] | |
def work(cmd): | |
"""Just return a subprocess call to make upstream code cleaner.""" | |
return subprocess.call(cmd, shell=False) | |
def main(url, repo_path): | |
"""Clone, parallelized, all the git projects at a cgit root url as | |
submodules in a repo_path.""" | |
process_count = (multiprocessing.cpu_count() - 1) | |
pool = multiprocessing.Pool(processes=process_count) | |
tasks = [add_submodule(repo[0], repo[1], repo_path) | |
for repo in get_repo_links(url)] | |
results = pool.map(work, tasks) | |
for result_id in range(len(results)): | |
print '{0}: {1}'.format(' '.join(tasks[result_id]), results[result_id]) | |
if __name__ == '__main__': | |
if 3 > len(sys.argv): | |
print __doc__ | |
sys.exit(255) | |
main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment