Skip to content

Instantly share code, notes, and snippets.

@dmsimard
Created October 19, 2021 14:51
Show Gist options
  • Save dmsimard/01ca6a34b4bd8741baec94a23311a266 to your computer and use it in GitHub Desktop.
Save dmsimard/01ca6a34b4bd8741baec94a23311a266 to your computer and use it in GitHub Desktop.
List of git repositories for collections included in the Ansible package
#!/usr/bin/env python3
# Query an ansible.in file to retrieve the list of collections included in the Ansible
# package so we can query galaxy and retrieve their git repositories.
# ansible.in files can be cloned from ansible-build-data:
# - https://github.com/ansible-community/ansible-build-data/blob/main/2.10/ansible.in
# - https://github.com/ansible-community/ansible-build-data/blob/main/3/ansible.in
import argparse
import json
import logging
import requests
# from requests.adapters import HTTPAdapter
import time
GALAXY = "https://galaxy.ansible.com"
NAMESPACES = f"{GALAXY}/api/v1/namespaces"
REPOSITORIES = f"{GALAXY}/api/v1/repositories"
COLLECTIONS_API = f"{GALAXY}/api/v2/collections"
# Retry Galaxy API calls if they fail
RETRY_COUNT = 15
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--infile", help="Path to an ansible.in file", required=True)
parser.add_argument(
"--ignore",
help=(
"Omit repositories belonging to specified git organizations (not collection namespaces), can be repeated.\n"
"For example: --ignore ansible --ignore ansible-community --ignore ansible-collections\n"
),
action="append",
default=[],
required=False,
)
args = parser.parse_args()
return args
def query_galaxy(session, url):
"""
Queries galaxy and re-tries a few times on failures which can improve the odds
of succeeding past rate-limiting or unreliability.
"""
logging.info(f"Querying galaxy: {url}")
retries = 0
query = None
while query is None:
try:
query = session.get(url).json()
except Exception as e:
retries += 1
if retries >= RETRY_COUNT:
logging.error(f"giving up retrying on {url}: {e}")
break
logging.warning(f"retrying call to galaxy: {url}: {e}")
time.sleep(2)
query = None
return query
def main():
args = get_args()
logging.basicConfig(level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s")
session = requests.Session()
# session.mount(GALAXY, HTTPAdapter(max_retries=5))
with open(args.infile) as f:
collections = f.readlines()
collections = [x.strip() for x in collections]
repositories = []
for collection in collections:
if not collection.startswith("#"):
namespace, repository = collection.split(".")
# Galaxy knows about the git repository for a collection and we can find it in the latest_version of a collection
# ex: https://galaxy.ansible.com/api/v2/collections/community/general/
collection = query_galaxy(session, f"{COLLECTIONS_API}/{namespace}/{repository}/")
if collection is None:
logging.error("couldn't get the latest version for {collection}")
continue
# Now query the latest version so we can get the git repo
# ex: https://galaxy.ansible.com/api/v2/collections/community/general/versions/2.2.0/
version = query_galaxy(session, collection["latest_version"]["href"])
if version is None:
logging.error("couldn't get the git repository for {collection}")
continue
repository = version["metadata"]["repository"]
# We might want to exclude organizations or namespaces for repositories we otherwise already know about
ignored = False
for ignored_ns in args.ignore:
if f"/{ignored_ns}/" in repository:
ignored = True
if ignored:
logging.info(f"ignoring {repository}, in ignored namespace")
continue
repositories.append(repository)
print(json.dumps(repositories, indent=2))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment