Last active
January 10, 2022 08:31
-
-
Save lamchau/14737aec9efde1bc7e89686696952b39 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import datetime | |
import json | |
import logging | |
import os | |
import re | |
import requests | |
import sys | |
from types import SimpleNamespace | |
from typing import Dict, List | |
MAX_RESULTS = 100 | |
file_handler = logging.FileHandler(filename='debug.log', encoding='utf-8') | |
stdout_handler = logging.StreamHandler(sys.stdout) | |
logging.basicConfig( | |
format='%(asctime)s.%(msecs)03d %(levelname)s: %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
level=logging.DEBUG, | |
handlers=[file_handler, stdout_handler], | |
) | |
# logging.getLogger('urllib3').setLevel(logging.CRITICAL) | |
def get_query(author: str, created_at: str = None, end_cursor: str = None) -> Dict: | |
date_query = f'created:{created_at}..*' if created_at else '' | |
params = { | |
'query': f'author:{author} org:squareup {date_query}', | |
'results': MAX_RESULTS, | |
# hack: conditionally build GQL query to avoid using external packages | |
'end_cursor': f', after: "{end_cursor}"' if end_cursor else '', | |
} | |
# https://docs.github.com/en/graphql/overview/explorer | |
gql = ''' | |
{ | |
search(query: "%(query)s", type: ISSUE, first: %(results)d%(end_cursor)s) { | |
pageInfo { | |
hasNextPage | |
endCursor | |
} | |
nodes { | |
... on PullRequest { | |
headRefName | |
title | |
body | |
repository { | |
name | |
} | |
url | |
createdAt | |
closedAt | |
merged | |
additions | |
deletions | |
} | |
} | |
} | |
}''' | |
logging.debug(f'GQL: {gql % params}') | |
return { | |
'query': gql % params | |
} | |
parser = argparse.ArgumentParser(description='Download all pull requests') | |
required = parser.add_argument_group(title='Required') | |
required.add_argument('--author', required=True, help='the target github author/username') | |
required.add_argument('--username', required=True, help='[auth] github: username') | |
required.add_argument('--token', required=True, help='[auth] github: personal access token') | |
required.add_argument('--created-at', required=False, help='pull requested creation date') | |
# TODO: add name resolver https://registry.sqprod.co/api/v2/github_identities | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
parser.print_help() | |
sys.exit(0) | |
output_dir = os.path.realpath('pull-requests') | |
logging.debug(f'Checking directory: {output_dir}') | |
if not os.path.isdir(output_dir): | |
logging.debug(f'Creating directory: {output_dir}') | |
os.makedirs(output_dir) | |
args = parser.parse_args() | |
session = requests.Session() | |
session.auth = (args.username, args.token) | |
issues: List['SimpleNamespace'] = [] | |
author = args.author | |
created_at = args.created_at | |
current_date = datetime.datetime.now().strftime('%Y%m%d') | |
filename = os.path.join(output_dir, f'{author}.{current_date}.json') | |
logging.debug(f'Output file: {os.path.realpath(filename)}') | |
jira_regex = r'[A-Z]{2,}-\d+' | |
has_next_page = True | |
end_cursor = None | |
while has_next_page: | |
if issues: | |
logging.info(f'Collected {len(issues)} pull requests for {author}') | |
response = session.post( | |
url='https://api.github.com/graphql', | |
json=get_query(author=author, created_at=created_at, end_cursor=end_cursor), | |
headers={'Accept': 'application/vnd.github.v3+json'}, | |
) | |
if response.status_code == 200: | |
data = json.loads( | |
json.dumps(response.json()), | |
# hack: recursive SimpleNamespace for easier retrieval of attrs | |
object_hook=lambda item: SimpleNamespace(**item), | |
).data | |
has_next_page = data.search.pageInfo.hasNextPage | |
end_cursor = data.search.pageInfo.endCursor | |
logging.debug(f'Page Info: {data.search.pageInfo}') | |
data.search.nodes = [x for x in data.search.nodes if hasattr(x, 'repository')] | |
logging.debug(f'Extracting JIRA issues from {len(data.search.nodes)} issue(s)') | |
for x in data.search.nodes: | |
x.jira = set() | |
x.jira.update(re.findall(jira_regex, x.headRefName)) # branch name | |
x.jira.update(re.findall(jira_regex, x.body)) # description | |
x.jira.update(re.findall(jira_regex, x.title)) | |
x.jira = [x for x in sorted(list(x.jira)) if not x.startswith('COVID')] | |
x.repository = x.repository.name | |
del x.headRefName | |
del x.body | |
issues.extend(data.search.nodes) | |
else: | |
error_message = f'Query failed {response.status_code}' | |
logging.error(error_message) | |
raise Exception(error_message) | |
if issues: | |
with open(filename, 'w') as f: | |
json.dump(issues, f, ensure_ascii=True, indent=2, default=lambda o: o.__dict__) | |
success_message = (f'Saved {len(issues)} pull requests for {author} to {filename}') | |
logging.info(success_message) | |
else: | |
empty_message = f'No issues found for {author}' | |
logging.info(empty_message) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment