lamchau · January 10, 2022 08:31
diff --git a/download-prs.py b/download-prs.py
 #!/usr/bin/env python3

 import argparse
 import datetime
 import json
 import logging
 import os
 import re
 import requests
 import sys
 from types import SimpleNamespace
 from typing import Dict, List

 MAX_RESULTS = 100
 file_handler = logging.FileHandler(filename='debug.log', encoding='utf-8')
 stdout_handler = logging.StreamHandler(sys.stdout)
 logging.basicConfig(
    format='%(asctime)s.%(msecs)03d %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.DEBUG,
    handlers=[file_handler, stdout_handler],
 )
 # logging.getLogger('urllib3').setLevel(logging.CRITICAL)


 def get_query(author: str, created_at: str = None, end_cursor: str = None) -> Dict:
    date_query = f'created:{created_at}..*' if created_at else ''
    params = {
        'query': f'author:{author} org:squareup {date_query}',
        'results': MAX_RESULTS,
        # hack: conditionally build GQL query to avoid using external packages
        'end_cursor': f', after: "{end_cursor}"' if end_cursor else '',
    }
    # https://docs.github.com/en/graphql/overview/explorer
    gql = '''
 {
  search(query: "%(query)s", type: ISSUE, first: %(results)d%(end_cursor)s) {
    pageInfo {
      hasNextPage
      endCursor
    }
    nodes {
      ... on PullRequest {
        headRefName
        title
        body
        repository {
          name
        }
        url
        createdAt
        closedAt
        merged
        additions
        deletions
      }
    }
  }
 }'''
    logging.debug(f'GQL: {gql % params}')
    return {
        'query': gql % params
    }


 parser = argparse.ArgumentParser(description='Download all pull requests')
 required = parser.add_argument_group(title='Required')
 required.add_argument('--author', required=True, help='the target github author/username')
 required.add_argument('--username', required=True, help='[auth] github: username')
 required.add_argument('--token', required=True, help='[auth] github: personal access token')
 required.add_argument('--created-at', required=False, help='pull requested creation date')

 # TODO: add name resolver https://registry.sqprod.co/api/v2/github_identities
 if __name__ == '__main__':
    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(0)

    output_dir = os.path.realpath('pull-requests')

    logging.debug(f'Checking directory: {output_dir}')
    if not os.path.isdir(output_dir):
        logging.debug(f'Creating directory: {output_dir}')
        os.makedirs(output_dir)

    args = parser.parse_args()
    session = requests.Session()
    session.auth = (args.username, args.token)

    issues: List['SimpleNamespace'] = []
    author = args.author
    created_at = args.created_at
    current_date = datetime.datetime.now().strftime('%Y%m%d')
    filename = os.path.join(output_dir, f'{author}.{current_date}.json')
    logging.debug(f'Output file: {os.path.realpath(filename)}')

    jira_regex = r'[A-Z]{2,}-\d+'
    has_next_page = True
    end_cursor = None

    while has_next_page:
        if issues:
            logging.info(f'Collected {len(issues)} pull requests for {author}')
        response = session.post(
            url='https://api.github.com/graphql',
            json=get_query(author=author, created_at=created_at, end_cursor=end_cursor),
            headers={'Accept': 'application/vnd.github.v3+json'},
        )

        if response.status_code == 200:
            data = json.loads(
                json.dumps(response.json()),
                # hack: recursive SimpleNamespace for easier retrieval of attrs
                object_hook=lambda item: SimpleNamespace(**item),
            ).data
            has_next_page = data.search.pageInfo.hasNextPage
            end_cursor = data.search.pageInfo.endCursor
            logging.debug(f'Page Info: {data.search.pageInfo}')

            data.search.nodes = [x for x in data.search.nodes if hasattr(x, 'repository')]
            logging.debug(f'Extracting JIRA issues from {len(data.search.nodes)} issue(s)')
            for x in data.search.nodes:
                x.jira = set()
                x.jira.update(re.findall(jira_regex, x.headRefName))  # branch name
                x.jira.update(re.findall(jira_regex, x.body))  # description
                x.jira.update(re.findall(jira_regex, x.title))
                x.jira = [x for x in sorted(list(x.jira)) if not x.startswith('COVID')]
                x.repository = x.repository.name

                del x.headRefName
                del x.body

            issues.extend(data.search.nodes)
        else:
            error_message = f'Query failed {response.status_code}'
            logging.error(error_message)
            raise Exception(error_message)

    if issues:
        with open(filename, 'w') as f:
            json.dump(issues, f, ensure_ascii=True, indent=2, default=lambda o: o.__dict__)
        success_message = (f'Saved {len(issues)} pull requests for {author} to {filename}')
        logging.info(success_message)
    else:
        empty_message = f'No issues found for {author}'
        logging.info(empty_message)
	#!/usr/bin/env python3

	import argparse
	import datetime
	import json
	import logging
	import os
	import re
	import requests
	import sys
	from types import SimpleNamespace
	from typing import Dict, List

	MAX_RESULTS = 100
	file_handler = logging.FileHandler(filename='debug.log', encoding='utf-8')
	stdout_handler = logging.StreamHandler(sys.stdout)
	logging.basicConfig(
	format='%(asctime)s.%(msecs)03d %(levelname)s: %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	level=logging.DEBUG,
	handlers=[file_handler, stdout_handler],
	)
	# logging.getLogger('urllib3').setLevel(logging.CRITICAL)


	def get_query(author: str, created_at: str = None, end_cursor: str = None) -> Dict:
	date_query = f'created:{created_at}..*' if created_at else ''
	params = {
	'query': f'author:{author} org:squareup {date_query}',
	'results': MAX_RESULTS,
	# hack: conditionally build GQL query to avoid using external packages
	'end_cursor': f', after: "{end_cursor}"' if end_cursor else '',
	}
	# https://docs.github.com/en/graphql/overview/explorer
	gql = '''
	{
	search(query: "%(query)s", type: ISSUE, first: %(results)d%(end_cursor)s) {
	pageInfo {
	hasNextPage
	endCursor
	}
	nodes {
	... on PullRequest {
	headRefName
	title
	body
	repository {
	name
	}
	url
	createdAt
	closedAt
	merged
	additions
	deletions
	}
	}
	}
	}'''
	logging.debug(f'GQL: {gql % params}')
	return {
	'query': gql % params
	}


	parser = argparse.ArgumentParser(description='Download all pull requests')
	required = parser.add_argument_group(title='Required')
	required.add_argument('--author', required=True, help='the target github author/username')
	required.add_argument('--username', required=True, help='[auth] github: username')
	required.add_argument('--token', required=True, help='[auth] github: personal access token')
	required.add_argument('--created-at', required=False, help='pull requested creation date')

	# TODO: add name resolver https://registry.sqprod.co/api/v2/github_identities
	if __name__ == '__main__':
	if len(sys.argv) < 2:
	parser.print_help()
	sys.exit(0)

	output_dir = os.path.realpath('pull-requests')

	logging.debug(f'Checking directory: {output_dir}')
	if not os.path.isdir(output_dir):
	logging.debug(f'Creating directory: {output_dir}')
	os.makedirs(output_dir)

	args = parser.parse_args()
	session = requests.Session()
	session.auth = (args.username, args.token)

	issues: List['SimpleNamespace'] = []
	author = args.author
	created_at = args.created_at
	current_date = datetime.datetime.now().strftime('%Y%m%d')
	filename = os.path.join(output_dir, f'{author}.{current_date}.json')
	logging.debug(f'Output file: {os.path.realpath(filename)}')

	jira_regex = r'[A-Z]{2,}-\d+'
	has_next_page = True
	end_cursor = None

	while has_next_page:
	if issues:
	logging.info(f'Collected {len(issues)} pull requests for {author}')
	response = session.post(
	url='https://api.github.com/graphql',
	json=get_query(author=author, created_at=created_at, end_cursor=end_cursor),
	headers={'Accept': 'application/vnd.github.v3+json'},
	)

	if response.status_code == 200:
	data = json.loads(
	json.dumps(response.json()),
	# hack: recursive SimpleNamespace for easier retrieval of attrs
	object_hook=lambda item: SimpleNamespace(**item),
	).data
	has_next_page = data.search.pageInfo.hasNextPage
	end_cursor = data.search.pageInfo.endCursor
	logging.debug(f'Page Info: {data.search.pageInfo}')

	data.search.nodes = [x for x in data.search.nodes if hasattr(x, 'repository')]
	logging.debug(f'Extracting JIRA issues from {len(data.search.nodes)} issue(s)')
	for x in data.search.nodes:
	x.jira = set()
	x.jira.update(re.findall(jira_regex, x.headRefName)) # branch name
	x.jira.update(re.findall(jira_regex, x.body)) # description
	x.jira.update(re.findall(jira_regex, x.title))
	x.jira = [x for x in sorted(list(x.jira)) if not x.startswith('COVID')]
	x.repository = x.repository.name

	del x.headRefName
	del x.body

	issues.extend(data.search.nodes)
	else:
	error_message = f'Query failed {response.status_code}'
	logging.error(error_message)
	raise Exception(error_message)

	if issues:
	with open(filename, 'w') as f:
	json.dump(issues, f, ensure_ascii=True, indent=2, default=lambda o: o.__dict__)
	success_message = (f'Saved {len(issues)} pull requests for {author} to {filename}')
	logging.info(success_message)
	else:
	empty_message = f'No issues found for {author}'
	logging.info(empty_message)