Last active
October 26, 2022 21:26
-
-
Save ianmcook/0f1538ebc8268a88cd4e0a0a61445287 to your computer and use it in GitHub Desktop.
Match Apache Arrow Jira user accounts with GitHub user accounts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # run this script second | |
| library(dplyr) | |
| df <- read.csv("dirty.csv") | |
| agg <- df %>% | |
| group_by(jira, github) %>% | |
| summarise(n = n(), .groups = "keep") %>% | |
| ungroup() %>% | |
| arrange(desc(n)) | |
| jira_users <- unique(agg$jira) | |
| final <- data.frame(jira = jira_users, github = character(length(jira_users))) | |
| for (jira_user in jira_users) { | |
| temp <- agg %>% filter(jira == jira_user) | |
| if (nrow(temp) > 1) { | |
| print(temp) | |
| i <- as.integer(readline(prompt="Which row matches? ")) | |
| } else { | |
| i <- 1 | |
| } | |
| final[final$jira == jira_user, "github"] <- temp[i, "github", drop = TRUE] | |
| } | |
| final %>% write.csv("clean.csv") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # run this script first | |
| from jira import JIRA | |
| from github import Github | |
| import re | |
| import pandas as pd | |
| jira = JIRA('https://issues.apache.org/jira') | |
| github = Github("ENTER_YOUR_GITHUB_PAT_HERE") | |
| repo = github.get_repo("apache/arrow") | |
| jira_issues = [] | |
| i = 0 | |
| chunk_size = 200 | |
| while True: | |
| chunk = jira.search_issues(f'project = ARROW AND status = Resolved ORDER BY key', startAt=i, maxResults=chunk_size, fields='assignee,comment') | |
| i += chunk_size | |
| jira_issues += chunk.iterable | |
| if i >= chunk.total: | |
| break | |
| jira_users = [] | |
| gh_users = [] | |
| for jira_issue in jira_issues: | |
| print(jira_issue.key) | |
| if jira_issue.fields.assignee is None: | |
| continue | |
| n = len(jira_issue.fields.comment.comments) | |
| for j in reversed(range(0, n)): | |
| t = jira_issue.fields.comment.comments[j].body | |
| s = re.search(r"^Issue resolved by pull request ([0-9]{1,5})\n\[https://github.com/apache/arrow/pull/\1]$", t) | |
| if s: | |
| gh_issue_id = int(s.group(1)) | |
| gh_issue = repo.get_issue(number=gh_issue_id) | |
| print('\t' + jira_issue.fields.assignee.key) | |
| jira_users.append(jira_issue.fields.assignee.key) | |
| print('\t' + gh_issue.user.login) | |
| gh_users.append(gh_issue.user.login) | |
| break | |
| df = pd.DataFrame({'jira': jira_users, 'github': gh_users}) | |
| df.to_csv('dirty.csv') |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was created to help with apache/arrow#14510.
Replace
assignee.keywithassignee.nameif needed.