Skip to content

Instantly share code, notes, and snippets.

@ianmcook
Last active October 26, 2022 21:26
Show Gist options
  • Select an option

  • Save ianmcook/0f1538ebc8268a88cd4e0a0a61445287 to your computer and use it in GitHub Desktop.

Select an option

Save ianmcook/0f1538ebc8268a88cd4e0a0a61445287 to your computer and use it in GitHub Desktop.
Match Apache Arrow Jira user accounts with GitHub user accounts
# run this script second
library(dplyr)
df <- read.csv("dirty.csv")
agg <- df %>%
group_by(jira, github) %>%
summarise(n = n(), .groups = "keep") %>%
ungroup() %>%
arrange(desc(n))
jira_users <- unique(agg$jira)
final <- data.frame(jira = jira_users, github = character(length(jira_users)))
for (jira_user in jira_users) {
temp <- agg %>% filter(jira == jira_user)
if (nrow(temp) > 1) {
print(temp)
i <- as.integer(readline(prompt="Which row matches? "))
} else {
i <- 1
}
final[final$jira == jira_user, "github"] <- temp[i, "github", drop = TRUE]
}
final %>% write.csv("clean.csv")
# run this script first
from jira import JIRA
from github import Github
import re
import pandas as pd
jira = JIRA('https://issues.apache.org/jira')
github = Github("ENTER_YOUR_GITHUB_PAT_HERE")
repo = github.get_repo("apache/arrow")
jira_issues = []
i = 0
chunk_size = 200
while True:
chunk = jira.search_issues(f'project = ARROW AND status = Resolved ORDER BY key', startAt=i, maxResults=chunk_size, fields='assignee,comment')
i += chunk_size
jira_issues += chunk.iterable
if i >= chunk.total:
break
jira_users = []
gh_users = []
for jira_issue in jira_issues:
print(jira_issue.key)
if jira_issue.fields.assignee is None:
continue
n = len(jira_issue.fields.comment.comments)
for j in reversed(range(0, n)):
t = jira_issue.fields.comment.comments[j].body
s = re.search(r"^Issue resolved by pull request ([0-9]{1,5})\n\[https://github.com/apache/arrow/pull/\1]$", t)
if s:
gh_issue_id = int(s.group(1))
gh_issue = repo.get_issue(number=gh_issue_id)
print('\t' + jira_issue.fields.assignee.key)
jira_users.append(jira_issue.fields.assignee.key)
print('\t' + gh_issue.user.login)
gh_users.append(gh_issue.user.login)
break
df = pd.DataFrame({'jira': jira_users, 'github': gh_users})
df.to_csv('dirty.csv')
@ianmcook
Copy link
Author

ianmcook commented Oct 26, 2022

This was created to help with apache/arrow#14510.

Replace assignee.key with assignee.name if needed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment