Created
November 5, 2013 08:45
-
-
Save mook/7315774 to your computer and use it in GitHub Desktop.
Script to figure out how many people got commit access to mozilla.org-related repositories, whether they are MoCo employees/interns, and the "highest" level of access granted.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This was the the command used to grab the JSON dump. To re-fetch, please | |
# enter valid userid / cookie; see | |
# https://wiki.mozilla.org/Bugzilla:REST_API#Authentication for instructions | |
#import requests | |
# | |
#r = requests.get("https://api-dev.bugzilla.mozilla.org/latest/bug", | |
# headers={"Content-Type": "application/json", | |
# "Accept": "application/json"}, | |
# params={"component": "Repository Account Requests", | |
# "changed_after": "2013-06-01", | |
# "include_fields": "id,summary,comments", | |
# "resolution": "FIXED", | |
# "userid": "0", | |
# "cookie": "NONONO", | |
# }, | |
# stream=True) | |
# | |
#with open("/dev/shm/bugs.json", "w") as f: | |
# for chunk in r.iter_content(4096): | |
# f.write(chunk) | |
# | |
#print(r.status_code) | |
import collections | |
import json | |
import logging | |
import re | |
import sys | |
import time | |
logging.basicConfig(format="%(message)s", stream=sys.stdout) | |
log = logging.root | |
log.setLevel(logging.DEBUG) | |
with open("/dev/shm/bugs.json", "r") as f: | |
d = json.load(f) | |
moco = [] # (email, bug) for people that are MoCo employees | |
other = [] # (email, bug) for everybody else | |
levels = {} # email -> set(access levels) | |
# Ordering used for commit access; this isn't quite correct (it's not actually | |
# linear) but close enough. | |
level_ordering = [None, "cvs", "bzr", "hg", "l10n", "svn", | |
"level 1", "level 2", "level 3"] | |
mail_re = re.compile("(?<=\s)\S+@\S+(?=\s|$)") | |
def search_for_mail(text): | |
"""Search a given piece of text for something email-address like and | |
return it. Returns None if not found.""" | |
match = mail_re.search(text) | |
if match: | |
return match.group(0).strip('()"<>.`') | |
else: | |
return None | |
paren_re = re.compile("(.*)\((.*?)\)") | |
def search_for_target_user(b): | |
"""Look through bug |b| for an email address that looks like the user being | |
granted access.""" | |
target = search_for_mail(b["summary"]) | |
if target: | |
# Any email addresses in the summary probably matches the person being | |
# granted access | |
return target | |
# If the bug summary looks like "XXX for YYY", "YYY" is likely to be the | |
# person's name (i.e. "Commit Access (Level 1) for John Doe"). | |
match = re.search(r"\b[Ff]or\b", b["summary"]) | |
if match: | |
user_name = b["summary"][match.end():] | |
else: | |
user_name = "" | |
user_name = user_name.split(":", 1)[0].split(" - ", 1)[0].split(" to ", 1)[0] | |
paren_match = paren_re.search(user_name) | |
user_name = user_name.strip().strip('()"<>.`').lower() | |
user_name = "".join(user_name.split("-")) | |
if len(user_name) > 5: # real people names have some length... | |
if paren_match: | |
# Possibly a nickname; look for [email protected] addresses | |
# i.e. "Commit Access (Level 1) for John Doe (jdoe)" | |
for c in b["comments"]: | |
if c["creator"]["name"].startswith(paren_match.group(2) + "@"): | |
return c["creator"]["name"] | |
user_name = paren_match.group(1).strip().strip('()"<>.`').lower() | |
# Look for a user with every part of the name in the real name | |
for c in b["comments"]: | |
real_name = c["creator"]["real_name"].lower() | |
real_name = "".join(real_name.split("-")) | |
if all(word in real_name for word in user_name.split()): | |
return c["creator"]["name"] | |
# Some people use their name in their email (mostly for Chinese | |
# people with very short last names) | |
mail_user_name = c["creator"]["name"].split("@", 1)[0].lower() | |
if all(word in mail_user_name for word in user_name.split()): | |
return c["creator"]["name"] | |
# Look for a user with email that's this surname, at least 5 characters | |
# long to avoid false positives | |
user_name_parts = user_name.split() | |
surname = user_name_parts.pop() | |
while len(surname) < 5 and user_name_parts: | |
surname = user_name_parts.pop() + surname | |
if len(surname) >= 5: | |
# Look for a user with an email with the surname in it | |
for c in b["comments"]: | |
if surname in c["creator"]["name"]: | |
return c["creator"]["name"] | |
# Look for a mentioned email address with the surname in it | |
for c in b["comments"]: | |
email = search_for_mail(c["text"] or "") | |
if email and surname in email.lower(): | |
return email | |
# Look for a "my email" in a comment; vouchers don't need to provide that | |
comment = " ".join(re.split(r"\s+", b["comments"][0]["text"] or "")) | |
match = re.search("my e-?mail", comment.lower()) | |
if match: | |
email = search_for_mail(comment[match.start():]) | |
if email: | |
return email | |
# Look for "my key" in a comment; vouchers don't need to attach their keys | |
if re.search(r"(?:\b|^)my(?: \S+){,3} \S*key", comment, re.I): | |
return b["comments"][0]["creator"]["name"] | |
# For the period tested, 2013-06-01 to 2013-11-04, this was enough to | |
# narrow things down to the point where manual checks are feasible | |
log.warn("Couldn't find user [%s]", user_name) | |
def is_moco_address(email, b): | |
email = email.lower() | |
if email.endswith("@mozilla.com") or email.endswith("@m.c"): | |
return True | |
for c in b["comments"]: | |
if "intern" in c.get("text", "").lower().split(): | |
# interns don't always have moco addresses; normal people wouldn't | |
# mention interns otherwise. | |
return True | |
return False | |
# Cutoff time; ignore bugs last commented on before this date. | |
# Sometimes people randomly CC themselves to old bugs...? | |
cutoff = time.strptime("2013-06-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ") | |
for b in d["bugs"]: | |
summary = b["summary"].lower() | |
if summary.startswith("rust committer agreement for "): | |
# Rust only does committer agreements; presumably they checkin into | |
# GitHub instead. | |
continue | |
last_comment = b["comments"][-1] | |
comment_time = time.strptime(last_comment["creation_time"], | |
"%Y-%m-%dT%H:%M:%SZ") | |
if comment_time < cutoff: | |
# Last comment was long ago, this wasn't resolved recently | |
continue | |
if "upgrade" in summary.split(): | |
continue # skip account upgrade requests | |
for expected_text in ("access", "ldap/svn", "account"): | |
if expected_text in summary: | |
break | |
else: | |
log.warn("Skipping bug %s, does not look like an account request: %s", | |
b["id"], b["summary"]) | |
continue | |
target_mail = search_for_target_user(b) | |
if not target_mail: | |
log.warn("bug %s: failed to find mail: %s", b["id"], b["summary"]) | |
continue | |
# At this point, we have a good idea of the user's email address and can | |
# add it to one of our lists. But first, let's guess the commit level for | |
# fun... | |
bag = levels.setdefault(target_mail, set([None])) | |
for level in reversed(level_ordering): | |
if level in summary: | |
bag.add(level) | |
break | |
if is_moco_address(target_mail, b): | |
moco.append((target_mail, b)) | |
else: | |
other.append((target_mail, b)) | |
# Manually checked the range involved; turns out that if the person getting | |
# access mentions a mozilla.com email address, it's either their own or the | |
# person handling the account requests. Do the subsitution here. | |
# This may not be true for different search results. | |
for email, b in other: | |
for i, c in enumerate(b["comments"]): | |
if c["creator"]["name"] == email: | |
text = c["text"] or "" | |
# Shannon handles account requests, ignore her email showing up | |
text = text.replace("[email protected]", "") | |
if "@mozilla.com" in text: | |
log.warn("Marking %s as MoCo due to bug %s comment %s", | |
email, b["id"], i) | |
log.debug("\n".join(" " + t for t in text.splitlines())) | |
moco.append((email, b)) | |
# Squash duplicates by converting everything to dicts and back | |
moco = dict(moco) | |
other = dict(other) | |
# If people are in both buckets, mark as MoCo | |
for email in moco.keys(): | |
other.pop(email, None) | |
moco = moco.items() | |
other = other.items() | |
log.info("MoCo addresses: %s", len(moco)) | |
for mail, b in sorted(moco): | |
log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"]) | |
log.info("Other addresses: %s", len(other)) | |
for mail, b in sorted(other): | |
log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"]) | |
# Squash the access levels to the "highest" (I'm not sure if the order is right) | |
for k in levels.keys()[:]: | |
v = list(levels[k]) | |
levels[k] = sorted(levels[k], key=lambda n: level_ordering.index(n)).pop() | |
if levels[k] is None: | |
log.warn("Can't tell access level for %s", k) | |
moco_levels = collections.defaultdict(int) | |
for email, _ in moco: | |
moco_levels[levels[email]] += 1 | |
log.info("MoCo levels: %s", | |
", ".join("%s: %s" % (k, v) for k, v in sorted(moco_levels.items()))) | |
other_levels = collections.defaultdict(int) | |
for email, _ in other: | |
other_levels[levels[email]] += 1 | |
log.info("Other levels: %s", | |
", ".join("%s: %s" % (k, v) for k, v in sorted(other_levels.items()))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment