Skip to content

Instantly share code, notes, and snippets.

@mook
Created November 5, 2013 08:45
Show Gist options
  • Save mook/7315774 to your computer and use it in GitHub Desktop.
Save mook/7315774 to your computer and use it in GitHub Desktop.
Script to figure out how many people got commit access to mozilla.org-related repositories, whether they are MoCo employees/interns, and the "highest" level of access granted.
# This was the the command used to grab the JSON dump. To re-fetch, please
# enter valid userid / cookie; see
# https://wiki.mozilla.org/Bugzilla:REST_API#Authentication for instructions
#import requests
#
#r = requests.get("https://api-dev.bugzilla.mozilla.org/latest/bug",
# headers={"Content-Type": "application/json",
# "Accept": "application/json"},
# params={"component": "Repository Account Requests",
# "changed_after": "2013-06-01",
# "include_fields": "id,summary,comments",
# "resolution": "FIXED",
# "userid": "0",
# "cookie": "NONONO",
# },
# stream=True)
#
#with open("/dev/shm/bugs.json", "w") as f:
# for chunk in r.iter_content(4096):
# f.write(chunk)
#
#print(r.status_code)
import collections
import json
import logging
import re
import sys
import time
logging.basicConfig(format="%(message)s", stream=sys.stdout)
log = logging.root
log.setLevel(logging.DEBUG)
with open("/dev/shm/bugs.json", "r") as f:
d = json.load(f)
moco = [] # (email, bug) for people that are MoCo employees
other = [] # (email, bug) for everybody else
levels = {} # email -> set(access levels)
# Ordering used for commit access; this isn't quite correct (it's not actually
# linear) but close enough.
level_ordering = [None, "cvs", "bzr", "hg", "l10n", "svn",
"level 1", "level 2", "level 3"]
mail_re = re.compile("(?<=\s)\S+@\S+(?=\s|$)")
def search_for_mail(text):
"""Search a given piece of text for something email-address like and
return it. Returns None if not found."""
match = mail_re.search(text)
if match:
return match.group(0).strip('()"<>.`')
else:
return None
paren_re = re.compile("(.*)\((.*?)\)")
def search_for_target_user(b):
"""Look through bug |b| for an email address that looks like the user being
granted access."""
target = search_for_mail(b["summary"])
if target:
# Any email addresses in the summary probably matches the person being
# granted access
return target
# If the bug summary looks like "XXX for YYY", "YYY" is likely to be the
# person's name (i.e. "Commit Access (Level 1) for John Doe").
match = re.search(r"\b[Ff]or\b", b["summary"])
if match:
user_name = b["summary"][match.end():]
else:
user_name = ""
user_name = user_name.split(":", 1)[0].split(" - ", 1)[0].split(" to ", 1)[0]
paren_match = paren_re.search(user_name)
user_name = user_name.strip().strip('()"<>.`').lower()
user_name = "".join(user_name.split("-"))
if len(user_name) > 5: # real people names have some length...
if paren_match:
# Possibly a nickname; look for [email protected] addresses
# i.e. "Commit Access (Level 1) for John Doe (jdoe)"
for c in b["comments"]:
if c["creator"]["name"].startswith(paren_match.group(2) + "@"):
return c["creator"]["name"]
user_name = paren_match.group(1).strip().strip('()"<>.`').lower()
# Look for a user with every part of the name in the real name
for c in b["comments"]:
real_name = c["creator"]["real_name"].lower()
real_name = "".join(real_name.split("-"))
if all(word in real_name for word in user_name.split()):
return c["creator"]["name"]
# Some people use their name in their email (mostly for Chinese
# people with very short last names)
mail_user_name = c["creator"]["name"].split("@", 1)[0].lower()
if all(word in mail_user_name for word in user_name.split()):
return c["creator"]["name"]
# Look for a user with email that's this surname, at least 5 characters
# long to avoid false positives
user_name_parts = user_name.split()
surname = user_name_parts.pop()
while len(surname) < 5 and user_name_parts:
surname = user_name_parts.pop() + surname
if len(surname) >= 5:
# Look for a user with an email with the surname in it
for c in b["comments"]:
if surname in c["creator"]["name"]:
return c["creator"]["name"]
# Look for a mentioned email address with the surname in it
for c in b["comments"]:
email = search_for_mail(c["text"] or "")
if email and surname in email.lower():
return email
# Look for a "my email" in a comment; vouchers don't need to provide that
comment = " ".join(re.split(r"\s+", b["comments"][0]["text"] or ""))
match = re.search("my e-?mail", comment.lower())
if match:
email = search_for_mail(comment[match.start():])
if email:
return email
# Look for "my key" in a comment; vouchers don't need to attach their keys
if re.search(r"(?:\b|^)my(?: \S+){,3} \S*key", comment, re.I):
return b["comments"][0]["creator"]["name"]
# For the period tested, 2013-06-01 to 2013-11-04, this was enough to
# narrow things down to the point where manual checks are feasible
log.warn("Couldn't find user [%s]", user_name)
def is_moco_address(email, b):
email = email.lower()
if email.endswith("@mozilla.com") or email.endswith("@m.c"):
return True
for c in b["comments"]:
if "intern" in c.get("text", "").lower().split():
# interns don't always have moco addresses; normal people wouldn't
# mention interns otherwise.
return True
return False
# Cutoff time; ignore bugs last commented on before this date.
# Sometimes people randomly CC themselves to old bugs...?
cutoff = time.strptime("2013-06-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")
for b in d["bugs"]:
summary = b["summary"].lower()
if summary.startswith("rust committer agreement for "):
# Rust only does committer agreements; presumably they checkin into
# GitHub instead.
continue
last_comment = b["comments"][-1]
comment_time = time.strptime(last_comment["creation_time"],
"%Y-%m-%dT%H:%M:%SZ")
if comment_time < cutoff:
# Last comment was long ago, this wasn't resolved recently
continue
if "upgrade" in summary.split():
continue # skip account upgrade requests
for expected_text in ("access", "ldap/svn", "account"):
if expected_text in summary:
break
else:
log.warn("Skipping bug %s, does not look like an account request: %s",
b["id"], b["summary"])
continue
target_mail = search_for_target_user(b)
if not target_mail:
log.warn("bug %s: failed to find mail: %s", b["id"], b["summary"])
continue
# At this point, we have a good idea of the user's email address and can
# add it to one of our lists. But first, let's guess the commit level for
# fun...
bag = levels.setdefault(target_mail, set([None]))
for level in reversed(level_ordering):
if level in summary:
bag.add(level)
break
if is_moco_address(target_mail, b):
moco.append((target_mail, b))
else:
other.append((target_mail, b))
# Manually checked the range involved; turns out that if the person getting
# access mentions a mozilla.com email address, it's either their own or the
# person handling the account requests. Do the subsitution here.
# This may not be true for different search results.
for email, b in other:
for i, c in enumerate(b["comments"]):
if c["creator"]["name"] == email:
text = c["text"] or ""
# Shannon handles account requests, ignore her email showing up
text = text.replace("[email protected]", "")
if "@mozilla.com" in text:
log.warn("Marking %s as MoCo due to bug %s comment %s",
email, b["id"], i)
log.debug("\n".join(" " + t for t in text.splitlines()))
moco.append((email, b))
# Squash duplicates by converting everything to dicts and back
moco = dict(moco)
other = dict(other)
# If people are in both buckets, mark as MoCo
for email in moco.keys():
other.pop(email, None)
moco = moco.items()
other = other.items()
log.info("MoCo addresses: %s", len(moco))
for mail, b in sorted(moco):
log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"])
log.info("Other addresses: %s", len(other))
for mail, b in sorted(other):
log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"])
# Squash the access levels to the "highest" (I'm not sure if the order is right)
for k in levels.keys()[:]:
v = list(levels[k])
levels[k] = sorted(levels[k], key=lambda n: level_ordering.index(n)).pop()
if levels[k] is None:
log.warn("Can't tell access level for %s", k)
moco_levels = collections.defaultdict(int)
for email, _ in moco:
moco_levels[levels[email]] += 1
log.info("MoCo levels: %s",
", ".join("%s: %s" % (k, v) for k, v in sorted(moco_levels.items())))
other_levels = collections.defaultdict(int)
for email, _ in other:
other_levels[levels[email]] += 1
log.info("Other levels: %s",
", ".join("%s: %s" % (k, v) for k, v in sorted(other_levels.items())))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment