mook · November 5, 2013 08:45
diff --git a/moz-committers.py b/moz-committers.py
 # This was the the command used to grab the JSON dump.  To re-fetch, please
 # enter valid userid / cookie; see
 # https://wiki.mozilla.org/Bugzilla:REST_API#Authentication for instructions

 #import requests
 #
 #r = requests.get("https://api-dev.bugzilla.mozilla.org/latest/bug",
 #                 headers={"Content-Type": "application/json",
 #                          "Accept": "application/json"},
 #                 params={"component": "Repository Account Requests",
 #                         "changed_after": "2013-06-01",
 #                         "include_fields": "id,summary,comments",
 #                         "resolution": "FIXED",
 #                         "userid": "0",
 #                         "cookie": "NONONO",
 #                         },
 #                 stream=True)
 #
 #with open("/dev/shm/bugs.json", "w") as f:
 #    for chunk in r.iter_content(4096):
 #        f.write(chunk)
 #
 #print(r.status_code)

 import collections
 import json
 import logging
 import re
 import sys
 import time

 logging.basicConfig(format="%(message)s", stream=sys.stdout)
 log = logging.root
 log.setLevel(logging.DEBUG)

 with open("/dev/shm/bugs.json", "r") as f:
    d = json.load(f)

 moco = [] # (email, bug) for people that are MoCo employees
 other = [] # (email, bug) for everybody else
 levels = {} # email -> set(access levels)

 # Ordering used for commit access; this isn't quite correct (it's not actually
 # linear) but close enough.
 level_ordering = [None, "cvs", "bzr", "hg", "l10n", "svn",
                  "level 1", "level 2", "level 3"]

 mail_re = re.compile("(?<=\s)\S+@\S+(?=\s|$)")
 def search_for_mail(text):
    """Search a given piece of text for something email-address like and
    return it.  Returns None if not found."""
    match = mail_re.search(text)
    if match:
        return match.group(0).strip('()"<>.`')
    else:
        return None

 paren_re = re.compile("(.*)\((.*?)\)")
 def search_for_target_user(b):
    """Look through bug |b| for an email address that looks like the user being
    granted access."""
    target = search_for_mail(b["summary"])
    if target:
        # Any email addresses in the summary probably matches the person being
        # granted access
        return target

    # If the bug summary looks like "XXX for YYY", "YYY" is likely to be the
    # person's name (i.e. "Commit Access (Level 1) for John Doe").
    match = re.search(r"\b[Ff]or\b", b["summary"])
    if match:
        user_name = b["summary"][match.end():]
    else:
        user_name = ""

    user_name = user_name.split(":", 1)[0].split(" - ", 1)[0].split(" to ", 1)[0]
    paren_match = paren_re.search(user_name)
    user_name = user_name.strip().strip('()"<>.`').lower()
    user_name = "".join(user_name.split("-"))
    if len(user_name) > 5: # real people names have some length...
        if paren_match:
            # Possibly a nickname; look for [email protected] addresses
            # i.e. "Commit Access (Level 1) for John Doe (jdoe)"
            for c in b["comments"]:
                if c["creator"]["name"].startswith(paren_match.group(2) + "@"):
                    return c["creator"]["name"]
            user_name = paren_match.group(1).strip().strip('()"<>.`').lower()

        # Look for a user with every part of the name in the real name
        for c in b["comments"]:
            real_name = c["creator"]["real_name"].lower()
            real_name = "".join(real_name.split("-"))
            if all(word in real_name for word in user_name.split()):
                return c["creator"]["name"]
            # Some people use their name in their email (mostly for Chinese
            # people with very short last names)
            mail_user_name = c["creator"]["name"].split("@", 1)[0].lower()
            if all(word in mail_user_name for word in user_name.split()):
                return c["creator"]["name"]

        # Look for a user with email that's this surname, at least 5 characters
        # long to avoid false positives
        user_name_parts = user_name.split()
        surname = user_name_parts.pop()
        while len(surname) < 5 and user_name_parts:
            surname = user_name_parts.pop() + surname
        if len(surname) >= 5:
            # Look for a user with an email with the surname in it
            for c in b["comments"]:
                if surname in c["creator"]["name"]:
                    return c["creator"]["name"]
            # Look for a mentioned email address with the surname in it
            for c in b["comments"]:
                email = search_for_mail(c["text"] or "")
                if email and surname in email.lower():
                    return email

    # Look for a "my email" in a comment; vouchers don't need to provide that
    comment = " ".join(re.split(r"\s+",  b["comments"][0]["text"] or ""))
    match = re.search("my e-?mail", comment.lower())
    if match:
        email = search_for_mail(comment[match.start():])
        if email:
            return email
    # Look for "my key" in a comment; vouchers don't need to attach their keys
    if re.search(r"(?:\b|^)my(?: \S+){,3} \S*key", comment, re.I):
        return b["comments"][0]["creator"]["name"]

    # For the period tested, 2013-06-01 to 2013-11-04, this was enough to
    # narrow things down to the point where manual checks are feasible
    log.warn("Couldn't find user [%s]", user_name)

 def is_moco_address(email, b):
    email = email.lower()
    if email.endswith("@mozilla.com") or email.endswith("@m.c"):
        return True
    for c in b["comments"]:
        if "intern" in c.get("text", "").lower().split():
            # interns don't always have moco addresses; normal people wouldn't
            # mention interns otherwise.
            return True
    return False

 # Cutoff time; ignore bugs last commented on before this date.
 # Sometimes people randomly CC themselves to old bugs...?
 cutoff = time.strptime("2013-06-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")

 for b in d["bugs"]:
    summary = b["summary"].lower()
    if summary.startswith("rust committer agreement for "):
        # Rust only does committer agreements; presumably they checkin into
        # GitHub instead.
        continue

    last_comment = b["comments"][-1]
    comment_time = time.strptime(last_comment["creation_time"],
                                 "%Y-%m-%dT%H:%M:%SZ")
    if comment_time < cutoff:
        # Last comment was long ago, this wasn't resolved recently
        continue

    if "upgrade" in summary.split():
        continue # skip account upgrade requests

    for expected_text in ("access", "ldap/svn", "account"):
        if expected_text in summary:
            break
    else:
        log.warn("Skipping bug %s, does not look like an account request: %s",
                 b["id"], b["summary"])
        continue

    target_mail = search_for_target_user(b)
    if not target_mail:
        log.warn("bug %s: failed to find mail: %s", b["id"], b["summary"])
        continue

    # At this point, we have a good idea of the user's email address and can
    # add it to one of our lists.  But first, let's guess the commit level for
    # fun...
    bag = levels.setdefault(target_mail, set([None]))
    for level in reversed(level_ordering):
        if level in summary:
            bag.add(level)
            break

    if is_moco_address(target_mail, b):
        moco.append((target_mail, b))
    else:
        other.append((target_mail, b))

 # Manually checked the range involved; turns out that if the person getting
 # access mentions a mozilla.com email address, it's either their own or the
 # person handling the account requests.  Do the subsitution here.
 # This may not be true for different search results.
 for email, b in other:
    for i, c in enumerate(b["comments"]):
        if c["creator"]["name"] == email:
            text = c["text"] or ""
            # Shannon handles account requests, ignore her email showing up
            text = text.replace("[email protected]", "")
            if "@mozilla.com" in text:
                log.warn("Marking %s as MoCo due to bug %s comment %s",
                         email, b["id"], i)
                log.debug("\n".join("    " + t for t in text.splitlines()))
                moco.append((email, b))

 # Squash duplicates by converting everything to dicts and back
 moco = dict(moco)
 other = dict(other)
 # If people are in both buckets, mark as MoCo
 for email in moco.keys():
    other.pop(email, None)
 moco = moco.items()
 other = other.items()

 log.info("MoCo addresses: %s", len(moco))
 for mail, b in sorted(moco):
    log.info("    %s (bug %s: %s)", mail, b["id"], b["summary"])
 log.info("Other addresses: %s", len(other))
 for mail, b in sorted(other):
    log.info("    %s (bug %s: %s)", mail, b["id"], b["summary"])


 # Squash the access levels to the "highest" (I'm not sure if the order is right)
 for k in levels.keys()[:]:
    v = list(levels[k])
    levels[k] = sorted(levels[k], key=lambda n: level_ordering.index(n)).pop()
    if levels[k] is None:
        log.warn("Can't tell access level for %s", k)

 moco_levels = collections.defaultdict(int)
 for email, _ in moco:
    moco_levels[levels[email]] += 1
 log.info("MoCo levels: %s",
         ", ".join("%s: %s" % (k, v) for k, v in sorted(moco_levels.items())))
 other_levels = collections.defaultdict(int)
 for email, _ in other:
    other_levels[levels[email]] += 1
 log.info("Other levels: %s",
         ", ".join("%s: %s" % (k, v) for k, v in sorted(other_levels.items())))
	# This was the the command used to grab the JSON dump. To re-fetch, please
	# enter valid userid / cookie; see
	# https://wiki.mozilla.org/Bugzilla:REST_API#Authentication for instructions

	#import requests
	#
	#r = requests.get("https://api-dev.bugzilla.mozilla.org/latest/bug",
	# headers={"Content-Type": "application/json",
	# "Accept": "application/json"},
	# params={"component": "Repository Account Requests",
	# "changed_after": "2013-06-01",
	# "include_fields": "id,summary,comments",
	# "resolution": "FIXED",
	# "userid": "0",
	# "cookie": "NONONO",
	# },
	# stream=True)
	#
	#with open("/dev/shm/bugs.json", "w") as f:
	# for chunk in r.iter_content(4096):
	# f.write(chunk)
	#
	#print(r.status_code)

	import collections
	import json
	import logging
	import re
	import sys
	import time

	logging.basicConfig(format="%(message)s", stream=sys.stdout)
	log = logging.root
	log.setLevel(logging.DEBUG)

	with open("/dev/shm/bugs.json", "r") as f:
	d = json.load(f)

	moco = [] # (email, bug) for people that are MoCo employees
	other = [] # (email, bug) for everybody else
	levels = {} # email -> set(access levels)

	# Ordering used for commit access; this isn't quite correct (it's not actually
	# linear) but close enough.
	level_ordering = [None, "cvs", "bzr", "hg", "l10n", "svn",
	"level 1", "level 2", "level 3"]

	mail_re = re.compile("(?<=\s)\S+@\S+(?=\s\|$)")
	def search_for_mail(text):
	"""Search a given piece of text for something email-address like and
	return it. Returns None if not found."""
	match = mail_re.search(text)
	if match:
	return match.group(0).strip('()"<>.`')
	else:
	return None

	paren_re = re.compile("(.)\((.?)\)")
	def search_for_target_user(b):
	"""Look through bug \|b\| for an email address that looks like the user being
	granted access."""
	target = search_for_mail(b["summary"])
	if target:
	# Any email addresses in the summary probably matches the person being
	# granted access
	return target

	# If the bug summary looks like "XXX for YYY", "YYY" is likely to be the
	# person's name (i.e. "Commit Access (Level 1) for John Doe").
	match = re.search(r"\b[Ff]or\b", b["summary"])
	if match:
	user_name = b["summary"][match.end():]
	else:
	user_name = ""

	user_name = user_name.split(":", 1)[0].split(" - ", 1)[0].split(" to ", 1)[0]
	paren_match = paren_re.search(user_name)
	user_name = user_name.strip().strip('()"<>.`').lower()
	user_name = "".join(user_name.split("-"))
	if len(user_name) > 5: # real people names have some length...
	if paren_match:
	# Possibly a nickname; look for [email protected] addresses
	# i.e. "Commit Access (Level 1) for John Doe (jdoe)"
	for c in b["comments"]:
	if c["creator"]["name"].startswith(paren_match.group(2) + "@"):
	return c["creator"]["name"]
	user_name = paren_match.group(1).strip().strip('()"<>.`').lower()

	# Look for a user with every part of the name in the real name
	for c in b["comments"]:
	real_name = c["creator"]["real_name"].lower()
	real_name = "".join(real_name.split("-"))
	if all(word in real_name for word in user_name.split()):
	return c["creator"]["name"]
	# Some people use their name in their email (mostly for Chinese
	# people with very short last names)
	mail_user_name = c["creator"]["name"].split("@", 1)[0].lower()
	if all(word in mail_user_name for word in user_name.split()):
	return c["creator"]["name"]

	# Look for a user with email that's this surname, at least 5 characters
	# long to avoid false positives
	user_name_parts = user_name.split()
	surname = user_name_parts.pop()
	while len(surname) < 5 and user_name_parts:
	surname = user_name_parts.pop() + surname
	if len(surname) >= 5:
	# Look for a user with an email with the surname in it
	for c in b["comments"]:
	if surname in c["creator"]["name"]:
	return c["creator"]["name"]
	# Look for a mentioned email address with the surname in it
	for c in b["comments"]:
	email = search_for_mail(c["text"] or "")
	if email and surname in email.lower():
	return email

	# Look for a "my email" in a comment; vouchers don't need to provide that
	comment = " ".join(re.split(r"\s+", b["comments"][0]["text"] or ""))
	match = re.search("my e-?mail", comment.lower())
	if match:
	email = search_for_mail(comment[match.start():])
	if email:
	return email
	# Look for "my key" in a comment; vouchers don't need to attach their keys
	if re.search(r"(?:\b\|^)my(?: \S+){,3} \S*key", comment, re.I):
	return b["comments"][0]["creator"]["name"]

	# For the period tested, 2013-06-01 to 2013-11-04, this was enough to
	# narrow things down to the point where manual checks are feasible
	log.warn("Couldn't find user [%s]", user_name)

	def is_moco_address(email, b):
	email = email.lower()
	if email.endswith("@mozilla.com") or email.endswith("@m.c"):
	return True
	for c in b["comments"]:
	if "intern" in c.get("text", "").lower().split():
	# interns don't always have moco addresses; normal people wouldn't
	# mention interns otherwise.
	return True
	return False

	# Cutoff time; ignore bugs last commented on before this date.
	# Sometimes people randomly CC themselves to old bugs...?
	cutoff = time.strptime("2013-06-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")

	for b in d["bugs"]:
	summary = b["summary"].lower()
	if summary.startswith("rust committer agreement for "):
	# Rust only does committer agreements; presumably they checkin into
	# GitHub instead.
	continue

	last_comment = b["comments"][-1]
	comment_time = time.strptime(last_comment["creation_time"],
	"%Y-%m-%dT%H:%M:%SZ")
	if comment_time < cutoff:
	# Last comment was long ago, this wasn't resolved recently
	continue

	if "upgrade" in summary.split():
	continue # skip account upgrade requests

	for expected_text in ("access", "ldap/svn", "account"):
	if expected_text in summary:
	break
	else:
	log.warn("Skipping bug %s, does not look like an account request: %s",
	b["id"], b["summary"])
	continue

	target_mail = search_for_target_user(b)
	if not target_mail:
	log.warn("bug %s: failed to find mail: %s", b["id"], b["summary"])
	continue

	# At this point, we have a good idea of the user's email address and can
	# add it to one of our lists. But first, let's guess the commit level for
	# fun...
	bag = levels.setdefault(target_mail, set([None]))
	for level in reversed(level_ordering):
	if level in summary:
	bag.add(level)
	break

	if is_moco_address(target_mail, b):
	moco.append((target_mail, b))
	else:
	other.append((target_mail, b))

	# Manually checked the range involved; turns out that if the person getting
	# access mentions a mozilla.com email address, it's either their own or the
	# person handling the account requests. Do the subsitution here.
	# This may not be true for different search results.
	for email, b in other:
	for i, c in enumerate(b["comments"]):
	if c["creator"]["name"] == email:
	text = c["text"] or ""
	# Shannon handles account requests, ignore her email showing up
	text = text.replace("[email protected]", "")
	if "@mozilla.com" in text:
	log.warn("Marking %s as MoCo due to bug %s comment %s",
	email, b["id"], i)
	log.debug("\n".join(" " + t for t in text.splitlines()))
	moco.append((email, b))

	# Squash duplicates by converting everything to dicts and back
	moco = dict(moco)
	other = dict(other)
	# If people are in both buckets, mark as MoCo
	for email in moco.keys():
	other.pop(email, None)
	moco = moco.items()
	other = other.items()

	log.info("MoCo addresses: %s", len(moco))
	for mail, b in sorted(moco):
	log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"])
	log.info("Other addresses: %s", len(other))
	for mail, b in sorted(other):
	log.info(" %s (bug %s: %s)", mail, b["id"], b["summary"])


	# Squash the access levels to the "highest" (I'm not sure if the order is right)
	for k in levels.keys()[:]:
	v = list(levels[k])
	levels[k] = sorted(levels[k], key=lambda n: level_ordering.index(n)).pop()
	if levels[k] is None:
	log.warn("Can't tell access level for %s", k)

	moco_levels = collections.defaultdict(int)
	for email, _ in moco:
	moco_levels[levels[email]] += 1
	log.info("MoCo levels: %s",
	", ".join("%s: %s" % (k, v) for k, v in sorted(moco_levels.items())))
	other_levels = collections.defaultdict(int)
	for email, _ in other:
	other_levels[levels[email]] += 1
	log.info("Other levels: %s",
	", ".join("%s: %s" % (k, v) for k, v in sorted(other_levels.items())))