badzil · February 16, 2011 23:50
diff --git a/ADS_affiliation_parser.py b/ADS_affiliation_parser.py
 import re

 _RE_MULTIPLE_SPACES = re.compile('\s\s+')
 _RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
 _RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
 _RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')

 def clean_affiliations_string(affiliations_string):
    """
    Strips the spaces and collapses multiple spaces.
    """
    return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())

 def get_affiliations(affiliations_string):
    """
    Returns a dictionary of affiliations and emails.
    """
    affiliations = {}
    emails = {}

    # First we clean up the spaces in the affilitions string.
    affiliations_string = clean_affiliations_string(affiliations_string)

    while affiliations_string:
        affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
        if index in affiliations:
            raise Exception('Double label.')
        else:
            if '<EMAIL>' in affiliation:
                affiliation, email = _extract_email_from_affiliation(affiliation)
                emails[index] = email
            affiliations[index] = affiliation

    return (affiliations, emails)

 def _extract_email_from_affiliation(affiliation):
    """
    Returns a tuple:
    * affiliation without email.
    * email.
    """
    email = None
    match = _RE_EMAIL.search(affiliation)
    if match is None:
        raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
    else:
        email = match.group(1)
        affiliation = affiliation.replace(match.group(0), '')

    return (affiliation, email)

 def _extract_first_affiliation(affiliations_string):
    """
    Extract the first affiliation from the affiliations string and returns a
    tuple of:
    * the affiliations string without the first affiliation.
    * the index of the first affiliation.
    * the first affiliation.
    """
    match = _RE_AFFILIATION_PREFIX.match(affiliations_string)

    if match is None:
        raise Exception('Prefix not found: %s' % affiliations_string)

    label = match.group(1)
    index = get_index_from_label(label)

    affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
    opened_parenthesis = 1

    # Now we count the parenthesis and when we find balanced parenthesis, we
    # consider that we got the full affiliation string.
    idx = 0
    for idx, char in enumerate(affiliations_string):
        if char == '(':
            opened_parenthesis += 1
        elif char == ')':
            opened_parenthesis -= 1

        if opened_parenthesis == 0:
            break

    if opened_parenthesis > 0:
        raise Exception('Problem of affiliation with unbalanced parenthesis.')

    # OK. We know where the affiliation is so we extract it and remove it from
    # the global string.
    affiliation = affiliations_string[:idx].strip()
    affiliations_string = affiliations_string[idx:].strip()

    # Finally we check that the global string starts with an affiliation suffix
    # and we clean it.
    if affiliations_string == ')':
        # OK. This was the last affiliation.
        affiliations_string = ''
    elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
        # OK. There is an affiliation following.
        affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
    else:
        # OK. Something went wrong. 
        raise Exception('Problem of affiliation with unbalanced parenthesis.')

    return (affiliations_string, index, affiliation)

 # Dictionary used to cache the results of the computation for the labels.
 _LABEL_INDEX = {}

 def get_index_from_label(label):
    """
    Returns an integer index for an affiliation label, ie:
        AA -> 1
        AB -> 2
        BA -> 27
        AAA -> 677
    """
    index = _LABEL_INDEX.get(label)
    if index is None:
        # First we reverse the label.
        label = label[::-1]

        # Then the label is a base-26 representation of the index.
        index = 0
        for idx, char in enumerate(label):
            index += (ord(char) - 64) * (26 ** idx)

        # Because we consider 'A' as 1 and not 0, we need to offset by 26.
        index -= 26

    return index

 TESTS = [
        # Simplest case
        ('AA(aff1)', ({1: 'aff1'}, {})),
        # Index does not start at AA. 
        ('AB(aff1)', ({2: 'aff1'}, {})),
        # 2 affiliations - ordered
        ('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        ('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        ('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
        # 3 affiliations - ordered
        ('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
        ('AA(aff with space)', ({1: 'aff with space'}, {})),
        ('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
        ('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
        ('AA(CERN, Switzerland <EMAIL>[email protected]</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})),
        ('AA(CERN,  Switzerland <EMAIL>[email protected]</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})),
        ('AA(  CERN  Geneva  ), AB( Another affiliation    <EMAIL>[email protected]  </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: '[email protected]'})),
        ('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
        ]

 def test_get_affiliations():
    for aff_string, output in TESTS:
        if get_affiliations(aff_string) != output:
            print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
    print 'All %d tests finished.' % len(TESTS)
	import re

	_RE_MULTIPLE_SPACES = re.compile('\s\s+')
	_RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(')
	_RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(')
	_RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?')

	def clean_affiliations_string(affiliations_string):
	"""
	Strips the spaces and collapses multiple spaces.
	"""
	return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip())

	def get_affiliations(affiliations_string):
	"""
	Returns a dictionary of affiliations and emails.
	"""
	affiliations = {}
	emails = {}

	# First we clean up the spaces in the affilitions string.
	affiliations_string = clean_affiliations_string(affiliations_string)

	while affiliations_string:
	affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string)
	if index in affiliations:
	raise Exception('Double label.')
	else:
	if '<EMAIL>' in affiliation:
	affiliation, email = _extract_email_from_affiliation(affiliation)
	emails[index] = email
	affiliations[index] = affiliation

	return (affiliations, emails)

	def _extract_email_from_affiliation(affiliation):
	"""
	Returns a tuple:
	* affiliation without email.
	* email.
	"""
	email = None
	match = _RE_EMAIL.search(affiliation)
	if match is None:
	raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation)
	else:
	email = match.group(1)
	affiliation = affiliation.replace(match.group(0), '')

	return (affiliation, email)

	def _extract_first_affiliation(affiliations_string):
	"""
	Extract the first affiliation from the affiliations string and returns a
	tuple of:
	* the affiliations string without the first affiliation.
	* the index of the first affiliation.
	* the first affiliation.
	"""
	match = _RE_AFFILIATION_PREFIX.match(affiliations_string)

	if match is None:
	raise Exception('Prefix not found: %s' % affiliations_string)

	label = match.group(1)
	index = get_index_from_label(label)

	affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1)
	opened_parenthesis = 1

	# Now we count the parenthesis and when we find balanced parenthesis, we
	# consider that we got the full affiliation string.
	idx = 0
	for idx, char in enumerate(affiliations_string):
	if char == '(':
	opened_parenthesis += 1
	elif char == ')':
	opened_parenthesis -= 1

	if opened_parenthesis == 0:
	break

	if opened_parenthesis > 0:
	raise Exception('Problem of affiliation with unbalanced parenthesis.')

	# OK. We know where the affiliation is so we extract it and remove it from
	# the global string.
	affiliation = affiliations_string[:idx].strip()
	affiliations_string = affiliations_string[idx:].strip()

	# Finally we check that the global string starts with an affiliation suffix
	# and we clean it.
	if affiliations_string == ')':
	# OK. This was the last affiliation.
	affiliations_string = ''
	elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None:
	# OK. There is an affiliation following.
	affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1)
	else:
	# OK. Something went wrong.
	raise Exception('Problem of affiliation with unbalanced parenthesis.')

	return (affiliations_string, index, affiliation)

	# Dictionary used to cache the results of the computation for the labels.
	_LABEL_INDEX = {}

	def get_index_from_label(label):
	"""
	Returns an integer index for an affiliation label, ie:
	AA -> 1
	AB -> 2
	BA -> 27
	AAA -> 677
	"""
	index = _LABEL_INDEX.get(label)
	if index is None:
	# First we reverse the label.
	label = label[::-1]

	# Then the label is a base-26 representation of the index.
	index = 0
	for idx, char in enumerate(label):
	index += (ord(char) - 64) * (26 ** idx)

	# Because we consider 'A' as 1 and not 0, we need to offset by 26.
	index -= 26

	return index

	TESTS = [
	# Simplest case
	('AA(aff1)', ({1: 'aff1'}, {})),
	# Index does not start at AA.
	('AB(aff1)', ({2: 'aff1'}, {})),
	# 2 affiliations - ordered
	('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
	('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
	('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})),
	# 3 affiliations - ordered
	('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})),
	('AA(aff with space)', ({1: 'aff with space'}, {})),
	('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})),
	('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})),
	('AA(CERN, Switzerland <EMAIL>[email protected]</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})),
	('AA(CERN, Switzerland <EMAIL>[email protected]</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})),
	('AA( CERN Geneva ), AB( Another affiliation <EMAIL>[email protected] </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: '[email protected]'})),
	('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})),
	]

	def test_get_affiliations():
	for aff_string, output in TESTS:
	if get_affiliations(aff_string) != output:
	print 'Test failed:\n\t%s\n\t%s' % (aff_string, output)
	print 'All %d tests finished.' % len(TESTS)