Created
February 16, 2011 23:50
-
-
Save badzil/830586 to your computer and use it in GitHub Desktop.
ads_affiliations_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
_RE_MULTIPLE_SPACES = re.compile('\s\s+') | |
_RE_AFFILIATION_PREFIX = re.compile('([A-Z][A-Z]+)\(') | |
_RE_AFFILIATION_SUFFIX = re.compile('\)[,;]? [A-Z][A-Z]+\(') | |
_RE_EMAIL = re.compile(';?\s?<EMAIL>\s?(.*?)\s?</EMAIL>;?') | |
def clean_affiliations_string(affiliations_string): | |
""" | |
Strips the spaces and collapses multiple spaces. | |
""" | |
return _RE_MULTIPLE_SPACES.sub(' ', affiliations_string.strip()) | |
def get_affiliations(affiliations_string): | |
""" | |
Returns a dictionary of affiliations and emails. | |
""" | |
affiliations = {} | |
emails = {} | |
# First we clean up the spaces in the affilitions string. | |
affiliations_string = clean_affiliations_string(affiliations_string) | |
while affiliations_string: | |
affiliations_string, index, affiliation = _extract_first_affiliation(affiliations_string) | |
if index in affiliations: | |
raise Exception('Double label.') | |
else: | |
if '<EMAIL>' in affiliation: | |
affiliation, email = _extract_email_from_affiliation(affiliation) | |
emails[index] = email | |
affiliations[index] = affiliation | |
return (affiliations, emails) | |
def _extract_email_from_affiliation(affiliation): | |
""" | |
Returns a tuple: | |
* affiliation without email. | |
* email. | |
""" | |
email = None | |
match = _RE_EMAIL.search(affiliation) | |
if match is None: | |
raise Exception('Affiliation contains <EMAIL> but could not be parsed: %s' % affiliation) | |
else: | |
email = match.group(1) | |
affiliation = affiliation.replace(match.group(0), '') | |
return (affiliation, email) | |
def _extract_first_affiliation(affiliations_string): | |
""" | |
Extract the first affiliation from the affiliations string and returns a | |
tuple of: | |
* the affiliations string without the first affiliation. | |
* the index of the first affiliation. | |
* the first affiliation. | |
""" | |
match = _RE_AFFILIATION_PREFIX.match(affiliations_string) | |
if match is None: | |
raise Exception('Prefix not found: %s' % affiliations_string) | |
label = match.group(1) | |
index = get_index_from_label(label) | |
affiliations_string = _RE_AFFILIATION_PREFIX.sub('', affiliations_string, count=1) | |
opened_parenthesis = 1 | |
# Now we count the parenthesis and when we find balanced parenthesis, we | |
# consider that we got the full affiliation string. | |
idx = 0 | |
for idx, char in enumerate(affiliations_string): | |
if char == '(': | |
opened_parenthesis += 1 | |
elif char == ')': | |
opened_parenthesis -= 1 | |
if opened_parenthesis == 0: | |
break | |
if opened_parenthesis > 0: | |
raise Exception('Problem of affiliation with unbalanced parenthesis.') | |
# OK. We know where the affiliation is so we extract it and remove it from | |
# the global string. | |
affiliation = affiliations_string[:idx].strip() | |
affiliations_string = affiliations_string[idx:].strip() | |
# Finally we check that the global string starts with an affiliation suffix | |
# and we clean it. | |
if affiliations_string == ')': | |
# OK. This was the last affiliation. | |
affiliations_string = '' | |
elif _RE_AFFILIATION_SUFFIX.match(affiliations_string) is not None: | |
# OK. There is an affiliation following. | |
affiliations_string = re.sub('\)[;,]? ', '', affiliations_string, count=1) | |
else: | |
# OK. Something went wrong. | |
raise Exception('Problem of affiliation with unbalanced parenthesis.') | |
return (affiliations_string, index, affiliation) | |
# Dictionary used to cache the results of the computation for the labels. | |
_LABEL_INDEX = {} | |
def get_index_from_label(label): | |
""" | |
Returns an integer index for an affiliation label, ie: | |
AA -> 1 | |
AB -> 2 | |
BA -> 27 | |
AAA -> 677 | |
""" | |
index = _LABEL_INDEX.get(label) | |
if index is None: | |
# First we reverse the label. | |
label = label[::-1] | |
# Then the label is a base-26 representation of the index. | |
index = 0 | |
for idx, char in enumerate(label): | |
index += (ord(char) - 64) * (26 ** idx) | |
# Because we consider 'A' as 1 and not 0, we need to offset by 26. | |
index -= 26 | |
return index | |
TESTS = [ | |
# Simplest case | |
('AA(aff1)', ({1: 'aff1'}, {})), | |
# Index does not start at AA. | |
('AB(aff1)', ({2: 'aff1'}, {})), | |
# 2 affiliations - ordered | |
('AA(aff1), AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})), | |
('AA(aff1); AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})), | |
('AA(aff1) AB(aff2)', ({1: 'aff1', 2: 'aff2'}, {})), | |
# 3 affiliations - ordered | |
('AA(aff1), AB(aff2), AC(aff3)', ({1: 'aff1', 2: 'aff2', 3: 'aff3'}, {})), | |
('AA(aff with space)', ({1: 'aff with space'}, {})), | |
('AA(CERN, Switzerland), AB(CfA (Cambridge) USA)', ({2: 'CfA (Cambridge) USA', 1: 'CERN, Switzerland'}, {})), | |
('AB(CERN, Switzerland), AA(CfA (Cambridge) USA)', ({1: 'CfA (Cambridge) USA', 2: 'CERN, Switzerland'}, {})), | |
('AA(CERN, Switzerland <EMAIL>[email protected]</EMAIL>), AB(CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})), | |
('AA(CERN, Switzerland <EMAIL>[email protected]</EMAIL>), AB( CfA (Cambridge))', ({1: 'CERN, Switzerland', 2: 'CfA (Cambridge)'}, {1: '[email protected]'})), | |
('AA( CERN Geneva ), AB( Another affiliation <EMAIL>[email protected] </EMAIL>;)', ({1: 'CERN Geneva', 2: 'Another affiliation'}, {2: '[email protected]'})), | |
('AA(aff1), AAA(aff2), AAAA(aff3)', ({1: 'aff1', 677: 'aff2', 18253: 'aff3'}, {})), | |
] | |
def test_get_affiliations(): | |
for aff_string, output in TESTS: | |
if get_affiliations(aff_string) != output: | |
print 'Test failed:\n\t%s\n\t%s' % (aff_string, output) | |
print 'All %d tests finished.' % len(TESTS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment