renaud · August 22, 2016 14:51
diff --git a/acronyms.py b/acronyms.py
 #!/usr/bin/env python
 '''Link abbreviations to their full names

 Based on

 A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
 A. Schwartz and M. Hearst
 Biocomputing, 2003, pp 451-462.


 # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt
 '''
 __date__ = 'July 2012'
 __author__ = 'Vincent Van Asch'
 __version__ = '1.2.1'


 # If you want to use the script that produces the results reported in __doc__
 # you have to set the global __REPRODUCE__ to True. It is assumed that
 # the extra code that is active when __REPRODUCE__ is set to True produces
 # better results. 
 __REPRODUCE__=False


 __doc__='''
 This script takes tokenized sentences and prints the abbreviations and their definitions to STDOUT.

 USAGE
    python [-v] abbreviations.py inputfile > abbreviations
    
 FORMATS
    inputfile: A tokenized sentence on every new line (tokens should be separated by a single space)
    abbreviations: 
        sentenceid1 characteroffset_begin_a1 characteroffset_end_a1 abbreviation1
        sentenceid1 characteroffset_begin_d1 characteroffset_end_d1 definition1
        
        sentenceid2 characteroffset_begin2 characteroffset_end2 abbreviation2
        sentenceid2 characteroffset_begin2 characteroffset_end2 definition2
        ...
    
        sentenceid: the line on which the sentence is in the inputfile minus one
        
 OPTIONS
    -v: Print more information to STDERR
        
 REMARKS
    The algorithm detects only links between an abbreviation and its definition if they
    are in the format <definition> (<abbreviation>). So, the reverse 
    <abbreviation> (<definition>) is not detected.
    
    The algorithm can only find definitions that have all characters of the abbreviation
    in them.
    
    It will also make errors in cases like "tyrosine transcription factor I (TTFI)" 
    the wrong definition will be "transcription factor I" because all characters from
    the abbreviation are in the definition.
    
    On the labeled yeast corpus (http://biotext.berkeley.edu/data.html) version 1.0 of this
    script reaches:
    
    TP: 673
    FP: 94
    FN: 280

    P : 87.74
    R : 70.62
    F1: 78.26
      
    (The corpus had to be corrected manually in order to be useable.)
       
 ACKNOWLEDGEMENTS
    Based on:
    A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
    A. Schwartz and M. Hearst
    Biocomputing, 2003, pp 451-462.

 %s (version %s)''' %(__date__, __version__)

 import os, sys, re, getopt

 encoding='UTF8'

 def fread(fname, encoding=encoding):
    f=open(os.path.abspath(os.path.expanduser(fname)))
    try:
        for l in f:
                line=l.strip()
                if line:
                    yield line.decode(encoding)
    finally:
        f.close()
        
        
 def fread2(fname, encoding=encoding):
    f=open(os.path.abspath(os.path.expanduser(fname)))
    try:
        for l in f:
                line=l.strip().decode(encoding)
                if line:
                    yield line.split()
                else:
                    yield None
    finally:
        f.close()

 class Candidate(unicode):
    def __new__(cls, start, stop, str):
        return unicode.__new__(cls, str)
    def __init__(self, start, stop, str):
        self._start = start
        self._stop = stop
    
    def __getslice__(self, i, j):
        start = self.start+i
        stop = self.start+j
        str = unicode.__getslice__(self, i, j)
        return Candidate(start, stop, str)
    
    @property
    def start(self):
        '''The start index'''
        return self._start
    @property
    def stop(self):
        '''The stop index'''
        return self._stop


        

 def getcandidates(sentence):
    '''Yields Candidates'''
    if '(' in sentence:
        # Check some things first
        if sentence.count('(') != sentence.count(')'):
            raise ValueError('Unbalanced parentheses: %s' %sentence)
            
        if sentence.find('(') > sentence.find(')'):
            raise ValueError('First parentheses is right: %s' %sentence)

        closeindex = -1    
        while 1:
            # Look for open parenthesis
            openindex = sentence.find('(', closeindex+1)
            
            if openindex == -1: break

            # Look for closing parantheses
            closeindex = openindex+1        
            open=1
            skip=False
            while open:
                try:
                    char = sentence[closeindex]
                except IndexError:
                    # We found an opening bracket but no associated closing bracket
                    # Skip the opening bracket
                    skip=True
                    break
                if char == '(':
                    open +=1
                elif char == ')':
                    open -=1
                closeindex+=1
                
            if skip:
                closeindex = openindex+1
                continue
                
            # Output if conditions are met
            start = openindex+1
            stop = closeindex-1
            str = sentence[start:stop]
            
            # Take into account whitepsace that should be removed
            start = start + len(str) - len(str.lstrip())
            stop = stop - len(str) + len(str.rstrip())
            str = sentence[start:stop]
            
            if conditions(str):
                yield Candidate(start, stop, str)
                
        
    

    
    
 def conditions(str):
    '''Based on Schwartz&Hearst
    
    2 <= len(str) <= 10
    len(tokens) <= 2
    re.search('[A-Za-z]', str)
    str[0].isalnum()
    
    and extra:
    if it matches ([A-Za-z]\. ?){2,}
    it is a good candidate.
    
    '''
    if not __REPRODUCE__ and re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
        return True
    if len(str) < 2 or len(str) > 10:
        return False
    if len(str.split()) > 2:
        return False
    if not re.search('[A-Za-z]', str):
        return False
    if not str[0].isalnum():
        return False
    
    return True
    
 def getdefinition(candidate, sentence):
    '''Takes a candidate and a sentence and returns the definition candidate.
    
    The definintion candidate is the set of tokens (in front of the candidate)
    that starts with a token starting with the first character of the candidate'''
    # Take the tokens in front of the candidate
    tokens = sentence[:candidate.start-2].lower().split()

    # the char that we are looking for
    key = candidate[0].lower()

    # Count the number of tokens that start with the same character as the candidate
    firstchars = [t[0] for t in tokens]
        
    definitionfreq = firstchars.count(key)
    candidatefreq = candidate.lower().count(key)
    
    # Look for the list of tokens in front of candidate that
    # have a sufficient number of tokens starting with key
    if candidatefreq <= definitionfreq:
        # we should at least have a good number of starts
        count=0
        start=0
        startindex=len(firstchars)-1
        while count < candidatefreq:
            if abs(start) > len(firstchars):
                raise ValueError('not found')
                
            start-=1
            # Look up key in the definition
            try:
                startindex = firstchars.index(key, len(firstchars)+start)
            except ValueError:
                pass
                
            # Count the number of keys in definition
            count = firstchars[startindex:].count(key)

        # We found enough keys in the definition so return the definition as a
        # definition candidate
        start = len(' '.join(tokens[:startindex]))
        stop = candidate.start-2
        str = sentence[start:stop]
        
        # Remove whitespace
        start = start + len(str) - len(str.lstrip())
        stop = stop - len(str) + len(str.rstrip())
        str = sentence[start:stop]
        
        return Candidate(start, stop, str)
                            
                                            
    else:
        #print 'S', sentence
        #print >>sys.stderr, 'KEY', key
        #print >>sys.stderr, 'TOKENS', tokens
        #print >>sys.stderr, 'ABBREV', candidate
        raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
        
        
        
 def definitionselection(definition, abbrev):
    '''Takes a definition candidate and an abbreviation candidate
    and returns True if the chars in the abbreviation occur in the definition
    
    Based on 
    A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''
    
    if len(definition) < len(abbrev):
        raise ValueError('Abbreviation is longer than definition')
        
    if abbrev in definition.split():
        raise ValueError('Abbreviation is full word of definition')
        
    sindex = -1
    lindex = -1
    
    while 1:
        try:
            longchar = definition[lindex].lower()
        except IndexError:
            #print definition, '||',abbrev
            raise
        
        
        shortchar = abbrev[sindex].lower()
        
        if not shortchar.isalnum():
            sindex-=1
        
        if sindex == -1*len(abbrev):
            if shortchar == longchar:
                if lindex == -1*len(definition) or not definition[lindex-1].isalnum():
                    break
                else:
                    lindex-=1
            else:
                lindex-=1
                
                if lindex == -1*(len(definition)+1):
                    raise ValueError('definition of "%s" not found in "%s"' %(abbrev, definition))
                
        else:
            if shortchar == longchar:
                sindex -=1
                lindex -=1        
            else:
                lindex -= 1
    
    definition =  definition[lindex:len(definition)]
    
    tokens = len(definition.split())
    length = len(abbrev)
    
    if tokens > min([length+5, length*2]):
        raise ValueError('did not meet min(|A|+5, |A|*2) constraint')
        
    # Do not return definitions that contain unbalanced parentheses
    if not __REPRODUCE__:
        #print 'ccc', abbrev, definition, definition.count('('), definition.count(')') 
        if definition.count('(') != definition.count(')'):
            raise ValueError('Unbalanced parentheses not allowed in a definition')
        
    return definition
    
    
    
    
    

 def main(fname, verbose=False, encoding=encoding):
    '''Writes a file (fname.abb) containing all abbreviations and their definitions in the format
    
    sentenceindex startindex stopindex abbrev 
    sentenceindex startindex stopindex definition
    
    sentenceindex startindex stopindex abbrev 
    sentenceindex startindex stopindex definition
    ...
    
    
    fname: single-whitespaced tokenized sentences; every sentence on a newline
    

    Evaluation
    abbreviations.score('abbrev.txt.gold', 'abbrev.txt.raw.abb')
    
    TP: 673
    FP: 94
    FN: 280

    P : 87.74
    R : 70.62
    F1: 78.26

    
    abbrev.txt.gold: all gold links in yeast_abbrev_labeled.txt (but as corrected in abbrev.txt)
    abbrev.txt.raw.abb: all links based on abbrev.txt.raw and extractedwith main()
    abbrev.txt: the sentences from yeast_abbrev_labeled.txt with at least one (; tokenized with tokenizer.py and corrected.
    '''
    #name = fname+'.abb'
    #o = open(name, 'w')
    
    omit=0
    written=0
    try:
        for i, sentence in enumerate(fread(fname)):
            try:
                for candidate in getcandidates(sentence):
                    try:
                        definition = getdefinition(candidate, sentence)
                    except ValueError, e:
                        #print >>sys.stderr, i, sentence
                        if verbose:
                            print >>sys.stderr, i, 'Omitting candidate', candidate.encode(encoding)
                            print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
                        omit+=1
                    else:
                        try:
                            definition = definitionselection(definition, candidate)
                        except IndexError:
                            if verbose:
                                #print >>sys.stderr, i, sentence
                                print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding), '\n'
                            omit+=1
                        except ValueError, e:
                            if verbose:
                                #print >>sys.stderr, i, sentence
                                print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '||', candidate.encode(encoding)
                                print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
                            omit+=1
                        else:
                            #o.write('%d %d %d %s\n' %(i, candidate.start, candidate.stop, candidate))
                            #o.write('%d %d %d %s\n' %(i, definition.start, definition.stop, definition))
                            #o.write('\n')
                            
                            cline = '%d %d %d %s' %(i, candidate.start, candidate.stop, candidate)
                            dline =  '%d %d %d %s' %(i, definition.start, definition.stop, definition)
                            
                            print cline.encode(encoding)
                            print dline.encode(encoding)
                            print
                            
                            written+=1
            except ValueError, e:
                if verbose:
                    #print >>sys.stderr, 'SENTENCE %d:' %i, sentence.encode(encoding)
                    #print >>sys.stderr, 'ERROR:', e.args
                    print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
    
    
    finally:
        1
        #o.close()
        
    print >>sys.stderr, 'INFO: %d abbreviations detected and kept (%d omitted)' %(written, omit)
        
    #print >>sys.stderr, 'Written', name
        
        

 def score(gold, pred):
    '''
    gold: a file as created with extract.write()
    pred: a file as created with main()
    
    Prints the resulst'''
    golds={}
    s=[]
    for line in fread2(gold):
        if line:
            s.append(' '.join(line[1:]))
            id = line[0]
        elif s:
            golds[id] = golds.get(id, []) + [tuple(s)]
            s=[]
            
    preds={}
    s=[]
    for line in fread2(pred):
        if line:
            s.append(' '.join(line[3:]))
            id = line[0]
        elif s:
            preds[id] = preds.get(id, []) + [tuple(s)]
            s=[]
            

    tp=0; fp=0; fn=0
    for id, gvalues in golds.items():
        for pair in gvalues:
            try:
                pvalues = preds[id]
            except KeyError:
                fn+=1
            else:
                if pair in pvalues:
                    tp+=1
                    preds[id].remove(pair)
                else:
                    print 'false negative', id, pair
                    fn+=1


    for id, pvalues in preds.items():
        fp += len(pvalues)
        
    
    #The number of gold abbreviatios
    total=0
    for v in golds.values():
        total += len(v)
        
        
        
    print '''
 TP: %d
 FP: %d
 FN: %d

 P : %5.2f
 R : %5.2f
 F1: %5.2f

 In total there were %d gold definition/abbreviation pairs.

 ''' %(tp, fp, fn, precision(tp, fp), recall(tp, fn), fscore(tp, fp, fn), total) 


 def recall(TP, FN):
    if TP == 0: return 0
    return 100*float(TP)/(TP+FN)
    
 def precision(TP, FP):
    if TP == 0: return 0
    return 100*float(TP)/(TP+FP)

 def fscore(TP, FP, FN):
    R = recall(TP, FN)
    P = precision(TP, FP)
    
    if R==0 or P==0:
        return 0
    
    return 2*R*P/(R+P)




 def _usage():
    print >>sys.stderr, __doc__
        

 if __name__ == '__main__':        
    try:
        opts,args=getopt.getopt(sys.argv[1:],'hv', ['help'])
    except getopt.GetoptError:
        # print help information and exit:
        _usage()
        sys.exit(2)


    verbose=False
    
    for o, a in opts:
        if o in ('-h', '--help'):
            _usage()
            sys.exit()
        if o == '-v':
            verbose=True


    if len(args) != 1:
        _usage()
        sys.exit(1)
        
        
    main(args[0], verbose=verbose)
	#!/usr/bin/env python
	'''Link abbreviations to their full names

	Based on

	A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
	A. Schwartz and M. Hearst
	Biocomputing, 2003, pp 451-462.


	# License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt
	'''
	__date__ = 'July 2012'
	__author__ = 'Vincent Van Asch'
	__version__ = '1.2.1'


	# If you want to use the script that produces the results reported in __doc__
	# you have to set the global __REPRODUCE__ to True. It is assumed that
	# the extra code that is active when __REPRODUCE__ is set to True produces
	# better results.
	__REPRODUCE__=False


	__doc__='''
	This script takes tokenized sentences and prints the abbreviations and their definitions to STDOUT.

	USAGE
	python [-v] abbreviations.py inputfile > abbreviations

	FORMATS
	inputfile: A tokenized sentence on every new line (tokens should be separated by a single space)
	abbreviations:
	sentenceid1 characteroffset_begin_a1 characteroffset_end_a1 abbreviation1
	sentenceid1 characteroffset_begin_d1 characteroffset_end_d1 definition1

	sentenceid2 characteroffset_begin2 characteroffset_end2 abbreviation2
	sentenceid2 characteroffset_begin2 characteroffset_end2 definition2
	...

	sentenceid: the line on which the sentence is in the inputfile minus one

	OPTIONS
	-v: Print more information to STDERR

	REMARKS
	The algorithm detects only links between an abbreviation and its definition if they
	are in the format <definition> (<abbreviation>). So, the reverse
	<abbreviation> (<definition>) is not detected.

	The algorithm can only find definitions that have all characters of the abbreviation
	in them.

	It will also make errors in cases like "tyrosine transcription factor I (TTFI)"
	the wrong definition will be "transcription factor I" because all characters from
	the abbreviation are in the definition.

	On the labeled yeast corpus (http://biotext.berkeley.edu/data.html) version 1.0 of this
	script reaches:

	TP: 673
	FP: 94
	FN: 280

	P : 87.74
	R : 70.62
	F1: 78.26

	(The corpus had to be corrected manually in order to be useable.)

	ACKNOWLEDGEMENTS
	Based on:
	A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
	A. Schwartz and M. Hearst
	Biocomputing, 2003, pp 451-462.

	%s (version %s)''' %(__date__, __version__)

	import os, sys, re, getopt

	encoding='UTF8'

	def fread(fname, encoding=encoding):
	f=open(os.path.abspath(os.path.expanduser(fname)))
	try:
	for l in f:
	line=l.strip()
	if line:
	yield line.decode(encoding)
	finally:
	f.close()


	def fread2(fname, encoding=encoding):
	f=open(os.path.abspath(os.path.expanduser(fname)))
	try:
	for l in f:
	line=l.strip().decode(encoding)
	if line:
	yield line.split()
	else:
	yield None
	finally:
	f.close()

	class Candidate(unicode):
	def __new__(cls, start, stop, str):
	return unicode.__new__(cls, str)
	def __init__(self, start, stop, str):
	self._start = start
	self._stop = stop

	def __getslice__(self, i, j):
	start = self.start+i
	stop = self.start+j
	str = unicode.__getslice__(self, i, j)
	return Candidate(start, stop, str)

	@property
	def start(self):
	'''The start index'''
	return self._start
	@property
	def stop(self):
	'''The stop index'''
	return self._stop




	def getcandidates(sentence):
	'''Yields Candidates'''
	if '(' in sentence:
	# Check some things first
	if sentence.count('(') != sentence.count(')'):
	raise ValueError('Unbalanced parentheses: %s' %sentence)

	if sentence.find('(') > sentence.find(')'):
	raise ValueError('First parentheses is right: %s' %sentence)

	closeindex = -1
	while 1:
	# Look for open parenthesis
	openindex = sentence.find('(', closeindex+1)

	if openindex == -1: break

	# Look for closing parantheses
	closeindex = openindex+1
	open=1
	skip=False
	while open:
	try:
	char = sentence[closeindex]
	except IndexError:
	# We found an opening bracket but no associated closing bracket
	# Skip the opening bracket
	skip=True
	break
	if char == '(':
	open +=1
	elif char == ')':
	open -=1
	closeindex+=1

	if skip:
	closeindex = openindex+1
	continue

	# Output if conditions are met
	start = openindex+1
	stop = closeindex-1
	str = sentence[start:stop]

	# Take into account whitepsace that should be removed
	start = start + len(str) - len(str.lstrip())
	stop = stop - len(str) + len(str.rstrip())
	str = sentence[start:stop]

	if conditions(str):
	yield Candidate(start, stop, str)






	def conditions(str):
	'''Based on Schwartz&Hearst

	2 <= len(str) <= 10
	len(tokens) <= 2
	re.search('[A-Za-z]', str)
	str[0].isalnum()

	and extra:
	if it matches ([A-Za-z]\. ?){2,}
	it is a good candidate.

	'''
	if not __REPRODUCE__ and re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
	return True
	if len(str) < 2 or len(str) > 10:
	return False
	if len(str.split()) > 2:
	return False
	if not re.search('[A-Za-z]', str):
	return False
	if not str[0].isalnum():
	return False

	return True

	def getdefinition(candidate, sentence):
	'''Takes a candidate and a sentence and returns the definition candidate.

	The definintion candidate is the set of tokens (in front of the candidate)
	that starts with a token starting with the first character of the candidate'''
	# Take the tokens in front of the candidate
	tokens = sentence[:candidate.start-2].lower().split()

	# the char that we are looking for
	key = candidate[0].lower()

	# Count the number of tokens that start with the same character as the candidate
	firstchars = [t[0] for t in tokens]

	definitionfreq = firstchars.count(key)
	candidatefreq = candidate.lower().count(key)

	# Look for the list of tokens in front of candidate that
	# have a sufficient number of tokens starting with key
	if candidatefreq <= definitionfreq:
	# we should at least have a good number of starts
	count=0
	start=0
	startindex=len(firstchars)-1
	while count < candidatefreq:
	if abs(start) > len(firstchars):
	raise ValueError('not found')

	start-=1
	# Look up key in the definition
	try:
	startindex = firstchars.index(key, len(firstchars)+start)
	except ValueError:
	pass

	# Count the number of keys in definition
	count = firstchars[startindex:].count(key)

	# We found enough keys in the definition so return the definition as a
	# definition candidate
	start = len(' '.join(tokens[:startindex]))
	stop = candidate.start-2
	str = sentence[start:stop]

	# Remove whitespace
	start = start + len(str) - len(str.lstrip())
	stop = stop - len(str) + len(str.rstrip())
	str = sentence[start:stop]

	return Candidate(start, stop, str)


	else:
	#print 'S', sentence
	#print >>sys.stderr, 'KEY', key
	#print >>sys.stderr, 'TOKENS', tokens
	#print >>sys.stderr, 'ABBREV', candidate
	raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')



	def definitionselection(definition, abbrev):
	'''Takes a definition candidate and an abbreviation candidate
	and returns True if the chars in the abbreviation occur in the definition

	Based on
	A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''

	if len(definition) < len(abbrev):
	raise ValueError('Abbreviation is longer than definition')

	if abbrev in definition.split():
	raise ValueError('Abbreviation is full word of definition')

	sindex = -1
	lindex = -1

	while 1:
	try:
	longchar = definition[lindex].lower()
	except IndexError:
	#print definition, '\|\|',abbrev
	raise


	shortchar = abbrev[sindex].lower()

	if not shortchar.isalnum():
	sindex-=1

	if sindex == -1*len(abbrev):
	if shortchar == longchar:
	if lindex == -1*len(definition) or not definition[lindex-1].isalnum():
	break
	else:
	lindex-=1
	else:
	lindex-=1

	if lindex == -1*(len(definition)+1):
	raise ValueError('definition of "%s" not found in "%s"' %(abbrev, definition))

	else:
	if shortchar == longchar:
	sindex -=1
	lindex -=1
	else:
	lindex -= 1

	definition = definition[lindex:len(definition)]

	tokens = len(definition.split())
	length = len(abbrev)

	if tokens > min([length+5, length*2]):
	raise ValueError('did not meet min(\|A\|+5, \|A\|*2) constraint')

	# Do not return definitions that contain unbalanced parentheses
	if not __REPRODUCE__:
	#print 'ccc', abbrev, definition, definition.count('('), definition.count(')')
	if definition.count('(') != definition.count(')'):
	raise ValueError('Unbalanced parentheses not allowed in a definition')

	return definition






	def main(fname, verbose=False, encoding=encoding):
	'''Writes a file (fname.abb) containing all abbreviations and their definitions in the format

	sentenceindex startindex stopindex abbrev
	sentenceindex startindex stopindex definition

	sentenceindex startindex stopindex abbrev
	sentenceindex startindex stopindex definition
	...


	fname: single-whitespaced tokenized sentences; every sentence on a newline


	Evaluation
	abbreviations.score('abbrev.txt.gold', 'abbrev.txt.raw.abb')

	TP: 673
	FP: 94
	FN: 280

	P : 87.74
	R : 70.62
	F1: 78.26


	abbrev.txt.gold: all gold links in yeast_abbrev_labeled.txt (but as corrected in abbrev.txt)
	abbrev.txt.raw.abb: all links based on abbrev.txt.raw and extractedwith main()
	abbrev.txt: the sentences from yeast_abbrev_labeled.txt with at least one (; tokenized with tokenizer.py and corrected.
	'''
	#name = fname+'.abb'
	#o = open(name, 'w')

	omit=0
	written=0
	try:
	for i, sentence in enumerate(fread(fname)):
	try:
	for candidate in getcandidates(sentence):
	try:
	definition = getdefinition(candidate, sentence)
	except ValueError, e:
	#print >>sys.stderr, i, sentence
	if verbose:
	print >>sys.stderr, i, 'Omitting candidate', candidate.encode(encoding)
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
	omit+=1
	else:
	try:
	definition = definitionselection(definition, candidate)
	except IndexError:
	if verbose:
	#print >>sys.stderr, i, sentence
	print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '\|\|', candidate.encode(encoding), '\n'
	omit+=1
	except ValueError, e:
	if verbose:
	#print >>sys.stderr, i, sentence
	print >>sys.stderr, i, 'Omitting candidate', definition.encode(encoding), '\|\|', candidate.encode(encoding)
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'
	omit+=1
	else:
	#o.write('%d %d %d %s\n' %(i, candidate.start, candidate.stop, candidate))
	#o.write('%d %d %d %s\n' %(i, definition.start, definition.stop, definition))
	#o.write('\n')

	cline = '%d %d %d %s' %(i, candidate.start, candidate.stop, candidate)
	dline = '%d %d %d %s' %(i, definition.start, definition.stop, definition)

	print cline.encode(encoding)
	print dline.encode(encoding)
	print

	written+=1
	except ValueError, e:
	if verbose:
	#print >>sys.stderr, 'SENTENCE %d:' %i, sentence.encode(encoding)
	#print >>sys.stderr, 'ERROR:', e.args
	print >>sys.stderr, 'Reason:', e.args[0].encode(encoding), '\n'


	finally:
	1
	#o.close()

	print >>sys.stderr, 'INFO: %d abbreviations detected and kept (%d omitted)' %(written, omit)

	#print >>sys.stderr, 'Written', name



	def score(gold, pred):
	'''
	gold: a file as created with extract.write()
	pred: a file as created with main()

	Prints the resulst'''
	golds={}
	s=[]
	for line in fread2(gold):
	if line:
	s.append(' '.join(line[1:]))
	id = line[0]
	elif s:
	golds[id] = golds.get(id, []) + [tuple(s)]
	s=[]

	preds={}
	s=[]
	for line in fread2(pred):
	if line:
	s.append(' '.join(line[3:]))
	id = line[0]
	elif s:
	preds[id] = preds.get(id, []) + [tuple(s)]
	s=[]


	tp=0; fp=0; fn=0
	for id, gvalues in golds.items():
	for pair in gvalues:
	try:
	pvalues = preds[id]
	except KeyError:
	fn+=1
	else:
	if pair in pvalues:
	tp+=1
	preds[id].remove(pair)
	else:
	print 'false negative', id, pair
	fn+=1


	for id, pvalues in preds.items():
	fp += len(pvalues)


	#The number of gold abbreviatios
	total=0
	for v in golds.values():
	total += len(v)



	print '''
	TP: %d
	FP: %d
	FN: %d

	P : %5.2f
	R : %5.2f
	F1: %5.2f

	In total there were %d gold definition/abbreviation pairs.

	''' %(tp, fp, fn, precision(tp, fp), recall(tp, fn), fscore(tp, fp, fn), total)


	def recall(TP, FN):
	if TP == 0: return 0
	return 100*float(TP)/(TP+FN)

	def precision(TP, FP):
	if TP == 0: return 0
	return 100*float(TP)/(TP+FP)

	def fscore(TP, FP, FN):
	R = recall(TP, FN)
	P = precision(TP, FP)

	if R==0 or P==0:
	return 0

	return 2RP/(R+P)




	def _usage():
	print >>sys.stderr, __doc__


	if __name__ == '__main__':
	try:
	opts,args=getopt.getopt(sys.argv[1:],'hv', ['help'])
	except getopt.GetoptError:
	# print help information and exit:
	_usage()
	sys.exit(2)


	verbose=False

	for o, a in opts:
	if o in ('-h', '--help'):
	_usage()
	sys.exit()
	if o == '-v':
	verbose=True


	if len(args) != 1:
	_usage()
	sys.exit(1)


	main(args[0], verbose=verbose)