Created
July 26, 2019 15:12
-
-
Save jacquesfize/1bc6ba1f9c8ecfcf758e9493bc556c8e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Natural Language Toolkit: Interface to the TreeTagger POS-tagger | |
# | |
# Copyright (C) Mirko Otto | |
# Author: Mirko Otto <[email protected]> | |
# Modified by: Jacques Fize | |
""" | |
A Python module for interfacing with the Treetagger by Helmut Schmid. | |
""" | |
import os | |
from subprocess import Popen, PIPE | |
from sys import platform as _platform | |
from nltk.internals import find_binary | |
_treetagger_url = 'http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/' | |
_treetagger_languages = ['bulgarian', 'dutch', 'english', 'estonian', 'finnish', 'french', 'galician', 'german', 'italian', 'polish', 'russian', 'slovak', 'slovak2', 'spanish'] | |
class TreeTagger(Tagger): | |
r""" | |
A class for pos tagging with TreeTagger. The default encoding used by TreeTagger is utf-8. The input is the paths to: | |
- a language trained on training data | |
- (optionally) the path to the TreeTagger binary | |
This class communicates with the TreeTagger binary via pipes. | |
Example: | |
.. doctest:: | |
:options: +SKIP | |
>>> from treetagger import TreeTagger | |
>>> tt = TreeTagger(language='english') | |
>>> tt.tag('What is the airspeed of an unladen swallow ?') | |
[['What', 'WP', 'What'], | |
['is', 'VBZ', 'be'], | |
['the', 'DT', 'the'], | |
['airspeed', 'NN', 'airspeed'], | |
['of', 'IN', 'of'], | |
['an', 'DT', 'an'], | |
['unladen', 'JJ', '<unknown>'], | |
['swallow', 'NN', 'swallow'], | |
['?', 'SENT', '?']] | |
.. doctest:: | |
:options: +SKIP | |
>>> from treetagger import TreeTagger | |
>>> tt = TreeTagger(language='german') | |
>>> tt.tag('Das Haus hat einen großen hübschen Garten.') | |
[['Das', 'ART', 'die'], | |
['Haus', 'NN', 'Haus'], | |
['hat', 'VAFIN', 'haben'], | |
['einen', 'ART', 'eine'], | |
['großen', 'ADJA', 'groß'], | |
['hübschen', 'ADJA', 'hübsch'], | |
['Garten', 'NN', 'Garten'], | |
['.', '$.', '.']] | |
""" | |
def __init__(self,language='french' ,path_to_home=None, | |
verbose=False, abbreviation_list=None): | |
Tagger.__init__(self,True) | |
""" | |
Initialize the TreeTagger. | |
:param path_to_home: The TreeTagger binary. | |
:param language: Default language is german. | |
The encoding used by the model. Unicode tokens | |
passed to the tag() and batch_tag() methods are converted to | |
this charset when they are sent to TreeTagger. | |
The default is utf-8. | |
This parameter is ignored for str tokens, which are sent as-is. | |
The caller must ensure that tokens are encoded in the right charset. | |
""" | |
treetagger_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin', | |
'/Applications/bin', '~/bin', '~/Applications/bin', | |
'~/work/tmp/treetagger/cmd', '~/tree-tagger/cmd'] | |
treetagger_paths = list(map(os.path.expanduser, treetagger_paths)) | |
self._abbr_list = abbreviation_list | |
if language in _treetagger_languages: | |
if _platform == "win32": | |
treetagger_bin_name = 'tag-' + language | |
else: | |
treetagger_bin_name = 'tree-tagger-' + language | |
else: | |
raise LookupError('Language not in language list!') | |
try: | |
os.environ["TREETAGGER_HOME"] = path_to_home | |
self._treetagger_bin = find_binary( | |
treetagger_bin_name, path_to_home, | |
env_vars=('TREETAGGER', 'TREETAGGER_HOME'), | |
searchpath=treetagger_paths, | |
url=_treetagger_url, | |
verbose=verbose) | |
except LookupError: | |
print('Can\'t find the TreeTagger bin!') | |
def tag(self, text): | |
"""Tags a single sentence: a list of words. | |
The tokens should not contain any newline characters. | |
""" | |
# Write the actual sentences to the temporary input file | |
if isinstance(text, list): | |
_input = '\n'.join((x for x in text)) | |
else: | |
_input = text | |
# Run the tagger and get the output | |
if(self._abbr_list is None): | |
p = Popen([self._treetagger_bin], | |
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) | |
elif(self._abbr_list is not None): | |
p = Popen([self._treetagger_bin,"-a",self._abbr_list], | |
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) | |
(stdout, stderr) = p.communicate(str(_input).encode('utf-8')) | |
# Check the return code. | |
if p.returncode != 0: | |
print(stderr) | |
raise OSError('TreeTagger command failed!') | |
treetagger_output = stdout.decode('UTF-8') | |
# Output the tagged sentences | |
tagged_sentences = [] | |
for tagged_word in treetagger_output.strip().split('\n'): | |
tagged_word_split = tagged_word.split('\t') | |
tagged_sentences.append(tagged_word_split) | |
return tagged_sentences | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment