Created
January 27, 2020 18:43
-
-
Save PandaWhoCodes/162f41185fdc267408c7c2b84b764246 to your computer and use it in GitHub Desktop.
NER - Named Entity Extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from nltk.stem.snowball import SnowballStemmer | |
import os | |
from nltk.chunk import conlltags2tree, tree2conlltags | |
import pickle | |
# Change here | |
corpus_root = 'gmb-2.2.0' | |
def to_conll_iob(annotated_sentence): | |
""" | |
`annotated_sentence` = list of triplets [(w1, t1, iob1), ...] | |
Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O | |
to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O | |
""" | |
proper_iob_tokens = [] | |
for idx, annotated_token in enumerate(annotated_sentence): | |
tag, word, ner = annotated_token | |
if ner != 'O': | |
if idx == 0: | |
ner = "B-" + ner | |
elif annotated_sentence[idx - 1][2] == ner: | |
ner = "I-" + ner | |
else: | |
ner = "B-" + ner | |
proper_iob_tokens.append((tag, word, ner)) | |
return proper_iob_tokens | |
def read_gmb(corpus_root): | |
for root, dirs, files in os.walk(corpus_root): | |
for filename in files: | |
if filename.endswith(".tags"): | |
with open(os.path.join(root, filename), 'rb') as file_handle: | |
file_content = file_handle.read().decode('utf-8').strip() | |
annotated_sentences = file_content.split('\n\n') | |
for annotated_sentence in annotated_sentences: | |
annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq] | |
standard_form_tokens = [] | |
for idx, annotated_token in enumerate(annotated_tokens): | |
annotations = annotated_token.split('\t') | |
word, tag, ner = annotations[0], annotations[1], annotations[3] | |
if ner != 'O': | |
ner = ner.split('-')[0] | |
if tag in ('LQU', 'RQU'): # Make it NLTK compatible | |
tag = "``" | |
standard_form_tokens.append((word, tag, ner)) | |
conll_tokens = to_conll_iob(standard_form_tokens) | |
# Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...] | |
# Because the classfier expects a tuple as input, first item input, second the class | |
yield [((w, t), iob) for w, t, iob in conll_tokens] | |
# reader = read_gmb("GMB2.0.2") | |
def features(tokens, index, history): | |
""" | |
`tokens` = a POS-tagged sentence [(w1, t1), ...] | |
`index` = the index of the token we want to extract features for | |
`history` = the previous predicted IOB tags | |
""" | |
# init the stemmer | |
stemmer = SnowballStemmer('english') | |
# Pad the sequence with placeholders | |
tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), | |
('[END2]', '[END2]')] | |
history = ['[START2]', '[START1]'] + list(history) | |
# shift the index with 2, to accommodate the padding | |
index += 2 | |
word, pos = tokens[index] | |
prevword, prevpos = tokens[index - 1] | |
prevprevword, prevprevpos = tokens[index - 2] | |
nextword, nextpos = tokens[index + 1] | |
nextnextword, nextnextpos = tokens[index + 2] | |
previob = history[index - 1] | |
contains_dash = '-' in word | |
contains_dot = '.' in word | |
allascii = all([True for c in word if c in string.ascii_lowercase]) | |
allcaps = word == word.capitalize() | |
capitalized = word[0] in string.ascii_uppercase | |
prevallcaps = prevword == prevword.capitalize() | |
prevcapitalized = prevword[0] in string.ascii_uppercase | |
nextallcaps = prevword == prevword.capitalize() | |
nextcapitalized = prevword[0] in string.ascii_uppercase | |
return { | |
'word': word, | |
'lemma': stemmer.stem(word), | |
'pos': pos, | |
'all-ascii': allascii, | |
'next-word': nextword, | |
'next-lemma': stemmer.stem(nextword), | |
'next-pos': nextpos, | |
'next-next-word': nextnextword, | |
'nextnextpos': nextnextpos, | |
'prev-word': prevword, | |
'prev-lemma': stemmer.stem(prevword), | |
'prev-pos': prevpos, | |
'prev-prev-word': prevprevword, | |
'prev-prev-pos': prevprevpos, | |
'prev-iob': previob, | |
'contains-dash': contains_dash, | |
'contains-dot': contains_dot, | |
'all-caps': allcaps, | |
'capitalized': capitalized, | |
'prev-all-caps': prevallcaps, | |
'prev-capitalized': prevcapitalized, | |
'next-all-caps': nextallcaps, | |
'next-capitalized': nextcapitalized, | |
} | |
import pickle | |
from collections import Iterable | |
from nltk.tag import ClassifierBasedTagger | |
from nltk.chunk import ChunkParserI | |
class NamedEntityChunker(ChunkParserI): | |
def __init__(self, train_sents, **kwargs): | |
assert isinstance(train_sents, Iterable) | |
self.feature_detector = features | |
self.tagger = ClassifierBasedTagger( | |
train=train_sents, | |
feature_detector=features, | |
**kwargs) | |
def parse(self, tagged_sent): | |
chunks = self.tagger.tag(tagged_sent) | |
# Transform the result from [((w1, t1), iob1), ...] | |
# to the preferred list of triplets format [(w1, t1, iob1), ...] | |
iob_triplets = [(w, t, c) for ((w, t), c) in chunks] | |
# Transform the list of triplets to nltk.Tree format | |
return conlltags2tree(iob_triplets) | |
def save_object(obj, filename): | |
with open(filename, 'wb') as output: | |
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) | |
reader = read_gmb(corpus_root) | |
data = list(reader) | |
training_samples = data[:int(len(data) * 0.9)] | |
test_samples = data[int(len(data) * 0.9):] | |
print("#training samples = %s" % len(training_samples)) # training samples = 55809 | |
print("#test samples = %s" % len(test_samples)) # test samples = 6201 | |
# chunker = NamedEntityChunker(training_samples[:2000]) | |
with open('chunker.pkl', 'rb') as pickle_file: | |
chunker = pickle.load(pickle_file) | |
# chunker = pickle.load('',) | |
# save_object(chunker, 'chunker.pkl') | |
# print(chunker.parse("My name is Ashish")) | |
from nltk import pos_tag, word_tokenize | |
with open('input.txt', 'r') as f: | |
print(chunker.parse(pos_tag(word_tokenize(f.read())))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment