Created
March 13, 2017 13:32
-
-
Save bogsio/909600fa93a04794b83a5a532df56dec to your computer and use it in GitHub Desktop.
Train a NP Vectorizer script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
from collections import Iterable | |
from nltk.corpus import conll2000 | |
from nltk import ChunkParserI, ClassifierBasedTagger | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk.chunk import conlltags2tree, tree2conlltags | |
from nltk.tag import pos_tag | |
from nltk import Tree | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
################################################# | |
# | |
# STEP 1: Build a NP-Chunker | |
# | |
################################################# | |
# STEP 1.1: Create a feature extractor | |
def features(tokens, index, history): | |
""" | |
`tokens` = a POS-tagged sentence [(w1, t1), ...] | |
`index` = the index of the token we want to extract features for | |
`history` = the previous predicted IOB tags | |
""" | |
# init the stemmer | |
stemmer = SnowballStemmer('english') | |
# Pad the sequence with placeholders | |
tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')] | |
history = ['__START2__', '__START1__'] + list(history) | |
# shift the index with 2, to accommodate the padding | |
index += 2 | |
word, pos = tokens[index] | |
prevword, prevpos = tokens[index - 1] | |
prevprevword, prevprevpos = tokens[index - 2] | |
nextword, nextpos = tokens[index + 1] | |
nextnextword, nextnextpos = tokens[index + 2] | |
return { | |
'word': word, | |
'lemma': stemmer.stem(word), | |
'pos': pos, | |
'next-word': nextword, | |
'next-pos': nextpos, | |
'next-next-word': nextnextword, | |
'nextnextpos': nextnextpos, | |
'prev-word': prevword, | |
'prev-pos': prevpos, | |
'prev-prev-word': prevprevword, | |
'prev-prev-pos': prevprevpos, | |
} | |
# STEP 1.2: Create the Chunker class | |
class ClassifierChunkParser(ChunkParserI): | |
def __init__(self, chunked_sents, **kwargs): | |
assert isinstance(chunked_sents, Iterable) | |
# Transform the trees in IOB annotated sentences [(word, pos, chunk), ...] | |
chunked_sents = [tree2conlltags(sent) for sent in chunked_sents] | |
# Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...] | |
def triplets2tagged_pairs(iob_sent): | |
return [((word, pos), chunk) for word, pos, chunk in iob_sent] | |
chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] | |
self.feature_detector = features | |
self.tagger = ClassifierBasedTagger( | |
train=chunked_sents, | |
feature_detector=features, | |
**kwargs) | |
def parse(self, tagged_sent): | |
chunks = self.tagger.tag(tagged_sent) | |
# Transform the result from [((w1, t1), iob1), ...] | |
# to the preferred list of triplets format [(w1, t1, iob1), ...] | |
iob_triplets = [(w, t, c) for ((w, t), c) in chunks] | |
# Transform the list of triplets to nltk.Tree format | |
return conlltags2tree(iob_triplets) | |
shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['NP'])) | |
random.shuffle(shuffled_conll_sents) | |
train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)] | |
test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):] | |
# Step 1.3: Train and evaluate the chunker | |
classifier_chunker = ClassifierChunkParser(train_sents) | |
print classifier_chunker.evaluate(test_sents) | |
# You should get something around these values: | |
# ChunkParse score: | |
# IOB Accuracy: 94.8%% | |
# Precision: 85.2%% | |
# Recall: 89.7%% | |
# F-Measure: 87.4%% | |
################################################# | |
# | |
# STEP 2: Build a NP-Chunk Vectorizer | |
# | |
################################################# | |
def np_tokenizer(text): | |
# split into sentences | |
sentences = sent_tokenize(text) | |
# split into words | |
sentences = [word_tokenize(sent) for sent in sentences] | |
# POS tag | |
sentences = [pos_tag(sent) for sent in sentences] | |
# extract NP-chunks | |
sentences = [classifier_chunker.parse(sent) for sent in sentences] | |
# return the NP-chunks | |
noun_phrases = [] | |
for sent in sentences: | |
for token in sent: | |
if isinstance(token, Tree) and token.label() == 'NP': | |
noun_phrases.append(' '.join([t[0] for t in token])) | |
return noun_phrases | |
test_sentence = "This is a complex sentence that I have just made up to test NP chunking." | |
print np_tokenizer(test_sentence) # ['This', 'a complex sentence', 'I', 'NP chunking'] | |
NP_vectorizer = TfidfVectorizer(lowercase=True, tokenizer=np_tokenizer) | |
# Train the vectorizer | |
NP_vectorizer.fit((' '.join(sent) for sent in conll2000.sents())) | |
X = NP_vectorizer.transform((' '.join(sent) for sent in conll2000.sents()[:100])) | |
for row, col in zip(*X.nonzero()): | |
print "Document: %s contains feature='%s', weight=%s" % (row, NP_vectorizer.get_feature_names()[col], X[row, col]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment