Skip to content

Instantly share code, notes, and snippets.

@bogsio
Created March 13, 2017 13:32
Show Gist options
  • Save bogsio/909600fa93a04794b83a5a532df56dec to your computer and use it in GitHub Desktop.
Save bogsio/909600fa93a04794b83a5a532df56dec to your computer and use it in GitHub Desktop.
Train a NP Vectorizer script
import random
from collections import Iterable
from nltk.corpus import conll2000
from nltk import ChunkParserI, ClassifierBasedTagger
from nltk.stem.snowball import SnowballStemmer
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag import pos_tag
from nltk import Tree
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
#################################################
#
# STEP 1: Build a NP-Chunker
#
#################################################
# STEP 1.1: Create a feature extractor
def features(tokens, index, history):
"""
`tokens` = a POS-tagged sentence [(w1, t1), ...]
`index` = the index of the token we want to extract features for
`history` = the previous predicted IOB tags
"""
# init the stemmer
stemmer = SnowballStemmer('english')
# Pad the sequence with placeholders
tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
history = ['__START2__', '__START1__'] + list(history)
# shift the index with 2, to accommodate the padding
index += 2
word, pos = tokens[index]
prevword, prevpos = tokens[index - 1]
prevprevword, prevprevpos = tokens[index - 2]
nextword, nextpos = tokens[index + 1]
nextnextword, nextnextpos = tokens[index + 2]
return {
'word': word,
'lemma': stemmer.stem(word),
'pos': pos,
'next-word': nextword,
'next-pos': nextpos,
'next-next-word': nextnextword,
'nextnextpos': nextnextpos,
'prev-word': prevword,
'prev-pos': prevpos,
'prev-prev-word': prevprevword,
'prev-prev-pos': prevprevpos,
}
# STEP 1.2: Create the Chunker class
class ClassifierChunkParser(ChunkParserI):
def __init__(self, chunked_sents, **kwargs):
assert isinstance(chunked_sents, Iterable)
# Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]
# Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
def triplets2tagged_pairs(iob_sent):
return [((word, pos), chunk) for word, pos, chunk in iob_sent]
chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]
self.feature_detector = features
self.tagger = ClassifierBasedTagger(
train=chunked_sents,
feature_detector=features,
**kwargs)
def parse(self, tagged_sent):
chunks = self.tagger.tag(tagged_sent)
# Transform the result from [((w1, t1), iob1), ...]
# to the preferred list of triplets format [(w1, t1, iob1), ...]
iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
# Transform the list of triplets to nltk.Tree format
return conlltags2tree(iob_triplets)
shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['NP']))
random.shuffle(shuffled_conll_sents)
train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]
# Step 1.3: Train and evaluate the chunker
classifier_chunker = ClassifierChunkParser(train_sents)
print classifier_chunker.evaluate(test_sents)
# You should get something around these values:
# ChunkParse score:
# IOB Accuracy: 94.8%%
# Precision: 85.2%%
# Recall: 89.7%%
# F-Measure: 87.4%%
#################################################
#
# STEP 2: Build a NP-Chunk Vectorizer
#
#################################################
def np_tokenizer(text):
# split into sentences
sentences = sent_tokenize(text)
# split into words
sentences = [word_tokenize(sent) for sent in sentences]
# POS tag
sentences = [pos_tag(sent) for sent in sentences]
# extract NP-chunks
sentences = [classifier_chunker.parse(sent) for sent in sentences]
# return the NP-chunks
noun_phrases = []
for sent in sentences:
for token in sent:
if isinstance(token, Tree) and token.label() == 'NP':
noun_phrases.append(' '.join([t[0] for t in token]))
return noun_phrases
test_sentence = "This is a complex sentence that I have just made up to test NP chunking."
print np_tokenizer(test_sentence) # ['This', 'a complex sentence', 'I', 'NP chunking']
NP_vectorizer = TfidfVectorizer(lowercase=True, tokenizer=np_tokenizer)
# Train the vectorizer
NP_vectorizer.fit((' '.join(sent) for sent in conll2000.sents()))
X = NP_vectorizer.transform((' '.join(sent) for sent in conll2000.sents()[:100]))
for row, col in zip(*X.nonzero()):
print "Document: %s contains feature='%s', weight=%s" % (row, NP_vectorizer.get_feature_names()[col], X[row, col])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment