bogsio · March 13, 2017 13:32
diff --git a/np_vectorizer.py b/np_vectorizer.py
 import random

 from collections import Iterable
 from nltk.corpus import conll2000
 from nltk import ChunkParserI, ClassifierBasedTagger
 from nltk.stem.snowball import SnowballStemmer
 from nltk.chunk import conlltags2tree, tree2conlltags
 from nltk.tag import pos_tag
 from nltk import Tree
 from nltk.tokenize import sent_tokenize, word_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer


 #################################################
 #
 #  STEP 1: Build a NP-Chunker
 #
 #################################################


 # STEP 1.1: Create a feature extractor

 def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
    history = ['__START2__', '__START1__'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
 
        'next-word': nextword,
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
    }
 

 # STEP 1.2: Create the Chunker class
 class ClassifierChunkParser(ChunkParserI):
    def __init__(self, chunked_sents, **kwargs):
        assert isinstance(chunked_sents, Iterable)
 
        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]
 
        # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
        def triplets2tagged_pairs(iob_sent):
            return [((word, pos), chunk) for word, pos, chunk in iob_sent]
        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=chunked_sents,
            feature_detector=features,
            **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)

 shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['NP']))
 random.shuffle(shuffled_conll_sents)
 train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
 test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]

 # Step 1.3: Train and evaluate the chunker
 classifier_chunker = ClassifierChunkParser(train_sents)
 print classifier_chunker.evaluate(test_sents)

 # You should get something around these values:

 # ChunkParse score:
 #     IOB Accuracy:  94.8%%
 #     Precision:     85.2%%
 #     Recall:        89.7%%
 #     F-Measure:     87.4%%

 #################################################
 #
 #  STEP 2: Build a NP-Chunk Vectorizer
 #
 #################################################

 def np_tokenizer(text):
    # split into sentences
    sentences = sent_tokenize(text)

    # split into words
    sentences = [word_tokenize(sent) for sent in sentences]

    # POS tag
    sentences = [pos_tag(sent) for sent in sentences]

    # extract NP-chunks
    sentences = [classifier_chunker.parse(sent) for sent in sentences]

    # return the NP-chunks
    noun_phrases = []

    for sent in sentences:
        for token in sent:
            if isinstance(token, Tree) and token.label() == 'NP':
                noun_phrases.append(' '.join([t[0] for t in token]))

    return noun_phrases

 test_sentence = "This is a complex sentence that I have just made up to test NP chunking."
 print np_tokenizer(test_sentence) # ['This', 'a complex sentence', 'I', 'NP chunking']


 NP_vectorizer = TfidfVectorizer(lowercase=True, tokenizer=np_tokenizer)

 # Train the vectorizer
 NP_vectorizer.fit((' '.join(sent) for sent in conll2000.sents()))


 X = NP_vectorizer.transform((' '.join(sent) for sent in conll2000.sents()[:100]))

 for row, col in zip(*X.nonzero()):
    print "Document: %s contains feature='%s', weight=%s" % (row, NP_vectorizer.get_feature_names()[col], X[row, col])
	import random

	from collections import Iterable
	from nltk.corpus import conll2000
	from nltk import ChunkParserI, ClassifierBasedTagger
	from nltk.stem.snowball import SnowballStemmer
	from nltk.chunk import conlltags2tree, tree2conlltags
	from nltk.tag import pos_tag
	from nltk import Tree
	from nltk.tokenize import sent_tokenize, word_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer


	#################################################
	#
	# STEP 1: Build a NP-Chunker
	#
	#################################################


	# STEP 1.1: Create a feature extractor

	def features(tokens, index, history):
	"""
	`tokens` = a POS-tagged sentence [(w1, t1), ...]
	`index` = the index of the token we want to extract features for
	`history` = the previous predicted IOB tags
	"""

	# init the stemmer
	stemmer = SnowballStemmer('english')

	# Pad the sequence with placeholders
	tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
	history = ['__START2__', '__START1__'] + list(history)

	# shift the index with 2, to accommodate the padding
	index += 2

	word, pos = tokens[index]
	prevword, prevpos = tokens[index - 1]
	prevprevword, prevprevpos = tokens[index - 2]
	nextword, nextpos = tokens[index + 1]
	nextnextword, nextnextpos = tokens[index + 2]

	return {
	'word': word,
	'lemma': stemmer.stem(word),
	'pos': pos,

	'next-word': nextword,
	'next-pos': nextpos,

	'next-next-word': nextnextword,
	'nextnextpos': nextnextpos,

	'prev-word': prevword,
	'prev-pos': prevpos,

	'prev-prev-word': prevprevword,
	'prev-prev-pos': prevprevpos,
	}


	# STEP 1.2: Create the Chunker class
	class ClassifierChunkParser(ChunkParserI):
	def __init__(self, chunked_sents, **kwargs):
	assert isinstance(chunked_sents, Iterable)

	# Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
	chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]

	# Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
	def triplets2tagged_pairs(iob_sent):
	return [((word, pos), chunk) for word, pos, chunk in iob_sent]
	chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]

	self.feature_detector = features
	self.tagger = ClassifierBasedTagger(
	train=chunked_sents,
	feature_detector=features,
	**kwargs)

	def parse(self, tagged_sent):
	chunks = self.tagger.tag(tagged_sent)

	# Transform the result from [((w1, t1), iob1), ...]
	# to the preferred list of triplets format [(w1, t1, iob1), ...]
	iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

	# Transform the list of triplets to nltk.Tree format
	return conlltags2tree(iob_triplets)

	shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['NP']))
	random.shuffle(shuffled_conll_sents)
	train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
	test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]

	# Step 1.3: Train and evaluate the chunker
	classifier_chunker = ClassifierChunkParser(train_sents)
	print classifier_chunker.evaluate(test_sents)

	# You should get something around these values:

	# ChunkParse score:
	# IOB Accuracy: 94.8%%
	# Precision: 85.2%%
	# Recall: 89.7%%
	# F-Measure: 87.4%%

	#################################################
	#
	# STEP 2: Build a NP-Chunk Vectorizer
	#
	#################################################

	def np_tokenizer(text):
	# split into sentences
	sentences = sent_tokenize(text)

	# split into words
	sentences = [word_tokenize(sent) for sent in sentences]

	# POS tag
	sentences = [pos_tag(sent) for sent in sentences]

	# extract NP-chunks
	sentences = [classifier_chunker.parse(sent) for sent in sentences]

	# return the NP-chunks
	noun_phrases = []

	for sent in sentences:
	for token in sent:
	if isinstance(token, Tree) and token.label() == 'NP':
	noun_phrases.append(' '.join([t[0] for t in token]))

	return noun_phrases

	test_sentence = "This is a complex sentence that I have just made up to test NP chunking."
	print np_tokenizer(test_sentence) # ['This', 'a complex sentence', 'I', 'NP chunking']


	NP_vectorizer = TfidfVectorizer(lowercase=True, tokenizer=np_tokenizer)

	# Train the vectorizer
	NP_vectorizer.fit((' '.join(sent) for sent in conll2000.sents()))


	X = NP_vectorizer.transform((' '.join(sent) for sent in conll2000.sents()[:100]))

	for row, col in zip(*X.nonzero()):
	print "Document: %s contains feature='%s', weight=%s" % (row, NP_vectorizer.get_feature_names()[col], X[row, col])