pip freeze:
- nltk==3.2.1
- numpy==1.11.1
# -*- coding: utf-8 -*-
'''
@author: Ricardo Cosme
'''
import nltk.data, nltk.tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
text = """
Two explosions hit the finish line of the Boston Marathon on Monday, injuring spectators at America's oldest and most prestigious marathon. It was not immediately clear what caused the explosion or the number and extent of injuries. Bloody spectators were carried to the medical tent that had been set up to care for fatigued runners. Neither race officials nor public officials could immediately estimate the number or degree of injuries.
A Boston police officer was wheeled from the course with a leg injury that was bleeding. "There are a lot of people down," said one man, whose bib identified him as Frank Deruyter of North Carolina. He was not injured, but marathon workers were carrying one woman, who did not appear to be a runner, to the medical area as blood gushed from her leg.
About three hours after the winners crossed the finish line, there was a loud explosion on the north side of Boylston Street."""
def named_entities(text, stopwordslang):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
clean_sentences = []
tokenizer = RegexpTokenizer(r'\w+')
for sentence in tokenized_sentences:
no_punct = filter(lambda word: word not in ',-.?!:;"', sentence)
no_numbers = filter(lambda x: ''.join([i for i in x if not x.isdigit()]), no_punct)
removed_stop_words = filter(lambda x: x.lower() not in stopwords.words(stopwordslang), no_numbers)
clean_sentences.append(filter(lambda x: tokenizer.tokenize(x), removed_stop_words))
tagged_sentences = [nltk.pos_tag(sentence) for sentence in clean_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if len(t) > 0:
for child in t:
if not isinstance(child, tuple) and child.label().startswith('NE'):
entity_names.append(child[0][0].lower())
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return list(set(entity_names))
print named_entities(text, 'english')