Skip to content

Instantly share code, notes, and snippets.

@ricjcosme
Last active September 10, 2016 00:20
Show Gist options
  • Save ricjcosme/8f65546e1e073110d57a01815abfe821 to your computer and use it in GitHub Desktop.
Save ricjcosme/8f65546e1e073110d57a01815abfe821 to your computer and use it in GitHub Desktop.
Named Entity extraction in python

pip freeze:

  • nltk==3.2.1
  • numpy==1.11.1
# -*- coding: utf-8 -*-
'''
@author: Ricardo Cosme
'''

import nltk.data, nltk.tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

text = """
Two explosions hit the finish line of the Boston Marathon on Monday, injuring spectators at America's oldest and most prestigious marathon. It was not immediately clear what caused the explosion or the number and extent of injuries. Bloody spectators were carried to the medical tent that had been set up to care for fatigued runners. Neither race officials nor public officials could immediately estimate the number or degree of injuries.
A Boston police officer was wheeled from the course with a leg injury that was bleeding. "There are a lot of people down," said one man, whose bib identified him as Frank Deruyter of North Carolina. He was not injured, but marathon workers were carrying one woman, who did not appear to be a runner, to the medical area as blood gushed from her leg. 
About three hours after the winners crossed the finish line, there was a loud explosion on the north side of Boylston Street."""

def named_entities(text, stopwordslang):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    clean_sentences = []
    tokenizer = RegexpTokenizer(r'\w+')
    for sentence in tokenized_sentences:
        no_punct = filter(lambda word: word not in ',-.?!:;"', sentence)
        no_numbers = filter(lambda x: ''.join([i for i in x if not x.isdigit()]), no_punct)
        removed_stop_words = filter(lambda x: x.lower() not in stopwords.words(stopwordslang), no_numbers)
        clean_sentences.append(filter(lambda x: tokenizer.tokenize(x), removed_stop_words))        
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in clean_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []
        if len(t) > 0:
            for child in t:
                if not isinstance(child, tuple) and child.label().startswith('NE'):
                    entity_names.append(child[0][0].lower())
        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return list(set(entity_names))

print named_entities(text, 'english')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment