-
-
Save KavithaShanmugam/72ab36cad2ac5b228cd9970d591d74c0 to your computer and use it in GitHub Desktop.
Python coding task: "Named Entity Recognition" @http://codepad.org/LLRQAd9q
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is an simple implementation of a Named Entity Recognition (NER) from given text. | |
Here I have implemented a method to extract the names using NLTK (Natural Language Toolkit) instead of using Regular expression. | |
This is the very effective method to extract the names from the given text. | |
Steps: | |
****** | |
1) Install the nltk package. Here is the instruction link | |
http://www.nltk.org/install.html | |
2) Download and install the following packages in the Python IDE using | |
>>> import nltk | |
>>> nltk.download() | |
Select the following packages in NLTK downloader | |
1) --> Models --> Averaged_Perceptron_Tagger, maxent_ne_chunker, punkt | |
2) --> Corpora --> words | |
""" | |
import unittest | |
from nltk import ne_chunk, pos_tag, word_tokenize | |
from nltk.tree import Tree | |
# Function that extracts named entity | |
def ne_extraction(text): | |
chunks = ne_chunk(pos_tag(word_tokenize(text))) # Uses nltk tokenizer, tagger and chunker to construct the sentence tree | |
current_name = [] | |
continuous_name = [] | |
for i in chunks: | |
if type(i) == Tree: | |
# Extracting Name value from tree structure and append this value to the current_name list | |
current_name.append(" ".join([token for token, pos in i.leaves()])) | |
elif current_name: | |
named_entity = " ".join(current_name) | |
if named_entity not in continuous_name: | |
continuous_name.append(named_entity) | |
current_name = [] | |
else: | |
continue | |
return continuous_name | |
class NamedEntityTestCase(unittest.TestCase): | |
def test_ner_extraction(self): | |
text = "When we went to Los Angeles last year we visited the Hollywood Sign." | |
entities = ne_extraction(text) | |
print entities | |
self.assertEqual(["Los Angeles", "Hollywood Sign"], entities) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment