-
-
Save matthewcornell/8f7f036b87961e9f1acd to your computer and use it in GitHub Desktop.
Basic example of using NLTK for name entity extraction.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# forked for my own reference | |
import urllib.request | |
import urllib.parse | |
from bs4 import BeautifulSoup as bs | |
import nltk | |
def extract_named_ents_from_url(url): | |
url_bytes = get_bytes_for_url(url) | |
soup = bs(url_bytes, "lxml") | |
sentences = nltk.sent_tokenize(soup.text) | |
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] | |
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] | |
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) | |
named_entities = [] | |
for chunked_sentence in chunked_sentences: | |
return entity_names_for_tree(chunked_sentence) | |
return named_entities | |
def get_bytes_for_url(url): | |
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' | |
req = urllib.request.Request(url, headers={'User-Agent': user_agent}) | |
httpResponse = urllib.request.urlopen(req) | |
return httpResponse.read() # todo encoding? | |
def entity_names_for_tree(t): | |
entity_names = [] | |
if hasattr(t, 'label') and t.label: | |
if t.label() == 'NE': | |
entity_names.append(' '.join([child[0] for child in t])) | |
else: | |
for child in t: | |
entity_names.extend(entity_names_for_tree(child)) | |
return entity_names | |
named_entities = extract_named_ents_from_url('http://www.omdurman.org/columns/obama12.html') | |
print(named_entities) | |
# ['Widespread Consensus', 'Obama Promotes', 'Racism', 'Hatred', 'Israel', 'Western Civilization Home', 'Obama', 'Racism', 'Hatred', 'Israel', 'Richard Cohen', 'Brigitte Gabriel', 'Sharon Hughes', 'Charles Krauthammer', 'Kenneth Blackwell', 'Naomi Ragen', 'Debbie Schlussel', 'Ed Lasky', 'William Levinson', 'Barack', 'Catholics', 'Israel', 'United States'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment