Created
June 23, 2016 17:17
-
-
Save johnDorian/872a70a1ce1f79f27ad383214aecfa10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Programming task | |
================ | |
The following is an implementation of a simple Named Entity Recognition (NER). | |
NER is concerned with identifying place names, people names or other special | |
identifiers in text. | |
Here we make a very simple definition of a named entity: A sequence of | |
at least two consecutive capitalized words. E.g. "Los Angeles" is a named | |
entity, "our hotel" is not. | |
While the implementation passes the Unit test, it suffers from bad structure and | |
readability. It is your task to rework *both* the implementation and the Unit | |
test. You are expected to come up with a better interface than the one presented | |
here. | |
Your code will be evaluated on: | |
- Readability: Is naming intuitive? Are there comments where necessary? | |
- Structure: Is functionality grouped into functions or classes in a way that | |
enables reusability? | |
- Testability: Is it easy to test individual components of your algorithm? This | |
is a good indicator of good interface design. | |
- Bonus: Functional programming. Demonstrate how you have applied principles of | |
functional programming to improve this code. | |
If you want, explain reasons for changes you've made in comments. | |
Note that you don't have to improve the actual Named Entity Recognition | |
algorithm itself - the focus is on code quality. | |
""" | |
import re | |
import unittest | |
# COMMENT: I removed the buffer as I did not think having the buffer within the | |
# global environment was needed. | |
# COMMENT: I also removed the other two functions renamed them and placed them | |
# within new functions. I did this to make editing the functions and the | |
# actions of the functions easier, portable, and reproduiable. | |
def split_word_set(text): | |
""" | |
Check to see if there were any words remaining in the text if not, asign | |
None values to each of the returned strings, otherwise assign the first word | |
to the first index of the tuple and the remaing text to the second part of | |
tuple | |
Parameters | |
---------- | |
text : str | |
A string with words. | |
Returns | |
---------- | |
Tuple (first_word, remaining_text) | |
A tuple with the first word and remaining words seperated. | |
""" | |
# Regular expression for matching a token at the beginning of a sentence | |
split_first_word = re.compile(r"([a-z]+)\s*(.*)$", re.I) | |
# Split the first word off the sentence. | |
split_words = split_first_word.match(text) | |
# Check if there was anything in the input string. | |
if split_words is None: | |
first_word = None | |
remaining_text = None | |
else: | |
# the first word of the text is added to this object | |
first_word = split_words.group(1) | |
# the remaining text is added to the second returned object | |
remaining_text = split_words.group(2) | |
# return the first word and the remianing text as a tuple | |
return first_word, remaining_text | |
def is_name_long_enough(word_list, desired_length=2): | |
""" | |
This function takes a list and compares the length of the list to the desired | |
length via the desired_length argument. | |
Parameters | |
---------- | |
word_list : list | |
A list. It should contain strings, but there are no check in that regard. | |
desired_length : int | |
An integer specifying the minimum consequective words with capitals is a | |
name entity. | |
Returns | |
---------- | |
is_long_enough | |
boolean, is the list long enough. | |
""" | |
# set the default return value to False | |
is_long_enough = False | |
# check the length of the list is greater or equal to the desired length | |
if len(word_list) >= desired_length: | |
is_long_enough = True | |
return is_long_enough | |
def is_capitalised(word): | |
""" | |
This function checks if the first letter of a string is capitalised. | |
Parameters | |
---------- | |
word : str | |
A word to be | |
desired_length : int | |
An integer specifying the minimum consequective words with capitals is a | |
name entity. | |
Returns | |
---------- | |
capitalised | |
boolean, is the word capitalised. | |
""" | |
# Regular expression to recognize an uppercase token | |
uppercase_re = re.compile(r"[A-Z][a-z]*$") | |
# return the result as a boolean | |
capitalised = bool(uppercase_re.match(word)) | |
return capitalised | |
def find_all_name_entities(text, desired_length=2): | |
""" | |
This function returns a Set with all matching | |
sequential words with capital letters that are as long or longer | |
than the `desired_length` | |
Parameters | |
---------- | |
text : str | |
A string of words. | |
desired_length : int | |
An integer specifying the minimum consequective words with capitals is a | |
name entity. | |
Returns | |
---------- | |
name_enitity_set | |
This is a set with all matching name entities. | |
""" | |
# Create a set for storing named entities | |
name_enitity_set = set() | |
# Create a list to store the words with capitals. | |
possible_name_enitity = [] | |
# Split the first word from the inputted text. | |
first_word, remaining_words = split_word_set(text) | |
# move across the inputted text from left to right using a while loop. | |
while first_word is not None: | |
# Check if the first_word of the remaining text is capitalised | |
if is_capitalised(first_word): | |
# Add it to the possible name list | |
possible_name_enitity.append(first_word) | |
else: | |
# if the word is lowercase than check if what is stored in the | |
# possible_name_enitity list meets the requirements of the desired_length | |
if is_name_long_enough(possible_name_enitity, desired_length): | |
# Join the words with a white space and add them to the result Set | |
name_enitity_set.add(" ".join(possible_name_enitity)) | |
# Reset the possible name enitity list as it was not long enough. | |
possible_name_enitity = [] | |
# Get the next word from the remaining_words for the next iteration. | |
first_word, remaining_words = split_word_set(remaining_words) | |
# Perform a final check on the outside of the while loop. | |
if len(possible_name_enitity) >= desired_length: | |
name_enitity_set.add(" ".join(possible_name_enitity)) | |
# return the result set. | |
return name_enitity_set | |
class NamedEntityTestCase(unittest.TestCase): | |
def test_ner_extraction(self): | |
# Remember to change this Unit test as well to follow the interface | |
# changes you propose above | |
text = "When we went to Los Angeles last year we visited the Hollywood Sign" | |
# COMMENT: I removed the previous code that was here, as it should not have | |
# been within a test function. | |
entities = find_all_name_entities(text, desired_length=2) | |
self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities) | |
if __name__ == "__main__": | |
unittest.main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment