Created
September 8, 2015 20:52
-
-
Save anonymous/4fa107c4a9c15ed01b2b to your computer and use it in GitHub Desktop.
Submission for task Named Entity Recognition @http://codepad.org/LLRQAd9q,
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Programming task | |
================ | |
The following is an implementation of a simple Named Entity Recognition (NER). | |
NER is concerned with identifying place names, people names or other special | |
identifiers in text. | |
Here we make a very simple definition of a named entity: A sequence of | |
at least two consecutive capitalized words. E.g. "Los Angeles" is a named | |
entity, "our hotel" is not. | |
While the implementation passes the Unit test, it suffers from bad structure and | |
readability. It is your task to rework *both* the implementation and the Unit | |
test. You are expected to come up with a better interface than the one presented | |
here. | |
Your code will be evaluated on: | |
- Readability: Is naming intuitive? Are there comments where necessary? | |
- Structure: Is functionality grouped into functions or classes in a way that | |
enables reusability? | |
- Testability: Is it easy to test individual components of your algorithm? This | |
is a good indicator of good interface design. | |
- Bonus: Functional programming. Demonstrate how you have applied principles of | |
functional programming to improve this code. | |
If you want, explain reasons for changes you've made in comments. | |
Note that you don't have to improve the actual Named Entity Recognition | |
algorithm itself - the focus is on code quality. | |
""" | |
import re | |
import unittest | |
# Buffer to store current named entity | |
#word_buffer = [] #MG : not needed anymore | |
# Regular expression for matching a token at the beginning of a sentence | |
#token_re = re.compile(r"([a-z]+)\s*(.*)$", re.I) #MG : not needed anymore | |
# Regular expression to recognize an uppercase token | |
uppercase_re = re.compile(r"[A-Z][a-z]*$") | |
#Regular expression to separate string at whitespace characters [\t\n\r\f\v] | |
space_re = re.compile(r"\s+") | |
""" Remarks : | |
1) Start with function get_named_entities() and rest is easy to follow (at least I think so :)) | |
2) The new version of the code allows much more re usability by introducing Pure Functions for separate tasks | |
which means minimal dependency on other methods, variables. So, functions can work independently - even in multithreaded environment | |
3) In the previous version, function pop_token(text) always receives full text as an argument and returns the remaining string. | |
This incurs a lot of overhead since string is immutable type | |
4) The new function get_ne_from_buffer(word_buffer, entity_set = None) has an optional parameter entity_set to ensure re usability | |
5) Since we have pure functions, testing each function is easy. | |
6) global variable word_buffer is removed since it's not needed here and it incurs memory overhead | |
""" | |
def tokenize(text): | |
""" | |
Tokenizes the text by splitting it at whitespace characters [\t\n\r\f\v] | |
Returns list of tokens | |
""" | |
return re.split(space_re, text) | |
def is_token_valid(token): | |
""" | |
Returns true if its first letter is capitalized - we may have a named entity on our hands!! | |
""" | |
if uppercase_re.match(token): | |
return True | |
return False | |
def get_ne_from_buffer(word_buffer, entity_set = None): | |
""" | |
Returns a named entity and adds it to the entity_set, if we have assembled one from the buffer. | |
Returns None if we can't assemble one. | |
""" | |
if len(word_buffer) >= 2: | |
named_entity = " ".join(word_buffer) | |
if entity_set is not None: #if entity_set provided then add named_entity to it | |
entity_set.add(named_entity) | |
return named_entity | |
return None | |
def get_named_entities(text): | |
""" | |
Returns a set of all the named entities from the text | |
""" | |
entity_set = set() | |
for line in text.splitlines(): | |
word_buffer = [] | |
for token in tokenize(line): | |
if is_token_valid(token): | |
word_buffer.append(token) | |
else: | |
get_ne_from_buffer(word_buffer, entity_set) | |
word_buffer = [] #empty the buffer | |
get_ne_from_buffer(word_buffer, entity_set) #if we have named_entity at the end, we don't wana miss it! | |
return entity_set | |
class NamedEntityTestCase(unittest.TestCase): | |
def test_ner_extraction(self): | |
""" | |
unit test for method get_named_entities() | |
""" | |
text = 'When we went to Los Angeles last year we visited the Hollywood Sign' | |
entities = get_named_entities(text) | |
self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities) | |
def test_named_entity(self): | |
""" | |
unit test for method get_ne_from_buffer() | |
""" | |
buffer = ['Southern', 'California'] | |
named_entity = get_ne_from_buffer(buffer) | |
self.assertEqual(named_entity, 'Southern California') | |
def test_token_validity(self): | |
""" | |
unit test for method is_token_valid() | |
""" | |
token = 'Southern' | |
validity = is_token_valid(token) | |
self.assertEqual(validity, True) | |
def test_tokenizer(self): | |
""" | |
unit test for method tokenize() | |
""" | |
text = 'Southern California is much like Arizona' | |
tokens = tokenize(text) | |
self.assertEqual(['Southern', 'California', 'is', 'much', 'like', 'Arizona'], tokens) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment