johnDorian · June 23, 2016 17:17
diff --git a/ner.py b/ner.py
 """
 Programming task
 ================

 The following is an implementation of a simple Named Entity Recognition (NER).
 NER is concerned with identifying place names, people names or other special
 identifiers in text.

 Here we make a very simple definition of a named entity: A sequence of
 at least two consecutive capitalized words. E.g. "Los Angeles" is a named
 entity, "our hotel" is not.

 While the implementation passes the Unit test, it suffers from bad structure and
 readability. It is your task to rework *both* the implementation and the Unit
 test. You are expected to come up with a better interface than the one presented
 here.

 Your code will be evaluated on:
 - Readability: Is naming intuitive? Are there comments where necessary?
 - Structure: Is functionality grouped into functions or classes in a way that
 enables reusability?
 - Testability: Is it easy to test individual components of your algorithm? This
 is a good indicator of good interface design.
 - Bonus: Functional programming. Demonstrate how you have applied principles of
 functional programming to improve this code.

 If you want, explain reasons for changes you've made in comments.

 Note that you don't have to improve the actual Named Entity Recognition
 algorithm itself - the focus is on code quality.
 """

 import re
 import unittest


 # COMMENT: I removed the buffer as I did not think having the buffer within the 
 # global environment was needed.

 # COMMENT: I also removed the other two functions renamed them and placed them 
 # within new functions. I did this to make editing the functions and the 
 # actions of the functions easier, portable, and reproduiable. 




 def split_word_set(text):
    """
 	Check to see if there were any words remaining in the text if not, asign 
  None values to each of the returned strings, otherwise assign the first word
  to the first index of the tuple and the remaing text to the second part of 
  tuple

 	Parameters
  ----------
  text : str
    A string with words.

  Returns 
  ----------
  Tuple (first_word, remaining_text)
 	  A tuple with the first word and remaining words seperated. 
 	"""
    # Regular expression for matching a token at the beginning of a sentence


    split_first_word = re.compile(r"([a-z]+)\s*(.*)$", re.I)
    # Split the first word off the sentence. 
    split_words = split_first_word.match(text)

    # Check if there was anything in the input string. 
    if split_words is None:
        first_word = None
        remaining_text = None
    else:
        # the first word of the text is added to this object
        first_word = split_words.group(1)
        # the remaining text is added to the second returned object
        remaining_text = split_words.group(2)
    # return the first word and the remianing text as a tuple
    return first_word, remaining_text


 def is_name_long_enough(word_list, desired_length=2):
    """
      This function takes a list and compares the length of the list to the desired 
      length via the desired_length argument. 
  
      Parameters
    ----------
    word_list : list
      A list. It should contain strings, but there are no check in that regard.
    desired_length : int
      An integer specifying the minimum consequective words with capitals is a 
      name entity.
  
    Returns 
    ----------
    is_long_enough
        boolean, is the list long enough.
      """

    # set the default return value to False
    is_long_enough = False
    # check the length of the list is greater or equal to the desired length
    if len(word_list) >= desired_length:
        is_long_enough = True
    return is_long_enough


 def is_capitalised(word):
    """
      This function checks if the first letter of a string is capitalised.
  
      Parameters
    ----------
    word : str
      A word to be 
    desired_length : int
      An integer specifying the minimum consequective words with capitals is a 
      name entity.
  
    Returns 
    ----------
    capitalised
        boolean, is the word capitalised.
      """

    # Regular expression to recognize an uppercase token
    uppercase_re = re.compile(r"[A-Z][a-z]*$")
    # return the result as a boolean
    capitalised = bool(uppercase_re.match(word))
    return capitalised


 def find_all_name_entities(text, desired_length=2):
    """
      This function returns a Set with all matching
      sequential words with capital letters that are as long or longer 
      than the `desired_length`
  
      Parameters
    ----------
    text : str
      A string of words.
    desired_length : int
      An integer specifying the minimum consequective words with capitals is a 
      name entity.
  
    Returns 
    ----------
    name_enitity_set
        This is a set with all matching name entities.
      """

    # Create a set for storing named entities
    name_enitity_set = set()
    # Create a list to store the words with capitals.
    possible_name_enitity = []
    # Split the first word from the inputted text.
    first_word, remaining_words = split_word_set(text)
    # move across the inputted text from left to right using a while loop.

    while first_word is not None:
        # Check if the first_word of the remaining text is capitalised
        if is_capitalised(first_word):
            # Add it to the possible name list
            possible_name_enitity.append(first_word)
        else:
            # if the word is lowercase than check if what is stored in the 
            # possible_name_enitity list meets the requirements of the desired_length
            if is_name_long_enough(possible_name_enitity, desired_length):
                # Join the words with a white space and add them to the result Set
                name_enitity_set.add(" ".join(possible_name_enitity))
            # Reset the possible name enitity list as it was not long enough. 
            possible_name_enitity = []

        # Get the next word from the remaining_words for the next iteration. 
        first_word, remaining_words = split_word_set(remaining_words)
    # Perform a final check on the outside of the while loop. 

    if len(possible_name_enitity) >= desired_length:
        name_enitity_set.add(" ".join(possible_name_enitity))

    # return the result set.
    return name_enitity_set


 class NamedEntityTestCase(unittest.TestCase):
    def test_ner_extraction(self):
        # Remember to change this Unit test as well to follow the interface
        # changes you propose above

        text = "When we went to Los Angeles last year we visited the Hollywood Sign"

        # COMMENT: I removed the previous code that was here, as it should not have 
        # been within a test function. 
        entities = find_all_name_entities(text, desired_length=2)

        self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities)


 if __name__ == "__main__":
    unittest.main()
	"""
	Programming task
	================

	The following is an implementation of a simple Named Entity Recognition (NER).
	NER is concerned with identifying place names, people names or other special
	identifiers in text.

	Here we make a very simple definition of a named entity: A sequence of
	at least two consecutive capitalized words. E.g. "Los Angeles" is a named
	entity, "our hotel" is not.

	While the implementation passes the Unit test, it suffers from bad structure and
	readability. It is your task to rework both the implementation and the Unit
	test. You are expected to come up with a better interface than the one presented
	here.

	Your code will be evaluated on:
	- Readability: Is naming intuitive? Are there comments where necessary?
	- Structure: Is functionality grouped into functions or classes in a way that
	enables reusability?
	- Testability: Is it easy to test individual components of your algorithm? This
	is a good indicator of good interface design.
	- Bonus: Functional programming. Demonstrate how you have applied principles of
	functional programming to improve this code.

	If you want, explain reasons for changes you've made in comments.

	Note that you don't have to improve the actual Named Entity Recognition
	algorithm itself - the focus is on code quality.
	"""

	import re
	import unittest


	# COMMENT: I removed the buffer as I did not think having the buffer within the
	# global environment was needed.

	# COMMENT: I also removed the other two functions renamed them and placed them
	# within new functions. I did this to make editing the functions and the
	# actions of the functions easier, portable, and reproduiable.




	def split_word_set(text):
	"""
	Check to see if there were any words remaining in the text if not, asign
	None values to each of the returned strings, otherwise assign the first word
	to the first index of the tuple and the remaing text to the second part of
	tuple

	Parameters
	----------
	text : str
	A string with words.

	Returns
	----------
	Tuple (first_word, remaining_text)
	A tuple with the first word and remaining words seperated.
	"""
	# Regular expression for matching a token at the beginning of a sentence


	split_first_word = re.compile(r"([a-z]+)\s(.)$", re.I)
	# Split the first word off the sentence.
	split_words = split_first_word.match(text)

	# Check if there was anything in the input string.
	if split_words is None:
	first_word = None
	remaining_text = None
	else:
	# the first word of the text is added to this object
	first_word = split_words.group(1)
	# the remaining text is added to the second returned object
	remaining_text = split_words.group(2)
	# return the first word and the remianing text as a tuple
	return first_word, remaining_text


	def is_name_long_enough(word_list, desired_length=2):
	"""
	This function takes a list and compares the length of the list to the desired
	length via the desired_length argument.

	Parameters
	----------
	word_list : list
	A list. It should contain strings, but there are no check in that regard.
	desired_length : int
	An integer specifying the minimum consequective words with capitals is a
	name entity.

	Returns
	----------
	is_long_enough
	boolean, is the list long enough.
	"""

	# set the default return value to False
	is_long_enough = False
	# check the length of the list is greater or equal to the desired length
	if len(word_list) >= desired_length:
	is_long_enough = True
	return is_long_enough


	def is_capitalised(word):
	"""
	This function checks if the first letter of a string is capitalised.

	Parameters
	----------
	word : str
	A word to be
	desired_length : int
	An integer specifying the minimum consequective words with capitals is a
	name entity.

	Returns
	----------
	capitalised
	boolean, is the word capitalised.
	"""

	# Regular expression to recognize an uppercase token
	uppercase_re = re.compile(r"[A-Z][a-z]*$")
	# return the result as a boolean
	capitalised = bool(uppercase_re.match(word))
	return capitalised


	def find_all_name_entities(text, desired_length=2):
	"""
	This function returns a Set with all matching
	sequential words with capital letters that are as long or longer
	than the `desired_length`

	Parameters
	----------
	text : str
	A string of words.
	desired_length : int
	An integer specifying the minimum consequective words with capitals is a
	name entity.

	Returns
	----------
	name_enitity_set
	This is a set with all matching name entities.
	"""

	# Create a set for storing named entities
	name_enitity_set = set()
	# Create a list to store the words with capitals.
	possible_name_enitity = []
	# Split the first word from the inputted text.
	first_word, remaining_words = split_word_set(text)
	# move across the inputted text from left to right using a while loop.

	while first_word is not None:
	# Check if the first_word of the remaining text is capitalised
	if is_capitalised(first_word):
	# Add it to the possible name list
	possible_name_enitity.append(first_word)
	else:
	# if the word is lowercase than check if what is stored in the
	# possible_name_enitity list meets the requirements of the desired_length
	if is_name_long_enough(possible_name_enitity, desired_length):
	# Join the words with a white space and add them to the result Set
	name_enitity_set.add(" ".join(possible_name_enitity))
	# Reset the possible name enitity list as it was not long enough.
	possible_name_enitity = []

	# Get the next word from the remaining_words for the next iteration.
	first_word, remaining_words = split_word_set(remaining_words)
	# Perform a final check on the outside of the while loop.

	if len(possible_name_enitity) >= desired_length:
	name_enitity_set.add(" ".join(possible_name_enitity))

	# return the result set.
	return name_enitity_set


	class NamedEntityTestCase(unittest.TestCase):
	def test_ner_extraction(self):
	# Remember to change this Unit test as well to follow the interface
	# changes you propose above

	text = "When we went to Los Angeles last year we visited the Hollywood Sign"

	# COMMENT: I removed the previous code that was here, as it should not have
	# been within a test function.
	entities = find_all_name_entities(text, desired_length=2)

	self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), entities)


	if __name__ == "__main__":
	unittest.main()