dmod · December 18, 2019 13:31
diff --git a/README.md b/README.md
diff --git a/business_card_parser.py b/business_card_parser.py
 ## business_card_parser.py
 #
 # The following module contains the necessary functionality to parse and
 # represent the output of an OCR Business Card Reader. Two classes are contained,
 # one that will represent the model of the contact info parsed from the business
 # card, including: Name, Phone Number, and Email Address. The other class
 # contains the necessary methods to parse each field from the string of business
 # card text.
 import re

 # Class IContactInfo is an immutable representation of the contact information parsed
 # from the Business Card.
 class IContactInfo:

    def __init__(self, name, phoneNumber, emailAddress):
        self.name = name
        self.phoneNumber = phoneNumber
        self.emailAddress = emailAddress

    # Returns the full name of the individual (eg. John Smith, Susan Malick)
    def getName(self):
        return self.name

    # Returns the phone number formatted as a sequence of digits with no punctuation
    def getPhoneNumber(self):
        return self.phoneNumber

    # Returns the email address of the individual
    def getEmailAddress(self):
        return self.emailAddress

 # Class IBusinessCardParser contains the nessessary functionality to parse the
 # relevant fields from a string of Business Card.
 class IBusinessCardParser:

    # Contains the symbols that are used to throw out any potential matches of a
    # persons name. If any of the following are found in a potential
    # match, the match is regarded as not a name from a person.
    # TODO: Put this list in a config file so that it can be easily edited
    # without program modificaton.
    nameDisqualifiers = ("ENGINEER", "DEVELOPER", "LTD", "INC", "TECHNOLOGIES", "COMPANY")

    # Contains the symbols that are used to throw out any potential matches of a
    # line containing a phone number. If any of the following are found on a line
    # that contains a phone number, the line is regarded as not a normal phone number.
    # TODO: Put this list in a config file so that it can be easily edited
    # without program modificaton.
    phoneDisqualifiers = ("FAX", "PAGER", "CELL")

    # Given the business card text, return the persons name contained, or None if not found
    @staticmethod
    def parseName(document):

        # Two words (can contain "-" or ".") of 2 characters or more, separated by a space
        personsNameRegex = "[\w\-.]{2,} [\w\-.]{2,}"

        # Iterate through each of the matches found in the document
        for name in re.findall(personsNameRegex, document):
            # Seperate the contents of this name by a space
            nameParts = name.upper().split(" ")

            # Take the intersection of the nameParts and the nameDisqualifiers,
            # if there are no intersecting names (the intersection set is empty),
            # then return this name (loop will exit, the first name that matches
            # this criteria is most likely the real name)
            disqualifiersInNameParts = set(nameParts).intersection(IBusinessCardParser.nameDisqualifiers)
            if not disqualifiersInNameParts:
                return name

    # Given the business card text, return the phone number contained, or None if not found
    @staticmethod
    def parsePhoneNumber(document):

        phoneNumberRegex = "(?:\+\d{1,3})?\s?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}"

        # Iterate through each line of the document
        for line in document.splitlines():
            # Search the line for a substring matching the pattern
            match = re.search(phoneNumberRegex, line)

            if match:
                # Any will return true if any of the phoneDisqualifiers are found
                # in line.upper()
                if not any(s in line.upper() for s in IBusinessCardParser.phoneDisqualifiers):
                    # None of the phoneDisqualifiers were found in this line,
                    # strip the matched group of any non numeric character, and
                    # return the phone nuber (loop will exit, the number that
                    # matches this criteria is most likely the real number)
                    strippedPhoneNumber = re.sub("[^0-9]", "", match.group())
                    return strippedPhoneNumber

    # Given the business card text, return the email address contained, or None if not found
    @staticmethod
    def parseEmailAddress(document):
        emailRegex = "[a-zA-Z0-9_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"

        # Find the email address in the document, if more than one match is found,
        # only the first is returned
        match = re.search(emailRegex, document)
        if match:
            email = match.group()
            return email

    @staticmethod
    def getContactInfo(document):
        # Return an immutable representation of the contact info
        return IContactInfo(IBusinessCardParser.parseName(document),
                            IBusinessCardParser.parsePhoneNumber(document),
                            IBusinessCardParser.parseEmailAddress(document))
diff --git a/test_business_card_parser.py b/test_business_card_parser.py
 ## test_business_card_parser.py
 #
 import unittest
 from business_card_parser import *

 test1 = \
 """ASYMMETRIK LTD
 Mike Smith
 Senior Software Engineer
 (410)555-1234
 [email protected]"""

 test2 = \
 """Foobar Technologies
 Analytic Developer
 Lisa Haung
 1234 Sentry Road
 Columbia, MD 12345
 Phone: 410-555-1234
 Fax: 410-555-4321
 [email protected]"""

 test3 = \
 """Arthur Wilson
 Software Engineer
 Decision & Security Technologies
 ABC Technologies
 123 North 11th Street
 Suite 229
 Arlington, VA 22209
 Tel: +1 (703) 555-1259
 Fax: +1 (703) 555-1200
 [email protected]"""

 test_company_as_part_of_valid_name = \
 """Mark Companys
 Tel: (410)867-5309
 Email: [email protected]"""

 test_period_in_name = \
 """Mr. Jones
 Maybe cOMpany Developer
 Los Angeles, CA, USA
 Tel: (410)867-5309 Email: [email protected]"""

 test_email_phone_same_line = \
 """Tom Hanks
 Actor
 Some Movie Company
 Los Angeles, CA, USA
 Tel: (410)867-5309 Email: [email protected]"""

 test_occupation_name_same_line = \
 """Tom Hanks, Actor
 [email protected] - (410)867-5309
 'Life is like a box of chocolates'"""

 test_everything_same_line = \
 """Name: Tom Hanks Tel:(410)867-5309 Email: [email protected]"""

 class TestStringMethods(unittest.TestCase):

    def test_1(self):
        contactInfo = IBusinessCardParser.getContactInfo(test1)
        self.assertEqual(contactInfo.getName(), "Mike Smith")
        self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_2(self):
        contactInfo = IBusinessCardParser.getContactInfo(test2)
        self.assertEqual(contactInfo.getName(), "Lisa Haung")
        self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_3(self):
        contactInfo = IBusinessCardParser.getContactInfo(test3)
        self.assertEqual(contactInfo.getName(), "Arthur Wilson")
        self.assertEqual(contactInfo.getPhoneNumber(), "17035551259")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_test_company_as_part_of_valid_name(self):
        contactInfo = IBusinessCardParser.getContactInfo(test_company_as_part_of_valid_name)
        self.assertEqual(contactInfo.getName(), "Mark Companys")
        self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_test_period_in_name(self):
        contactInfo = IBusinessCardParser.getContactInfo(test_period_in_name)
        self.assertEqual(contactInfo.getName(), "Mr. Jones")
        self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_test_email_phone_same_line(self):
        contactInfo = IBusinessCardParser.getContactInfo(test_email_phone_same_line)
        self.assertEqual(contactInfo.getName(), "Tom Hanks")
        self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_test_occupation_name_same_line(self):
        contactInfo = IBusinessCardParser.getContactInfo(test_occupation_name_same_line)
        self.assertEqual(contactInfo.getName(), "Tom Hanks")
        self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

    def test_test_everything_same_line(self):
        contactInfo = IBusinessCardParser.getContactInfo(test_everything_same_line)
        self.assertEqual(contactInfo.getName(), "Tom Hanks")
        self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
        self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

 if __name__ == '__main__':
    unittest.main()
	## business_card_parser.py
	#
	# The following module contains the necessary functionality to parse and
	# represent the output of an OCR Business Card Reader. Two classes are contained,
	# one that will represent the model of the contact info parsed from the business
	# card, including: Name, Phone Number, and Email Address. The other class
	# contains the necessary methods to parse each field from the string of business
	# card text.
	import re

	# Class IContactInfo is an immutable representation of the contact information parsed
	# from the Business Card.
	class IContactInfo:

	def __init__(self, name, phoneNumber, emailAddress):
	self.name = name
	self.phoneNumber = phoneNumber
	self.emailAddress = emailAddress

	# Returns the full name of the individual (eg. John Smith, Susan Malick)
	def getName(self):
	return self.name

	# Returns the phone number formatted as a sequence of digits with no punctuation
	def getPhoneNumber(self):
	return self.phoneNumber

	# Returns the email address of the individual
	def getEmailAddress(self):
	return self.emailAddress

	# Class IBusinessCardParser contains the nessessary functionality to parse the
	# relevant fields from a string of Business Card.
	class IBusinessCardParser:

	# Contains the symbols that are used to throw out any potential matches of a
	# persons name. If any of the following are found in a potential
	# match, the match is regarded as not a name from a person.
	# TODO: Put this list in a config file so that it can be easily edited
	# without program modificaton.
	nameDisqualifiers = ("ENGINEER", "DEVELOPER", "LTD", "INC", "TECHNOLOGIES", "COMPANY")

	# Contains the symbols that are used to throw out any potential matches of a
	# line containing a phone number. If any of the following are found on a line
	# that contains a phone number, the line is regarded as not a normal phone number.
	# TODO: Put this list in a config file so that it can be easily edited
	# without program modificaton.
	phoneDisqualifiers = ("FAX", "PAGER", "CELL")

	# Given the business card text, return the persons name contained, or None if not found
	@staticmethod
	def parseName(document):

	# Two words (can contain "-" or ".") of 2 characters or more, separated by a space
	personsNameRegex = "[\w\-.]{2,} [\w\-.]{2,}"

	# Iterate through each of the matches found in the document
	for name in re.findall(personsNameRegex, document):
	# Seperate the contents of this name by a space
	nameParts = name.upper().split(" ")

	# Take the intersection of the nameParts and the nameDisqualifiers,
	# if there are no intersecting names (the intersection set is empty),
	# then return this name (loop will exit, the first name that matches
	# this criteria is most likely the real name)
	disqualifiersInNameParts = set(nameParts).intersection(IBusinessCardParser.nameDisqualifiers)
	if not disqualifiersInNameParts:
	return name

	# Given the business card text, return the phone number contained, or None if not found
	@staticmethod
	def parsePhoneNumber(document):

	phoneNumberRegex = "(?:\+\d{1,3})?\s?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}"

	# Iterate through each line of the document
	for line in document.splitlines():
	# Search the line for a substring matching the pattern
	match = re.search(phoneNumberRegex, line)

	if match:
	# Any will return true if any of the phoneDisqualifiers are found
	# in line.upper()
	if not any(s in line.upper() for s in IBusinessCardParser.phoneDisqualifiers):
	# None of the phoneDisqualifiers were found in this line,
	# strip the matched group of any non numeric character, and
	# return the phone nuber (loop will exit, the number that
	# matches this criteria is most likely the real number)
	strippedPhoneNumber = re.sub("[^0-9]", "", match.group())
	return strippedPhoneNumber

	# Given the business card text, return the email address contained, or None if not found
	@staticmethod
	def parseEmailAddress(document):
	emailRegex = "[a-zA-Z0-9_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"

	# Find the email address in the document, if more than one match is found,
	# only the first is returned
	match = re.search(emailRegex, document)
	if match:
	email = match.group()
	return email

	@staticmethod
	def getContactInfo(document):
	# Return an immutable representation of the contact info
	return IContactInfo(IBusinessCardParser.parseName(document),
	IBusinessCardParser.parsePhoneNumber(document),
	IBusinessCardParser.parseEmailAddress(document))
	## test_business_card_parser.py
	#
	import unittest
	from business_card_parser import *

	test1 = \
	"""ASYMMETRIK LTD
	Mike Smith
	Senior Software Engineer
	(410)555-1234
	[email protected]"""

	test2 = \
	"""Foobar Technologies
	Analytic Developer
	Lisa Haung
	1234 Sentry Road
	Columbia, MD 12345
	Phone: 410-555-1234
	Fax: 410-555-4321
	[email protected]"""

	test3 = \
	"""Arthur Wilson
	Software Engineer
	Decision & Security Technologies
	ABC Technologies
	123 North 11th Street
	Suite 229
	Arlington, VA 22209
	Tel: +1 (703) 555-1259
	Fax: +1 (703) 555-1200
	[email protected]"""

	test_company_as_part_of_valid_name = \
	"""Mark Companys
	Tel: (410)867-5309
	Email: [email protected]"""

	test_period_in_name = \
	"""Mr. Jones
	Maybe cOMpany Developer
	Los Angeles, CA, USA
	Tel: (410)867-5309 Email: [email protected]"""

	test_email_phone_same_line = \
	"""Tom Hanks
	Actor
	Some Movie Company
	Los Angeles, CA, USA
	Tel: (410)867-5309 Email: [email protected]"""

	test_occupation_name_same_line = \
	"""Tom Hanks, Actor
	[email protected] - (410)867-5309
	'Life is like a box of chocolates'"""

	test_everything_same_line = \
	"""Name: Tom Hanks Tel:(410)867-5309 Email: [email protected]"""

	class TestStringMethods(unittest.TestCase):

	def test_1(self):
	contactInfo = IBusinessCardParser.getContactInfo(test1)
	self.assertEqual(contactInfo.getName(), "Mike Smith")
	self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_2(self):
	contactInfo = IBusinessCardParser.getContactInfo(test2)
	self.assertEqual(contactInfo.getName(), "Lisa Haung")
	self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_3(self):
	contactInfo = IBusinessCardParser.getContactInfo(test3)
	self.assertEqual(contactInfo.getName(), "Arthur Wilson")
	self.assertEqual(contactInfo.getPhoneNumber(), "17035551259")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_test_company_as_part_of_valid_name(self):
	contactInfo = IBusinessCardParser.getContactInfo(test_company_as_part_of_valid_name)
	self.assertEqual(contactInfo.getName(), "Mark Companys")
	self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_test_period_in_name(self):
	contactInfo = IBusinessCardParser.getContactInfo(test_period_in_name)
	self.assertEqual(contactInfo.getName(), "Mr. Jones")
	self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_test_email_phone_same_line(self):
	contactInfo = IBusinessCardParser.getContactInfo(test_email_phone_same_line)
	self.assertEqual(contactInfo.getName(), "Tom Hanks")
	self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_test_occupation_name_same_line(self):
	contactInfo = IBusinessCardParser.getContactInfo(test_occupation_name_same_line)
	self.assertEqual(contactInfo.getName(), "Tom Hanks")
	self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	def test_test_everything_same_line(self):
	contactInfo = IBusinessCardParser.getContactInfo(test_everything_same_line)
	self.assertEqual(contactInfo.getName(), "Tom Hanks")
	self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
	self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")

	if __name__ == '__main__':
	unittest.main()