Skip to content

Instantly share code, notes, and snippets.

@dmod
Last active December 18, 2019 13:31
Show Gist options
  • Save dmod/5497fbcce9441cbd445d750651bd4043 to your computer and use it in GitHub Desktop.
Save dmod/5497fbcce9441cbd445d750651bd4043 to your computer and use it in GitHub Desktop.
Business Card Parser

Business Card Parser

The Business Card Parser contains the necessary functionality to parse and represent the output of an OCR Business Card Reader. Two classes are contained, one that will represent the model of the contact info parsed from the business card, including: Name, Phone Number, and Email Address. The other class contains the necessary methods to parse each field from the string of business card text.

The module can be directly incorporated into an existing Python application:

import business_card_parser

contactInfo = business_card_parser.IBusinessCardParser.getContactInfo(userText)

name = contactInfo.getName()
phoneNumber = contactInfo.getPhoneNumber()
emailAddress = contactInfo.getEmailAddress()

Tests

All tests are located in test_business_card_parser.py New tests can be added by including the string of the business card in the header and by writing the new test function with the expected values.

python test_business_card_parser.py

## business_card_parser.py
#
# The following module contains the necessary functionality to parse and
# represent the output of an OCR Business Card Reader. Two classes are contained,
# one that will represent the model of the contact info parsed from the business
# card, including: Name, Phone Number, and Email Address. The other class
# contains the necessary methods to parse each field from the string of business
# card text.
import re
# Class IContactInfo is an immutable representation of the contact information parsed
# from the Business Card.
class IContactInfo:
def __init__(self, name, phoneNumber, emailAddress):
self.name = name
self.phoneNumber = phoneNumber
self.emailAddress = emailAddress
# Returns the full name of the individual (eg. John Smith, Susan Malick)
def getName(self):
return self.name
# Returns the phone number formatted as a sequence of digits with no punctuation
def getPhoneNumber(self):
return self.phoneNumber
# Returns the email address of the individual
def getEmailAddress(self):
return self.emailAddress
# Class IBusinessCardParser contains the nessessary functionality to parse the
# relevant fields from a string of Business Card.
class IBusinessCardParser:
# Contains the symbols that are used to throw out any potential matches of a
# persons name. If any of the following are found in a potential
# match, the match is regarded as not a name from a person.
# TODO: Put this list in a config file so that it can be easily edited
# without program modificaton.
nameDisqualifiers = ("ENGINEER", "DEVELOPER", "LTD", "INC", "TECHNOLOGIES", "COMPANY")
# Contains the symbols that are used to throw out any potential matches of a
# line containing a phone number. If any of the following are found on a line
# that contains a phone number, the line is regarded as not a normal phone number.
# TODO: Put this list in a config file so that it can be easily edited
# without program modificaton.
phoneDisqualifiers = ("FAX", "PAGER", "CELL")
# Given the business card text, return the persons name contained, or None if not found
@staticmethod
def parseName(document):
# Two words (can contain "-" or ".") of 2 characters or more, separated by a space
personsNameRegex = "[\w\-.]{2,} [\w\-.]{2,}"
# Iterate through each of the matches found in the document
for name in re.findall(personsNameRegex, document):
# Seperate the contents of this name by a space
nameParts = name.upper().split(" ")
# Take the intersection of the nameParts and the nameDisqualifiers,
# if there are no intersecting names (the intersection set is empty),
# then return this name (loop will exit, the first name that matches
# this criteria is most likely the real name)
disqualifiersInNameParts = set(nameParts).intersection(IBusinessCardParser.nameDisqualifiers)
if not disqualifiersInNameParts:
return name
# Given the business card text, return the phone number contained, or None if not found
@staticmethod
def parsePhoneNumber(document):
phoneNumberRegex = "(?:\+\d{1,3})?\s?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}"
# Iterate through each line of the document
for line in document.splitlines():
# Search the line for a substring matching the pattern
match = re.search(phoneNumberRegex, line)
if match:
# Any will return true if any of the phoneDisqualifiers are found
# in line.upper()
if not any(s in line.upper() for s in IBusinessCardParser.phoneDisqualifiers):
# None of the phoneDisqualifiers were found in this line,
# strip the matched group of any non numeric character, and
# return the phone nuber (loop will exit, the number that
# matches this criteria is most likely the real number)
strippedPhoneNumber = re.sub("[^0-9]", "", match.group())
return strippedPhoneNumber
# Given the business card text, return the email address contained, or None if not found
@staticmethod
def parseEmailAddress(document):
emailRegex = "[a-zA-Z0-9_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
# Find the email address in the document, if more than one match is found,
# only the first is returned
match = re.search(emailRegex, document)
if match:
email = match.group()
return email
@staticmethod
def getContactInfo(document):
# Return an immutable representation of the contact info
return IContactInfo(IBusinessCardParser.parseName(document),
IBusinessCardParser.parsePhoneNumber(document),
IBusinessCardParser.parseEmailAddress(document))
## test_business_card_parser.py
#
import unittest
from business_card_parser import *
test1 = \
"""ASYMMETRIK LTD
Mike Smith
Senior Software Engineer
(410)555-1234
[email protected]"""
test2 = \
"""Foobar Technologies
Analytic Developer
Lisa Haung
1234 Sentry Road
Columbia, MD 12345
Phone: 410-555-1234
Fax: 410-555-4321
[email protected]"""
test3 = \
"""Arthur Wilson
Software Engineer
Decision & Security Technologies
ABC Technologies
123 North 11th Street
Suite 229
Arlington, VA 22209
Tel: +1 (703) 555-1259
Fax: +1 (703) 555-1200
[email protected]"""
test_company_as_part_of_valid_name = \
"""Mark Companys
Tel: (410)867-5309
Email: [email protected]"""
test_period_in_name = \
"""Mr. Jones
Maybe cOMpany Developer
Los Angeles, CA, USA
Tel: (410)867-5309 Email: [email protected]"""
test_email_phone_same_line = \
"""Tom Hanks
Actor
Some Movie Company
Los Angeles, CA, USA
Tel: (410)867-5309 Email: [email protected]"""
test_occupation_name_same_line = \
"""Tom Hanks, Actor
[email protected] - (410)867-5309
'Life is like a box of chocolates'"""
test_everything_same_line = \
"""Name: Tom Hanks Tel:(410)867-5309 Email: [email protected]"""
class TestStringMethods(unittest.TestCase):
def test_1(self):
contactInfo = IBusinessCardParser.getContactInfo(test1)
self.assertEqual(contactInfo.getName(), "Mike Smith")
self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_2(self):
contactInfo = IBusinessCardParser.getContactInfo(test2)
self.assertEqual(contactInfo.getName(), "Lisa Haung")
self.assertEqual(contactInfo.getPhoneNumber(), "4105551234")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_3(self):
contactInfo = IBusinessCardParser.getContactInfo(test3)
self.assertEqual(contactInfo.getName(), "Arthur Wilson")
self.assertEqual(contactInfo.getPhoneNumber(), "17035551259")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_test_company_as_part_of_valid_name(self):
contactInfo = IBusinessCardParser.getContactInfo(test_company_as_part_of_valid_name)
self.assertEqual(contactInfo.getName(), "Mark Companys")
self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_test_period_in_name(self):
contactInfo = IBusinessCardParser.getContactInfo(test_period_in_name)
self.assertEqual(contactInfo.getName(), "Mr. Jones")
self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_test_email_phone_same_line(self):
contactInfo = IBusinessCardParser.getContactInfo(test_email_phone_same_line)
self.assertEqual(contactInfo.getName(), "Tom Hanks")
self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_test_occupation_name_same_line(self):
contactInfo = IBusinessCardParser.getContactInfo(test_occupation_name_same_line)
self.assertEqual(contactInfo.getName(), "Tom Hanks")
self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
def test_test_everything_same_line(self):
contactInfo = IBusinessCardParser.getContactInfo(test_everything_same_line)
self.assertEqual(contactInfo.getName(), "Tom Hanks")
self.assertEqual(contactInfo.getPhoneNumber(), "4108675309")
self.assertEqual(contactInfo.getEmailAddress(), "[email protected]")
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment