|
## business_card_parser.py |
|
# |
|
# The following module contains the necessary functionality to parse and |
|
# represent the output of an OCR Business Card Reader. Two classes are contained, |
|
# one that will represent the model of the contact info parsed from the business |
|
# card, including: Name, Phone Number, and Email Address. The other class |
|
# contains the necessary methods to parse each field from the string of business |
|
# card text. |
|
import re |
|
|
|
# Class IContactInfo is an immutable representation of the contact information parsed |
|
# from the Business Card. |
|
class IContactInfo: |
|
|
|
def __init__(self, name, phoneNumber, emailAddress): |
|
self.name = name |
|
self.phoneNumber = phoneNumber |
|
self.emailAddress = emailAddress |
|
|
|
# Returns the full name of the individual (eg. John Smith, Susan Malick) |
|
def getName(self): |
|
return self.name |
|
|
|
# Returns the phone number formatted as a sequence of digits with no punctuation |
|
def getPhoneNumber(self): |
|
return self.phoneNumber |
|
|
|
# Returns the email address of the individual |
|
def getEmailAddress(self): |
|
return self.emailAddress |
|
|
|
# Class IBusinessCardParser contains the nessessary functionality to parse the |
|
# relevant fields from a string of Business Card. |
|
class IBusinessCardParser: |
|
|
|
# Contains the symbols that are used to throw out any potential matches of a |
|
# persons name. If any of the following are found in a potential |
|
# match, the match is regarded as not a name from a person. |
|
# TODO: Put this list in a config file so that it can be easily edited |
|
# without program modificaton. |
|
nameDisqualifiers = ("ENGINEER", "DEVELOPER", "LTD", "INC", "TECHNOLOGIES", "COMPANY") |
|
|
|
# Contains the symbols that are used to throw out any potential matches of a |
|
# line containing a phone number. If any of the following are found on a line |
|
# that contains a phone number, the line is regarded as not a normal phone number. |
|
# TODO: Put this list in a config file so that it can be easily edited |
|
# without program modificaton. |
|
phoneDisqualifiers = ("FAX", "PAGER", "CELL") |
|
|
|
# Given the business card text, return the persons name contained, or None if not found |
|
@staticmethod |
|
def parseName(document): |
|
|
|
# Two words (can contain "-" or ".") of 2 characters or more, separated by a space |
|
personsNameRegex = "[\w\-.]{2,} [\w\-.]{2,}" |
|
|
|
# Iterate through each of the matches found in the document |
|
for name in re.findall(personsNameRegex, document): |
|
# Seperate the contents of this name by a space |
|
nameParts = name.upper().split(" ") |
|
|
|
# Take the intersection of the nameParts and the nameDisqualifiers, |
|
# if there are no intersecting names (the intersection set is empty), |
|
# then return this name (loop will exit, the first name that matches |
|
# this criteria is most likely the real name) |
|
disqualifiersInNameParts = set(nameParts).intersection(IBusinessCardParser.nameDisqualifiers) |
|
if not disqualifiersInNameParts: |
|
return name |
|
|
|
# Given the business card text, return the phone number contained, or None if not found |
|
@staticmethod |
|
def parsePhoneNumber(document): |
|
|
|
phoneNumberRegex = "(?:\+\d{1,3})?\s?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{4}" |
|
|
|
# Iterate through each line of the document |
|
for line in document.splitlines(): |
|
# Search the line for a substring matching the pattern |
|
match = re.search(phoneNumberRegex, line) |
|
|
|
if match: |
|
# Any will return true if any of the phoneDisqualifiers are found |
|
# in line.upper() |
|
if not any(s in line.upper() for s in IBusinessCardParser.phoneDisqualifiers): |
|
# None of the phoneDisqualifiers were found in this line, |
|
# strip the matched group of any non numeric character, and |
|
# return the phone nuber (loop will exit, the number that |
|
# matches this criteria is most likely the real number) |
|
strippedPhoneNumber = re.sub("[^0-9]", "", match.group()) |
|
return strippedPhoneNumber |
|
|
|
# Given the business card text, return the email address contained, or None if not found |
|
@staticmethod |
|
def parseEmailAddress(document): |
|
emailRegex = "[a-zA-Z0-9_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" |
|
|
|
# Find the email address in the document, if more than one match is found, |
|
# only the first is returned |
|
match = re.search(emailRegex, document) |
|
if match: |
|
email = match.group() |
|
return email |
|
|
|
@staticmethod |
|
def getContactInfo(document): |
|
# Return an immutable representation of the contact info |
|
return IContactInfo(IBusinessCardParser.parseName(document), |
|
IBusinessCardParser.parsePhoneNumber(document), |
|
IBusinessCardParser.parseEmailAddress(document)) |