Created
January 27, 2019 22:54
-
-
Save reverie/de28f63fd1dae92cb3f87307ffdcf1e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_first_email(s): | |
""" | |
Assumes `s` is a list of email addresses split by (mixed) spaces or commas. | |
Returns (first_email, rest) where first_email + rest == s, | |
or (None, None) if we don't think this is an email address. | |
""" | |
first_quote = s.find('"') | |
first_at = s.find('@') | |
if first_at == -1: | |
return (None, None) | |
if first_quote != -1 and first_quote < first_at: | |
second_quote = s.find('"', first_quote+1) | |
if second_quote == -1: | |
return (None, None) | |
first_at = s.find('@', second_quote) | |
next_separator = re.search('[ ,]', s[first_at:]) | |
if not next_separator: | |
return (s, '') | |
first_email_end = first_at + next_separator.start() | |
return (s[:first_email_end], s[first_email_end:]) | |
def split_email_line_by_spaces_or_commas(s): | |
""" | |
Returns a pair of ([maybe_valid_emails], [invalid_emails]) in s. | |
""" | |
emails = [] | |
invalid = [] | |
s = s.strip(' ,') | |
while s: | |
first, rest = extract_first_email(s) | |
if first is None: | |
invalid.append(s) | |
break | |
emails.append(first.strip(' ,')) | |
s = rest.strip(' ,') | |
return emails, invalid | |
def lenient_email_extractor(text): | |
""" | |
Returns a pair (address_pairs, invalid_addresses), | |
where address_pairs is of the kind returned by email.utils.parseaddr. | |
Test case: | |
''' | |
[email protected], "Andrew, Esq." <[email protected]>, | |
"Mr. Bob Ross" <[email protected]> [email protected],[email protected], | |
[email protected],[email protected] "Full Name with quotes and <[email protected]>" <[email protected]> | |
[email protected] | |
[email protected] [email protected] [email protected] | |
''' | |
(note: spaces, commas, blank lines) | |
""" | |
from email.utils import parseaddr | |
address_pairs = [] | |
invalid_addresses = [] | |
lines = text.strip().splitlines() | |
for l in lines: | |
emails, invalid = split_email_line_by_spaces_or_commas(l) | |
invalid_addresses.extend(invalid) | |
for e in emails: | |
name, real = parseaddr(e) | |
if name == real == '': | |
invalid_addresses.append(e) | |
else: | |
address_pairs.append((name, real)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment