Skip to content

Instantly share code, notes, and snippets.

@hernamesbarbara
Created November 19, 2024 15:51
Show Gist options
  • Save hernamesbarbara/d00cda51ef3c31730d438cb9428e8b71 to your computer and use it in GitHub Desktop.
Save hernamesbarbara/d00cda51ef3c31730d438cb9428e8b71 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import pandas as pd
import re
# Input string
email_str = """
Foo Bar <[email protected]>, [email protected], Alice B. Charlie <[email protected]>, Pete <[email protected]>, Sam Fred Webster Clarke <[email protected]>, [email protected]
"""
# Common suffixes and initials to ignore
SUFFIXES = {"Jr.", "Sr.", "Jr", "Sr"}
INITIALS_REGEX = re.compile(r"^(?:[A-Z]\.|[A-Z]\.[A-Z]\.|[A-Z])$")
# Helper function to parse individual email entry
def parse_email(entry):
# Match canonical mailbox format: Name <email>
canonical_match = re.match(r"(.*?)\s*<([^>]+)>", entry)
if canonical_match:
display_name = canonical_match.group(1).strip()
email = canonical_match.group(2).strip()
first_name, last_name = extract_names(display_name)
else:
# If no display name, extract email only
email = entry.strip()
first_name, last_name = guess_name_from_email(email)
return {"first_name": first_name, "last_name": last_name, "email": email}
# Helper function to extract names from a display name
def extract_names(display_name):
tokens = display_name.split()
if len(tokens) == 1:
# Single token, assume it's the first name
return tokens[0], "UNK"
first_name = tokens[0]
remaining_tokens = [token for token in tokens[1:] if not (token in SUFFIXES or INITIALS_REGEX.match(token))]
if remaining_tokens:
last_name = " ".join(remaining_tokens)
else:
last_name = "UNK"
return first_name, last_name
# Helper function to guess name from email
def guess_name_from_email(email):
username = email.split("@")[0]
if username.isalpha():
first_name = username.capitalize()
last_name = "UNK"
else:
first_name = "UNK"
last_name = "UNK"
return first_name, last_name
# Split the input string into individual entries
entries = [e.strip() for e in email_str.split(",")]
# Parse all entries
parsed_data = [parse_email(entry) for entry in entries]
# Convert to DataFrame
df = pd.DataFrame(parsed_data)
# Write to CSV
output_file = "parsed_emails.csv"
df.to_csv(output_file, index=False)
print(f"Results written to {output_file}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment