Created
November 19, 2024 15:51
-
-
Save hernamesbarbara/d00cda51ef3c31730d438cb9428e8b71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pandas as pd | |
import re | |
# Input string | |
email_str = """ | |
Foo Bar <[email protected]>, [email protected], Alice B. Charlie <[email protected]>, Pete <[email protected]>, Sam Fred Webster Clarke <[email protected]>, [email protected] | |
""" | |
# Common suffixes and initials to ignore | |
SUFFIXES = {"Jr.", "Sr.", "Jr", "Sr"} | |
INITIALS_REGEX = re.compile(r"^(?:[A-Z]\.|[A-Z]\.[A-Z]\.|[A-Z])$") | |
# Helper function to parse individual email entry | |
def parse_email(entry): | |
# Match canonical mailbox format: Name <email> | |
canonical_match = re.match(r"(.*?)\s*<([^>]+)>", entry) | |
if canonical_match: | |
display_name = canonical_match.group(1).strip() | |
email = canonical_match.group(2).strip() | |
first_name, last_name = extract_names(display_name) | |
else: | |
# If no display name, extract email only | |
email = entry.strip() | |
first_name, last_name = guess_name_from_email(email) | |
return {"first_name": first_name, "last_name": last_name, "email": email} | |
# Helper function to extract names from a display name | |
def extract_names(display_name): | |
tokens = display_name.split() | |
if len(tokens) == 1: | |
# Single token, assume it's the first name | |
return tokens[0], "UNK" | |
first_name = tokens[0] | |
remaining_tokens = [token for token in tokens[1:] if not (token in SUFFIXES or INITIALS_REGEX.match(token))] | |
if remaining_tokens: | |
last_name = " ".join(remaining_tokens) | |
else: | |
last_name = "UNK" | |
return first_name, last_name | |
# Helper function to guess name from email | |
def guess_name_from_email(email): | |
username = email.split("@")[0] | |
if username.isalpha(): | |
first_name = username.capitalize() | |
last_name = "UNK" | |
else: | |
first_name = "UNK" | |
last_name = "UNK" | |
return first_name, last_name | |
# Split the input string into individual entries | |
entries = [e.strip() for e in email_str.split(",")] | |
# Parse all entries | |
parsed_data = [parse_email(entry) for entry in entries] | |
# Convert to DataFrame | |
df = pd.DataFrame(parsed_data) | |
# Write to CSV | |
output_file = "parsed_emails.csv" | |
df.to_csv(output_file, index=False) | |
print(f"Results written to {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment