Last active
August 27, 2021 00:01
-
-
Save cy-xu/0030e4030c15dcd7df0ce267ec79bd24 to your computer and use it in GitHub Desktop.
Email addresses clean up - remove nonsense from a long list of emails
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Remove nonsense from a long list of emails | |
[email protected] | |
""" | |
raw_emails_file = './email_addresses.txt' | |
clean_email_file = './clean_email_addresses.txt' | |
raw_emails = open(raw_emails_file, 'r').readlines() | |
clean_emails = open(clean_email_file, 'w') | |
valid_counter = 0 | |
for raw_email in raw_emails: | |
# skip empty/invalid lines | |
if not '@' in raw_email: | |
continue | |
# if < > found then save the valid part | |
if '<' in raw_email: | |
left_pos = raw_email.find('<') | |
right_pos = raw_email.find('>') | |
raw_email = raw_email[left_pos+1 : right_pos] | |
# if no special case found, remove nonsense and save | |
valid_address = raw_email.replace(' ', '') | |
valid_address = valid_address.replace(',', '') | |
valid_address = valid_address.lower() | |
# write valid address to new line | |
if valid_address.endswith('\n'): | |
clean_emails.write(valid_address) | |
else: | |
clean_emails.write(valid_address + '\n') | |
print(valid_address) | |
valid_counter += 1 | |
clean_emails.close() | |
print(f'a total of {valid_counter} valid emails saved to {clean_email_file}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment