Created
January 16, 2023 01:02
-
-
Save robot00f/eb3cb45696769304c2ced1379885a560 to your computer and use it in GitHub Desktop.
Example: Extract emails from a text file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from validate_email import validate_email | |
from collections import OrderedDict | |
from fuzzywuzzy import fuzz | |
# Open the input file in read mode | |
with open("input.txt", "r") as input_file: | |
# Read the contents of the file | |
data = input_file.read() | |
# Use a regular expression to find emails in the text | |
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
emails = re.findall(email_regex, data) | |
# Create an ordered dictionary to store the emails | |
email_dict = OrderedDict() | |
# Iterate through the emails | |
for email in emails: | |
# Check if the email is valid | |
if validate_email(email): | |
# Iterate through the keys in the dictionary | |
for key in email_dict.keys(): | |
# Check if the email is similar to a key in the dictionary | |
if fuzz.token_set_ratio(email, key) > 90: | |
# If it is, don't add it to the dictionary | |
break | |
else: | |
# If it isn't similar to any key, add it to the dictionary | |
email_dict[email] = None | |
# Open the output file in write mode | |
with open("output.txt", "w") as output_file: | |
# Write the emails to the output file | |
for email in email_dict.keys(): | |
output_file.write(email + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment