Skip to content

Instantly share code, notes, and snippets.

@robot00f
Created January 16, 2023 01:02
Show Gist options
  • Save robot00f/eb3cb45696769304c2ced1379885a560 to your computer and use it in GitHub Desktop.
Save robot00f/eb3cb45696769304c2ced1379885a560 to your computer and use it in GitHub Desktop.
Example: Extract emails from a text file
import re
from validate_email import validate_email
from collections import OrderedDict
from fuzzywuzzy import fuzz
# Open the input file in read mode
with open("input.txt", "r") as input_file:
# Read the contents of the file
data = input_file.read()
# Use a regular expression to find emails in the text
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_regex, data)
# Create an ordered dictionary to store the emails
email_dict = OrderedDict()
# Iterate through the emails
for email in emails:
# Check if the email is valid
if validate_email(email):
# Iterate through the keys in the dictionary
for key in email_dict.keys():
# Check if the email is similar to a key in the dictionary
if fuzz.token_set_ratio(email, key) > 90:
# If it is, don't add it to the dictionary
break
else:
# If it isn't similar to any key, add it to the dictionary
email_dict[email] = None
# Open the output file in write mode
with open("output.txt", "w") as output_file:
# Write the emails to the output file
for email in email_dict.keys():
output_file.write(email + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment