Skip to content

Instantly share code, notes, and snippets.

@jonathands
Created May 14, 2020 00:40
Show Gist options
  • Select an option

  • Save jonathands/c81d7851cd28f7a98eda79c17e8da4d7 to your computer and use it in GitHub Desktop.

Select an option

Save jonathands/c81d7851cd28f7a98eda79c17e8da4d7 to your computer and use it in GitHub Desktop.
trying to extract emails on
import os, re, sys
emails = []
current_file = ""
try:
files = os.listdir("msgs")
for f in files:
current_file = f
content = open("msgs/"+f)
text = content.read();
match = re.findall(r'[\w\.-]+@[\w]+\.[\w|\.\w]+',text)
#match = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+',text)
emails.append(match)
texts = []
for email in emails:
texts.append(list(dict.fromkeys(email)))
f = open('emails.txt', 'a')
for e in texts:
for line in e:
has_blocked = re.search(r'postmaster|sendgrid|graficabu|businessin|mx.google|smtp|mx.|Mailer-Daemon|mail.protection|srvd.|thunder.|.webserver.',line, flags=re.IGNORECASE)
if has_blocked:
continue
f.writelines(line+" ")
f.writelines('\r\n')
except UnicodeDecodeError:
print("file "+current_file+" raised exception")
print(UnicodeDecodeError)
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment