Created
May 14, 2020 00:40
-
-
Save jonathands/c81d7851cd28f7a98eda79c17e8da4d7 to your computer and use it in GitHub Desktop.
trying to extract emails on
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, re, sys | |
| emails = [] | |
| current_file = "" | |
| try: | |
| files = os.listdir("msgs") | |
| for f in files: | |
| current_file = f | |
| content = open("msgs/"+f) | |
| text = content.read(); | |
| match = re.findall(r'[\w\.-]+@[\w]+\.[\w|\.\w]+',text) | |
| #match = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+',text) | |
| emails.append(match) | |
| texts = [] | |
| for email in emails: | |
| texts.append(list(dict.fromkeys(email))) | |
| f = open('emails.txt', 'a') | |
| for e in texts: | |
| for line in e: | |
| has_blocked = re.search(r'postmaster|sendgrid|graficabu|businessin|mx.google|smtp|mx.|Mailer-Daemon|mail.protection|srvd.|thunder.|.webserver.',line, flags=re.IGNORECASE) | |
| if has_blocked: | |
| continue | |
| f.writelines(line+" ") | |
| f.writelines('\r\n') | |
| except UnicodeDecodeError: | |
| print("file "+current_file+" raised exception") | |
| print(UnicodeDecodeError) | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment