Created
September 20, 2017 07:38
-
-
Save vyraun/e84c67899fb3aca50bab38017e93dc10 to your computer and use it in GitHub Desktop.
Drop this file in a Log Directory and It Will extract all the emails in a file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Slightly Modified version of https://gist.github.com/dideler/5219706 | |
from optparse import OptionParser | |
import os.path | |
import re | |
import os | |
from sets import Set | |
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" | |
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" | |
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) | |
def file_to_str(filename): | |
"""Returns the contents of filename as a string.""" | |
with open(filename) as f: | |
return f.read().lower() # Case is lowered to prevent regex mismatches. | |
def get_emails(s): | |
"""Returns an iterator of matched emails found in string s.""" | |
# Removing lines that start with '//' because the regular expression | |
# mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'. | |
return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//')) | |
if __name__ == '__main__': | |
dir = os.getcwd() | |
unique_emails = Set([]) | |
for root, dirs, files in os.walk(dir): | |
for arg in files: | |
if os.path.isfile(arg) and arg != "get_email.py": | |
for email in get_emails(file_to_str(arg)): | |
unique_emails.add(email) | |
print email | |
else: | |
print '"{}" is not a file to parse for emails!!'.format(arg) | |
f = open('unique_emails.txt', 'w+') | |
for item in list(unique_emails): | |
f.write("%s\n" % item) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment