-
-
Save antoniotrento/7b3eb8aa2c43221803a096107e316807 to your computer and use it in GitHub Desktop.
A python script for extracting email addresses from text files.
You can pass it multiple files. It prints the email addresses to stdout, one address per line. For ease of use, remove the .py extension and place it in your $PATH (e.g. /usr/local/bin/) to run it like a built-in command.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Extracts email addresses from one or more plain text files. | |
# | |
# Notes: | |
# - Does not save to file (pipe the output to a file if you want it saved). | |
# - Does not check for duplicates (which can easily be done in the terminal). | |
# | |
# (c) 2013 Dennis Ideler <[email protected]> | |
from optparse import OptionParser | |
import os.path | |
import re | |
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" | |
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" | |
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) | |
def file_to_str(filename): | |
"""Returns the contents of filename as a string.""" | |
with open(filename) as f: | |
return f.read().lower() # Case is lowered to prevent regex mismatches. | |
def get_emails(s): | |
"""Returns an iterator of matched emails found in string s.""" | |
# Removing lines that start with '//' because the regular expression | |
# mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'. | |
return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//')) | |
if __name__ == '__main__': | |
parser = OptionParser(usage="Usage: python %prog [FILE]...") | |
# No options added yet. Add them here if you ever need them. | |
options, args = parser.parse_args() | |
if not args: | |
parser.print_usage() | |
exit(1) | |
for arg in args: | |
if os.path.isfile(arg): | |
for email in get_emails(file_to_str(arg)): | |
print email | |
else: | |
print '"{}" is not a file.'.format(arg) | |
parser.print_usage() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment