Created
July 14, 2018 21:58
-
-
Save jmb/424e8e113f2a546349ff60d07f4eab3a to your computer and use it in GitHub Desktop.
Script to extract all email addresses from all emails in all folders of an IMAP account.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Create a connection to an IMAP server and find ALL email addresses | |
Original script by abought: https://gist.github.com/abought/15a1e08705b121c1b7bd | |
References: | |
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/ | |
and | |
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/ | |
""" | |
__author__ = 'jmb' | |
import email | |
import imaplib | |
import getpass | |
import sys | |
import re | |
import os | |
# EDIT these as required: | |
FILENAME = 'out.txt' | |
DEFAULT_MAIL_SERVER = 'imap.server' | |
# No user parameters below this line | |
# ADDR_PATTERN = re.compile('<(.*?)>') # Finds email as <[email protected]> | |
# Find ALL email address in all fields: | |
ADDR_PATTERN = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)") | |
def connect(user, pwd, server=DEFAULT_MAIL_SERVER): | |
"""Connect to [the specified] mail server. Return an open connection""" | |
conn = imaplib.IMAP4_SSL(server) | |
try: | |
conn.login(user, pwd) | |
except imaplib.IMAP4.error: | |
print "Failed to login" | |
sys.exit(1) | |
return conn | |
def print_folders(conn): | |
"""Print a list of open mailbox folders""" | |
for f in conn.list(): | |
print " ", f | |
def get_folder(conn, folder_name): | |
"""Fetch a specific folder (or label) from server""" | |
if conn.state == "SELECTED": | |
# Explicitly close any previously opened folders; may not be necessary | |
conn.close() | |
rv, data = conn.select(folder_name) | |
if rv != 'OK': | |
print "Could not open specified folder. Known labels:" | |
print_folders(conn) | |
return conn | |
def get_email_ids(conn, query='ALL'): | |
"""Get the numeric IDs for all emails in a given folder""" | |
if conn.state != "SELECTED": | |
raise imaplib.IMAP4.error("Cannot search without selecting a folder") | |
rv, data = conn.uid('search', None, query) | |
if rv != 'OK': | |
print "Could not fetch email ids" # for some reason... | |
return [] | |
return data[0].split() | |
def fetch_message(conn, msg_uid): | |
""" | |
Fetch a specific message uid (not sequential id!) from the given folder; | |
return the parsed message. User must ensure that specified | |
message ID exists in that folder. | |
""" | |
# TODO: Could we fetch just the envelope of the response to save bandwidth? | |
rv, data = conn.uid('fetch', msg_uid, "(RFC822)") | |
if rv != 'OK': | |
print "ERROR fetching message #", msg_uid | |
return {} | |
return email.message_from_string(data[0][1]) # dict-like object | |
def get_recipients(msg_parsed): | |
"""Given a parsed message, extract and return recipient list""" | |
recipients = [] | |
addr_fields = ['From', 'To', 'Cc', 'Bcc'] | |
for f in addr_fields: | |
rfield = msg_parsed.get(f, "") # Empty string if field not present | |
rlist = re.findall(ADDR_PATTERN, rfield) | |
recipients.extend(rlist) | |
return recipients | |
if __name__ == "__main__": | |
username = raw_input("Full email address: ") | |
password = getpass.getpass() | |
# Connect | |
mail_conn = connect(username, password) | |
# Open output file | |
file = open(FILENAME, "a") | |
# Go through each folder | |
for f in mail_conn.list()[1]: | |
folder = f.split()[2].strip('"') | |
if folder == ".": | |
continue | |
mail_conn = get_folder(mail_conn, folder) | |
msg_uid_list = get_email_ids(mail_conn) | |
print "Scanning folder: ", folder, " with ", len(msg_uid_list), " messages" | |
# Fetch a list of recipients | |
all_recipients = [] | |
for msg_uid in msg_uid_list: | |
msg = fetch_message(mail_conn, msg_uid) | |
recip_list = get_recipients(msg) | |
all_recipients.extend(recip_list) | |
print "Writing", len(set(all_recipients)), "email addresses to file", file.name | |
output = "" | |
for address in set(all_recipients): | |
output = output + address + "\n" | |
file.write(output) | |
file.flush() | |
os.fsync(file.fileno()) | |
file.close() | |
print("\nWritten to file: " + FILENAME) | |
try: | |
mail_conn.close() # Close currently selected folder (if any) | |
finally: | |
mail_conn.logout() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment