Last active
September 15, 2024 20:50
-
-
Save abought/15a1e08705b121c1b7bd to your computer and use it in GitHub Desktop.
Extract all email addresses in from/to/cc fields of every msg in one Gmail folder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Create a connection to Gmail and do something with the results | |
References: | |
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/ | |
and | |
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/ | |
""" | |
__author__ = 'abought' | |
import email | |
import imaplib | |
import getpass | |
import sys | |
import re | |
from pprint import pprint as pp | |
# User may want to change these parameters if running script as-is | |
SEARCH_FOLDER = '[Gmail]Trash' # TODO: A user will want to change this | |
# Other folders: "INBOX", "[Gmail]/All Mail" | |
DEFAULT_MAIL_SERVER = 'imap.gmail.com' | |
# No user parameters below this line | |
ADDR_PATTERN = re.compile('<(.*?)>') # Finds email as <[email protected]> | |
def connect(user, pwd, server=DEFAULT_MAIL_SERVER): | |
"""Connect to [the specified] mail server. Return an open connection""" | |
conn = imaplib.IMAP4_SSL(server) | |
try: | |
conn.login(user, pwd) | |
except imaplib.IMAP4.error: | |
print "Failed to login" | |
sys.exit(1) | |
return conn | |
def print_folders(conn): | |
"""Print a list of open mailbox folders""" | |
for f in conn.list(): | |
print "\t", f | |
def get_folder(conn, folder_name): | |
"""Fetch a specific folder (or label) from server""" | |
if conn.state == "SELECTED": | |
# Explicitly close any previously opened folders; may not be necessary | |
conn.close() | |
rv, data = conn.select(folder_name) | |
if rv != 'OK': | |
print "Could not open specified folder. Known labels:" | |
print_folders(conn) | |
return conn | |
def get_email_ids(conn, query='ALL'): | |
"""Get the numeric IDs for all emails in a given folder""" | |
if conn.state != "SELECTED": | |
raise imaplib.IMAP4.error("Cannot search without selecting a folder") | |
rv, data = conn.uid('search', None, query) | |
if rv != 'OK': | |
print "Could not fetch email ids" # for some reason... | |
return [] | |
return data[0].split() | |
def fetch_message(conn, msg_uid ): | |
""" | |
Fetch a specific message uid (not sequential id!) from the given folder; | |
return the parsed message. User must ensure that specified | |
message ID exists in that folder. | |
""" | |
# TODO: Could we fetch just the envelope of the response to save bandwidth? | |
rv, data = conn.uid('fetch', msg_uid, "(RFC822)") | |
if rv != 'OK': | |
print "ERROR fetching message #", msg_uid | |
return {} | |
return email.message_from_string(data[0][1]) # dict-like object | |
def get_recipients(msg_parsed): | |
"""Given a parsed message, extract and return recipient list""" | |
recipients = [] | |
addr_fields = ['From', 'To', 'Cc', 'Bcc'] | |
for f in addr_fields: | |
rfield = msg_parsed.get(f, "") # Empty string if field not present | |
rlist = re.findall(ADDR_PATTERN, rfield) | |
recipients.extend(rlist) | |
return recipients | |
if __name__ == "__main__": | |
username = raw_input("Full email address: ") | |
password = getpass.getpass() | |
# Connect | |
mail_conn = connect(username, password) | |
# Open a specific folder and get list of email message uids | |
mail_conn = get_folder(mail_conn, SEARCH_FOLDER) | |
msg_uid_list = get_email_ids(mail_conn) | |
# Fetch a list of recipients | |
all_recipients = [] | |
for msg_uid in msg_uid_list: | |
msg = fetch_message(mail_conn, msg_uid) | |
recip_list = get_recipients(msg) | |
all_recipients.extend(recip_list) | |
# Very unsophisticated way of showing the recipient list | |
print "List of all recipients:" | |
print "------------" | |
pp(all_recipients) | |
print "\n\n List of all UNIQUE recipients:" | |
print "-------------------------------" | |
pp(set(all_recipients)) | |
try: | |
mail_conn.close() # Close currently selected folder (if any) | |
finally: | |
mail_conn.logout() | |
Hello @abought,
Thanks for this little script! I updated it to use it with python3:
"""Create a connection to Gmail and do something with the results
References:
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
and
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/
"""
__author__ = 'abought'
import email.parser
import imaplib
import getpass
import sys
import re
import ssl
from pprint import pprint as pp
# User may want to change these parameters if running script as-is
# Search folders, multiple directories can be given
# TODO: A user will want to change this
SEARCH_FOLDER = ['"[Gmail]Trash"', '"[Gmail]/All Mail"', '"INBOX"']
DEFAULT_MAIL_SERVER = 'imap.gmail.com'
# No user parameters below this line
ADDR_PATTERN = re.compile("<(.+)>") # Finds email as <[email protected]>
def connect(user, pwd, server=DEFAULT_MAIL_SERVER):
"""Connect to [the specified] mail server. Return an open connection"""
conn = imaplib.IMAP4_SSL(host=server,
ssl_context=ssl.create_default_context())
try:
conn.login(user, pwd)
except imaplib.IMAP4.error:
print("Failed to login")
sys.exit(1)
return conn
def print_folders(conn):
"""Print a list of open mailbox folders"""
for f in conn.list():
for i in f:
print("\t", i)
def get_mails_from_folder(conn, folder_name):
"""Fetch a specific folder (or label) from server"""
typ, data = conn.select(mailbox=folder_name, readonly=True)
if typ != 'OK':
print("Could not open specified folder. Known labels:")
print_folders(conn)
return
typ, data = conn.search(None, 'ALL')
if typ != 'OK':
print("Could not get mail list of folder: ", folder_name)
return
return data[0].split()
def fetch_message(conn, msg_uid):
"""
Fetch a specific message uid (not sequential id!) from the given folder;
return the parsed message. User must ensure that specified
message ID exists in that folder.
"""
# TODO: Could we fetch just the envelope of the response to save bandwidth?
typ, data = conn.fetch(msg_uid, '(RFC822)')
if typ != 'OK':
print("ERROR fetching message #", msg_uid)
return
return email.parser.BytesParser().parsebytes(data[0][1], headersonly=True)
def get_recipients(msg):
"""Given a parsed message, extract and return recipient list"""
recipients = []
addr_fields = ['From', 'To', 'Cc', 'Bcc', 'Reply-To', 'Sender']
for f in addr_fields:
if msg[f] is None:
continue
# str conversion is needed for non-ascii chars
rlist = ADDR_PATTERN.findall(str(msg[f]))
recipients.extend(rlist)
return recipients
if __name__ == "__main__":
username = input("Enter username: ")
password = input("Enter password: ")
# Connect
mail_conn = connect(username, password)
# show folders of mail account
#print_folders(mail_conn)
# Open folders and get list of email message uids
all_recipients = []
for folder in SEARCH_FOLDER:
# switch to folder
for mail_id in get_mails_from_folder(mail_conn, folder):
data = fetch_message(mail_conn, mail_id)
recip_list = get_recipients(data)
all_recipients.extend(recip_list)
mail_conn.close()
mail_conn.logout()
# Very unsophisticated way of showing the recipient list
print("List of all recipients:")
print("------------")
pp(all_recipients)
print("\n\n List of all UNIQUE recipients:")
print("-------------------------------")
pp(set(all_recipients))
Best
Thanks! That's what I was looking for. Is that a way of limiting by date?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fetches all email messages from one Gmail folder and returns a list of email addresses in the FROM, TO, CC, and BCC fields. Written for Python 2.7; should not require any external dependencies.
Usage
Can be run directly as is (once you specify the desired
SEARCH_FOLDER
). Will prompt for username and password.python gistfile1.py
By default it just outputs results to console. The variables SEARCH_FOLDER and DEFAULT_MAIL_SERVER at the top of the file can be changed as appropriate; the default value (Trash) was chosen for testing purposes (my trash folder had just a few messages). Other common sample folder/ label names used by Gmail are provided. This file also exposes functions that can be called from other scripts.
Caveats
By default this is a bit brute-force and has no safeguards for bandwidth usage etc. This was tuned towards the output of GMail responses, and may or may not work perfectly with other IMAP providers.