Created
November 18, 2022 04:08
-
-
Save RobCranfill/c2d840071fd7df120352944df5895580 to your computer and use it in GitHub Desktop.
Extract headers from online email and create a CSV file to import all found email addresses as contacts into GMail.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import imaplib | |
import email | |
import csv | |
import sys | |
""" | |
Extract headers from online email and create a CSV file | |
to import all found email addresses as contacts into GMail. | |
(c)2022 [email protected] | |
based on https://qr.ae/pvQWFi | |
Command-line args: {HOST} {USERNAME} {PASSWORD} | |
""" | |
skipped = 0 | |
def split_addrs(s): | |
""" | |
split an address list into list of tuples of (name,address) | |
""" | |
if not(s): | |
return [] | |
outQ = True | |
cut = -1 | |
res = [] | |
for i in range(len(s)): | |
if s[i] == '"': | |
outQ = not(outQ) | |
if outQ and s[i] == ',': | |
res.append(email.utils.parseaddr(s[cut+1:i])) | |
cut = i | |
res.append(email.utils.parseaddr(s[cut+1:i+1])) | |
return res | |
def accumulate_goodness(biggus_dictus, list_of_name_and_address): | |
""" | |
Biggus dictus! | |
""" | |
global skipped | |
if len(list_of_name_and_address) == 0: | |
return | |
if len(list_of_name_and_address) > 1: | |
print(f" To/From/CC: {list_of_name_and_address}") | |
print(" ^^ Multiple-address message") | |
# return | |
for p in list_of_name_and_address: | |
email_name = p[0] | |
email_addr = p[1] | |
# print(f" name: '{email_name}', addr: '{email_addr}'") | |
if len(email_name) == 0: | |
print(" skipping empty name") | |
skipped += 1 | |
return | |
if email_name in biggus_dictus: | |
addresses = biggus_dictus[email_name] | |
else: | |
print(f" new name '{email_name}'") | |
addresses = set() | |
biggus_dictus[email_name] = addresses | |
# print(f" Prev set of addresses is {addresses}") | |
addresses.add(email_addr) | |
if len(addresses) == 4: | |
print(" **** MAX NAMES HIT FOR {email_name}") | |
# begin | |
if len(sys.argv) != 4: | |
print(f"{sys.argv[0]} HOST USERNAME PASSWORD") | |
exit(1) | |
pAddress = sys.argv[1] | |
pUser = sys.argv[2] | |
pPassword = sys.argv[3] | |
mail = imaplib.IMAP4_SSL(pAddress) | |
mail.login(pUser, pPassword) | |
mail.select("INBOX") | |
result, data = mail.search(None, "ALL") | |
ids = data[0].split() | |
# data_as_str = bytes(data).decode("utf-8") | |
# ids = data_as_str[0].split() | |
# FOR DEBUG | |
print("SHORTENING DATA....") | |
ids = ids[0:100] | |
# PROBLEM: ids is a tuple of bytes, but needs to be strings! (or does it?) | |
# super cheesy way to fix: | |
ids_str = [] | |
for id in ids: | |
ids_str.append(id.decode('UTF-8')) | |
print(f"fetching {len(ids)} ids....") | |
msgs = mail.fetch(','.join(ids_str), '(BODY.PEEK[HEADER])')[1][0::2] | |
print(f"fetched {len(msgs)} messages; iterating....") | |
# intermediate result: dictionary of | |
# key: Person (or company) name | |
# value: set of email addresses associated with that person | |
# | |
names_and_addresses = {} | |
for x, msg in msgs: | |
msg_str = msg.decode('UTF-8') | |
# print(f"Raw message is\n----------\n{msg_str}\n----------\n") | |
# was msg; TODO: could use message_from_bytes and avoid above conversion?? | |
msgobj = email.message_from_string(msg_str) | |
# msgobj = email.message_from_bytes(msg) | |
accumulate_goodness(names_and_addresses, split_addrs(msgobj['to'])) | |
accumulate_goodness(names_and_addresses, split_addrs(msgobj['from'])) | |
accumulate_goodness(names_and_addresses, split_addrs(msgobj['cc'])) | |
print("------------------------------------------------") | |
print(f"Found {len(names_and_addresses)} distinct names") | |
print(f" skipped: {skipped}") | |
# print(f"{names_and_addresses}") | |
# Now create a list of dictionaries for the CSV DictWriter. | |
# Each entry in the list is a dictionary like this: | |
# {"Name": "Rob Cranfill", | |
# "E-mail 1 - Value": "[email protected]", | |
# "E-mail 2 - Value": "[email protected]"} | |
# | |
csv_data = [] | |
for name_key in names_and_addresses.keys(): | |
d = dict() | |
d["Name"] = name_key | |
i = 1 | |
# todo: check for email addresses that only differ in upper/lower case? | |
# todo: if there are more than 4 addresses, will create bad CSV? | |
for a in names_and_addresses[name_key]: | |
e_key_name = f"E-mail {i} - Value" | |
d[e_key_name] = a | |
# print(f" >>> {key_name} = {a}") | |
i += 1 | |
csv_data.append(d) | |
# print(f"\n\nCSV dict: {csv_data}") | |
# Write the CSV file! | |
with open('output.csv', 'w', newline='') as csvfile: | |
Gfieldnames = ["Name", "Given Name", "Additional Name", "Family Name", | |
"Yomi Name", "Given Name Yomi", "Additional Name Yomi", | |
"Family Name Yomi", "Name Prefix", "Name Suffix", | |
"Initials", "Nickname", "Short Name", "Maiden Name", | |
"Birthday", "Gender", "Location", "Billing Information", | |
"Directory Server", "Mileage", "Occupation", "Hobby", | |
"Sensitivity", "Priority", "Subject", "Notes", | |
"Language", "Photo", "Group Membership", | |
"E-mail 1 - Type", "E-mail 1 - Value", | |
"E-mail 2 - Type", "E-mail 2 - Value", | |
"E-mail 3 - Type", "E-mail 3 - Value", | |
"E-mail 4 - Type", "E-mail 4 - Value", | |
"Phone 1 - Type", "Phone 1 - Value", | |
"Phone 2 - Type", "Phone 2 - Value", | |
"Phone 3 - Type", "Phone 3 - Value", | |
"Phone 4 - Type", "Phone 4 - Value", | |
"Address 1 - Type", "Address 1 - Formatted", | |
"Address 1 - Street", "Address 1 - City", | |
"Address 1 - PO Box", "Address 1 - Region", | |
"Address 1 - Postal Code", "Address 1 - Country", | |
"Address 1 - Extended Address", | |
"Address 2 - Type", "Address 2 - Formatted", | |
"Address 2 - Street", "Address 2 - City", | |
"Address 2 - PO Box", "Address 2 - Region", | |
"Address 2 - Postal Code", "Address 2 - Country", | |
"Address 2 - Extended Address", "Organization 1 - Type", | |
"Organization 1 - Name", "Organization 1 - Yomi Name", | |
"Organization 1 - Title", "Organization 1 - Department", | |
"Organization 1 - Symbol", "Organization 1 - Location", | |
"Organization 1 - Job Description", "Website 1 - Type", | |
"Website 1 - Value"] | |
writer = csv.DictWriter(csvfile, fieldnames=Gfieldnames) | |
for d in csv_data: | |
writer.writerow(d) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment