Created
February 10, 2015 15:18
-
-
Save tyndyll/69ac76f1e3838ee6a2b4 to your computer and use it in GitHub Desktop.
Extract Mail from GMail and Import into Neo4J
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/env python | |
import argparse | |
import email | |
import extract | |
import getpass | |
import imaplib | |
import os | |
import sys | |
import uuid | |
from py2neo import Graph, Node, Relationship | |
class GmailExtractor: | |
def __init__(self, imap_server, neo4j_server): | |
self.__graph = neo4j_server | |
self.__imap = imap_server | |
self.__persons_known = {} | |
def __get_folder_mail(self, folder): | |
status, msgs = self.__imap.select(folder) | |
if status != "OK": | |
sys.exit(status) | |
result, data = self.__imap.uid('search', None, "ALL") | |
for msg_id in data[0].split(): | |
result, data = self.__imap.uid('fetch', msg_id, "RFC822") | |
msg = email.message_from_string(data[0][1]) | |
self.__add_message_to_db(msg) | |
def __add_message_to_db(self, msg): | |
msg_obj, attachments = extract.extract_msg(msg) | |
msg_node = Node.cast(msg_obj) | |
msg_node.labels.add("Email") | |
for addr_type in ["to", "cc", "from"]: | |
if addr_type in msg_obj: | |
for address in msg_obj[addr_type].split(", "): | |
name, email_addr = email.utils.parseaddr(address) | |
if email_addr not in self.__persons_known: | |
self.__persons_known[email_addr] = Node("Person", name=name, email=email_addr) | |
to_relationship = Relationship(msg_node, addr_type.upper(), self.__persons_known[email_addr]) | |
graph.create(to_relationship) | |
def __list(self): | |
folders = self.__imap.list() | |
return [folder[folder.find("/")+4:-1] for folder in folders[1]] | |
def get_folder_list(self, args): | |
dirs = self.__list() | |
for folder in dirs: | |
print folder | |
def get_all_mail(self, args): | |
self.__get_folder_mail('[Gmail]/All Mail') | |
self.__get_folder_mail('[Gmail]/Trash') | |
def get_inbox(self, args): | |
self.__get_folder_mail('INBOX') | |
def setup_help(gmc): | |
parser = argparse.ArgumentParser() | |
subparsers = parser.add_subparsers() | |
inbox_folder = subparsers.add_parser('inbox') | |
inbox_folder.set_defaults(func=gmc.get_inbox) | |
all_folder = subparsers.add_parser('all') | |
all_folder.set_defaults(func=gmc.get_all_mail) | |
list_folder = subparsers.add_parser('list') | |
list_folder.set_defaults(func=gmc.get_folder_list) | |
return parser | |
if __name__ == "__main__": | |
graph = Graph(os.environ['NEO4J_ADDRESS'] + '/db/data") | |
user = os.environ["GMAIL_USER"] if "GMAIL_USER" in os.environ else raw_input("GMail User Name: ") | |
password = os.environ["GMAIL_PASS"] if "GMAIL_PASS" in os.environ else getpass.getpass("GMail Password: ") | |
gm = imaplib.IMAP4_SSL('imap.gmail.com') | |
try: | |
print gm.login(user.strip(), password.strip()) | |
except Exception as E: | |
sys.exit(E) | |
gmc = GmailExtractor(gm, graph) | |
parser = setup_help(gmc) | |
args = parser.parse_args() | |
args.func(args) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment