Skip to content

Instantly share code, notes, and snippets.

@kadin2048
Last active November 3, 2021 05:51
Show Gist options
  • Save kadin2048/8db8767686dfe93fe045 to your computer and use it in GitHub Desktop.
Save kadin2048/8db8767686dfe93fe045 to your computer and use it in GitHub Desktop.
Convert Adium logs to .eml files

Adium to EML Log Converter

Convert Adium logs to RFC822-compliant '.eml' files that can be imported into a mail program, uploaded to Gmail, etc.

Usage: $ ./adiumToEml.py logfile [outputdir]

In most cases, you would probably want to call this from a script, e.g. with find and xargs in order to run it on a bunch of logfiles.

For old-style (.AdiumHTMLLog) files, working directory must contain a header and footer file, "header.htmlpart" and "footer.htmlpart", which are prepended and appended to the log in order to make it into a complete HTML document.

For modern XML logs, working directory must contain an XSL file used to convert from XML to HTML, which should be named "chatlog_transform.xsl".

Requires Python 2.5 or later and both the "lxml" and "dateutil" packages, available through pip.

Released under the GPL v2 or later.

#!/usr/bin/env python
"""Convert Adium logs to RFC822-compliant '.eml' files that can be imported
into a mail program, uploaded to Gmail, etc.
Usage:
$ ./adiumToEml.py logfile [outputdir]
In most cases, you would probably want to call this from a script, e.g.
with `find` and `xargs` in order to run it on a bunch of logfiles.
For old-style (.AdiumHTMLLog) files, working directory must contain a header
and footer file, "header.htmlpart" and "footer.htmlpart", which are prepended and
appended to the log in order to make it into a complete HTML document.
For modern XML logs, working directory must contain an XSL file used to convert
from XML to HTML, which should be named "chatlog_transform.xsl".
Requires Python 2.5 or later and both the "lxml" and "dateutil" packages, available through pip.
Released under the GPL v2 or later.
"""
# These libs are pretty critical
import sys
import datetime
import os.path
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
# These are needed for new XML-formatted ".chatlog" but not old ".AdiumHTMLLog"
import lxml.etree # NOT STANDARD LIBRARY, MUST INSTALL
import dateutil.parser # NOT STANDARD LIBRARY, MUST INSTALL
# These are for Message-ID functionality and can be easily disabled
import uuid
import socket
# RUNTIME FLAGS
global debug # Debug, if true, produces very verbose output
debug = True
global silent # If true, suppresses all stdout output
silent = False
global haltOnDuplicates # If true, throw error if output file exists already
haltOnDuplicates = False
# CONFIG OPTIONS
global configFilePath
configFilePath = '' # empty string means working dir, for header/footer files
global myhostname
myhostname = '' # set to false/null to determine programmatically from system FQDN
global defaultToAddr
defaultToAddr = "Tester John Doe <[email protected]>" # use this as the email 'To' header if we can't determine from the log
global skipIfBeforeDate
skipIfBeforeDate = '' # skip log if date is earlier than %Y-%m-%dT%H:%M:%S%z, e.g. 2006-07-14T12:42:01-0500
def main():
global myhostname
# Use the first argument as the input file
global filename
try:
if sys.argv[1]:
filename = sys.argv[1]
except IndexError:
sys.stderr.write("No input file specified. Terminating.\n")
return 1
else:
if os.path.isfile(filename) is not True:
sys.stderr.write("Input file (" + filename + ") appears invalid. Terminating.\n")
return 1
if debug:
sys.stderr.write("output dir is " + sys.argv[2] + '\n')
# Second arg, if present, is the output dir
try:
outputdir = sys.argv[2] # Note that this will throw IndexError if not specified
except IndexError:
# which we catch here and fix by setting to zero-length string, i.e. the working directory
outputdir = ''
else:
if os.path.isdir(outputdir) is not True:
sys.stderr.write("Output dir (" + outputdir + ") specified but not a directory. Terminating.\n")
return 1
# output file name is based on input; we take the input basename, append .eml, then prefix the output dir
outfilename = os.path.basename(filename) + '.eml'
outpath = os.path.join(outputdir, outfilename)
# DEBUG
if debug:
print "-- Reading from " + filename
print "-- Writing to " + outpath
# Test to see if the output file already exists (prevents duplicate processing)
# In some cases this may be undesirable/annoying so we can disable with flag haltOnDuplicates
if os.path.isfile(outpath) and haltOnDuplicates:
sys.stderr.write("Output file " + outpath + " already exists. Terminating.\n")
return 1
# Test to make sure the file suffix is one of the types we can process
if filename.split('.')[-1] not in ['xml', 'AdiumHTMLLog']:
sys.stderr.write("Input file suffix not one of supported types. Terminating.\n")
return 1
# Open the in and out files
try:
fi = open(filename, 'r') # fi is a file object
except IOError:
sys.stderr.write("I/O Error while opening input: " + filename + "\n")
return 1
try:
fo = open(outpath, 'w')
except IOError:
sys.stderr.write("I/O Error while opening output: " + outpath + "\n")
return 1
# Create a message object
msg_base = MIMEMultipart('mixed')
# For new XML-based Adium logs (have to be run through a XML transform)
# Ref. <https://trac.adium.im/wiki/XMLLogFormat>
if filename.split('.')[-1] == 'xml':
# TODO Adium XML logs should always be contained in directories with the same basename but extension '.chatlog'
# MacOS treats these directories like 'packages' which might or might not cause issues...
try:
xslt = lxml.etree.parse( os.path.join(configFilePath, 'chatlog_transform.xsl') )
except IOError:
sys.stderr.write("I/O Error while attempting to open " + os.path.join(configFilePath, 'chatlog_transform.xsl') + '\n')
return 1
# Use lxml to transform XML (TODO: might want to put this in a function...)
# See <http://stackoverflow.com/questions/16698935/how-to-transform-an-xml-file-using-xslt-in-python>
dom = lxml.etree.parse(fi)
transform = lxml.etree.XSLT(xslt)
html_dom = transform(dom)
ht = lxml.etree.tostring(html_dom, pretty_print='True') # at this point we have well-formed HTML
try:
d = determineAdiumXMLHeaders( filename, fi, dom, msg_base )
except:
sys.stderr.write("Error while determining headers of " + filename + "\n")
raise
# For old style fragementary-HTML Adium logs...
if filename.split('.')[-1] == 'AdiumHTMLLog':
try:
header = open( os.path.join(configFilePath, 'header.htmlpart'), 'r')
footer = open( os.path.join(configFilePath, 'footer.htmlpart'), 'r')
except IOError:
sys.stderr.write('I/O Error while attempting to open header or footer file.\nBe sure both header.htmlpart and footer.htmlpart exist in the working dir.\n')
return 1
# Parse the first line of the input file
fi.seek(0) # make sure we're really at the first line
try:
d = determineAdiumHTMLDateTime( fi.readline(), msg_base, filename ) # Only date is set
except:
sys.stderr.write('Error while determining date/time of ' + filename + '\n')
raise
try:
determineAdiumHTMLToFrom( fi, msg_base ) # To and From headers
except:
sys.stderr.write('Error while determining to/from of ' + filename + '\n')
raise
msg_base['Subject'] = 'Chat with ' + msg_base['From'] + ' on ' + filename[ filename.find("(")+1 : filename.find(")") ]
doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n'
fi.seek(0) # make sure we don't leave off the first line...
ht = doctype + header.read() + fi.read() + footer.read() + '\n' # construct message using static header/footer files
# See if we should process the message based on date filter
if skipIfBeforeDate:
if debug:
print "Date filtering enabled, cutoff date is " + skipIfBeforeDate
try:
dcutoff = dateutil.parser.parse(skipIfBeforeDate)
except:
sys.stderr.out("Error while parsing cutoff date configuration parameter.\n")
raise
if d < dcutoff:
if not silent:
sys.stderr.write("File " + filename + " skipped due to skipIfBeforeDate constraint.\n")
return 0 # skip the file and stop processing with normal exit
# Create the message body...
msghtml = MIMEText(ht, 'html')
# determine system FQDN, but only if not specified above
if myhostname == '':
myhostname = socket.getfqdn()
# Create message ID
msguuid = uuid.uuid1() # note this uses machine's MAC addr. uuid.uuid4() is totally random if it matters
msgid = str(msguuid) + '@' + myhostname # this is per RFC822 and RFC2822
msg_base['Message-ID'] = msgid
# Set additional headers (comment out if not desired)
msg_base['X-Converted-By'] = sys.argv[0].lstrip('./')
msg_base['X-Converted-On'] = datetime.datetime.now().strftime("%a, %d %b %Y %H:%M:%S" + " -0500 (EST)")
if debug:
print "-- Headers are..."
for key, value in msg_base.items():
print key + ": " + value
# Attach the HTML to the root
msg_base.attach(msghtml)
if debug:
print "-- Ready to flatten and write message..."
# Write out the message
fo.write( msg_base.as_string() )
if debug:
print "-- Complete."
if not silent:
# If not in silent mode, write input filename and Message-ID to stdout
# When run inside a wrapper script, we can use this to make a nice logfile
print filename + '\t' + msgid
fo.close()
return 0
def determineAdiumXMLHeaders( filename, fi, dom, msg_base ):
"""Determine email headers for a new-style Adium XML .chatlog, using
a combination of the file object (fi) and also the parsed DOM of the
log itself. Sets headers on msg_base, which is a MIMEMultipart object.
Returns a datetime object which is the time of the log.
Requires lxml.etree.
"""
# If DOM is empty (not parsed yet), parse:
if not dom:
dom = lxml.etree.parse(fi)
# Determine log time
if len(dom.xpath('//@time')) == 0:
raise ValueError("Log does not appear to contain any timestamps!")
# TODO fall back to filename?
times = dom.xpath('//@time') # should return a list
try:
# We can't use datetime.datetime.strptime() here due to timezone, have to use dateutil
d = dateutil.parser.parse(times[0])
except ValueError:
# if we get a malformed time, try next one in the log, then give up
d = dateutil.parser.parse(times[1])
# Then write it out to RFC822 format
# TODO: We assume we are always in EST, which is dumb but most MUAs convert.
msg_base['Date'] = d.strftime("%a, %d %b %Y %H:%M:%S" + " -0500 (EST)")
# Determine IM account used for 'From' field (local end of log)
if len( dom.xpath('//@account') ) == 0:
raise ValueError("Log does not appear to contain an 'account' element!")
msg_base['From'] = dom.xpath('//@account')[0]
# Determine IM account used for 'To' field (remote end of log)
# This doesn't seem to be recorded in the logs, have to use filename
msg_base['To'] = os.path.basename(filename).split(' ')[0]
return d
def determineAdiumHTMLDateTime(line, msg_base, filename):
"""Determine the date and time of an old-style Adium log, using a single
line (typically the first), and the filename, and set appropriate headers
in the message object.
The global variable 'debug' enables debugging output, if True.
"""
# Date we can determine from the log file's filename
logdate = filename[ filename.find("(")+1 : filename.find(")") ]
if debug:
print "-- Log date appears to be " + logdate
# We must determine time from inside the log
if '<span class="timestamp">' in line:
logtime = line[ line.find('<span class="timestamp">')+24 : line.find('</span>') ]
if '<div class="status">' in line:
logtime = line[ line.find(' (')+2 : line.find(')</div>') ]
if debug:
print "-- Logtime appears to be " + logtime
# Turn it into a datetime object
try:
d = datetime.datetime.strptime(logdate + ' ' + logtime, '%Y-%m-%d %I:%M:%S %p')
except ValueError:
# if that date format (most common) doesn't work, try differently:
try:
d = datetime.datetime.strptime(logdate + ' ' + logtime, '%Y|%m|%d %H:%M:%S')
except ValueError:
# if that doesnt work either, try a 3rd time, this time without AM/PM flag
d = datetime.datetime.strptime(logdate + ' ' + logtime, '%Y-%m-%d %H:%M:%S')
# Then write it out to RFC822 format
# TODO: We assume we are always in EST, which is dumb.
msg_base['Date'] = d.strftime("%a, %d %b %Y %H:%M:%S" + " -0500 (EST)")
return d
def determineAdiumHTMLToFrom( fi, msg_base ):
"""Determine the To and From fields for an old-style Adium HTML log, using the log
as a file object, and set appropriate headers in the message object.
The global variable 'debug' enables debugging output, if True.
"""
fi.seek(0) # make sure we are at the beginning of the file
msgfrom = None
msgto = None
while msgfrom is None or msgto is None:
line = fi.readline()
if not line:
break # break loop at EOF
if '<div class="receive">' in line:
msgfrom = line[ line.find('<span class="sender">')+21 : line.find(': </span>') ]
if debug:
print "-- From username is " + msgfrom
if '<div class="send">' in line:
msgto = line[ line.find('<span class="sender">')+21 : line.find(': </span>') ]
if debug:
print "-- To username is " + msgto
if debug:
print "-- Loop complete"
if not msgfrom:
if debug:
print "-- Could not determine FROM field using log contents, falling back to filename."
msgfrom = os.path.basename(filename).split(' ')[0]
if not msgto:
if debug:
print "-- Could not determine TO field using log contents, falling back to default."
msgto = defaultToAddr
if not msgfrom or not msgto:
sys.stderr.write('Could not determine from or to field while processing ' + filename + '\n')
return 1
msg_base['From'] = msgfrom
msg_base['To'] = msgto
def determineHTMLLogHeaders(firstline, msg_base):
"""Process Pidgin/libpurple style HTML logs, which are complete HTML documents
beginning with a title element containing header information about the chat.
This is not used for Adium logs. Left in for future use.
Returns a date object which is the time of the log, for filtering purposes.
"""
title = firstline[firstline.find("<title>")+7:firstline.find("</title>")]
if debug:
print "<title>: " + title
# Determine the 'From' address of the chat
# TODO: This would be better done with a regexp but I was lazy
msg_base['From'] = title[title.find("Conversation with ")+18:title.find(" at ")]
if debug:
print "-- From is: " + msg_base['From']
# Determine the 'To' address
msg_base['To'] = title[title.find(" on ")+4:]
if debug:
print "-- To is: " + msg_base['To']
# Now we have to deal with the date. This is messy.
logdate = title[title.find(" at ")+4:title.find(" on ")]
# Turn it into a datetime object
d = datetime.datetime.strptime(logdate, '%m/%d/%Y %I:%M:%S %p')
# Then write it out to RFC822 format
# TODO: This is a naive/stupid way of handling timezone!
msg_base['Date'] = d.strftime("%a, %d %b %Y %H:%M:%S" + " -0500 (EST)")
# And the message subject
msg_base['Subject'] = title[0:title.find(" on ")]
return d
if __name__ == "__main__":
sys.exit( main() )
<?xml version="1.0"?>
<!-- this xslt is from http://trac.adium.im/ticket/6569 -->
<!-- H/T to https://gist.github.com/paulirish/1161725 -->
<!--
Adium is the legal property of its developers, whose names are listed in the copyright file included
with this source distribution.
This program is free software; you can redistribute it and/or modify it under the terms of the GNU
General Public License as published by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along with this program; if not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
Purpose:
Format an Adium log file as XHTML
Parameters:
title A string to use for the page title
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:adium="http://purl.org/net/ulf/ns/0.4-02"
xmlns="http://www.w3.org/1999/xhtml"
exclude-result-prefixes="adium">
<xsl:output method="xml"
doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
indent="yes" encoding="utf-8"/>
<xsl:strip-space elements="*"/>
<xsl:param name="title" select="'Chat'"/>
<!-- Process chats -->
<xsl:template match="adium:chat">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title><xsl:value-of select="$title"/></title>
<link rel="stylesheet" href="https://gist.githubusercontent.com/paulirish/1161725/raw/c3ba3fbbfcaef2dad0e62afe3837469b6d0bd603/adiumlogs.css"/>
</head>
<body>
<xsl:apply-templates/>
</body>
</html>
</xsl:template>
<!-- Process events -->
<xsl:template match="adium:event">
<div class="event">
<xsl:value-of select="@type"/>
<xsl:text>: </xsl:text>
<xsl:value-of select="translate(@time, 'T', ' ')"/>
</div>
</xsl:template>
<!-- Process messages -->
<xsl:template match="adium:message">
<!-- Record whether this is a follow-on message -->
<xsl:variable name="prec" select="preceding-sibling::*[1]"/>
<xsl:variable name="follow-on">
<xsl:if test="$prec[self::adium:message] and $prec/@sender = @sender">
<xsl:text>follow-on</xsl:text>
</xsl:if>
</xsl:variable>
<!-- Record whether this is from the principal account -->
<xsl:variable name="principal">
<xsl:if test="@sender = /adium:chat/@account">
<xsl:text>principal</xsl:text>
</xsl:if>
</xsl:variable>
<!-- Record if this is an auto message -->
<xsl:variable name="auto">
<xsl:if test="@auto = 'true'">
<xsl:text>auto</xsl:text>
</xsl:if>
</xsl:variable>
<!-- Create message div -->
<div class="message {$principal} {$follow-on} {$auto}">
<!-- Include a sender box when this is not a follow-on message -->
<xsl:if test="$follow-on = ''">
<div class="sender">
<xsl:apply-templates select="@sender|@alias"/>
</div>
</xsl:if>
<!-- Process attributes -->
<div class="meta">
<xsl:apply-templates select="@*[not(name() = 'sender' or name() = 'alias')]"/>
</div>
<div class="content">
<!-- Process child elements etc. -->
<xsl:apply-templates select="node()"/>
</div>
</div>
</xsl:template>
<xsl:template match="@sender|@alias">
<xsl:if test="name() = 'alias' or not(../@alias)">
<span class="sender">
<xsl:value-of select="."/>
</span>
</xsl:if>
</xsl:template>
<xsl:template match="@auto">
<span class="auto">
<xsl:value-of select="."/>
</span>
</xsl:template>
<xsl:template match="@time">
<span class="time">
<xsl:value-of select="substring(., 12, 8)"/>
</span>
</xsl:template>
<xsl:template match="adium:message/@*" priority="0">
<xsl:message>Unhandled attribute: message/@<xsl:value-of select="name()"/>&#10;</xsl:message>
</xsl:template>
<!-- Copy elements but strip off the namespace -->
<xsl:template match="*">
<xsl:element name="{local-name()}">
<xsl:apply-templates select="node()|@*"/>
</xsl:element>
</xsl:template>
<!-- Copy atrributes but strip off the namespace -->
<xsl:template match="@*">
<xsl:attribute name="{local-name()}">
<xsl:apply-templates/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
<!-- HTML header section start -->
<html>
<head>
<style>
.send {
color: green;
}
.receive {
color: blue;
}
.timestamp {
color: grey;
font-family: monospace;
}
.sender {
font-family: sans-serif;
padding: 0.5em;
}
.message {
display: inline;
color: black;
font-family: sans-serif;
padding: 0.5em;
}
.status {
font-family: sans-serif;
color: #4D4D4D;
}
</style>
</head>
<!-- Header ends -->
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment