Skip to content

Instantly share code, notes, and snippets.

@nbhasker
Created November 1, 2019 19:29
Combine Yahoo Groups email messages downloaded via YahooGroupArchiver to HTML (and then to PDF via browser print function)
import os
import json
import datetime
import io
dir_name = r'C:\Users\Bhasker\Documents\mydocs\src\YahooEmailToHTML\EmailFull'
outfile_name = "output.html"
start_msg_no = 15000
num_msgs_per_file = 5000
msgs = []
for f_name in os.listdir(dir_name):
if f_name.endswith(".json") and "_raw.json" not in f_name:
print "Loading ", (os.path.join(dir_name, f_name))
else:
continue
with open(os.path.join(dir_name, f_name)) as j:
d = json.load(j)
msgs.append(d)
if "attachmentsInfo" in d:
for a in d["attachmentsInfo"]:
print "Attachment Filename: ", a["filename"]
if len(d["authorName"]) == 0:
print "Zero length authorName. Using: ", d["from"]
print "Found ", len(msgs), " messages"
outf = io.open(outfile_name, 'w', encoding='utf8')
print "Sorting ..."
sorted_msgs = sorted(msgs, key=lambda i: (i["topicId"], i["msgId"]))
print "Done"
msg_no = 0
msg_count = 0
for m in sorted_msgs:
if msg_no < start_msg_no:
msg_no += 1
continue;
if msg_count >= num_msgs_per_file:
break;
t = datetime.datetime.utcfromtimestamp(float(m["postDate"])).strftime('%B %#d, %Y %H:%M:%S (UTC)')
if len(m["authorName"]) <> 0:
s = u"<h1>" + m["authorName"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
else:
s = u"<h1>" + m["from"] + u" : " + unicode(t, "utf-8") + u"</h1>" + u"\n"
print "Processing ", m["authorName"], " / ", m["from"]
outf.write(s)
if "subject" in m:
s = u"<h2>" + m["subject"] + u"</h2>" + u"\n"
outf.write(s)
if "messageBody" in m:
outf.write(m["messageBody"])
if "attachmentsInfo" in m and len(m["attachmentsInfo"]) > 0:
s = u"<h2>Attachments:</h2>\n"
outf.write(s)
s = u"\n" + u"<ul>" + u"\n"
outf.write(s)
for a in m["attachmentsInfo"]:
s = u"<li>" + a["filename"] + u"</li>" + u"\n"
outf.write(s)
s = u"</ul>" + u"\n"
outf.write(s)
s = u"<hr><br></br>\n"
outf.write(s)
msg_count += 1
print "Start Msg Number: ", start_msg_no, "Processed Messages: ", msg_count
outf.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment