|
#!/usr/bin/env ruby |
|
# |
|
# Reads emails from the `files` directory (as created by `psts-to-files.sh` |
|
# and `fiddle-with-rtf-bodies.rb`), and writes them to `emails.csv`. |
|
|
|
UsefulHeaders = %w(Bcc Cc Date From Message-Id Subject To) |
|
UsefulHeadersRegexp = Regexp.new("(#{UsefulHeaders.join('|')}): (.*)", 'i') |
|
TrimAttachmentsRegexp = Regexp.new("\\A(.*?)\r?\n--(?:alt-)?--boundary-LibPST-iamunique-\\d+_-_-\r?\nContent-Type: (text\/html|application)", Regexp::MULTILINE, 'n') |
|
|
|
def quote_csv_value(datum) |
|
if datum =~ /[\x00-\x1f",]/ |
|
"\"#{datum.gsub(/"/, '""')}\"" |
|
else |
|
datum |
|
end |
|
end |
|
|
|
def array_to_csv_row(arr) |
|
arr.map{ |value| quote_csv_value(value) }.join(',') + "\n" |
|
end |
|
|
|
def plain_email_to_csv_row(filename_without_extension, blob) |
|
headers_string, body_string = blob.split(/\r?\n\r?\n/, 2) |
|
|
|
headers = headers_string |
|
.split(/\r?\n/) |
|
.reduce({}) do |hash, line| |
|
if line =~ UsefulHeadersRegexp |
|
hash[$1.downcase] = $2 |
|
end |
|
hash |
|
end |
|
|
|
body = UsefulHeaders.map do |header_name| |
|
if headers[header_name.downcase].nil? |
|
nil |
|
else |
|
"#{header_name}: #{headers[header_name.downcase]}" |
|
end |
|
end.compact.join("\n") + "\n\n#{body_string}" |
|
|
|
arr = [ filename_without_extension ] * 2 |
|
UsefulHeaders.each do |h| |
|
arr << headers[h.downcase] || '' |
|
end |
|
arr << body |
|
|
|
array_to_csv_row(arr) |
|
end |
|
|
|
def rtf_filename_to_csv_row(rtf_filename) |
|
unrtf_output = IO.popen([ 'unrtf', rtf_filename, '--nopict', '--text' ], binmode: true, &:read) |
|
# The unrtf --quiet option doesn't seem to work (v0.21.9), I take an |
|
# uglier approach to nixing its stupid header |
|
text = if unrtf_output =~ /\A### Translation from RTF.*?---------\n(.*)/ |
|
$1 |
|
else |
|
unrtf_output |
|
end |
|
|
|
plain_email_to_csv_row(rtf_filename, text) |
|
end |
|
|
|
def eml_filename_to_csv_row(eml_filename) |
|
full_text = IO.read(eml_filename, binmode: true) |
|
text = if full_text =~ TrimAttachmentsRegexp |
|
"#{$1}\n\n [ attachment(s) truncated ]" |
|
else |
|
full_text |
|
end |
|
|
|
plain_email_to_csv_row(eml_filename, text) |
|
end |
|
|
|
File.open('emails.csv', 'wb') do |f| |
|
f.write("id,title,#{UsefulHeaders.join(',')},text\n") |
|
|
|
Dir['files/**/*'].each do |filename| |
|
if filename =~ /\/[0-9]+.rtf$/ |
|
f.write(rtf_filename_to_csv_row(filename)) |
|
elsif filename =~ /\/[0-9]+$/ |
|
f.write(eml_filename_to_csv_row(filename)) |
|
end |
|
end |
|
end |