Created
October 20, 2011 23:40
-
-
Save baudehlo/1302718 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var ld = new (require('languagedetect')); | |
exports.hook_data = function (next, connection) { | |
// enable mail body parsing | |
connection.transaction.parse_body = true; | |
next(); | |
} | |
exports.hook_data_post = function (next, connection) { | |
check_language(this, connection.transaction, connection.transaction.body); | |
next(); | |
} | |
var html_strip = /<\/?[\w:]+((\s+\w+(\s*=\s*(?:"[\s\S]*?"|'[\s\S]*?'|[^'">\s]+))?)+\s*|\s*)\/?>/g; | |
function check_language (plugin, tran, body) { | |
var ct = body.header.get_decoded('content-type') || 'text/plain'; | |
if (/multipart/i.test(ct)) { | |
return check_language(plugin, tran, body.children[0]); | |
} | |
if (body.children.length) { | |
// probably the preamble - ignore that, lets find the part we want | |
// here we prefer HTML parts over text parts. | |
var html_part; | |
for (var i=0,l=body.children.length; i<l; i++) { | |
var ct = body.children[i].header.get_decoded('content-type') || 'text/plain'; | |
if (/html/i.test(ct)) { | |
html_part = i; | |
break; | |
} | |
} | |
var part_we_want = 0; | |
if (html_part !== undefined) { | |
part_we_want = html_part; | |
} | |
body = body.children[part_we_want]; | |
} | |
var ct = body.header.get_decoded('content-type') || 'text/plain'; | |
var text = body.bodytext; | |
// plugin.loginfo("Text: " + text); | |
if (/html/i.test(ct)) { | |
// crappy HTML stripping regexps | |
text = text.replace(/<!DOCTYPE[^>]*>/, ''); | |
text = text.replace(/<\?xml:namespace[^>]*>/, ''); | |
text = text.replace(/<br\/?>/gi, " "); | |
text = text.replace(/<p\/?>/gi, " "); | |
text = text.replace(/>/gi, ">"); | |
text = text.replace(/</gi, "<"); | |
text = text.replace(/ /gi, " "); | |
text = text.replace(/&/gi, "&"); | |
text = text.replace(/&#([0-9]+);/gi, function (ignore, num) { | |
return String.fromCharCode(parseInt(num)); | |
}) | |
text = text.replace(html_strip, ''); | |
text = text.replace(/^\s*/g, ''); | |
text = text.replace(/\s*$/g, ''); | |
} | |
var langs = ld.detect(text); | |
if (langs.length) { | |
tran.notes.body_language = langs[0][0]; | |
plugin.loginfo("Email is most likely in: " + tran.notes.body_language); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment