Skip to content

Instantly share code, notes, and snippets.

@baudehlo
Created October 20, 2011 23:40
Show Gist options
  • Save baudehlo/1302718 to your computer and use it in GitHub Desktop.
Save baudehlo/1302718 to your computer and use it in GitHub Desktop.
var ld = new (require('languagedetect'));
exports.hook_data = function (next, connection) {
// enable mail body parsing
connection.transaction.parse_body = true;
next();
}
exports.hook_data_post = function (next, connection) {
check_language(this, connection.transaction, connection.transaction.body);
next();
}
var html_strip = /<\/?[\w:]+((\s+\w+(\s*=\s*(?:"[\s\S]*?"|'[\s\S]*?'|[^'">\s]+))?)+\s*|\s*)\/?>/g;
function check_language (plugin, tran, body) {
var ct = body.header.get_decoded('content-type') || 'text/plain';
if (/multipart/i.test(ct)) {
return check_language(plugin, tran, body.children[0]);
}
if (body.children.length) {
// probably the preamble - ignore that, lets find the part we want
// here we prefer HTML parts over text parts.
var html_part;
for (var i=0,l=body.children.length; i<l; i++) {
var ct = body.children[i].header.get_decoded('content-type') || 'text/plain';
if (/html/i.test(ct)) {
html_part = i;
break;
}
}
var part_we_want = 0;
if (html_part !== undefined) {
part_we_want = html_part;
}
body = body.children[part_we_want];
}
var ct = body.header.get_decoded('content-type') || 'text/plain';
var text = body.bodytext;
// plugin.loginfo("Text: " + text);
if (/html/i.test(ct)) {
// crappy HTML stripping regexps
text = text.replace(/<!DOCTYPE[^>]*>/, '');
text = text.replace(/<\?xml:namespace[^>]*>/, '');
text = text.replace(/<br\/?>/gi, " ");
text = text.replace(/<p\/?>/gi, " ");
text = text.replace(/&gt;/gi, ">");
text = text.replace(/&lt;/gi, "<");
text = text.replace(/&nbsp;/gi, " ");
text = text.replace(/&amp;/gi, "&");
text = text.replace(/&#([0-9]+);/gi, function (ignore, num) {
return String.fromCharCode(parseInt(num));
})
text = text.replace(html_strip, '');
text = text.replace(/^\s*/g, '');
text = text.replace(/\s*$/g, '');
}
var langs = ld.detect(text);
if (langs.length) {
tran.notes.body_language = langs[0][0];
plugin.loginfo("Email is most likely in: " + tran.notes.body_language);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment