|
const PostalMime = require('postal-mime'); |
|
|
|
// @Author: @irazasyed - https://github.com/irazasyed |
|
// MIT |
|
const unsubscribeKeywords = { |
|
// English (with regional variations) |
|
en: { |
|
general: ['unsubscribe', 'opt-out', 'optout', 'remove', 'stop receiving', 'cancel subscription'], |
|
uk: ['cease communications', 'stop emails', 'remove from mailing list'], |
|
au: ['stop subscription', 'cancel communications', 'end subscription'], |
|
nz: ['stop messages', 'cancel email subscription', 'remove from list'], |
|
ca: ['cancel emails', 'remove subscription', 'stop mail'] |
|
}, |
|
|
|
// Spanish (with regional variations) |
|
es: { |
|
general: ['cancelar', 'darse de baja', 'desuscribir', 'desuscribirse', 'anular suscripción'], |
|
es_mx: ['cancelar suscripción', 'ya no recibir correos', 'eliminar suscripción'], |
|
es_ar: ['dar de baja', 'cancelar envíos', 'no recibir más correos'], |
|
es_co: ['cancelar mensajes', 'retirar suscripción', 'eliminar de lista'] |
|
}, |
|
|
|
// French (with regional variations) |
|
fr: { |
|
general: ['désabonner', 'désabonnement', 'désinscription', 'se désinscrire'], |
|
fr_ca: ['annuler l\'abonnement', 'arrêter les courriels', 'retirer de la liste'], |
|
fr_be: ['arrêter l\'inscription', 'ne plus recevoir', 'supprimer l\'abonnement'], |
|
fr_ch: ['supprimer inscription', 'arrêter les messages', 'retirer abonnement'] |
|
}, |
|
|
|
// German (with regional variations) |
|
de: { |
|
general: ['abbestellen', 'abmelden', 'austragen', 'abonnement beenden'], |
|
de_at: ['newsletter abmelden', 'keine e-mails mehr', 'aus der liste entfernen'], |
|
de_ch: ['mitteilungen stoppen', 'abonnement kündigen', 'kommunikation beenden'], |
|
de_lu: ['nachrichten abbestellen', 'keine zusendungen mehr', 'aus verteiler entfernen'] |
|
}, |
|
|
|
// Chinese (with regional variations) |
|
zh: { |
|
simplified: ['退订', '取消订阅', '取消电子报', '停止接收', '退出订阅'], |
|
traditional: ['退訂', '取消訂閱', '取消電子報', '停止接收', '退出訂閱'], |
|
hk: ['停止訂閱', '取消電郵', '停止接收郵件'], |
|
tw: ['取消訂閱電子報', '停止接收電子報', '退出郵件列表'] |
|
}, |
|
|
|
// Japanese (with regional variations and common patterns) |
|
ja: { |
|
general: ['配信解除', '購読解除', '登録解除', '配信停止', '受信解除'], |
|
formal: ['配信を停止する', '購読を解除する', 'メールの配信を停止'], |
|
informal: ['メルマガ解除', '配信とめる', '登録をやめる'], |
|
keigo: ['配信を御解除', '購読を終了させていただく', '配信停止を承ります'] |
|
}, |
|
|
|
// Korean (with formal/informal variations) |
|
ko: { |
|
general: ['구독취소', '수신거부', '구독해지', '이메일수신거부'], |
|
formal: ['구독을 취소하겠습니다', '수신을 거부하겠습니다', '이메일 수신을 중지합니다'], |
|
informal: ['구독 그만하기', '메일 안받기', '수신 그만하기'], |
|
honorific: ['구독을 취소하시겠습니까', '수신을 거부하시겠습니까'] |
|
}, |
|
|
|
// Additional Asian Languages |
|
th: { |
|
general: ['ยกเลิกการสมัคร', 'ยกเลิกรับข้อมูล', 'ยกเลิกรับข่าวสาร'], |
|
formal: ['ขอยกเลิกการสมัครสมาชิก', 'ยกเลิกการรับจดหมายข่าว'], |
|
informal: ['ไม่รับเมลล์', 'เลิกรับข่าว', 'ยกเลิกเมลล์'] |
|
}, |
|
|
|
vi: { |
|
general: ['hủy đăng ký', 'ngừng nhận tin', 'hủy theo dõi'], |
|
formal: ['xin hủy đăng ký', 'ngừng nhận thông tin', 'xin ngừng theo dõi'], |
|
informal: ['không nhận mail', 'hủy mail', 'ngừng mail'] |
|
}, |
|
|
|
// Middle Eastern Languages |
|
ar: { |
|
general: ['إلغاء الاشتراك', 'الغاء الاشتراك', 'إيقاف التسجيل'], |
|
msa: ['وقف تلقي الرسائل', 'إلغاء التسجيل في القائمة'], |
|
levant: ['الغاء الاشتراك بالقائمة', 'ايقاف استلام الرسائل'], |
|
gulf: ['إلغاء التسجيل', 'وقف استلام البريد'] |
|
}, |
|
|
|
he: { |
|
general: ['ביטול הרשמה', 'הסרה מרשימת התפוצה', 'ביטול קבלת מיילים'], |
|
formal: ['אבקש להסיר את כתובתי', 'נא להסיר מרשימת התפוצה'], |
|
informal: ['להסיר אותי', 'לא לשלוח יותר', 'ביטול המייל'] |
|
}, |
|
|
|
// Indian Languages |
|
hi: { |
|
general: ['सदस्यता रद्द', 'अनसब्स्क्राइब', 'सब्सक्रिप्शन रद्द'], |
|
formal: ['कृपया सदस्यता समाप्त करें', 'मेल सूची से हटाएं'], |
|
informal: ['मेल बंद करो', 'सब्सक्रिप्शन हटाओ'] |
|
}, |
|
|
|
bn: { |
|
general: ['সদস্যতা বাতিল', 'আনসাবস্ক্রাইব', 'সাবস্ক্রিপশন বাতিল'], |
|
formal: ['সদস্যতা প্রত্যাহার করুন', 'মেল তালিका থেকে সরান'], |
|
informal: ['মেল বন্ধ করুন', 'সাবস্ক্রিপশন তুলে নিন'] |
|
}, |
|
|
|
// European Languages |
|
pl: { |
|
general: ['wypisz się', 'rezygnacja z subskrypcji', 'anuluj subskrypcję'], |
|
formal: ['proszę o wypisanie', 'rezygnuję z newslettera'], |
|
informal: ['nie chcę więcej wiadomości', 'wypiszcie mnie'] |
|
}, |
|
|
|
uk: { |
|
general: ['відписатися', 'скасувати підписку', 'припинити розсилку'], |
|
formal: ['відмовитися від розсилки', 'припинити отримання листів'], |
|
informal: ['не хочу отримувати', 'вийти з розсилки'] |
|
}, |
|
|
|
tr: { |
|
general: ['aboneliği iptal', 'abonelikten çık', 'üyeliği iptal'], |
|
formal: ['bülteni durdurun', 'e-posta listesinden çıkarın'], |
|
informal: ['iptal et', 'listeden çıkar'] |
|
} |
|
}; |
|
|
|
// Language detection patterns for better accuracy |
|
const languagePatterns = { |
|
// Common language-specific characters and patterns |
|
zh: /[\u4E00-\u9FFF]/, |
|
ja: /[\u3040-\u30FF\u31F0-\u31FF]/, |
|
ko: /[\uAC00-\uD7AF\u1100-\u11FF]/, |
|
ar: /[\u0600-\u06FF]/, |
|
he: /[\u0590-\u05FF]/, |
|
hi: /[\u0900-\u097F]/, |
|
bn: /[\u0980-\u09FF]/, |
|
th: /[\u0E00-\u0E7F]/, |
|
|
|
// Common words and patterns for European languages |
|
en: /\b(the|and|or|if)\b/i, |
|
es: /\b(el|la|los|las|y|o|si)\b/i, |
|
fr: /\b(le|la|les|et|ou|si)\b/i, |
|
de: /\b(der|die|das|und|oder|wenn)\b/i, |
|
pl: /\b(i|lub|czy|ale|gdy)\b/i, |
|
uk: /\b(і|або|чи|але|коли)\b/i, |
|
tr: /\b(ve|veya|eğer|ama|ne)\b/i/ |
|
}; |
|
|
|
// Enhanced URL pattern matching |
|
const urlPatterns = { |
|
// Standard URL patterns |
|
standard: /https?:\/\/[^\s<>"']+/gi, |
|
|
|
// Encoded URL patterns |
|
encoded: /%(?:[0-9A-Fa-f]{2})+/, |
|
|
|
// Common unsubscribe URL patterns |
|
unsubscribe: [ |
|
/https?:\/\/[^\s<>"']*(?:unsubscribe|opt-?out|remove)[^\s<>"']*/gi, |
|
/https?:\/\/[^\s<>"']*(?:cancel|stop)[^\s<>"']*subscription[^\s<>"']*/gi, |
|
/https?:\/\/[^\s<>"']*preferences[^\s<>"']*/gi |
|
] |
|
}; |
|
|
|
// HTML Element patterns for better link extraction |
|
const htmlPatterns = { |
|
// Link elements |
|
links: { |
|
anchor: /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>.*?<\/a>/gi, |
|
button: /<button[^>]*onclick=["'][^"']*?(https?:\/\/[^"']+)[^>]*>.*?<\/button>/gi, |
|
form: /<form[^>]*action=["'](https?:\/\/[^"']+)["'][^>]*>.*?<\/form>/gi |
|
}, |
|
|
|
// Common unsubscribe elements |
|
unsubscribe: { |
|
footer: /<(?:div|footer)[^>]*class=["'][^"']*(?:footer|bottom)[^"']*["'][^>]*>.*?<\/(?:div|footer)>/gi, |
|
preferences: /<div[^>]*class=["'][^"']*(?:preferences|settings)[^"']*["'][^>]*>.*?<\/div>/gi |
|
} |
|
}; |
|
|
|
class EmailLanguageDetector { |
|
constructor() { |
|
this.patterns = languagePatterns; |
|
this.keywords = unsubscribeKeywords; |
|
} |
|
|
|
detectLanguage(content) { |
|
const scores = {}; |
|
|
|
// Check character patterns |
|
Object.entries(this.patterns).forEach(([lang, pattern]) => { |
|
const matches = content.match(pattern); |
|
scores[lang] = (scores[lang] || 0) + (matches ? matches.length * 2 : 0); |
|
}); |
|
|
|
// Check keyword matches |
|
Object.entries(this.keywords).forEach(([lang, variants]) => { |
|
Object.values(variants).flat().forEach(keyword => { |
|
const regex = new RegExp(keyword, 'gi'); |
|
const matches = content.match(regex); |
|
scores[lang] = (scores[lang] || 0) + (matches ? matches.length * 3 : 0); |
|
}); |
|
}); |
|
|
|
// Get language with highest score |
|
return Object.entries(scores) |
|
.sort(([,a], [,b]) => b - a) |
|
.map(([lang]) => lang)[0] || 'en'; |
|
} |
|
|
|
getConfidence(content, detectedLang) { |
|
const totalScore = Object.values(scores).reduce((a, b) => a + b, 0); |
|
return scores[detectedLang] / totalScore; |
|
} |
|
} |
|
|
|
class UnsubscribeLinkExtractor { |
|
constructor() { |
|
this.urlPatterns = urlPatterns; |
|
this.htmlPatterns = htmlPatterns; |
|
} |
|
|
|
extractFromHtml(html, language) { |
|
const links = new Set(); |
|
|
|
// Extract from standard elements |
|
Object.values(this.htmlPatterns.links).forEach(pattern => { |
|
let match; |
|
while ((match = pattern.exec(html)) !== null) { |
|
if (match[1]) links.add(this.cleanUrl(match[1])); |
|
} |
|
}); |
|
|
|
// Extract from language-specific content |
|
const keywords = this.getKeywordsForLanguage(language); |
|
keywords.forEach(keyword => { |
|
const pattern = new RegExp( |
|
`<[^>]*>(?:[^<]*${keyword}[^<]*)?<\/[^>]*>`, 'gi' |
|
); |
|
let context = ''; |
|
let match; |
|
while ((match = pattern.exec(html)) !== null) { |
|
context = html.slice( |
|
Math.max(0, match.index - 500), |
|
Math.min(html.length, match.index + match[0].length + 500) |
|
); |
|
this.extractUrlsFromContext(context, links); |
|
} |
|
}); |
|
|
|
return [...links]; |
|
} |
|
|
|
extractFromText(text, language) { |
|
const links = new Set(); |
|
const keywords = this.getKeywordsForLanguage(language); |
|
|
|
keywords.forEach(keyword => { |
|
const pattern = new RegExp( |
|
`(?:${keyword}[^]*?)(https?:\/\/\\S+)|` + |
|
`(https?:\/\/\\S+)[^]*?${keyword}`, |
|
'gi' |
|
); |
|
let match; |
|
while ((match = pattern.exec(text)) !== null) { |
|
const url = match[1] || match[2]; |
|
if (url) links.add(this.cleanUrl(url)); |
|
} |
|
}); |
|
|
|
return [...links]; |
|
} |
|
|
|
getKeywordsForLanguage(language) { |
|
const langKeywords = unsubscribeKeywords[language]; |
|
if (!langKeywords) return []; |
|
|
|
return Object.values(langKeywords) |
|
.flat() |
|
.map(keyword => keyword.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')); |
|
} |
|
|
|
cleanUrl(url) { |
|
try { |
|
let cleaned = url |
|
.replace(/[.,;:)\]}"'>]+$/, '') |
|
.replace(/&/g, '&') |
|
.trim(); |
|
|
|
// Handle multiple levels of encoding |
|
let previous; |
|
do { |
|
previous = cleaned; |
|
cleaned = decodeURIComponent(previous); |
|
} while (cleaned !== previous); |
|
|
|
// Validate and normalize URL |
|
const urlObj = new URL(cleaned); |
|
return urlObj.toString(); |
|
} catch { |
|
return null; |
|
} |
|
} |
|
|
|
extractUrlsFromContext(context, links) { |
|
this.urlPatterns.unsubscribe.forEach(pattern => { |
|
let match; |
|
while ((match = pattern.exec(context)) !== null) { |
|
const url = this.cleanUrl(match[0]); |
|
if (url) links.add(url); |
|
} |
|
}); |
|
} |
|
} |
|
|
|
class EmailUnsubscribeParser { |
|
constructor() { |
|
this.languageDetector = new EmailLanguageDetector(); |
|
this.linkExtractor = new UnsubscribeLinkExtractor(); |
|
} |
|
|
|
async parse(emailContent, options = {}) { |
|
const parser = new PostalMime(); |
|
try { |
|
const email = await parser.parse(emailContent); |
|
return this.processEmail(email, options); |
|
} catch (error) { |
|
console.error('Error parsing email:', error); |
|
throw error; |
|
} |
|
} |
|
|
|
async processEmail(email, options) { |
|
// Detect language |
|
const content = this.getEmailContent(email); |
|
const detectedLanguage = options.language || |
|
this.languageDetector.detectLanguage(content); |
|
|
|
const result = { |
|
headerLinks: this.extractHeaderLinks(email), |
|
bodyLinks: [], |
|
language: { |
|
detected: detectedLanguage, |
|
confidence: this.languageDetector.getConfidence(content, detectedLanguage) |
|
}, |
|
metadata: this.extractMetadata(email) |
|
}; |
|
|
|
// Extract links from HTML content |
|
if (email.html) { |
|
const htmlLinks = this.linkExtractor.extractFromHtml( |
|
email.html, |
|
detectedLanguage |
|
); |
|
result.bodyLinks.push(...htmlLinks); |
|
} |
|
|
|
// Extract links from text content |
|
if (email.text) { |
|
const textLinks = this.linkExtractor.extractFromText( |
|
email.text, |
|
detectedLanguage |
|
); |
|
result.bodyLinks.push(...textLinks); |
|
} |
|
|
|
// Remove duplicates and invalid links |
|
result.bodyLinks = [...new Set(result.bodyLinks)].filter(Boolean); |
|
|
|
return result; |
|
} |
|
|
|
getEmailContent(email) { |
|
return [ |
|
email.subject || '', |
|
email.text || '', |
|
email.html || '' |
|
].join('\n'); |
|
} |
|
|
|
extractHeaderLinks(email) { |
|
const links = []; |
|
|
|
// Check List-Unsubscribe header |
|
if (email.headers && email.headers['list-unsubscribe']) { |
|
const headerValue = email.headers['list-unsubscribe']; |
|
|
|
// Handle comma-separated URLs |
|
headerValue.split(',').forEach(part => { |
|
// Extract mailto: links |
|
const mailtoMatch = part.match(/<mailto:([^>]+)>/); |
|
if (mailtoMatch) { |
|
links.push({ |
|
type: 'mailto', |
|
value: mailtoMatch[1] |
|
}); |
|
} |
|
|
|
// Extract HTTP(S) links |
|
const urlMatch = part.match(/<(https?:\/\/[^>]+)>/); |
|
if (urlMatch) { |
|
const url = this.linkExtractor.cleanUrl(urlMatch[1]); |
|
if (url) { |
|
links.push({ |
|
type: 'http', |
|
value: url |
|
}); |
|
} |
|
} |
|
}); |
|
} |
|
|
|
return links; |
|
} |
|
|
|
extractMetadata(email) { |
|
return { |
|
subject: email.subject, |
|
from: email.from, |
|
to: email.to, |
|
date: email.date, |
|
hasHtml: Boolean(email.html), |
|
hasText: Boolean(email.text), |
|
headers: this.extractRelevantHeaders(email.headers) |
|
}; |
|
} |
|
|
|
extractRelevantHeaders(headers) { |
|
const relevantHeaders = [ |
|
'list-unsubscribe', |
|
'list-unsubscribe-post', |
|
'list-id', |
|
'precedence', |
|
'x-mailer', |
|
'x-unsubscribe-web' |
|
]; |
|
|
|
return Object.fromEntries( |
|
Object.entries(headers || {}) |
|
.filter(([key]) => relevantHeaders.includes(key.toLowerCase())) |
|
); |
|
} |
|
} |
|
|
|
// Utility functions for module usage |
|
function createParser(options = {}) { |
|
return new EmailUnsubscribeParser(options); |
|
} |
|
|
|
async function parseEmail(emailContent, options = {}) { |
|
const parser = createParser(options); |
|
return parser.parse(emailContent); |
|
} |
|
|
|
// Export the module |
|
module.exports = { |
|
createParser, |
|
parseEmail, |
|
EmailUnsubscribeParser, |
|
EmailLanguageDetector, |
|
UnsubscribeLinkExtractor, |
|
// Constants |
|
unsubscribeKeywords, |
|
languagePatterns, |
|
urlPatterns, |
|
htmlPatterns |
|
}; |