Skip to content

Instantly share code, notes, and snippets.

@irazasyed
Created November 17, 2024 20:21
Show Gist options
  • Save irazasyed/8c6fd88975e1fc5b0cf303003b1b65d8 to your computer and use it in GitHub Desktop.
Save irazasyed/8c6fd88975e1fc5b0cf303003b1b65d8 to your computer and use it in GitHub Desktop.
Smart Email Unsubscribe Link Parser with Multilingual support

Email Unsubscribe Parser with Multilingual support

By @irazasyed

A smart email unsubscribe link parser with multilingual. It can detect language or you can provide one. Let's you find the unsubscribe link which you can later trigger or utlize to process for any purposes.

Eg. To unsubscribe from newsletters for mailbox cleanup automatically).

const { parseEmail } = require('./email-unsubscribe-parser');

// Simple usage
const result = await parseEmail(emailContent);

// Advanced usage with options
const result = await parseEmail(emailContent, {
  // Optional configuration
  language: 'auto', // or specific language code, eg. 'ja'
  includeMetadata: true,
  strictMode: false, // relaxed URL validation
  maxLinks: 10 // limit number of extracted links
});

console.log('Parsing Results:');
console.log('Detected Language:', result.language);
console.log('Header Links:', result.headerLinks);
console.log('Body Links:', result.bodyLinks);
console.log('Metadata:', result.metadata);
const PostalMime = require('postal-mime');
// @Author: @irazasyed - https://github.com/irazasyed
// MIT
const unsubscribeKeywords = {
// English (with regional variations)
en: {
general: ['unsubscribe', 'opt-out', 'optout', 'remove', 'stop receiving', 'cancel subscription'],
uk: ['cease communications', 'stop emails', 'remove from mailing list'],
au: ['stop subscription', 'cancel communications', 'end subscription'],
nz: ['stop messages', 'cancel email subscription', 'remove from list'],
ca: ['cancel emails', 'remove subscription', 'stop mail']
},
// Spanish (with regional variations)
es: {
general: ['cancelar', 'darse de baja', 'desuscribir', 'desuscribirse', 'anular suscripción'],
es_mx: ['cancelar suscripción', 'ya no recibir correos', 'eliminar suscripción'],
es_ar: ['dar de baja', 'cancelar envíos', 'no recibir más correos'],
es_co: ['cancelar mensajes', 'retirar suscripción', 'eliminar de lista']
},
// French (with regional variations)
fr: {
general: ['désabonner', 'désabonnement', 'désinscription', 'se désinscrire'],
fr_ca: ['annuler l\'abonnement', 'arrêter les courriels', 'retirer de la liste'],
fr_be: ['arrêter l\'inscription', 'ne plus recevoir', 'supprimer l\'abonnement'],
fr_ch: ['supprimer inscription', 'arrêter les messages', 'retirer abonnement']
},
// German (with regional variations)
de: {
general: ['abbestellen', 'abmelden', 'austragen', 'abonnement beenden'],
de_at: ['newsletter abmelden', 'keine e-mails mehr', 'aus der liste entfernen'],
de_ch: ['mitteilungen stoppen', 'abonnement kündigen', 'kommunikation beenden'],
de_lu: ['nachrichten abbestellen', 'keine zusendungen mehr', 'aus verteiler entfernen']
},
// Chinese (with regional variations)
zh: {
simplified: ['退订', '取消订阅', '取消电子报', '停止接收', '退出订阅'],
traditional: ['退訂', '取消訂閱', '取消電子報', '停止接收', '退出訂閱'],
hk: ['停止訂閱', '取消電郵', '停止接收郵件'],
tw: ['取消訂閱電子報', '停止接收電子報', '退出郵件列表']
},
// Japanese (with regional variations and common patterns)
ja: {
general: ['配信解除', '購読解除', '登録解除', '配信停止', '受信解除'],
formal: ['配信を停止する', '購読を解除する', 'メールの配信を停止'],
informal: ['メルマガ解除', '配信とめる', '登録をやめる'],
keigo: ['配信を御解除', '購読を終了させていただく', '配信停止を承ります']
},
// Korean (with formal/informal variations)
ko: {
general: ['구독취소', '수신거부', '구독해지', '이메일수신거부'],
formal: ['구독을 취소하겠습니다', '수신을 거부하겠습니다', '이메일 수신을 중지합니다'],
informal: ['구독 그만하기', '메일 안받기', '수신 그만하기'],
honorific: ['구독을 취소하시겠습니까', '수신을 거부하시겠습니까']
},
// Additional Asian Languages
th: {
general: ['ยกเลิกการสมัคร', 'ยกเลิกรับข้อมูล', 'ยกเลิกรับข่าวสาร'],
formal: ['ขอยกเลิกการสมัครสมาชิก', 'ยกเลิกการรับจดหมายข่าว'],
informal: ['ไม่รับเมลล์', 'เลิกรับข่าว', 'ยกเลิกเมลล์']
},
vi: {
general: ['hủy đăng ký', 'ngừng nhận tin', 'hủy theo dõi'],
formal: ['xin hủy đăng ký', 'ngừng nhận thông tin', 'xin ngừng theo dõi'],
informal: ['không nhận mail', 'hủy mail', 'ngừng mail']
},
// Middle Eastern Languages
ar: {
general: ['إلغاء الاشتراك', 'الغاء الاشتراك', 'إيقاف التسجيل'],
msa: ['وقف تلقي الرسائل', 'إلغاء التسجيل في القائمة'],
levant: ['الغاء الاشتراك بالقائمة', 'ايقاف استلام الرسائل'],
gulf: ['إلغاء التسجيل', 'وقف استلام البريد']
},
he: {
general: ['ביטול הרשמה', 'הסרה מרשימת התפוצה', 'ביטול קבלת מיילים'],
formal: ['אבקש להסיר את כתובתי', 'נא להסיר מרשימת התפוצה'],
informal: ['להסיר אותי', 'לא לשלוח יותר', 'ביטול המייל']
},
// Indian Languages
hi: {
general: ['सदस्यता रद्द', 'अनसब्स्क्राइब', 'सब्सक्रिप्शन रद्द'],
formal: ['कृपया सदस्यता समाप्त करें', 'मेल सूची से हटाएं'],
informal: ['मेल बंद करो', 'सब्सक्रिप्शन हटाओ']
},
bn: {
general: ['সদস্যতা বাতিল', 'আনসাবস্ক্রাইব', 'সাবস্ক্রিপশন বাতিল'],
formal: ['সদস্যতা প্রত্যাহার করুন', 'মেল তালিका থেকে সরান'],
informal: ['মেল বন্ধ করুন', 'সাবস্ক্রিপশন তুলে নিন']
},
// European Languages
pl: {
general: ['wypisz się', 'rezygnacja z subskrypcji', 'anuluj subskrypcję'],
formal: ['proszę o wypisanie', 'rezygnuję z newslettera'],
informal: ['nie chcę więcej wiadomości', 'wypiszcie mnie']
},
uk: {
general: ['відписатися', 'скасувати підписку', 'припинити розсилку'],
formal: ['відмовитися від розсилки', 'припинити отримання листів'],
informal: ['не хочу отримувати', 'вийти з розсилки']
},
tr: {
general: ['aboneliği iptal', 'abonelikten çık', 'üyeliği iptal'],
formal: ['bülteni durdurun', 'e-posta listesinden çıkarın'],
informal: ['iptal et', 'listeden çıkar']
}
};
// Language detection patterns for better accuracy
const languagePatterns = {
// Common language-specific characters and patterns
zh: /[\u4E00-\u9FFF]/,
ja: /[\u3040-\u30FF\u31F0-\u31FF]/,
ko: /[\uAC00-\uD7AF\u1100-\u11FF]/,
ar: /[\u0600-\u06FF]/,
he: /[\u0590-\u05FF]/,
hi: /[\u0900-\u097F]/,
bn: /[\u0980-\u09FF]/,
th: /[\u0E00-\u0E7F]/,
// Common words and patterns for European languages
en: /\b(the|and|or|if)\b/i,
es: /\b(el|la|los|las|y|o|si)\b/i,
fr: /\b(le|la|les|et|ou|si)\b/i,
de: /\b(der|die|das|und|oder|wenn)\b/i,
pl: /\b(i|lub|czy|ale|gdy)\b/i,
uk: /\b(і|або|чи|але|коли)\b/i,
tr: /\b(ve|veya|eğer|ama|ne)\b/i/
};
// Enhanced URL pattern matching
const urlPatterns = {
// Standard URL patterns
standard: /https?:\/\/[^\s<>"']+/gi,
// Encoded URL patterns
encoded: /%(?:[0-9A-Fa-f]{2})+/,
// Common unsubscribe URL patterns
unsubscribe: [
/https?:\/\/[^\s<>"']*(?:unsubscribe|opt-?out|remove)[^\s<>"']*/gi,
/https?:\/\/[^\s<>"']*(?:cancel|stop)[^\s<>"']*subscription[^\s<>"']*/gi,
/https?:\/\/[^\s<>"']*preferences[^\s<>"']*/gi
]
};
// HTML Element patterns for better link extraction
const htmlPatterns = {
// Link elements
links: {
anchor: /<a[^>]*href=["'](https?:\/\/[^"']+)["'][^>]*>.*?<\/a>/gi,
button: /<button[^>]*onclick=["'][^"']*?(https?:\/\/[^"']+)[^>]*>.*?<\/button>/gi,
form: /<form[^>]*action=["'](https?:\/\/[^"']+)["'][^>]*>.*?<\/form>/gi
},
// Common unsubscribe elements
unsubscribe: {
footer: /<(?:div|footer)[^>]*class=["'][^"']*(?:footer|bottom)[^"']*["'][^>]*>.*?<\/(?:div|footer)>/gi,
preferences: /<div[^>]*class=["'][^"']*(?:preferences|settings)[^"']*["'][^>]*>.*?<\/div>/gi
}
};
class EmailLanguageDetector {
constructor() {
this.patterns = languagePatterns;
this.keywords = unsubscribeKeywords;
}
detectLanguage(content) {
const scores = {};
// Check character patterns
Object.entries(this.patterns).forEach(([lang, pattern]) => {
const matches = content.match(pattern);
scores[lang] = (scores[lang] || 0) + (matches ? matches.length * 2 : 0);
});
// Check keyword matches
Object.entries(this.keywords).forEach(([lang, variants]) => {
Object.values(variants).flat().forEach(keyword => {
const regex = new RegExp(keyword, 'gi');
const matches = content.match(regex);
scores[lang] = (scores[lang] || 0) + (matches ? matches.length * 3 : 0);
});
});
// Get language with highest score
return Object.entries(scores)
.sort(([,a], [,b]) => b - a)
.map(([lang]) => lang)[0] || 'en';
}
getConfidence(content, detectedLang) {
const totalScore = Object.values(scores).reduce((a, b) => a + b, 0);
return scores[detectedLang] / totalScore;
}
}
class UnsubscribeLinkExtractor {
constructor() {
this.urlPatterns = urlPatterns;
this.htmlPatterns = htmlPatterns;
}
extractFromHtml(html, language) {
const links = new Set();
// Extract from standard elements
Object.values(this.htmlPatterns.links).forEach(pattern => {
let match;
while ((match = pattern.exec(html)) !== null) {
if (match[1]) links.add(this.cleanUrl(match[1]));
}
});
// Extract from language-specific content
const keywords = this.getKeywordsForLanguage(language);
keywords.forEach(keyword => {
const pattern = new RegExp(
`<[^>]*>(?:[^<]*${keyword}[^<]*)?<\/[^>]*>`, 'gi'
);
let context = '';
let match;
while ((match = pattern.exec(html)) !== null) {
context = html.slice(
Math.max(0, match.index - 500),
Math.min(html.length, match.index + match[0].length + 500)
);
this.extractUrlsFromContext(context, links);
}
});
return [...links];
}
extractFromText(text, language) {
const links = new Set();
const keywords = this.getKeywordsForLanguage(language);
keywords.forEach(keyword => {
const pattern = new RegExp(
`(?:${keyword}[^]*?)(https?:\/\/\\S+)|` +
`(https?:\/\/\\S+)[^]*?${keyword}`,
'gi'
);
let match;
while ((match = pattern.exec(text)) !== null) {
const url = match[1] || match[2];
if (url) links.add(this.cleanUrl(url));
}
});
return [...links];
}
getKeywordsForLanguage(language) {
const langKeywords = unsubscribeKeywords[language];
if (!langKeywords) return [];
return Object.values(langKeywords)
.flat()
.map(keyword => keyword.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'));
}
cleanUrl(url) {
try {
let cleaned = url
.replace(/[.,;:)\]}"'>]+$/, '')
.replace(/&amp;/g, '&')
.trim();
// Handle multiple levels of encoding
let previous;
do {
previous = cleaned;
cleaned = decodeURIComponent(previous);
} while (cleaned !== previous);
// Validate and normalize URL
const urlObj = new URL(cleaned);
return urlObj.toString();
} catch {
return null;
}
}
extractUrlsFromContext(context, links) {
this.urlPatterns.unsubscribe.forEach(pattern => {
let match;
while ((match = pattern.exec(context)) !== null) {
const url = this.cleanUrl(match[0]);
if (url) links.add(url);
}
});
}
}
class EmailUnsubscribeParser {
constructor() {
this.languageDetector = new EmailLanguageDetector();
this.linkExtractor = new UnsubscribeLinkExtractor();
}
async parse(emailContent, options = {}) {
const parser = new PostalMime();
try {
const email = await parser.parse(emailContent);
return this.processEmail(email, options);
} catch (error) {
console.error('Error parsing email:', error);
throw error;
}
}
async processEmail(email, options) {
// Detect language
const content = this.getEmailContent(email);
const detectedLanguage = options.language ||
this.languageDetector.detectLanguage(content);
const result = {
headerLinks: this.extractHeaderLinks(email),
bodyLinks: [],
language: {
detected: detectedLanguage,
confidence: this.languageDetector.getConfidence(content, detectedLanguage)
},
metadata: this.extractMetadata(email)
};
// Extract links from HTML content
if (email.html) {
const htmlLinks = this.linkExtractor.extractFromHtml(
email.html,
detectedLanguage
);
result.bodyLinks.push(...htmlLinks);
}
// Extract links from text content
if (email.text) {
const textLinks = this.linkExtractor.extractFromText(
email.text,
detectedLanguage
);
result.bodyLinks.push(...textLinks);
}
// Remove duplicates and invalid links
result.bodyLinks = [...new Set(result.bodyLinks)].filter(Boolean);
return result;
}
getEmailContent(email) {
return [
email.subject || '',
email.text || '',
email.html || ''
].join('\n');
}
extractHeaderLinks(email) {
const links = [];
// Check List-Unsubscribe header
if (email.headers && email.headers['list-unsubscribe']) {
const headerValue = email.headers['list-unsubscribe'];
// Handle comma-separated URLs
headerValue.split(',').forEach(part => {
// Extract mailto: links
const mailtoMatch = part.match(/<mailto:([^>]+)>/);
if (mailtoMatch) {
links.push({
type: 'mailto',
value: mailtoMatch[1]
});
}
// Extract HTTP(S) links
const urlMatch = part.match(/<(https?:\/\/[^>]+)>/);
if (urlMatch) {
const url = this.linkExtractor.cleanUrl(urlMatch[1]);
if (url) {
links.push({
type: 'http',
value: url
});
}
}
});
}
return links;
}
extractMetadata(email) {
return {
subject: email.subject,
from: email.from,
to: email.to,
date: email.date,
hasHtml: Boolean(email.html),
hasText: Boolean(email.text),
headers: this.extractRelevantHeaders(email.headers)
};
}
extractRelevantHeaders(headers) {
const relevantHeaders = [
'list-unsubscribe',
'list-unsubscribe-post',
'list-id',
'precedence',
'x-mailer',
'x-unsubscribe-web'
];
return Object.fromEntries(
Object.entries(headers || {})
.filter(([key]) => relevantHeaders.includes(key.toLowerCase()))
);
}
}
// Utility functions for module usage
function createParser(options = {}) {
return new EmailUnsubscribeParser(options);
}
async function parseEmail(emailContent, options = {}) {
const parser = createParser(options);
return parser.parse(emailContent);
}
// Export the module
module.exports = {
createParser,
parseEmail,
EmailUnsubscribeParser,
EmailLanguageDetector,
UnsubscribeLinkExtractor,
// Constants
unsubscribeKeywords,
languagePatterns,
urlPatterns,
htmlPatterns
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment