Last active
March 17, 2023 20:13
-
-
Save gf3/761ca1469939057a7fa406bfc229f8ad to your computer and use it in GitHub Desktop.
Validate and correct misspelled emails using a fuzzy matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* eslint-env jasmine */ | |
import { validate, suggest } from './email'; | |
describe('email', () => { | |
describe('validate', () => { | |
it('should correctly validate correct email addresses', (done) => { | |
const promises = [ | |
validate('[email protected]'), | |
validate('[email protected]'), | |
validate('[email protected]'), | |
validate('[email protected]') | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should correctly validate incorrect email addresses', (done) => { | |
const promises = [ | |
validate('@').then(() => { throw 'fail'; }, () => true), | |
validate('user [email protected]').then(() => { throw 'fail'; }, () => true), | |
validate('user@domain lol.com').then(() => { throw 'fail'; }, () => true), | |
validate('[email protected] om').then(() => { throw 'fail'; }, () => true), | |
validate('gmail.com').then(() => { throw 'fail'; }, () => true), | |
validate('@gmail.com').then(() => { throw 'fail'; }, () => true), | |
validate('user@gmail.').then(() => { throw 'fail'; }, () => true), | |
validate('[email protected]').then(() => { throw 'fail'; }, () => true) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should correctly validate an email address with unicode characters', (done) => { | |
validate('snædis@💩.com').then(done, done.fail); | |
}); | |
}); | |
describe('suggest', () => { | |
it('should not suggest correct domains & tlds', (done) => { | |
const promises = [ | |
suggest('[email protected]').then(e => { throw `fail: ${e}`; }, () => true), | |
suggest('[email protected]').then(e => { throw `fail: ${e}`; }, () => true) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should not suggest correct but unknown domains & tlds', (done) => { | |
const promises = [ | |
suggest('[email protected]').then(e => { throw `fail: ${e}`; }, () => true) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should not suggest emails that are too far gone', (done) => { | |
const promises = [ | |
suggest('user@gmaillolobuzz').then(e => { throw `fail: ${e}`; }, () => true), | |
suggest('[email protected]').then(e => { throw `fail: ${e}`; }, () => true) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should suggest misspelled domains', (done) => { | |
const promises = [ | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }), | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should suggest misspelled TLDs', (done) => { | |
const promises = [ | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }), | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }), | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }), | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should suggest misspelled TLDs for unknown domains', (done) => { | |
const promises = [ | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }), | |
suggest('gianni@universecom').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: gianni@universecom'; }) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
it('should suggest misspelled domains and TLDs', (done) => { | |
const promises = [ | |
suggest('user@gmailcom').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: user@gmailcom'; }), | |
suggest('user@gnailcon').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: user@gnailcon'; }), | |
suggest('[email protected]').then(e => expect(e).toEqual('[email protected]'), () => { throw 'fail: [email protected]'; }) | |
]; | |
Promise.all(promises).then(done, done.fail); | |
}); | |
}); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @flow | |
*/ | |
import { sift4 } from './sift'; | |
//----------------------------------------------------------------------------- | |
// Misc. | |
//----------------------------------------------------------------------------- | |
export const EMAIL = /^\S+@\S+\.\S+$/u; | |
const DOMAINS = [ | |
/* Default domains included */ | |
'aol.com', 'att.net', 'comcast.net', 'facebook.com', 'gmail.com', 'gmx.com', 'googlemail.com', | |
'google.com', 'hotmail.com', 'hotmail.co.uk', 'mac.com', 'me.com', 'mail.com', 'msn.com', | |
'live.com', 'sbcglobal.net', 'verizon.net', 'yahoo.com', 'yahoo.co.uk', | |
/* Other global domains */ | |
'email.com', 'games.com' /* AOL */, 'gmx.net', 'hush.com', 'hushmail.com', 'icloud.com', 'inbox.com', | |
'lavabit.com', 'love.com' /* AOL */, 'outlook.com', 'pobox.com', 'rocketmail.com' /* Yahoo */, | |
'safe-mail.net', 'wow.com' /* AOL */, 'ygm.com' /* AOL */, 'ymail.com' /* Yahoo */, 'zoho.com', 'fastmail.fm', | |
'yandex.com', | |
/* United States ISP domains */ | |
'bellsouth.net', 'charter.net', 'comcast.net', 'cox.net', 'earthlink.net', 'juno.com', | |
/* British ISP domains */ | |
'btinternet.com', 'virginmedia.com', 'blueyonder.co.uk', 'freeserve.co.uk', 'live.co.uk', | |
'ntlworld.com', 'o2.co.uk', 'orange.net', 'sky.com', 'talktalk.co.uk', 'tiscali.co.uk', | |
'virgin.net', 'wanadoo.co.uk', 'bt.com', | |
/* Domains used in Asia */ | |
'sina.com', 'qq.com', 'naver.com', 'hanmail.net', 'daum.net', 'nate.com', 'yahoo.co.jp', 'yahoo.co.kr', 'yahoo.co.id', 'yahoo.co.in', 'yahoo.com.sg', 'yahoo.com.ph', | |
/* French ISP domains */ | |
'hotmail.fr', 'live.fr', 'laposte.net', 'yahoo.fr', 'wanadoo.fr', 'orange.fr', 'gmx.fr', 'sfr.fr', 'neuf.fr', 'free.fr', | |
/* German ISP domains */ | |
'gmx.de', 'hotmail.de', 'live.de', 'online.de', 't-online.de' /* T-Mobile */, 'web.de', 'yahoo.de', | |
/* Russian ISP domains */ | |
'mail.ru', 'rambler.ru', 'yandex.ru', 'ya.ru', 'list.ru', | |
/* Belgian ISP domains */ | |
'hotmail.be', 'live.be', 'skynet.be', 'voo.be', 'tvcablenet.be', 'telenet.be', | |
/* Argentinian ISP domains */ | |
'hotmail.com.ar', 'live.com.ar', 'yahoo.com.ar', 'fibertel.com.ar', 'speedy.com.ar', 'arnet.com.ar', | |
/* Domains used in Mexico */ | |
'hotmail.com', 'gmail.com', 'yahoo.com.mx', 'live.com.mx', 'yahoo.com', 'hotmail.es', 'live.com', 'hotmail.com.mx', 'prodigy.net.mx', 'msn.com', | |
/* Domains used in Brazil */ | |
'yahoo.com.br', 'hotmail.com.br', 'outlook.com.br', 'uol.com.br', 'bol.com.br', 'terra.com.br', 'ig.com.br', 'itelefonica.com.br', 'r7.com', 'zipmail.com.br', 'globo.com', 'globomail.com', 'oi.com.br' | |
]; | |
const HOSTS = [ | |
'aim', 'aol', 'att', 'bellsouth', 'btinternet', 'charter', 'comcast', 'cox', | |
'earthlink', 'gmail', 'google', 'googlemail', 'icloud', 'mac', 'me', 'msn', | |
'optonline', 'optusnet', 'qq', 'rocketmail', 'rogers', 'sbcglobal', 'shaw', | |
'sky', 'sympatico', 'telus', 'verizon', 'web', 'xtra', 'ymail' | |
]; | |
const TLDS = [ | |
'com', 'com.au', 'com.tw', 'ca', 'co.nz', 'co.uk', 'de', 'fr', 'it', 'ru', | |
'net', 'org', 'edu', 'gov', 'jp', 'nl', 'kr', 'se', 'eu', 'ie', 'co.il', | |
'us', 'at', 'be', 'dk', 'hk', 'es', 'gr', 'ch', 'no', 'cz', 'in', 'net', | |
'net.au', 'info', 'biz', 'mil', 'co.jp', 'sg', 'hu', 'uk' | |
]; | |
//----------------------------------------------------------------------------- | |
// Validate | |
//----------------------------------------------------------------------------- | |
export function validate(email: string): Promise<void> { | |
if (!EMAIL.test(email)) { | |
return Promise.reject(); | |
} | |
return Promise.resolve(); | |
} | |
//----------------------------------------------------------------------------- | |
// Suggest | |
//----------------------------------------------------------------------------- | |
type Match = { | |
distance: number; | |
match: string; | |
}; | |
function findClosest(haystack: Array<string>, needle: string, threshold: number = 2): ?Match { | |
const closest: ?Match = haystack.reduce((prev: ?Match, d: string) => { | |
const distance = sift4(needle, d, 5, 13); | |
const current: Match = { | |
distance, | |
match: d | |
}; | |
if (!prev) { | |
return current; | |
} | |
return (current.distance < prev.distance) ? current : prev; | |
}, undefined); | |
if (closest && closest.distance > threshold) { | |
return undefined; | |
} | |
return closest; | |
} | |
export function suggest(email: string): Promise<string> { | |
const match = /(\S+?@)(\S+?(\.\S{2,}|\S{3})?)$/u.exec(email); | |
if (!match) { | |
return Promise.reject(); | |
} | |
const [, user, domain, tld] = match; | |
// Check full domain | |
if (DOMAINS.includes(domain)) { | |
return Promise.reject(); | |
} | |
const closestDomain = findClosest(DOMAINS, domain); | |
if (closestDomain) { | |
return Promise.resolve(`${user}${closestDomain.match}`); | |
} | |
// Check host and top-level domains | |
if (tld) { | |
const host = domain.slice(0, (-1 * tld.length)); | |
const closestHost = findClosest(HOSTS, host); | |
const strippedTld = (tld[0] === '.') | |
? tld.slice(1) | |
: tld; | |
const threshold = (domain.indexOf('.') >= 0) ? 2 : 1; | |
const closestTld = findClosest(TLDS, strippedTld, threshold); | |
if (!closestHost) { | |
if (!closestTld) { | |
return Promise.reject(); | |
} | |
let suggest = `${user}${host}.${closestTld.match}`; | |
if (suggest === email) { | |
return Promise.reject(); | |
} | |
return Promise.resolve(suggest); | |
} | |
else if (!closestTld || ((closestHost.distance === 0) && (closestTld.distance === 0))) { | |
return Promise.reject(); | |
} | |
return Promise.resolve(`${user}${closestHost.match}.${closestTld.match}`); | |
} | |
return Promise.reject(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* eslint-disable */ | |
// Sift4 - common version | |
// online algorithm to compute the distance between two strings in O(n) | |
// maxOffset is the number of characters to search for matching letters | |
// maxDistance is the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway) | |
export function sift4(s1, s2, maxOffset, maxDistance) { | |
if (!s1 || !s1.length) { | |
if (!s2) { | |
return 0; | |
} | |
return s2.length; | |
} | |
if (!s2 || !s2.length) { | |
return s1.length; | |
} | |
var l1 = s1.length; | |
var l2 = s2.length; | |
var c1 = 0; //cursor for string 1 | |
var c2 = 0; //cursor for string 2 | |
var lcss = 0; //largest common subsequence | |
var local_cs = 0; //local common substring | |
var trans = 0; //number of transpositions ('ab' vs 'ba') | |
var offset_arr = []; //offset pair array, for computing the transpositions | |
while ((c1 < l1) && (c2 < l2)) { | |
if (s1.charAt(c1) == s2.charAt(c2)) { | |
local_cs++; | |
var isTrans = false; | |
//see if current match is a transposition | |
var i = 0; | |
while (i < offset_arr.length) { | |
var ofs = offset_arr[i]; | |
if (c1 <= ofs.c1 || c2 <= ofs.c2) { | |
// when two matches cross, the one considered a transposition is the one with the largest difference in offsets | |
isTrans = Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1); | |
if (isTrans) { | |
trans++; | |
} else { | |
if (!ofs.trans) { | |
ofs.trans = true; | |
trans++; | |
} | |
} | |
break; | |
} else { | |
if (c1 > ofs.c2 && c2 > ofs.c1) { | |
offset_arr.splice(i, 1); | |
} else { | |
i++; | |
} | |
} | |
} | |
offset_arr.push({ | |
c1: c1, | |
c2: c2, | |
trans: isTrans | |
}); | |
} else { | |
lcss += local_cs; | |
local_cs = 0; | |
if (c1 != c2) { | |
c1 = c2 = Math.min(c1, c2); //using min allows the computation of transpositions | |
} | |
//if matching characters are found, remove 1 from both cursors (they get incremented at the end of the loop) | |
//so that we can have only one code block handling matches | |
for (var i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) { | |
if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) { | |
c1 += i - 1; | |
c2--; | |
break; | |
} | |
if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) { | |
c1--; | |
c2 += i - 1; | |
break; | |
} | |
} | |
} | |
c1++; | |
c2++; | |
if (maxDistance) { | |
var temporaryDistance = Math.max(c1, c2) - lcss + trans; | |
if (temporaryDistance >= maxDistance) return Math.round(temporaryDistance); | |
} | |
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly | |
if ((c1 >= l1) || (c2 >= l2)) { | |
lcss += local_cs; | |
local_cs = 0; | |
c1 = c2 = Math.min(c1, c2); | |
} | |
} | |
lcss += local_cs; | |
return Math.round(Math.max(l1, l2) - lcss + trans); //add the cost of transpositions to the final result | |
} | |
/* eslint-enable */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment