Last active
November 23, 2015 03:50
-
-
Save scpike/1d21ec18b2baecad5689 to your computer and use it in GitHub Desktop.
deduplicate some names
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// http://stackoverflow.com/questions/18233874/get-all-the-combinations-of-n-elements-of-multidimensional-array | |
function getCombinations(arr, n) { | |
if(n == 1) | |
{ | |
var ret = []; | |
for(var i = 0; i < arr.length; i++) | |
{ | |
ret.push([arr[i]]); | |
} | |
return ret; | |
} | |
else | |
{ | |
var ret = []; | |
for(var i = 0; i <= arr.length; i++) | |
{ | |
var elem = arr.shift(); | |
var childperm = getCombinations(arr.slice(), n-1); | |
for(var k = 0; k < childperm.length; k++) | |
{ | |
ret.push([elem].concat(childperm[k])); | |
} | |
} | |
return ret; | |
} | |
} | |
// https://gist.github.com/inactivist/7614182 | |
function shingle(collection, size) { | |
var shingles = new Set(); | |
for (var i=0; i<collection.length-size+1; i++) { | |
shingles.add(collection.slice(i, i+size)); | |
} | |
return shingles; | |
} | |
function jaccard_coefficient(s1, s2) { | |
s1_shingles = shingle(s1, 2); | |
s2_shingles = shingle(s2, 2); | |
var union = new Set([...s1_shingles, ...s2_shingles]); | |
intersection = new Set( | |
[...s1_shingles].filter(x => s2_shingles.has(x))); | |
return (intersection.size / union.size); | |
} | |
function strip_special_characters(string){ | |
var r=string.toLowerCase(); | |
r = r.replace(new RegExp("[àáâãäå]", 'g'),"a"); | |
r = r.replace(new RegExp("æ", 'g'),"ae"); | |
r = r.replace(new RegExp("ç", 'g'),"c"); | |
r = r.replace(new RegExp("[èéêë]", 'g'),"e"); | |
r = r.replace(new RegExp("[ìíîï]", 'g'),"i"); | |
r = r.replace(new RegExp("ñ", 'g'),"n"); | |
r = r.replace(new RegExp("[òóôõö]", 'g'),"o"); | |
r = r.replace(new RegExp("œ", 'g'),"oe"); | |
r = r.replace(new RegExp("[ùúûü]", 'g'),"u"); | |
r = r.replace(new RegExp("[ýÿ]", 'g'),"y"); | |
return r; | |
} | |
function remove_punct(s) { | |
return s.replace(".", " ").replace("&", " ").replace('"', ''); | |
} | |
function remove_tokens(s) { | |
return s.replace(/\b(llc|inc|ltd|pte|intl|gmbh|corp|corporation|company|co|sa|sl|winery|wines|bodega|slu|vineyard|winework|cellar|the)\b/, '', 'g') | |
} | |
function normalize(s) { | |
return remove_tokens(strip_special_characters(remove_punct(s.toLowerCase()))).replace(/\s\s*/, ' ').trim(); | |
} | |
function block(s) { | |
return s.replace(/[^A-z0-0]/, '', 'g').substring(0,2).toLowerCase(); | |
} | |
function numeric_part(s) { | |
return s.replace(/[^0-9]/, '', 'g'); | |
} | |
(function(x) { | |
var blocks = {}; | |
var words = x.trim().split("\n"); | |
var results = []; | |
words.forEach(function(e) { | |
blocks[block(e)] = blocks[block(e)] || []; | |
blocks[block(e)].push(e); | |
}) | |
for (var key in blocks) { | |
if (blocks.hasOwnProperty(key)) { | |
var blockGroup = blocks[key]; | |
var pairs = getCombinations(blockGroup, 2); | |
pairs.forEach(function(e) { | |
if (numeric_part(e[0]) === numeric_part(e[1])) { | |
var diff = jaccard_coefficient(normalize(e[0]), normalize(e[1])); | |
if (diff > 0.5) { | |
results.push([diff, e[0], e[1]]); | |
} | |
} | |
}); | |
} | |
} | |
return results.sort().reverse().join("\n") | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment