Created
January 8, 2018 20:27
-
-
Save janwirth/7d18d7e266d1198ee01ed582912be0f3 to your computer and use it in GitHub Desktop.
Find fuzzy duplicates in a list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import levenshtein from 'fast-levenshtein'; | |
import { filter, join, isEmpty, sortBy, get, identity, flow, map, toLower } from 'lodash/fp' | |
// https://stackoverflow.com/questions/43241174/javascript-generating-all-combinations-of-elements-in-a-single-array-in-pairs | |
function getCombinations(array) { | |
var results = []; | |
// Since you only want pairs, there's no reason | |
// to iterate over the last element directly | |
for (var i = 0; i < array.length - 1; i++) { | |
// This is where you'll capture that last value | |
for (var j = i + 1; j < array.length; j++) { | |
results.push([array[i], array[j]]); | |
} | |
} | |
return results | |
} | |
const format = flow( | |
map( x => `${x[1]} ${x[0][0]} ${x[0][1]}` ), | |
join('\n') | |
) | |
const process = flow([ | |
filter(line => !isEmpty(line)), | |
getCombinations, | |
pairs => pairs.map(processPair), | |
sortBy(get('1')) | |
]) | |
const processPair = (pair) => { | |
const score = levenshtein.get(...pair.map(toLower)) | |
return [pair, score] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment