Last active
November 2, 2017 14:47
-
-
Save lovasoa/253f43f99b664ef52fa360f99dd50e28 to your computer and use it in GitHub Desktop.
Generalized Damerau-Levenshtein algorithm implementation in java. Works with lists of any type.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.qwant.utils; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.stream.IntStream; | |
// Inspired From : https://github.com/KevinStern/software-and-algorithms | |
// This version has me modified to work with any element type, not only string | |
/* Copyright (c) 2012 Kevin L. Stern | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
/** | |
* The Damerau-Levenshtein Algorithm is an extension to the Levenshtein | |
* Algorithm which solves the edit distance problem between a source string and | |
* a target string with the following operations: | |
* <p> | |
* <ul> | |
* <li>Character Insertion</li> | |
* <li>Character Deletion</li> | |
* <li>Character Replacement</li> | |
* <li>Adjacent Character Swap</li> | |
* </ul> | |
* <p> | |
* Note that the adjacent character swap operation is an edit that may be | |
* applied when two adjacent characters in the source string match two adjacent | |
* characters in the target string, but in reverse order, rather than a general | |
* allowance for adjacent character swaps. | |
* <p> | |
* <p> | |
* This implementation allows the client to specify the costs of the various | |
* edit operations with the restriction that the cost of two swap operations | |
* must not be less than the cost of a delete operation followed by an insert | |
* operation. This restriction is required to preclude two swaps involving the | |
* same character being required for optimality which, in turn, enables a fast | |
* dynamic programming solution. | |
* <p> | |
* <p> | |
* The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is | |
* the length of the source string and m is the length of the target string. | |
* This implementation consumes O(n*m) space. | |
* | |
* @author Kevin L. Stern | |
*/ | |
public class DamerauLevenshteinAlgorithm { | |
private final int deleteCost; | |
private final int insertCost; | |
private final int replaceCost; | |
private final int swapCost; | |
/** | |
* Constructor. | |
* | |
* @param deleteCost the cost of deleting a character. | |
* @param insertCost the cost of inserting a character. | |
* @param replaceCost the cost of replacing a character. | |
* @param swapCost the cost of swapping two adjacent characters. | |
*/ | |
public DamerauLevenshteinAlgorithm(int deleteCost, int insertCost, | |
int replaceCost, int swapCost) { | |
/* | |
* Required to facilitate the premise to the algorithm that two swaps of the | |
* same element are never required for optimality. | |
*/ | |
if (2 * swapCost < insertCost + deleteCost) { | |
throw new IllegalArgumentException("Unsupported cost assignment"); | |
} | |
this.deleteCost = deleteCost; | |
this.insertCost = insertCost; | |
this.replaceCost = replaceCost; | |
this.swapCost = swapCost; | |
} | |
/** | |
* Compute the Damerau-Levenshtein distance between the specified source | |
* sequence and the specified target sequence. | |
* | |
* @param <T> any type. You need to override hashCode and equals on this type. | |
* @return the distance | |
*/ | |
public <T> int execute(List<T> source, List<T> target) { | |
if (source.isEmpty()) { | |
return target.size() * insertCost; | |
} | |
if (target.isEmpty()) { | |
return source.size() * deleteCost; | |
} | |
int[][] table = new int[source.size()][target.size()]; | |
Map<T, Integer> sourceIndexByElement = new HashMap<>(); | |
if (!source.get(0).equals(target.get(0))) { | |
table[0][0] = min(replaceCost, deleteCost + insertCost); | |
} | |
sourceIndexByElement.put(source.get(0), 0); | |
for (int i = 1; i < source.size(); i++) { | |
int deleteDistance = table[i - 1][0] + deleteCost; | |
int insertDistance = (i + 1) * deleteCost + insertCost; | |
int matchDistance = i * deleteCost | |
+ (source.get(i).equals(target.get(0)) ? 0 : replaceCost); | |
table[i][0] = min(deleteDistance, insertDistance, matchDistance); | |
} | |
for (int j = 1; j < target.size(); j++) { | |
int deleteDistance = (j + 1) * insertCost + deleteCost; | |
int insertDistance = table[0][j - 1] + insertCost; | |
int matchDistance = j * insertCost | |
+ (source.get(0).equals(target.get(j)) ? 0 : replaceCost); | |
table[0][j] = min(deleteDistance, insertDistance, matchDistance); | |
} | |
for (int i = 1; i < source.size(); i++) { | |
int maxSourceLetterMatchIndex = source.get(i).equals(target.get(0)) ? 0 : -1; | |
for (int j = 1; j < target.size(); j++) { | |
Integer candidateSwapIndex = sourceIndexByElement.get(target.get(j)); | |
int jSwap = maxSourceLetterMatchIndex; | |
int deleteDistance = table[i - 1][j] + deleteCost; | |
int insertDistance = table[i][j - 1] + insertCost; | |
int matchDistance = table[i - 1][j - 1]; | |
if (source.get(i) != target.get(j)) { | |
matchDistance += replaceCost; | |
} else { | |
maxSourceLetterMatchIndex = j; | |
} | |
int swapDistance; | |
if (candidateSwapIndex != null && jSwap != -1) { | |
int iSwap = candidateSwapIndex; | |
int preSwapCost; | |
if (iSwap == 0 && jSwap == 0) { | |
preSwapCost = 0; | |
} else { | |
preSwapCost = table[max(iSwap - 1, 0)][max(jSwap - 1, 0)]; | |
} | |
swapDistance = | |
preSwapCost + (i - iSwap - 1) * deleteCost | |
+ (j - jSwap - 1) * insertCost + swapCost; | |
} else { | |
swapDistance = Integer.MAX_VALUE; | |
} | |
table[i][j] = min(deleteDistance, insertDistance, matchDistance, swapDistance); | |
} | |
sourceIndexByElement.put(source.get(i), i); | |
} | |
return table[source.size() - 1][target.size() - 1]; | |
} | |
private static int max(int... nums) { | |
return IntStream.of(nums).max() | |
.orElseThrow(() -> new IllegalArgumentException("At least one argument required")); | |
} | |
private static int min(int... nums) { | |
return IntStream.of(nums).min() | |
.orElseThrow(() -> new IllegalArgumentException("At least one argument required")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment