Last active
February 11, 2025 22:10
-
-
Save blasten/d42bd0d814b7df1addea to your computer and use it in GitHub Desktop.
String matching based on the KMP algorithm. This how `String.prototype.indexOf` is generally implemented.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Construct a table with table[i] as the length of the longest prefix of the substring 0..i | |
function longestPrefix(str) { | |
// create a table of size equal to the length of `str` | |
// table[i] will store the prefix of the longest prefix of the substring str[0..i] | |
var table = new Array(str.length); | |
var maxPrefix = 0; | |
// the longest prefix of the substring str[0] has length | |
table[0] = 0; | |
// for the substrings the following substrings, we have two cases | |
for (var i = 1; i < str.length; i++) { | |
// case 1. the current character doesn't match the last character of the longest prefix | |
while (maxPrefix > 0 && str.charAt(i) !== str.charAt(maxPrefix)) { | |
// if that is the case, we have to backtrack, and try find a character that will be equal to the current character | |
// if we reach 0, then we couldn't find a chracter | |
maxPrefix = table[maxPrefix - 1]; | |
} | |
// case 2. The last character of the longest prefix matches the current character in `str` | |
if (str.charAt(maxPrefix) === str.charAt(i)) { | |
// if that is the case, we know that the longest prefix at position i has one more character. | |
// for example consider `-` be any character not contained in the set [a-c] | |
// str = abc----abc | |
// consider `i` to be the last character `c` in `str` | |
// maxPrefix = will be 2 (the first `c` in `str`) | |
// maxPrefix now will be 3 | |
maxPrefix++; | |
// so the max prefix for table[9] is 3 | |
} | |
table[i] = maxPrefix; | |
} | |
return table; | |
} | |
// Find all the patterns that matches in a given string `str` | |
// this algorithm is based on the Knuth–Morris–Pratt algorithm. Its beauty consists in that it performs the matching in O(n) | |
function kmpMatching(str, pattern) { | |
// find the prefix table in O(n) | |
var prefixes = longestPrefix(pattern); | |
var matches = []; | |
// `j` is the index in `P` | |
var j = 0; | |
// `i` is the index in `S` | |
var i = 0; | |
while (i < str.length) { | |
// Case 1. S[i] == P[j] so we move to the next index in `S` and `P` | |
if (str.charAt(i) === pattern.charAt(j)) { | |
i++; | |
j++; | |
} | |
// Case 2. `j` is equal to the length of `P` | |
// that means that we reached the end of `P` and thus we found a match | |
if (j === pattern.length) { | |
matches.push(i-j); | |
// Next we have to update `j` because we want to save some time | |
// instead of updating to j = 0 , we can jump to the last character of the longest prefix well known so far. | |
// j-1 means the last character of `P` because j is actually `P.length` | |
// e.g. | |
// S = a b a b d e | |
// P = `a b`a b | |
// we will jump to `a b` and we will compare d and a in the next iteration | |
// a b a b `d` e | |
// a b `a` b | |
j = prefixes[j-1]; | |
} | |
// Case 3. | |
// S[i] != P[j] There's a mismatch! | |
else if (str.charAt(i) !== pattern.charAt(j)) { | |
// if we have found at least a character in common, do the same thing as in case 2 | |
if (j !== 0) { | |
j = prefixes[j-1]; | |
} else { | |
// otherwise, j = 0, and we can move to the next character S[i+1] | |
i++; | |
} | |
} | |
} | |
return matches; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for the good code. 👍👍 I also think that the indexOf of the string would have used KMP.
I know It depends on the browser but Is there a reference to check this? I've searched many other sites, but it's hard to find a clear basis.
Thank you!