Created
April 2, 2015 18:43
-
-
Save ariens/d3ae82c5d36182f8504a to your computer and use it in GitHub Desktop.
An Experiment in Fast Text Matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* To change this license header, choose License Headers in Project Properties. | |
* To change this template file, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package com.blackberry.bdp.samples; | |
import java.nio.charset.Charset; | |
import java.util.ArrayList; | |
import static java.util.Arrays.sort; | |
import java.util.List; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
public class StringMatcher | |
{ | |
List<SortedKeyword> keywords = new ArrayList<>(); | |
int textIndex; | |
int keywordIndex; | |
int keywordNum; | |
int keywordMatchIndex; | |
Logger LOG = LoggerFactory.getLogger(StringMatcher.class); | |
public StringMatcher(String[] keywordStrings) throws Exception | |
{ | |
sort(keywordStrings); | |
for (keywordNum = 0; keywordNum < keywordStrings.length; keywordNum++) | |
{ | |
int startingUniqueOffset = 0; | |
byte[] bytes = keywordStrings[keywordNum].getBytes(Charset.forName("UTF-8")); | |
// Compare subsequent keywords to the previous and determine | |
// the position of the first non-matching character | |
if (keywordNum > 0) | |
{ | |
if (keywordStrings[keywordNum].equals(keywordStrings[keywordNum - 1])) | |
{ | |
throw new Exception("Can't have duplicate keywords"); | |
} | |
for (int charNum = 0; charNum < keywordStrings[keywordNum].length(); charNum++) | |
{ | |
if(bytes[charNum] != keywords.get(keywordNum - 1).bytes[charNum]) | |
{ | |
startingUniqueOffset = charNum; | |
break; | |
} | |
} | |
} | |
keywords.add(new SortedKeyword(keywordStrings[keywordNum], bytes, startingUniqueOffset)); | |
} | |
} | |
public boolean utf8ByteArrayContainsAnyKeyword(byte[] bytes) | |
{ | |
textIndex = 0; | |
keywordNum = 0; | |
keywordIndex = 0; | |
keywordMatchIndex = 0; | |
LOG.trace("length of byteArray: {}", bytes.length); | |
// For every character in our input text... | |
while(textIndex < bytes.length) | |
{ | |
// For every keyword... | |
while (keywordNum < keywords.size()) | |
{ | |
// Starting at our unique offset... | |
keywordIndex = keywords.get(keywordNum).startingUniqueOffset; | |
// Checking every subsequent character in the keyword | |
while(keywordIndex < keywords.get(keywordNum).bytes.length) | |
{ | |
if (bytes[textIndex + keywordIndex] != keywords.get(keywordNum).bytes[keywordIndex]) | |
{ | |
break; // No match, the next keyword will be checked | |
} | |
else if (keywordIndex == keywords.get(keywordNum).bytes.length) | |
{ | |
// If this the keywordIndex is the length of the keyword, we matched | |
return true; | |
} | |
keywordIndex++; | |
} | |
keywordNum++; | |
} | |
// Advance the text index by as many positions as were matched in all keywords | |
textIndex+= keywordIndex; | |
} | |
return false; | |
} | |
private class SortedKeyword | |
{ | |
private final String keyword; | |
private final byte[] bytes; | |
private final int startingUniqueOffset; | |
private SortedKeyword(String keyword, byte[] bytes,int startingUniqueOffset) | |
{ | |
this.keyword = keyword; | |
this.bytes = bytes; | |
this.startingUniqueOffset = startingUniqueOffset; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment