Skip to content

Instantly share code, notes, and snippets.

@ariens
Created April 2, 2015 18:43
Show Gist options
  • Save ariens/d3ae82c5d36182f8504a to your computer and use it in GitHub Desktop.
Save ariens/d3ae82c5d36182f8504a to your computer and use it in GitHub Desktop.
An Experiment in Fast Text Matching
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.blackberry.bdp.samples;
import java.nio.charset.Charset;
import java.util.ArrayList;
import static java.util.Arrays.sort;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class StringMatcher
{
List<SortedKeyword> keywords = new ArrayList<>();
int textIndex;
int keywordIndex;
int keywordNum;
int keywordMatchIndex;
Logger LOG = LoggerFactory.getLogger(StringMatcher.class);
public StringMatcher(String[] keywordStrings) throws Exception
{
sort(keywordStrings);
for (keywordNum = 0; keywordNum < keywordStrings.length; keywordNum++)
{
int startingUniqueOffset = 0;
byte[] bytes = keywordStrings[keywordNum].getBytes(Charset.forName("UTF-8"));
// Compare subsequent keywords to the previous and determine
// the position of the first non-matching character
if (keywordNum > 0)
{
if (keywordStrings[keywordNum].equals(keywordStrings[keywordNum - 1]))
{
throw new Exception("Can't have duplicate keywords");
}
for (int charNum = 0; charNum < keywordStrings[keywordNum].length(); charNum++)
{
if(bytes[charNum] != keywords.get(keywordNum - 1).bytes[charNum])
{
startingUniqueOffset = charNum;
break;
}
}
}
keywords.add(new SortedKeyword(keywordStrings[keywordNum], bytes, startingUniqueOffset));
}
}
public boolean utf8ByteArrayContainsAnyKeyword(byte[] bytes)
{
textIndex = 0;
keywordNum = 0;
keywordIndex = 0;
keywordMatchIndex = 0;
LOG.trace("length of byteArray: {}", bytes.length);
// For every character in our input text...
while(textIndex < bytes.length)
{
// For every keyword...
while (keywordNum < keywords.size())
{
// Starting at our unique offset...
keywordIndex = keywords.get(keywordNum).startingUniqueOffset;
// Checking every subsequent character in the keyword
while(keywordIndex < keywords.get(keywordNum).bytes.length)
{
if (bytes[textIndex + keywordIndex] != keywords.get(keywordNum).bytes[keywordIndex])
{
break; // No match, the next keyword will be checked
}
else if (keywordIndex == keywords.get(keywordNum).bytes.length)
{
// If this the keywordIndex is the length of the keyword, we matched
return true;
}
keywordIndex++;
}
keywordNum++;
}
// Advance the text index by as many positions as were matched in all keywords
textIndex+= keywordIndex;
}
return false;
}
private class SortedKeyword
{
private final String keyword;
private final byte[] bytes;
private final int startingUniqueOffset;
private SortedKeyword(String keyword, byte[] bytes,int startingUniqueOffset)
{
this.keyword = keyword;
this.bytes = bytes;
this.startingUniqueOffset = startingUniqueOffset;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment