ariens · April 2, 2015 18:43
diff --git a/StringMatcher.java b/StringMatcher.java
 /*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
 package com.blackberry.bdp.samples;

 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import static java.util.Arrays.sort;
 import java.util.List;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class StringMatcher
 {
 	List<SortedKeyword> keywords = new ArrayList<>();
 		
 	int textIndex;
 	int keywordIndex;
 	int keywordNum;
 	int keywordMatchIndex;
 		
 	Logger LOG = LoggerFactory.getLogger(StringMatcher.class);
 	
 	public StringMatcher(String[] keywordStrings) throws Exception
 	{
 		sort(keywordStrings);
 		
 		for (keywordNum = 0; keywordNum < keywordStrings.length; keywordNum++)
 		{
 			int startingUniqueOffset = 0;
 			byte[] bytes = keywordStrings[keywordNum].getBytes(Charset.forName("UTF-8"));

 			// Compare subsequent keywords to the previous and determine 
 			// the position of the first non-matching character
 			
 			if (keywordNum > 0)
 			{
 				if (keywordStrings[keywordNum].equals(keywordStrings[keywordNum - 1]))
 				{
 					throw new Exception("Can't have duplicate keywords");						 
 				}
 				
 				for (int charNum = 0; charNum < keywordStrings[keywordNum].length(); charNum++)
 				{
 					if(bytes[charNum] != keywords.get(keywordNum - 1).bytes[charNum])
 					{
 						startingUniqueOffset = charNum;
 						break;
 					}
 				}
 			}
 			
 			keywords.add(new SortedKeyword(keywordStrings[keywordNum], bytes, startingUniqueOffset));
 		}
 	}
 	
 	public boolean utf8ByteArrayContainsAnyKeyword(byte[] bytes)
 	{
 		textIndex = 0;
 		keywordNum = 0;
 		keywordIndex = 0;
 		keywordMatchIndex = 0;
 		
 		LOG.trace("length of byteArray: {}", bytes.length);
 		
 		// For every character in our input text...
 		while(textIndex < bytes.length)
 		{
 			// For every keyword...
 			while (keywordNum < keywords.size())
 			{
 				// Starting at our unique offset...
 				keywordIndex = keywords.get(keywordNum).startingUniqueOffset;
 				
 				//  Checking every subsequent character in the keyword
 				while(keywordIndex < keywords.get(keywordNum).bytes.length)
 				{					
 					if (bytes[textIndex + keywordIndex] != keywords.get(keywordNum).bytes[keywordIndex])
 					{						
 						break; // No match, the next keyword will be checked
 					}
 					
 					else if (keywordIndex == keywords.get(keywordNum).bytes.length)
 					{
 						// If this the keywordIndex is the length of the keyword, we matched
 						return true;
 					}					
 					
 					keywordIndex++;
 				}	
 				
 				keywordNum++;
 			}
 			
 			// Advance the text index by as many positions as were matched in all keywords
 			
 			textIndex+= keywordIndex;
 		}		
 		
 		return false;
 	}
 	
 	private class SortedKeyword
 	{		
 		private final String keyword;
 		private final byte[] bytes;		
 		private final int startingUniqueOffset; 
 		
 		private SortedKeyword(String keyword, byte[] bytes,int startingUniqueOffset)
 		{
 			this.keyword = keyword;
 			this.bytes = bytes;
 			this.startingUniqueOffset = startingUniqueOffset;
 		}
 	}
 	
 }
	/*
	* To change this license header, choose License Headers in Project Properties.
	* To change this template file, choose Tools \| Templates
	* and open the template in the editor.
	*/
	package com.blackberry.bdp.samples;

	import java.nio.charset.Charset;
	import java.util.ArrayList;
	import static java.util.Arrays.sort;
	import java.util.List;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class StringMatcher
	{
	List<SortedKeyword> keywords = new ArrayList<>();

	int textIndex;
	int keywordIndex;
	int keywordNum;
	int keywordMatchIndex;

	Logger LOG = LoggerFactory.getLogger(StringMatcher.class);

	public StringMatcher(String[] keywordStrings) throws Exception
	{
	sort(keywordStrings);

	for (keywordNum = 0; keywordNum < keywordStrings.length; keywordNum++)
	{
	int startingUniqueOffset = 0;
	byte[] bytes = keywordStrings[keywordNum].getBytes(Charset.forName("UTF-8"));

	// Compare subsequent keywords to the previous and determine
	// the position of the first non-matching character

	if (keywordNum > 0)
	{
	if (keywordStrings[keywordNum].equals(keywordStrings[keywordNum - 1]))
	{
	throw new Exception("Can't have duplicate keywords");
	}

	for (int charNum = 0; charNum < keywordStrings[keywordNum].length(); charNum++)
	{
	if(bytes[charNum] != keywords.get(keywordNum - 1).bytes[charNum])
	{
	startingUniqueOffset = charNum;
	break;
	}
	}
	}

	keywords.add(new SortedKeyword(keywordStrings[keywordNum], bytes, startingUniqueOffset));
	}
	}

	public boolean utf8ByteArrayContainsAnyKeyword(byte[] bytes)
	{
	textIndex = 0;
	keywordNum = 0;
	keywordIndex = 0;
	keywordMatchIndex = 0;

	LOG.trace("length of byteArray: {}", bytes.length);

	// For every character in our input text...
	while(textIndex < bytes.length)
	{
	// For every keyword...
	while (keywordNum < keywords.size())
	{
	// Starting at our unique offset...
	keywordIndex = keywords.get(keywordNum).startingUniqueOffset;

	// Checking every subsequent character in the keyword
	while(keywordIndex < keywords.get(keywordNum).bytes.length)
	{
	if (bytes[textIndex + keywordIndex] != keywords.get(keywordNum).bytes[keywordIndex])
	{
	break; // No match, the next keyword will be checked
	}

	else if (keywordIndex == keywords.get(keywordNum).bytes.length)
	{
	// If this the keywordIndex is the length of the keyword, we matched
	return true;
	}

	keywordIndex++;
	}

	keywordNum++;
	}

	// Advance the text index by as many positions as were matched in all keywords

	textIndex+= keywordIndex;
	}

	return false;
	}

	private class SortedKeyword
	{
	private final String keyword;
	private final byte[] bytes;
	private final int startingUniqueOffset;

	private SortedKeyword(String keyword, byte[] bytes,int startingUniqueOffset)
	{
	this.keyword = keyword;
	this.bytes = bytes;
	this.startingUniqueOffset = startingUniqueOffset;
	}
	}

	}