Last active
May 25, 2019 08:22
-
-
Save SOF3/9b5528642c4db74c81fc97378424d6cc to your computer and use it in GitHub Desktop.
Java code to search haystack with a large amount of substrings efficiently
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.HashMap; | |
import java.util.Map; | |
class NeedleTree{ | |
private int codePoint; | |
private boolean selfSet = false; | |
private Map<Integer, NeedleTree> children = new HashMap<>(); | |
public NeedleTree(int codePoint){ | |
this.codePoint = codePoint; | |
} | |
public NeedleTree childOrNull(int codePoint){ | |
return children.get(codePoint); | |
} | |
public NeedleTree child(int codePoint){ | |
NeedleTree child = children.get(codePoint); | |
if(child == null){ | |
children.put(codePoint, child = new NeedleTree(codePoint)); | |
} | |
return child; | |
} | |
public boolean isSelfSet(){ | |
return selfSet; | |
} | |
public void markSelfSet(){ | |
selfSet = true; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.HashMap; | |
class StringSearcher{ | |
private NeedleTree needles = new NeedleTree(-1); | |
private boolean caseSensitive; | |
private List<Integer> lengths = new ArrayList<>(); | |
private int maxLength; | |
public StringSearcher(List<String> inputs, boolean caseSensitive){ | |
this.caseSensitive = caseSensitive; | |
for(String input : inputs){ | |
if(!lengths.contains(input.length())){ | |
lengths.add(input.length()); | |
} | |
NeedleTree tree = needles; | |
for(int i = 0; i < input.length(); i++){ | |
tree = tree.child(caseSensitive ? input.codePointAt(i) : Character.toLowerCase(input.codePointAt(i))); | |
} | |
tree.markSelfSet(); | |
} | |
maxLength = Collections.max(lengths); | |
} | |
public boolean matches(String haystack){ | |
if(!caseSensitive){ | |
haystack = haystack.toLowerCase(); | |
} | |
for(int i = 0; i < haystack.length(); i++){ | |
String substring = haystack.substring(i, Math.min(i + maxLength, haystack.length())); // maybe we can even skip this and use from haystack directly? | |
NeedleTree tree = needles; | |
for(int j = 0; j < substring.length(); j++){ | |
tree = tree.childOrNull(substring.codePointAt(j)); | |
if(tree == null){ | |
break; | |
} | |
if(tree.isSelfSet()){ | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment