Created
March 22, 2012 15:14
-
-
Save justinkamerman/2158931 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.DataInputStream; | |
import java.io.EOFException; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.InputStreamReader; | |
import java.io.IOException; | |
import java.util.List; | |
import java.util.Set; | |
import java.util.logging.Logger; | |
import util.*; | |
public class AhoIndexer extends DocumentTask | |
{ | |
private static Logger log = Logger.getLogger (AhoIndexer.class.getName()); | |
private StateMachine __sm; | |
public AhoIndexer (StateMachine sm, Document document, List<Match> matches) | |
{ | |
super (document, matches); | |
__sm = sm; | |
} | |
/** | |
* Index a document and return match result | |
*/ | |
public Match work (Document document) | |
{ | |
log.fine ("Processing document " + document.getName()); | |
Match match = new Match (document); | |
ExecutionContext ctx = new ExecutionContext (__sm); | |
try | |
{ | |
InputStreamReader isr = new InputStreamReader (new FileInputStream (document.getFile()), "US-ASCII"); | |
BufferedReader br = new BufferedReader (isr); | |
int b; | |
while ((b = br.read()) != -1) | |
{ | |
char a = (char) Character.toLowerCase(b); | |
Set<String> output = ctx.goTo (a); | |
log.finest (String.format("--%c--> %d", a, ctx.getState().getId())); | |
// If there is any output from the target state, add it to the document match | |
if ( output != null ) | |
{ | |
match.addOccurrence (output, ctx.getPosition()); | |
} | |
} | |
br.close(); | |
} | |
catch (FileNotFoundException ex) | |
{ | |
log.severe ("File not found: " + document.getName() + ": " + ex.toString()); | |
} | |
catch (IOException ex) | |
{ | |
log.severe ("IO Exception reading document: " + ex.toString()); | |
} | |
return match; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.DataInputStream; | |
import java.io.EOFException; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.InputStreamReader; | |
import java.io.IOException; | |
import java.util.List; | |
import java.util.Set; | |
import java.util.logging.Logger; | |
import util.*; | |
public class AhoSearcher extends DocumentTask | |
{ | |
private static Logger log = Logger.getLogger (AhoSearcher.class.getName()); | |
private StateMachine __sm; | |
private Set<String> __searchWords; | |
public AhoSearcher (StateMachine sm, Document document, List<Match> matches, Set<String> searchWords) | |
{ | |
super (document, matches); | |
__sm = sm; | |
__searchWords = searchWords; | |
} | |
/** | |
* Returns true if the given document contains at least one | |
* occurrence of each keyword. | |
*/ | |
public Match work (Document document) | |
{ | |
log.fine ("Processing document " + document.getName()); | |
Match match = new Match (document); | |
ExecutionContext ctx = new ExecutionContext (__sm); | |
try | |
{ | |
InputStreamReader isr = new InputStreamReader (new FileInputStream (document.getFile()), "US-ASCII"); | |
BufferedReader br = new BufferedReader (isr); | |
int b; | |
while ((b = br.read()) != -1) | |
{ | |
char a = (char) Character.toLowerCase(b); | |
Set<String> output = ctx.goTo (a); | |
log.finest (String.format("--%c--> %d", a, ctx.getState().getId())); | |
// If there is any output from the target state, check | |
// it against the search word list | |
if ( output != null ) | |
{ | |
for (String matchword : output) | |
{ | |
if (__searchWords.contains (matchword)) | |
{ | |
match.addOccurrence (matchword, ctx.getPosition()); | |
if ( match.size() == __searchWords.size() ) return match; | |
} | |
} | |
// Check if we've found each search word | |
if (__searchWords.size() == 0) return match; | |
} | |
} | |
br.close (); | |
} | |
catch (FileNotFoundException ex) | |
{ | |
log.severe ("File not found: " + document.getName() + ": " + ex.toString()); | |
} | |
catch (IOException ex) | |
{ | |
log.severe ("IO Exception reading document: " + ex.toString()); | |
} | |
return null; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.DataInputStream; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Set; | |
import java.util.logging.Logger; | |
import util.*; | |
/** | |
* This class externalizes an execution path through an Aho state | |
* machine so that a single machine can be used simultaneously by | |
* multiple threads. | |
*/ | |
public class ExecutionContext | |
{ | |
private static Logger log = Logger.getLogger (ExecutionContext.class.getName()); | |
private StateMachine __stateMachine; | |
private State __state; | |
private int __position; | |
/** | |
* Create execution context which can be used by a thread to | |
* navigate the state machine without effecting other threads | |
* doing the same. | |
*/ | |
private ExecutionContext () {}; | |
public ExecutionContext (StateMachine stateMachine) | |
{ | |
__stateMachine = stateMachine; | |
__state = __stateMachine.getStartState(); | |
__position = 0; | |
} | |
/** | |
* Return current state | |
*/ | |
public State getState () | |
{ | |
return __state; | |
} | |
/** | |
* Get current document position | |
*/ | |
public int getPosition () | |
{ | |
return __position; | |
} | |
/** | |
* Advance to the next state, following failure paths if necessary. | |
*/ | |
public Set<String> goTo (Character a) | |
{ | |
// Aho75: algorithm 1 | |
// Follow failure paths until we hit a state that has an edge | |
// matching the input character | |
while ( __state.goTo (a) == null ) __state = __state.getFailure (); | |
__state = __state.goTo (a); | |
__position++; | |
return __state.getOutput (); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.Set; | |
import java.util.logging.Logger; | |
public class State implements Iterable<State> | |
{ | |
private static Logger log = Logger.getLogger (State.class.getName()); | |
private int __id = 0; | |
private HashMap<Character, State> __transitions = new HashMap<Character, State> (); | |
private HashSet<String> __output = new HashSet<String>(); | |
private State __failure; | |
private State () {} | |
public State (int id) | |
{ | |
__id = id; | |
} | |
public int getId () | |
{ | |
return __id; | |
} | |
public HashMap<Character, State> getTransitions () | |
{ | |
return __transitions; | |
} | |
/** | |
* Create a new state and add a transition to it from this state | |
* with the given label. Return a reference to the new state (convenience) | |
* | |
* NOTE: all transitions must be added before the State is actually used. | |
*/ | |
public State addTransition (Character a, State s) | |
{ | |
__transitions.put (a, s); | |
return s; | |
} | |
public Set<String> getOutput () | |
{ | |
return __output; | |
} | |
public void addOutput (String output) | |
{ | |
if (output != null) | |
{ | |
__output.add (output); | |
} | |
else | |
{ | |
log.warning ("Adding null output string"); | |
} | |
} | |
public void mergeOutput (Set<String> output) | |
{ | |
__output.addAll (output); | |
} | |
public void setFailure (State s) | |
{ | |
__failure = s; | |
} | |
public State getFailure () | |
{ | |
return __failure; | |
} | |
public boolean isStart () | |
{ | |
return __id == 0; | |
} | |
/** | |
* Return state that we would transition to on input a. Returns | |
* null if there is no transition for the input character | |
* i.e. does not follow failure paths. | |
*/ | |
public State goTo (Character a) | |
{ | |
State target = __transitions.get (a); | |
if (target == null && this.isStart()) | |
{ | |
// Start state loops | |
return this; | |
} | |
else | |
{ | |
return target; | |
} | |
} | |
/** | |
* Iterable interface method | |
*/ | |
public Iterator<State> iterator () | |
{ | |
return new StateIterator (this); | |
} | |
public String toString () | |
{ | |
StringBuffer sb = new StringBuffer (); | |
sb.append (String.format("[id=%s][failure=%s][transitions[", | |
__id, __failure == null ? "" : __failure.getId())); | |
for ( Character c : __transitions.keySet() ) | |
{ | |
sb.append ("[" + c.toString() + " -> " + __transitions.get(c).getId() + "]"); | |
} | |
sb.append ("]"); | |
return sb.toString (); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.Iterator; | |
import java.util.Stack; | |
import java.util.logging.Logger; | |
public class StateIterator implements Iterator<State> | |
{ | |
private static Logger log = Logger.getLogger (StateIterator.class.getName()); | |
private Stack<State> __stack = new Stack<State>(); | |
private ArrayList<State> __visited = new ArrayList<State>(); | |
private StateIterator () {} | |
public StateIterator (State state) | |
{ | |
__stack.push (state); | |
} | |
public boolean hasNext () | |
{ | |
return (__stack.size() != 0); | |
} | |
public State next () | |
{ | |
State current = __stack.pop (); | |
for ( State child : current.getTransitions().values() ) | |
{ | |
// Put state on the stack if we haven't visited already | |
if ( ! __visited.contains (child) ) | |
{ | |
__stack.push (child); | |
} | |
} | |
return current; | |
} | |
public void remove () | |
{ | |
throw new UnsupportedOperationException (); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.Iterator; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.Queue; | |
import java.util.logging.Logger; | |
public class StateMachine | |
{ | |
private static Logger log = Logger.getLogger (StateMachine.class.getName()); | |
private static int __stateIdSequence = 0; | |
private State __startState; | |
private StateMachine () {} | |
public StateMachine (List<String> keywords) | |
{ | |
__startState = new State (nextStateId()); | |
// Construct goto function (Aho75 alogorithm 2) | |
for (String keyword : keywords ) | |
{ | |
log.finest ("Adding keyword: " + keyword); | |
enter (keyword); | |
} | |
log.finest ("Added " + keywords.size() + " keywords"); | |
// Construct failure function (Aho75 Algorithm 3) | |
constructFailureFunction (); | |
// Eliminate failure transitions - Aho75 algorithm 4 | |
} | |
public State getStartState () | |
{ | |
return __startState; | |
} | |
private static int nextStateId () | |
{ | |
return __stateIdSequence++; | |
} | |
/** | |
* Add pathways for a new keyword | |
*/ | |
private void enter (String keyword) | |
{ | |
State s = __startState; | |
int j = 0; | |
// Follow existing path as far as possible, don't use goTo() | |
// because it has pseudo loops on start state | |
for (j = 0; j < keyword.length(); j++) | |
{ | |
if (s.getTransitions().get(keyword.charAt(j)) == null) break; | |
s = s.getTransitions().get(keyword.charAt(j)); | |
} | |
// Extend path | |
for (int p = j; p < keyword.length(); p++) | |
{ | |
State newState = new State (nextStateId()); | |
s = s.addTransition (keyword.charAt(j++), newState); | |
} | |
s.addOutput (keyword); | |
} | |
/** | |
* Construct failure function | |
*/ | |
private void constructFailureFunction () | |
{ | |
Queue<State> queue = new LinkedList<State>(); | |
for (Character a : __startState.getTransitions().keySet()) | |
{ | |
State s = __startState.goTo(a); | |
log.finest ("Constructing failure function for level one state " + s.getId()); | |
if ( s.getId() != 0 ) | |
{ | |
s.setFailure (__startState); | |
queue.add (s); | |
} | |
} | |
while ( queue.size() != 0 ) | |
{ | |
State r = queue.remove (); | |
for (Character a : r.getTransitions().keySet()) | |
{ | |
State s = r.getTransitions().get(a); | |
queue.add (s); | |
State state = r.getFailure (); | |
// Special condition for start state because we didn't | |
// add failure edges for this state | |
while (state.goTo(a) == null && ! state.isStart()) | |
{ | |
state = state.getFailure(); | |
} | |
s.setFailure (state.goTo(a)); | |
s.mergeOutput (s.getFailure().getOutput()); | |
} | |
} | |
} | |
public String dot () | |
{ | |
StringBuffer sb = new StringBuffer (); | |
State s; | |
sb.append ("digraph G {\n"); | |
Iterator<State> iter = __startState.iterator (); | |
while (iter.hasNext()) | |
{ | |
s = iter.next (); | |
sb.append (String.format ("\t%s [label=\"%s\", shape=circle];\n", | |
s.getId(), | |
s.getId() + " " + s.getOutput())); | |
// Goto edges | |
for (Character a : s.getTransitions().keySet()) | |
{ | |
sb.append (String.format ("\t%s -> %s [label=\"%s\"];\n", | |
s.getId(), | |
s.getTransitions().get(a).getId(), | |
a)); | |
} | |
// Failure function | |
if ( s.getFailure() != null) | |
{ | |
sb.append (String.format ("\t%s -> %s [color=\"red\"];\n", | |
s.getId(), | |
s.getFailure().getId())); | |
} | |
} | |
sb.append ("}"); | |
return sb.toString (); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment