Created
October 18, 2012 05:45
-
-
Save stepchowfun/3910075 to your computer and use it in GitHub Desktop.
This header declares structures and interfaces for manipulating finite automata, both deterministic and nondeterministic.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This header declares structures and interfaces for manipulating finite automata, | |
both deterministic and nondeterministic. | |
The code is written in a portable subset of C++11. The only C++11 features used | |
are std::unordered_map and std::unordered_set, which easily can be replaced with | |
the (less-efficient) C++03 equivalents: std::map and std::set. | |
*/ | |
#ifndef FINITE_AUTOMATON_H | |
#define FINITE_AUTOMATON_H | |
#include <string> | |
#include <utility> | |
#include <vector> | |
#include <map> | |
#include <sstream> | |
#include <algorithm> | |
#include <unordered_map> | |
#include <unordered_set> | |
#include <cmath> | |
namespace finite_automata { | |
// exception class | |
class automaton_error { | |
public: | |
std::string message; | |
automaton_error() { | |
} | |
automaton_error(std::string msg) { | |
message = msg; | |
} | |
}; | |
// hash functionality for std::pair<size_t, size_t> | |
class hash_pair { | |
public: | |
size_t operator() (const std::pair<size_t, size_t> &x) const { | |
// the hash is simply the sum of the constituent integers | |
// size_t is unsigned, so overflow on addition is well-defined (modular arithmetic) | |
// if the constituent integers are uniformly distributed, the hash will be also | |
return (x.first << (sizeof(size_t)/2)) + x.second; | |
} | |
}; | |
// hash functionality for std::unordered_set<size_t> | |
class hash_set { | |
public: | |
size_t operator() (const std::unordered_set<size_t> &x) const { | |
// we don't want to rely on a hash function which assumes an ordering of the elements in this set | |
// we can let the hash be the sum of the constituent integers, since addition is commutative | |
// size_t is unsigned, so overflow on addition is well-defined (modular arithmetic) | |
// if the constituent integers are uniformly distributed, the hash will be also | |
size_t result = 0; | |
for (typename std::unordered_set<size_t>::const_iterator it = x.begin(); it != x.end(); ++it) | |
result += *it; | |
return result; | |
} | |
}; | |
// main class for finite automata | |
template <class SymbolType, class PayloadType> class finite_automaton { | |
public: | |
// each state stores a set of user-defined payloads that are preserved across transformations | |
std::vector<std::unordered_set<PayloadType> > states; | |
// there can be multiple start states, which are represented as indices into the states vector | |
std::unordered_set<size_t> start_states; | |
// there can be multiple end states, which are represented as indices into the states vector | |
std::unordered_set<size_t> accept_states; | |
// the non-epsilon transitions are represented as (current_state_id, next_state_id) -> [symbol] | |
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> transitions; | |
// the epsilon transitions are represented as (current_state_id, next_state_id) -> [symbol] | |
std::unordered_set<std::pair<size_t, size_t>, hash_pair> epsilon_transitions; | |
// the alphabet | |
std::unordered_set<SymbolType> alphabet; | |
// constructors | |
finite_automaton() { | |
// each member field has a default constructor, | |
// so there is no initialization to do here | |
} | |
finite_automaton(std::vector<std::unordered_set<PayloadType> > fa_states, | |
std::unordered_set<size_t> fa_start_states, | |
std::unordered_set<size_t> fa_accept_states, | |
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> fa_transitions, | |
std::unordered_set<std::pair<size_t, size_t>, hash_pair> fa_epsilon_transitions, | |
std::unordered_set<SymbolType> fa_alphabet) { | |
// make sure each start state exists | |
for (typename std::unordered_set<size_t>::const_iterator it = fa_start_states.begin(); it != fa_start_states.end(); ++it) { | |
if (*it >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "start state " << *it << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
} | |
// make sure each accept state exists | |
for (typename std::unordered_set<size_t>::const_iterator it = fa_accept_states.begin(); it != fa_accept_states.end(); ++it) { | |
if (*it >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "accept state " << *it << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
} | |
// make sure the transitions go to and from existing states over symbols in the alphabet | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = fa_transitions.begin(); it != fa_transitions.end(); ++it) { | |
if (it->first.first >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "state " << it->first.first << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
if (it->first.second >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "state " << it->first.second << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
for (typename std::unordered_set<SymbolType>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) { | |
if (fa_alphabet.find(*that) == fa_alphabet.end()) { | |
std::stringstream ss; | |
ss << "symbol " << *that << " is not in alphabet"; | |
throw automaton_error(ss.str()); | |
} | |
} | |
} | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = fa_epsilon_transitions.begin(); it != fa_epsilon_transitions.end(); ++it) { | |
if (it->first >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "state " << it->first << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
if (it->second >= fa_states.size()) { | |
std::stringstream ss; | |
ss << "state " << it->second << " does not exist"; | |
throw automaton_error(ss.str()); | |
} | |
} | |
// assign the member fields | |
states = fa_states; | |
start_states = fa_start_states; | |
accept_states = fa_accept_states; | |
transitions = fa_transitions; | |
epsilon_transitions = fa_epsilon_transitions; | |
alphabet = fa_alphabet; | |
} | |
// get a string representation of the finite automaton | |
std::string get_repr() { | |
// build the result in a stringstream buffer | |
std::stringstream ss; | |
// print the states | |
ss << "states: "; | |
for (size_t i = 0; i < states.size(); i++) { | |
if (i != 0) | |
ss << ", "; | |
ss << i << " {"; | |
for (typename std::unordered_set<PayloadType>::const_iterator it = states[i].begin(); it != states[i].end(); ++it) { | |
if (it != states[i].begin()) | |
ss << ", "; | |
ss << *it; | |
} | |
ss << "}"; | |
} | |
// print the start states | |
ss << "\nstart states:"; | |
for (typename std::unordered_set<size_t>::const_iterator it = start_states.begin(); it != start_states.end(); ++it) | |
ss << " " << *it; | |
// print the accept states | |
ss << "\naccept states:"; | |
for (typename std::unordered_set<size_t>::const_iterator it = accept_states.begin(); it != accept_states.end(); ++it) | |
ss << " " << *it; | |
// print the transitions | |
ss << "\ntransitions:\n"; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = transitions.begin(); it != transitions.end(); ++it) { | |
ss << " " << it->first.first << " -> ("; | |
for (typename std::unordered_set<SymbolType>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) { | |
if (that != it->second.begin()) | |
ss << ", "; | |
ss << *that; | |
} | |
ss << ") -> " << it->first.second << "\n"; | |
} | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = epsilon_transitions.begin(); it != epsilon_transitions.end(); ++it) | |
ss << " " << it->first << " -> () -> " << it->second << "\n"; | |
// convert the buffer to a string and return it | |
return ss.str(); | |
} | |
// take the complement of a finite automaton | |
static finite_automaton<SymbolType, PayloadType> automaton_complement(const finite_automaton<SymbolType, PayloadType> &a) { | |
// convert the automaton to a DFA | |
finite_automaton<SymbolType, PayloadType> result = automaton_dfa(a); | |
// swap accept states with non-accept states | |
std::unordered_set<size_t> new_accept_states; | |
for (size_t i = 0; i < result.states.size(); ++i) { | |
if (result.accept_states.find(i) == result.accept_states.end()) | |
new_accept_states.insert(i); | |
} | |
result.accept_states = new_accept_states; | |
// return the automaton | |
return result; | |
} | |
// construct a finite automaton which recognizes the reverse of the language recognized by a finite automaton | |
static finite_automaton<SymbolType, PayloadType> automaton_reverse(const finite_automaton<SymbolType, PayloadType> &a) { | |
// create a copy of the automaton | |
finite_automaton<SymbolType, PayloadType> result = a; | |
result.alphabet = a.alphabet; | |
// reverse the transitions | |
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> transitions; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = result.transitions.begin(); it != result.transitions.end(); ++it) | |
transitions[std::pair<size_t, size_t>(it->first.second, it->first.first)] = it->second; | |
result.transitions = transitions; | |
// reverse the epsilon transitions | |
std::unordered_set<std::pair<size_t, size_t>, hash_pair> epsilon_transitions; | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = result.epsilon_transitions.begin(); it != result.epsilon_transitions.end(); ++it) | |
epsilon_transitions.insert(std::pair<size_t, size_t>(it->second, it->first)); | |
result.epsilon_transitions = epsilon_transitions; | |
// swap start and accept states | |
std::unordered_set<size_t> old_accept_states = result.accept_states; | |
result.accept_states = result.start_states; | |
result.start_states = old_accept_states; | |
// return the automaton | |
return result; | |
} | |
// take the union of two finite automata | |
static finite_automaton<SymbolType, PayloadType> automaton_union(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) { | |
// start with a new automaton | |
finite_automaton<SymbolType, PayloadType> result; | |
result.alphabet = a.alphabet; | |
if (a.alphabet != b.alphabet) | |
throw automaton_error("automata alphabets do not match"); | |
// take the union of the states of the two automata | |
result.states = a.states; | |
result.states.insert(result.states.end(), b.states.begin(), b.states.end()); | |
// take the union of the start states of the two automata | |
result.start_states = a.start_states; | |
for (typename std::unordered_set<size_t>::const_iterator it = b.start_states.begin(); it != b.start_states.end(); ++it) | |
result.start_states.insert((*it) + a.states.size()); | |
// take the union of the accept states of the two automata | |
result.accept_states = a.accept_states; | |
for (typename std::unordered_set<size_t>::const_iterator it = b.accept_states.begin(); it != b.accept_states.end(); ++it) | |
result.accept_states.insert((*it) + a.states.size()); | |
// preserve the transitions of the original automata | |
result.transitions = a.transitions; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = b.transitions.begin(); it != b.transitions.end(); ++it) | |
result.transitions[std::pair<size_t, size_t>(it->first.first + a.states.size(), it->first.second + a.states.size())] = it->second; | |
// preserve the epsilon transitions of the original automata | |
result.epsilon_transitions = a.epsilon_transitions; | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = b.epsilon_transitions.begin(); it != b.epsilon_transitions.end(); ++it) | |
result.epsilon_transitions.insert(std::pair<size_t, size_t>(it->first + a.states.size(), it->second + a.states.size())); | |
// return the automaton | |
return result; | |
} | |
// take the intersection of two finite automata | |
static finite_automaton<SymbolType, PayloadType> automaton_intersection(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) { | |
// the intersection of two automata is the complement of the union of their complements | |
return automaton_complement(automaton_union(automaton_complement(a), automaton_complement(b))); | |
} | |
// create a finite automaton which recognizes the concatenation of the regular languages recognized by two finite automata | |
static finite_automaton<SymbolType, PayloadType> automaton_concat(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) { | |
// start with a new automaton | |
finite_automaton<SymbolType, PayloadType> result; | |
result.alphabet = a.alphabet; | |
if (a.alphabet != b.alphabet) | |
throw automaton_error("automata alphabets do not match"); | |
// take the union of the states of a and b | |
result.states = a.states; | |
result.states.insert(result.states.end(), b.states.begin(), b.states.end()); | |
// set the start states to be the start states of a | |
result.start_states = a.start_states; | |
// set the accept states to be the accept states of b | |
for (typename std::unordered_set<size_t>::const_iterator it = b.accept_states.begin(); it != b.accept_states.end(); ++it) | |
result.accept_states.insert((*it) + a.states.size()); | |
// preserve the transitions of the original automata | |
result.transitions = a.transitions; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = b.transitions.begin(); it != b.transitions.end(); ++it) | |
result.transitions[std::pair<size_t, size_t>(it->first.first + a.states.size(), it->first.second + a.states.size())] = it->second; | |
// preserve the epsilon transitions of the original automata | |
result.epsilon_transitions = a.epsilon_transitions; | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = b.epsilon_transitions.begin(); it != b.epsilon_transitions.end(); ++it) | |
result.epsilon_transitions.insert(std::pair<size_t, size_t>(it->first + a.states.size(), it->second + a.states.size())); | |
// add epsilon transitions from the accept states of a to the start states of b | |
for (typename std::unordered_set<size_t>::const_iterator ita = a.accept_states.begin(); ita != a.accept_states.end(); ++ita) { | |
for (typename std::unordered_set<size_t>::const_iterator itb = b.start_states.begin(); itb != b.start_states.end(); ++itb) | |
result.epsilon_transitions.insert(std::pair<size_t, size_t>(*ita, *itb + a.states.size())); | |
} | |
// return the automaton | |
return result; | |
} | |
// create a finite automaton which recognizes the Kleene star of the regular languages recognized by two finite automata | |
static finite_automaton<SymbolType, PayloadType> automaton_star(const finite_automaton<SymbolType, PayloadType> &a) { | |
// create a copy of the automaton | |
finite_automaton<SymbolType, PayloadType> result = a; | |
result.alphabet = a.alphabet; | |
// add epsilon transitions from each accept state to each start state | |
for (typename std::unordered_set<size_t>::const_iterator ita = a.accept_states.begin(); ita != a.accept_states.end(); ++ita) { | |
for (typename std::unordered_set<size_t>::const_iterator its = a.start_states.begin(); its != a.start_states.end(); ++its) | |
result.epsilon_transitions.insert(std::pair<size_t, size_t>(*ita, *its)); | |
} | |
// create a new start state | |
std::unordered_set<PayloadType> payload_values; | |
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it) | |
payload_values.insert(a.states[*it].begin(), a.states[*it].end()); | |
result.states.push_back(payload_values); | |
// create epsilon transitions from the new start state to the old ones | |
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it) | |
result.epsilon_transitions.insert(std::pair<size_t, size_t>(result.states.size() - 1, *it)); | |
// set the new state to be the only start state and add it to the accept states | |
result.start_states.clear(); | |
result.start_states.insert(result.states.size() - 1); | |
result.accept_states.insert(result.states.size() - 1); | |
// return the automaton | |
return result; | |
} | |
// convert a possibly nondeterministic finite state automaton into a deterministic one | |
static finite_automaton<SymbolType, PayloadType> automaton_dfa(const finite_automaton<SymbolType, PayloadType> &a, PayloadType new_sink_state) { | |
// start with a new automaton | |
finite_automaton<SymbolType, PayloadType> result; | |
result.alphabet = a.alphabet; | |
// keep track of NFA states on the frontier and states that have already been constructed | |
std::unordered_set<std::unordered_set<size_t>, hash_set> visited; | |
std::unordered_set<std::unordered_set<size_t>, hash_set> frontier; | |
// keep a map from DFA state to index | |
std::unordered_map<std::unordered_set<size_t>, size_t, hash_set> dfa_state_map; | |
// compute the epsilon closure of every state for later | |
std::unordered_map<size_t, std::unordered_set<size_t> > epsilon_closures; | |
for (size_t i = 0; i < a.states.size(); ++i) { | |
epsilon_closures[i] = std::unordered_set<size_t>(); | |
epsilon_closures[i].insert(i); | |
} | |
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = a.epsilon_transitions.begin(); it != a.epsilon_transitions.end(); ++it) | |
epsilon_closures[it->first].insert(it->second); | |
// compute the union of the epsilon closures of the DFA start states | |
std::unordered_set<size_t> epsilon_closure_start_states; | |
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it) { | |
std::unordered_set<size_t> epsilon_closure = epsilon_closures[*it]; | |
for (typename std::unordered_set<size_t>::const_iterator that = epsilon_closure.begin(); that != epsilon_closure.end(); ++that) | |
epsilon_closure_start_states.insert(*that); | |
} | |
// add this start state to the frontier | |
frontier.insert(epsilon_closure_start_states); | |
// construct the DFA | |
while (!frontier.empty()) { | |
// take an arbitrary DFA state from the frontier | |
std::unordered_set<size_t> dfa_state = *(frontier.begin()); | |
frontier.erase(frontier.begin()); | |
// create the DFA state to represent this subset of NFA states | |
if (dfa_state_map.find(dfa_state) == dfa_state_map.end()) { | |
std::unordered_set<PayloadType> payload_values; | |
for (typename std::unordered_set<size_t>::const_iterator it = dfa_state.begin(); it != dfa_state.end(); ++it) | |
payload_values.insert(a.states[*it].begin(), a.states[*it].end()); | |
result.states.push_back(payload_values); | |
dfa_state_map[dfa_state] = result.states.size()-1; | |
// add an accept state if necessary | |
bool accepting = false; | |
for (typename std::unordered_set<size_t>::const_iterator it = dfa_state.begin(); it != dfa_state.end(); ++it) { | |
if (a.accept_states.find(*it) != a.accept_states.end()) { | |
accepting = true; | |
break; | |
} | |
} | |
if (accepting) | |
result.accept_states.insert(dfa_state_map[dfa_state]); | |
} | |
// find the epsilon closure of all NFA states that can be transitioned to from this one | |
std::unordered_map<SymbolType, std::unordered_set<size_t> > subset_row; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = a.transitions.begin(); it != a.transitions.end(); ++it) { | |
// check if the transition leaves from this DFA state | |
if (dfa_state.find(it->first.first) != dfa_state.end()) { | |
std::unordered_set<SymbolType> symbols = it->second; | |
for (typename std::unordered_set<SymbolType>::const_iterator that = symbols.begin(); that != symbols.end(); ++that) | |
subset_row[*that].insert(it->first.second); | |
} | |
} | |
for (typename std::unordered_map<SymbolType, std::unordered_set<size_t> >::const_iterator it = subset_row.begin(); it != subset_row.end(); ++it) { | |
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) { | |
std::unordered_set<size_t> epsilon_closure = epsilon_closures[*that]; | |
subset_row[it->first].insert(epsilon_closure.begin(), epsilon_closure.end()); | |
} | |
} | |
// add the new states to the frontier | |
for (typename std::unordered_map<SymbolType, std::unordered_set<size_t> >::const_iterator it = subset_row.begin(); it != subset_row.end(); ++it) { | |
// add the new state to the state vector so we can index it | |
if (dfa_state_map.find(it->second) == dfa_state_map.end()) { | |
std::unordered_set<PayloadType> payload_values; | |
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) | |
payload_values.insert(a.states[*that].begin(), a.states[*that].end()); | |
result.states.push_back(payload_values); | |
dfa_state_map[it->second] = result.states.size()-1; | |
// add an accept state if necessary | |
bool accepting = false; | |
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) { | |
if (a.accept_states.find(*that) != a.accept_states.end()) { | |
accepting = true; | |
break; | |
} | |
} | |
if (accepting) | |
result.accept_states.insert(dfa_state_map[it->second]); | |
} | |
// add the transitions | |
result.transitions[std::pair<size_t, size_t>(dfa_state_map[dfa_state], dfa_state_map[it->second])].insert(it->first); | |
// make sure it hasn't been visited already | |
if (visited.find(it->second) != visited.end()) | |
continue; | |
// make sure it hasn't been added already | |
if (frontier.find(it->second) != frontier.end()) | |
continue; | |
// make sure we didn't just take it from the frontier | |
if (it->second == dfa_state) | |
continue; | |
// add it to the frontier | |
frontier.insert(it->second); | |
} | |
// mark this DFA state as visited | |
visited.insert(dfa_state); | |
} | |
// create a sink node if necessary | |
bool sink_created = false; | |
size_t sink_state = 0; | |
std::unordered_map<size_t, std::unordered_set<SymbolType> > outgoing_transitions; | |
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = result.transitions.begin(); it != result.transitions.end(); ++it) | |
outgoing_transitions[it->first.first].insert(it->second.begin(), it->second.end()); | |
for (size_t i = 0; i < result.states.size(); ++i) { | |
for (typename std::unordered_set<SymbolType>::const_iterator it = result.alphabet.begin(); it != result.alphabet.end(); ++it) { | |
if (outgoing_transitions[i].find(*it) == outgoing_transitions[i].end()) { | |
if (!sink_created) { | |
sink_state = result.states.size(); | |
result.states.push_back(std::unordered_set<PayloadType>()); | |
result.states.back().insert(new_sink_state); | |
for (typename std::unordered_set<SymbolType>::const_iterator that = result.alphabet.begin(); that != result.alphabet.end(); ++that) | |
result.transitions[std::pair<size_t, size_t>(sink_state, sink_state)].insert(*that); | |
sink_created = true; | |
} | |
result.transitions[std::pair<size_t, size_t>(i, sink_state)].insert(*it); | |
} | |
} | |
} | |
// add the start state | |
if (!result.states.empty()) | |
result.start_states.insert(0); | |
// return the automaton | |
return result; | |
} | |
// convert a possibly nondeterministic finite state automaton into an optimal deterministic one using Brzozowski's algorithm | |
static finite_automaton<SymbolType, PayloadType> automaton_optimal_dfa(const finite_automaton<SymbolType, PayloadType> &a, PayloadType new_sink_state) { | |
// this is Brzozowski's algorithm | |
return automaton_dfa(automaton_reverse(automaton_dfa(automaton_reverse(a), new_sink_state)), new_sink_state); | |
} | |
}; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment