Skip to content

Instantly share code, notes, and snippets.

@stepchowfun
Created October 18, 2012 05:45
Show Gist options
  • Save stepchowfun/3910075 to your computer and use it in GitHub Desktop.
Save stepchowfun/3910075 to your computer and use it in GitHub Desktop.
This header declares structures and interfaces for manipulating finite automata, both deterministic and nondeterministic.
/*
This header declares structures and interfaces for manipulating finite automata,
both deterministic and nondeterministic.
The code is written in a portable subset of C++11. The only C++11 features used
are std::unordered_map and std::unordered_set, which easily can be replaced with
the (less-efficient) C++03 equivalents: std::map and std::set.
*/
#ifndef FINITE_AUTOMATON_H
#define FINITE_AUTOMATON_H
#include <string>
#include <utility>
#include <vector>
#include <map>
#include <sstream>
#include <algorithm>
#include <unordered_map>
#include <unordered_set>
#include <cmath>
namespace finite_automata {
// exception class
class automaton_error {
public:
std::string message;
automaton_error() {
}
automaton_error(std::string msg) {
message = msg;
}
};
// hash functionality for std::pair<size_t, size_t>
class hash_pair {
public:
size_t operator() (const std::pair<size_t, size_t> &x) const {
// the hash is simply the sum of the constituent integers
// size_t is unsigned, so overflow on addition is well-defined (modular arithmetic)
// if the constituent integers are uniformly distributed, the hash will be also
return (x.first << (sizeof(size_t)/2)) + x.second;
}
};
// hash functionality for std::unordered_set<size_t>
class hash_set {
public:
size_t operator() (const std::unordered_set<size_t> &x) const {
// we don't want to rely on a hash function which assumes an ordering of the elements in this set
// we can let the hash be the sum of the constituent integers, since addition is commutative
// size_t is unsigned, so overflow on addition is well-defined (modular arithmetic)
// if the constituent integers are uniformly distributed, the hash will be also
size_t result = 0;
for (typename std::unordered_set<size_t>::const_iterator it = x.begin(); it != x.end(); ++it)
result += *it;
return result;
}
};
// main class for finite automata
template <class SymbolType, class PayloadType> class finite_automaton {
public:
// each state stores a set of user-defined payloads that are preserved across transformations
std::vector<std::unordered_set<PayloadType> > states;
// there can be multiple start states, which are represented as indices into the states vector
std::unordered_set<size_t> start_states;
// there can be multiple end states, which are represented as indices into the states vector
std::unordered_set<size_t> accept_states;
// the non-epsilon transitions are represented as (current_state_id, next_state_id) -> [symbol]
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> transitions;
// the epsilon transitions are represented as (current_state_id, next_state_id) -> [symbol]
std::unordered_set<std::pair<size_t, size_t>, hash_pair> epsilon_transitions;
// the alphabet
std::unordered_set<SymbolType> alphabet;
// constructors
finite_automaton() {
// each member field has a default constructor,
// so there is no initialization to do here
}
finite_automaton(std::vector<std::unordered_set<PayloadType> > fa_states,
std::unordered_set<size_t> fa_start_states,
std::unordered_set<size_t> fa_accept_states,
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> fa_transitions,
std::unordered_set<std::pair<size_t, size_t>, hash_pair> fa_epsilon_transitions,
std::unordered_set<SymbolType> fa_alphabet) {
// make sure each start state exists
for (typename std::unordered_set<size_t>::const_iterator it = fa_start_states.begin(); it != fa_start_states.end(); ++it) {
if (*it >= fa_states.size()) {
std::stringstream ss;
ss << "start state " << *it << " does not exist";
throw automaton_error(ss.str());
}
}
// make sure each accept state exists
for (typename std::unordered_set<size_t>::const_iterator it = fa_accept_states.begin(); it != fa_accept_states.end(); ++it) {
if (*it >= fa_states.size()) {
std::stringstream ss;
ss << "accept state " << *it << " does not exist";
throw automaton_error(ss.str());
}
}
// make sure the transitions go to and from existing states over symbols in the alphabet
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = fa_transitions.begin(); it != fa_transitions.end(); ++it) {
if (it->first.first >= fa_states.size()) {
std::stringstream ss;
ss << "state " << it->first.first << " does not exist";
throw automaton_error(ss.str());
}
if (it->first.second >= fa_states.size()) {
std::stringstream ss;
ss << "state " << it->first.second << " does not exist";
throw automaton_error(ss.str());
}
for (typename std::unordered_set<SymbolType>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) {
if (fa_alphabet.find(*that) == fa_alphabet.end()) {
std::stringstream ss;
ss << "symbol " << *that << " is not in alphabet";
throw automaton_error(ss.str());
}
}
}
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = fa_epsilon_transitions.begin(); it != fa_epsilon_transitions.end(); ++it) {
if (it->first >= fa_states.size()) {
std::stringstream ss;
ss << "state " << it->first << " does not exist";
throw automaton_error(ss.str());
}
if (it->second >= fa_states.size()) {
std::stringstream ss;
ss << "state " << it->second << " does not exist";
throw automaton_error(ss.str());
}
}
// assign the member fields
states = fa_states;
start_states = fa_start_states;
accept_states = fa_accept_states;
transitions = fa_transitions;
epsilon_transitions = fa_epsilon_transitions;
alphabet = fa_alphabet;
}
// get a string representation of the finite automaton
std::string get_repr() {
// build the result in a stringstream buffer
std::stringstream ss;
// print the states
ss << "states: ";
for (size_t i = 0; i < states.size(); i++) {
if (i != 0)
ss << ", ";
ss << i << " {";
for (typename std::unordered_set<PayloadType>::const_iterator it = states[i].begin(); it != states[i].end(); ++it) {
if (it != states[i].begin())
ss << ", ";
ss << *it;
}
ss << "}";
}
// print the start states
ss << "\nstart states:";
for (typename std::unordered_set<size_t>::const_iterator it = start_states.begin(); it != start_states.end(); ++it)
ss << " " << *it;
// print the accept states
ss << "\naccept states:";
for (typename std::unordered_set<size_t>::const_iterator it = accept_states.begin(); it != accept_states.end(); ++it)
ss << " " << *it;
// print the transitions
ss << "\ntransitions:\n";
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = transitions.begin(); it != transitions.end(); ++it) {
ss << " " << it->first.first << " -> (";
for (typename std::unordered_set<SymbolType>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) {
if (that != it->second.begin())
ss << ", ";
ss << *that;
}
ss << ") -> " << it->first.second << "\n";
}
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = epsilon_transitions.begin(); it != epsilon_transitions.end(); ++it)
ss << " " << it->first << " -> () -> " << it->second << "\n";
// convert the buffer to a string and return it
return ss.str();
}
// take the complement of a finite automaton
static finite_automaton<SymbolType, PayloadType> automaton_complement(const finite_automaton<SymbolType, PayloadType> &a) {
// convert the automaton to a DFA
finite_automaton<SymbolType, PayloadType> result = automaton_dfa(a);
// swap accept states with non-accept states
std::unordered_set<size_t> new_accept_states;
for (size_t i = 0; i < result.states.size(); ++i) {
if (result.accept_states.find(i) == result.accept_states.end())
new_accept_states.insert(i);
}
result.accept_states = new_accept_states;
// return the automaton
return result;
}
// construct a finite automaton which recognizes the reverse of the language recognized by a finite automaton
static finite_automaton<SymbolType, PayloadType> automaton_reverse(const finite_automaton<SymbolType, PayloadType> &a) {
// create a copy of the automaton
finite_automaton<SymbolType, PayloadType> result = a;
result.alphabet = a.alphabet;
// reverse the transitions
std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair> transitions;
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = result.transitions.begin(); it != result.transitions.end(); ++it)
transitions[std::pair<size_t, size_t>(it->first.second, it->first.first)] = it->second;
result.transitions = transitions;
// reverse the epsilon transitions
std::unordered_set<std::pair<size_t, size_t>, hash_pair> epsilon_transitions;
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = result.epsilon_transitions.begin(); it != result.epsilon_transitions.end(); ++it)
epsilon_transitions.insert(std::pair<size_t, size_t>(it->second, it->first));
result.epsilon_transitions = epsilon_transitions;
// swap start and accept states
std::unordered_set<size_t> old_accept_states = result.accept_states;
result.accept_states = result.start_states;
result.start_states = old_accept_states;
// return the automaton
return result;
}
// take the union of two finite automata
static finite_automaton<SymbolType, PayloadType> automaton_union(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) {
// start with a new automaton
finite_automaton<SymbolType, PayloadType> result;
result.alphabet = a.alphabet;
if (a.alphabet != b.alphabet)
throw automaton_error("automata alphabets do not match");
// take the union of the states of the two automata
result.states = a.states;
result.states.insert(result.states.end(), b.states.begin(), b.states.end());
// take the union of the start states of the two automata
result.start_states = a.start_states;
for (typename std::unordered_set<size_t>::const_iterator it = b.start_states.begin(); it != b.start_states.end(); ++it)
result.start_states.insert((*it) + a.states.size());
// take the union of the accept states of the two automata
result.accept_states = a.accept_states;
for (typename std::unordered_set<size_t>::const_iterator it = b.accept_states.begin(); it != b.accept_states.end(); ++it)
result.accept_states.insert((*it) + a.states.size());
// preserve the transitions of the original automata
result.transitions = a.transitions;
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = b.transitions.begin(); it != b.transitions.end(); ++it)
result.transitions[std::pair<size_t, size_t>(it->first.first + a.states.size(), it->first.second + a.states.size())] = it->second;
// preserve the epsilon transitions of the original automata
result.epsilon_transitions = a.epsilon_transitions;
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = b.epsilon_transitions.begin(); it != b.epsilon_transitions.end(); ++it)
result.epsilon_transitions.insert(std::pair<size_t, size_t>(it->first + a.states.size(), it->second + a.states.size()));
// return the automaton
return result;
}
// take the intersection of two finite automata
static finite_automaton<SymbolType, PayloadType> automaton_intersection(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) {
// the intersection of two automata is the complement of the union of their complements
return automaton_complement(automaton_union(automaton_complement(a), automaton_complement(b)));
}
// create a finite automaton which recognizes the concatenation of the regular languages recognized by two finite automata
static finite_automaton<SymbolType, PayloadType> automaton_concat(const finite_automaton<SymbolType, PayloadType> &a, const finite_automaton<SymbolType, PayloadType> &b) {
// start with a new automaton
finite_automaton<SymbolType, PayloadType> result;
result.alphabet = a.alphabet;
if (a.alphabet != b.alphabet)
throw automaton_error("automata alphabets do not match");
// take the union of the states of a and b
result.states = a.states;
result.states.insert(result.states.end(), b.states.begin(), b.states.end());
// set the start states to be the start states of a
result.start_states = a.start_states;
// set the accept states to be the accept states of b
for (typename std::unordered_set<size_t>::const_iterator it = b.accept_states.begin(); it != b.accept_states.end(); ++it)
result.accept_states.insert((*it) + a.states.size());
// preserve the transitions of the original automata
result.transitions = a.transitions;
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = b.transitions.begin(); it != b.transitions.end(); ++it)
result.transitions[std::pair<size_t, size_t>(it->first.first + a.states.size(), it->first.second + a.states.size())] = it->second;
// preserve the epsilon transitions of the original automata
result.epsilon_transitions = a.epsilon_transitions;
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = b.epsilon_transitions.begin(); it != b.epsilon_transitions.end(); ++it)
result.epsilon_transitions.insert(std::pair<size_t, size_t>(it->first + a.states.size(), it->second + a.states.size()));
// add epsilon transitions from the accept states of a to the start states of b
for (typename std::unordered_set<size_t>::const_iterator ita = a.accept_states.begin(); ita != a.accept_states.end(); ++ita) {
for (typename std::unordered_set<size_t>::const_iterator itb = b.start_states.begin(); itb != b.start_states.end(); ++itb)
result.epsilon_transitions.insert(std::pair<size_t, size_t>(*ita, *itb + a.states.size()));
}
// return the automaton
return result;
}
// create a finite automaton which recognizes the Kleene star of the regular languages recognized by two finite automata
static finite_automaton<SymbolType, PayloadType> automaton_star(const finite_automaton<SymbolType, PayloadType> &a) {
// create a copy of the automaton
finite_automaton<SymbolType, PayloadType> result = a;
result.alphabet = a.alphabet;
// add epsilon transitions from each accept state to each start state
for (typename std::unordered_set<size_t>::const_iterator ita = a.accept_states.begin(); ita != a.accept_states.end(); ++ita) {
for (typename std::unordered_set<size_t>::const_iterator its = a.start_states.begin(); its != a.start_states.end(); ++its)
result.epsilon_transitions.insert(std::pair<size_t, size_t>(*ita, *its));
}
// create a new start state
std::unordered_set<PayloadType> payload_values;
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it)
payload_values.insert(a.states[*it].begin(), a.states[*it].end());
result.states.push_back(payload_values);
// create epsilon transitions from the new start state to the old ones
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it)
result.epsilon_transitions.insert(std::pair<size_t, size_t>(result.states.size() - 1, *it));
// set the new state to be the only start state and add it to the accept states
result.start_states.clear();
result.start_states.insert(result.states.size() - 1);
result.accept_states.insert(result.states.size() - 1);
// return the automaton
return result;
}
// convert a possibly nondeterministic finite state automaton into a deterministic one
static finite_automaton<SymbolType, PayloadType> automaton_dfa(const finite_automaton<SymbolType, PayloadType> &a, PayloadType new_sink_state) {
// start with a new automaton
finite_automaton<SymbolType, PayloadType> result;
result.alphabet = a.alphabet;
// keep track of NFA states on the frontier and states that have already been constructed
std::unordered_set<std::unordered_set<size_t>, hash_set> visited;
std::unordered_set<std::unordered_set<size_t>, hash_set> frontier;
// keep a map from DFA state to index
std::unordered_map<std::unordered_set<size_t>, size_t, hash_set> dfa_state_map;
// compute the epsilon closure of every state for later
std::unordered_map<size_t, std::unordered_set<size_t> > epsilon_closures;
for (size_t i = 0; i < a.states.size(); ++i) {
epsilon_closures[i] = std::unordered_set<size_t>();
epsilon_closures[i].insert(i);
}
for (typename std::unordered_set<std::pair<size_t, size_t>, hash_pair>::const_iterator it = a.epsilon_transitions.begin(); it != a.epsilon_transitions.end(); ++it)
epsilon_closures[it->first].insert(it->second);
// compute the union of the epsilon closures of the DFA start states
std::unordered_set<size_t> epsilon_closure_start_states;
for (typename std::unordered_set<size_t>::const_iterator it = a.start_states.begin(); it != a.start_states.end(); ++it) {
std::unordered_set<size_t> epsilon_closure = epsilon_closures[*it];
for (typename std::unordered_set<size_t>::const_iterator that = epsilon_closure.begin(); that != epsilon_closure.end(); ++that)
epsilon_closure_start_states.insert(*that);
}
// add this start state to the frontier
frontier.insert(epsilon_closure_start_states);
// construct the DFA
while (!frontier.empty()) {
// take an arbitrary DFA state from the frontier
std::unordered_set<size_t> dfa_state = *(frontier.begin());
frontier.erase(frontier.begin());
// create the DFA state to represent this subset of NFA states
if (dfa_state_map.find(dfa_state) == dfa_state_map.end()) {
std::unordered_set<PayloadType> payload_values;
for (typename std::unordered_set<size_t>::const_iterator it = dfa_state.begin(); it != dfa_state.end(); ++it)
payload_values.insert(a.states[*it].begin(), a.states[*it].end());
result.states.push_back(payload_values);
dfa_state_map[dfa_state] = result.states.size()-1;
// add an accept state if necessary
bool accepting = false;
for (typename std::unordered_set<size_t>::const_iterator it = dfa_state.begin(); it != dfa_state.end(); ++it) {
if (a.accept_states.find(*it) != a.accept_states.end()) {
accepting = true;
break;
}
}
if (accepting)
result.accept_states.insert(dfa_state_map[dfa_state]);
}
// find the epsilon closure of all NFA states that can be transitioned to from this one
std::unordered_map<SymbolType, std::unordered_set<size_t> > subset_row;
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = a.transitions.begin(); it != a.transitions.end(); ++it) {
// check if the transition leaves from this DFA state
if (dfa_state.find(it->first.first) != dfa_state.end()) {
std::unordered_set<SymbolType> symbols = it->second;
for (typename std::unordered_set<SymbolType>::const_iterator that = symbols.begin(); that != symbols.end(); ++that)
subset_row[*that].insert(it->first.second);
}
}
for (typename std::unordered_map<SymbolType, std::unordered_set<size_t> >::const_iterator it = subset_row.begin(); it != subset_row.end(); ++it) {
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) {
std::unordered_set<size_t> epsilon_closure = epsilon_closures[*that];
subset_row[it->first].insert(epsilon_closure.begin(), epsilon_closure.end());
}
}
// add the new states to the frontier
for (typename std::unordered_map<SymbolType, std::unordered_set<size_t> >::const_iterator it = subset_row.begin(); it != subset_row.end(); ++it) {
// add the new state to the state vector so we can index it
if (dfa_state_map.find(it->second) == dfa_state_map.end()) {
std::unordered_set<PayloadType> payload_values;
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that)
payload_values.insert(a.states[*that].begin(), a.states[*that].end());
result.states.push_back(payload_values);
dfa_state_map[it->second] = result.states.size()-1;
// add an accept state if necessary
bool accepting = false;
for (typename std::unordered_set<size_t>::const_iterator that = it->second.begin(); that != it->second.end(); ++that) {
if (a.accept_states.find(*that) != a.accept_states.end()) {
accepting = true;
break;
}
}
if (accepting)
result.accept_states.insert(dfa_state_map[it->second]);
}
// add the transitions
result.transitions[std::pair<size_t, size_t>(dfa_state_map[dfa_state], dfa_state_map[it->second])].insert(it->first);
// make sure it hasn't been visited already
if (visited.find(it->second) != visited.end())
continue;
// make sure it hasn't been added already
if (frontier.find(it->second) != frontier.end())
continue;
// make sure we didn't just take it from the frontier
if (it->second == dfa_state)
continue;
// add it to the frontier
frontier.insert(it->second);
}
// mark this DFA state as visited
visited.insert(dfa_state);
}
// create a sink node if necessary
bool sink_created = false;
size_t sink_state = 0;
std::unordered_map<size_t, std::unordered_set<SymbolType> > outgoing_transitions;
for (typename std::unordered_map<std::pair<size_t, size_t>, std::unordered_set<SymbolType>, hash_pair>::const_iterator it = result.transitions.begin(); it != result.transitions.end(); ++it)
outgoing_transitions[it->first.first].insert(it->second.begin(), it->second.end());
for (size_t i = 0; i < result.states.size(); ++i) {
for (typename std::unordered_set<SymbolType>::const_iterator it = result.alphabet.begin(); it != result.alphabet.end(); ++it) {
if (outgoing_transitions[i].find(*it) == outgoing_transitions[i].end()) {
if (!sink_created) {
sink_state = result.states.size();
result.states.push_back(std::unordered_set<PayloadType>());
result.states.back().insert(new_sink_state);
for (typename std::unordered_set<SymbolType>::const_iterator that = result.alphabet.begin(); that != result.alphabet.end(); ++that)
result.transitions[std::pair<size_t, size_t>(sink_state, sink_state)].insert(*that);
sink_created = true;
}
result.transitions[std::pair<size_t, size_t>(i, sink_state)].insert(*it);
}
}
}
// add the start state
if (!result.states.empty())
result.start_states.insert(0);
// return the automaton
return result;
}
// convert a possibly nondeterministic finite state automaton into an optimal deterministic one using Brzozowski's algorithm
static finite_automaton<SymbolType, PayloadType> automaton_optimal_dfa(const finite_automaton<SymbolType, PayloadType> &a, PayloadType new_sink_state) {
// this is Brzozowski's algorithm
return automaton_dfa(automaton_reverse(automaton_dfa(automaton_reverse(a), new_sink_state)), new_sink_state);
}
};
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment