Last active
June 30, 2019 12:46
-
-
Save unixod/368a70f0235a596d9f62b8f545512915 to your computer and use it in GitHub Desktop.
NFA to DFA (a part of not yet published library). For the entry point see a function makeDfa at the end of the file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (c) 2019 Eldar Zakirov | |
* | |
* Dear Reader, | |
* | |
* This software can be freely used for any purposes that don't go against | |
* norms of Islam. Hence, this software shouldn't be used to implement, fix | |
* or improve the solutions for riba-based banks and other organizations | |
* that are prohibited in Islam. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
* | |
*/ | |
#ifndef RE2MEM_NFA_TO_DFA_H | |
#define RE2MEM_NFA_TO_DFA_H | |
#include <map> | |
#include <set> | |
#include <queue> | |
#include <functional> | |
#include "re2mem/nfa.h" | |
#include "re2mem/definitions.h" | |
namespace re2mem { | |
using NfaStateIdx = std::size_t; | |
// A single byte symbol decoder. Consumes one re2mem::Symbol | |
// and produces corresponding std::byte. | |
struct SingleByteSymbolDecoder { | |
using SymbolType = std::byte; | |
using ConsumedNumberOfConsecutiveStates = std::size_t; | |
static std::pair<SymbolType, ConsumedNumberOfConsecutiveStates> | |
decode(const Nfa& nfa, NfaStateIdx i) | |
{ | |
assert(i < nfa.size()); | |
return {std::get<re2mem::Symbol>(nfa[i]).ch, 1}; | |
} | |
}; | |
/// A lightweight reference to a subset of NFA states. | |
template<typename SymbolDecoder> | |
struct NfaStatesSubsetRef { | |
using SymbolType = typename SymbolDecoder::SymbolType; | |
NfaStatesSubsetRef(const Nfa& nfa, std::vector<NfaStateIdx> statesSubset = {}) | |
: nfa_{nfa} | |
{ | |
// If a set of states explicitly isn't specified, then construct | |
// the start set subset (epsilon closure over start set). | |
if (!nfa.empty() && statesSubset.empty()) { | |
statesSubset.push_back(0); | |
} | |
assert((nfa.empty() || !statesSubset.empty()) && | |
"If NFA is empty, then the set of sets must be empty too" | |
"because only valid states are accepted by the constructor."); | |
std::vector<std::byte> auxData; | |
auxData.reserve(sizeof(std::size_t)); | |
std::set<NfaStateIdx> visitedStates; | |
for (const auto& i : statesSubset) { | |
assert((i <= nfa.size()) && | |
"NfaStatesSubsetRef must be initialized with set of valid states." | |
"If i == nfa.size() then i is final state, despite the fact that" | |
"physically it isn't present in nfa."); | |
// Skip already processed states. | |
if (visitedStates.insert(i).second == false) { | |
continue; | |
} | |
std::queue<NfaStateIdx> statesToVisit; | |
statesToVisit.push(i); | |
while (!statesToVisit.empty()) { | |
auto s = statesToVisit.front(); | |
statesToVisit.pop(); | |
// If state is final skip its further processing, just mark | |
// that currently built subset contains a final state. This | |
// mark is used in the informative public method 'isAnyFinal'. | |
if (s == nfa.size()) { | |
someOfTheStatesIsFinal_ = true; | |
continue; | |
} | |
assert(s < nfa.size()); | |
utils::match(nfa[s], | |
[s, this](const re2mem::Symbol&) { | |
states_.push_back(s); | |
}, | |
[](const re2mem::Range&) { | |
throw std::runtime_error{INTERNAL_ERROR "Not yet implemented."}; | |
}, | |
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Alt<re2mem::Direction::Bakward> a) { | |
auto [_, notVisited] = visitedStates.insert(s+1); | |
if (notVisited) { | |
statesToVisit.push(s+1); | |
auxData.push_back(a.next); | |
auto offset = re2mem::utils::deserializeInteger(auxData); | |
auxData.clear(); | |
statesToVisit.push(s-offset); | |
} | |
}, | |
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Alt<re2mem::Direction::Forward> a) { | |
auto [_, notVisited] = visitedStates.insert(s+1); | |
if (notVisited) { | |
statesToVisit.push(s+1); | |
auxData.push_back(a.next); | |
auto offset = re2mem::utils::deserializeInteger(auxData); | |
auxData.clear(); | |
statesToVisit.push(s+offset); | |
} | |
}, | |
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Jump<re2mem::Direction::Bakward> j) { | |
auto [_, notVisited] = visitedStates.insert(s+1); | |
if (notVisited) { | |
auxData.push_back(j.next); | |
auto offset = re2mem::utils::deserializeInteger(auxData); | |
auxData.clear(); | |
statesToVisit.push(s-offset); | |
} | |
}, | |
[s, &auxData, &visitedStates, &statesToVisit](re2mem::Jump<re2mem::Direction::Forward> j) { | |
auto [_, notVisited] = visitedStates.insert(s+1); | |
if (notVisited) { | |
auxData.push_back(j.next); | |
auto offset = re2mem::utils::deserializeInteger(auxData); | |
auxData.clear(); | |
statesToVisit.push(s+offset); | |
} | |
}, | |
[&auxData](re2mem::Aux aux) { | |
auxData.push_back(aux.b); | |
} | |
); | |
} | |
} | |
} | |
std::map<SymbolType, NfaStatesSubsetRef> outgoingEdges() const | |
{ | |
// Form an auxiliary map holding a set of outgoing edges. | |
std::map<SymbolType, std::vector<NfaStateIdx>> aux; | |
for (const auto& s : states_) { | |
assert(s < nfa_.size() && | |
"All states must be valid (i.e. belong to NFA). The final state, due to " | |
"its ephemeral nature is represented by someOfTheStatesIsFinal_ flag."); | |
assert(std::holds_alternative<re2mem::Symbol>(nfa_[s]) && | |
"Due to epsilon closure searching at construction time, categories of all" | |
"states must be re2mem::Symbol."); | |
// Read a symbol of transition. | |
auto [symbol, numberOfConsumedStates] = SymbolDecoder::decode(nfa_, s); | |
assert(numberOfConsumedStates > 0); | |
// Get an index of target state. | |
const auto t = s + numberOfConsumedStates; | |
aux[symbol].push_back(t); | |
} | |
// Form a result by simply taking a map gotten at previous step | |
// and turning its second element into an instance of NfaStatesSubsetRef. | |
std::map<SymbolType, NfaStatesSubsetRef> result; | |
for (auto& [s, v] : aux) { | |
[[maybe_unused]] auto [_, inserted] = | |
result.emplace(s, NfaStatesSubsetRef{nfa_, std::move(v)}); | |
assert(inserted); | |
} | |
return result; | |
} | |
/// Check if any of references states is final. | |
bool isAnyFinal() const | |
{ | |
return someOfTheStatesIsFinal_; | |
} | |
// For serving as a key in std::map. | |
bool operator < (const NfaStatesSubsetRef& oth) const noexcept | |
{ | |
assert((&nfa_ == &oth.nfa_) && "The sebsets must belong to the same NFA."); | |
return states_ < oth.states_; | |
} | |
private: | |
const Nfa& nfa_; | |
std::vector<NfaStateIdx> states_; // Synthetic states aren't stored. | |
bool someOfTheStatesIsFinal_ = false; | |
}; | |
template<typename SymbolDecoder> | |
inline NfaStatesSubsetRef<SymbolDecoder>& epsilonClosure(NfaStatesSubsetRef<SymbolDecoder>& statesSubset) | |
{ | |
// No explicit searching of epsilon closure for generic version of | |
// NfaStatesSubsetRef is required. This is drawn by the fact that the | |
// searching of epsilone closure is done in constructor of the class. | |
return statesSubset; | |
} | |
/// @name DFA | |
/// @{ | |
/// A reference to a DFA state. | |
using DfaStateIdx = std::size_t; | |
/// A DFA state. | |
template<typename Symbol> | |
struct DfaState { | |
std::map<Symbol, DfaStateIdx> transitions; | |
bool isFinal = false; | |
}; | |
/// A DFA | |
template<typename Symbol> | |
using Dfa = std::vector<DfaState<Symbol>>; | |
/// @} | |
/// A DFA builder. | |
template<typename NfaStatesSubset> | |
struct DfaBuilder { | |
using SymbolType = typename NfaStatesSubset::SymbolType; | |
/// A lightweight reference to a certain DFA state. | |
class DfaStateRef { | |
public: | |
DfaStateRef(Dfa<SymbolType>& dfa, DfaStateIdx& dfaStateIdx, const NfaStatesSubset& nfaStatesSubsetRef) noexcept | |
: dfa_{dfa}, dfaStateIdx_{dfaStateIdx}, nfaStatesSubsetRef_{nfaStatesSubsetRef} | |
{} | |
// Prevent putting rvalues. | |
DfaStateRef(DfaStateIdx&& dfaStateRef, NfaStatesSubset&& nfaStatesSubsetRef) = delete; | |
/// Return a subset of NFA states corresponding to this DFA state. | |
const NfaStatesSubset& nfaStatesSubset() const noexcept | |
{ | |
return nfaStatesSubsetRef_; | |
} | |
/// Add to a DFA a transition on @a s from this state to @a target state. | |
void addTransition(SymbolType s, DfaStateRef target) | |
{ | |
assert((dfaStateIdx_ < dfa_.size()) && "The source state must exist in DFA."); | |
assert((target.dfaStateIdx_ < dfa_.size()) && "The source state must exist in DFA."); | |
assert((&dfa_ == &target.dfa_) && "Both source and target states must belong to the same DFA."); | |
dfa_[dfaStateIdx_].transitions[s] = target.dfaStateIdx_; | |
} | |
private: | |
Dfa<SymbolType>& dfa_; | |
DfaStateIdx& dfaStateIdx_; | |
const NfaStatesSubset& nfaStatesSubsetRef_; | |
}; | |
/// Get a DFA state for specified subset of NFA states. If the DFA | |
/// doesn't contain the requested DFA state, it is first created | |
/// and then returned. | |
/// | |
/// In case of a creation, if any state within @a nfaStatesSubset | |
/// is final, then the created DFA state is set to be final too. | |
/// | |
/// @return A pair consisting of a requested DFA state and a bool | |
/// denoting whether of not the state is brand new (wasn't existed | |
/// before). | |
std::pair<DfaStateRef, bool> getOrAddDfaStateFor(NfaStatesSubset nfaStatesSubset) | |
{ | |
const DfaStateIdx nextNonExistentState = dfa_.size(); | |
auto [i, inserted] = m_.emplace(nfaStatesSubset, nextNonExistentState); | |
if (inserted) { | |
dfa_.emplace_back(DfaState<SymbolType>{}); | |
dfa_.back().isFinal = i->first.isAnyFinal(); | |
} | |
return {DfaStateRef{dfa_, i->second, i->first}, inserted}; | |
} | |
/// Get resulting DFA. | |
Dfa<SymbolType> getDfa() | |
{ | |
dfa_.shrink_to_fit(); | |
return dfa_; | |
} | |
private: | |
/// A DFA | |
Dfa<SymbolType> dfa_; | |
/// An map between NFA states subsets and corresponding DFA states. | |
/// | |
/// @note Since instances of DfaStateRef internaly store references | |
/// to elements of container, the container type must ensure the | |
/// validity of references to its elements in case of adding new | |
/// elements to the container. | |
/// | |
/// @todo Think about using std::unordered_map. | |
std::map<NfaStatesSubset, DfaStateIdx> m_; | |
}; | |
template<typename NfaStatesSubset> | |
inline auto makeDfa(NfaStatesSubset nfaStartState) | |
{ | |
/// A builder of resulting DFA. | |
DfaBuilder<NfaStatesSubset> dfaBuilder; | |
/// A work list. | |
std::queue<typename decltype(dfaBuilder)::DfaStateRef> queue; | |
auto [dfaStartState, _] = dfaBuilder.getOrAddDfaStateFor(epsilonClosure(nfaStartState)); | |
queue.push(dfaStartState); | |
while (!queue.empty()) { | |
auto dfaState = std::move(queue.front()); | |
queue.pop(); | |
for (auto [symbol, nfaTargetStates] : dfaState.nfaStatesSubset().outgoingEdges()) { | |
auto [dfaTargetState, created] = dfaBuilder.getOrAddDfaStateFor(epsilonClosure(nfaTargetStates)); | |
dfaState.addTransition(symbol, dfaTargetState); | |
if (created) { | |
queue.push(dfaTargetState); | |
} | |
} | |
} | |
return dfaBuilder.getDfa(); | |
} | |
} // namespace re2mem | |
#endif // RE2MEM_NFA_TO_DFA_H |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment