Created
November 10, 2020 18:56
-
-
Save DasNaCl/aaf0a1d3eb4fb8a66ce3b40be5225662 to your computer and use it in GitHub Desktop.
Shift-Or-Algorithm with arbitrary Pattern length
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This implements the Shift-Or-Algorithm. | |
* Text input is via STDIN, pattern is expected as cmd argument. | |
* Example call: echo "12121321231" | ./a.out 123 | |
* | |
* The algorithm exits early as soon as a match was found. It can be easily modified | |
* to count all pattern occurences or even to provide all start positions. | |
* | |
* Compilation: g++ -O3 shiftor.cpp -o program | |
* Execution: ./program AA text.txt | |
*/ | |
#include <unordered_set> | |
#include <iostream> | |
#include <iterator> | |
#include <fstream> | |
#include <cassert> | |
#include <vector> | |
int main(int argc, char** argv) | |
{ | |
if(argc != 3) | |
{ | |
std::cerr << "Pattern and file containing the text expected as cmd argument.\n"; | |
return -1; | |
} | |
const std::string pattern = argv[1]; | |
std::string text; | |
std::fstream infile(argv[2]); | |
for(std::string str; std::getline(infile, str); text += str + "\n") | |
{ ; } | |
// Setup alphabet hashset | |
std::unordered_set<char> alphabet; | |
alphabet.insert(pattern.begin(), pattern.end()); | |
alphabet.insert(text.begin(), text.end()); | |
// We represent the pattern with a vector of 32 Bit unsigned ints | |
// -> The number of ints is `m / 32 + parity` | |
const std::size_t width = pattern.size() / 32 + (pattern.size() % 32 == 0 ? 0 : 1); | |
std::vector<std::uint32_t> masks; | |
masks.resize(alphabet.size() * width); | |
// Init masks | |
for(auto it = alphabet.begin(); it != alphabet.end(); ++it) | |
{ | |
const char letter = *it; | |
const std::size_t id = std::distance(alphabet.begin(), it); | |
for(std::size_t i = 0; i < pattern.size(); ++i) | |
{ | |
auto& mask = masks[(i / 32) + id * width]; | |
if(pattern[i] == letter) | |
mask &= ~(1UL << (i % 32)); // unset bit | |
else | |
mask |= (1UL << (i % 32)); // set bit | |
} | |
} | |
// Start pattern matching | |
std::vector<std::uint32_t> state; | |
state.resize(width, 0xFFFFFFFF); | |
std::vector<std::size_t> positions; | |
for(auto it = text.begin(); it != text.end(); ++it) | |
{ | |
const char letter = *it; | |
// Determine number used in to refer to the correct mask | |
const std::size_t id = std::distance(alphabet.begin(), alphabet.find(letter)); | |
// Perform shift | |
int carry = 0; | |
for(std::size_t i = 0; i < state.size(); ++i) | |
{ | |
int new_carry = ((state[i] & (1U << 31U)) > 0 ? 1 : 0); | |
// State update is technically just shift, or, and lookup | |
// However, if we have a state consisting of 2 ints and the right one has 1 at position 31, | |
// then the 1 would be lost upon shifting. So we still need to carry it over. | |
state[i] = ((state[i] << 1U) + carry) | masks[i + id * width]; | |
carry = new_carry; | |
} | |
// We have a matich if the last state in the NFA is active, which is represented as 0 in Shift-Or | |
if((state.back() & (1U << ((pattern.size() - 1) % 32U))) == 0) | |
positions.push_back(((it - text.begin()) - pattern.size()) + 1); | |
} | |
// Print the results to STDOUT | |
if(!positions.empty()) | |
{ | |
std::copy(positions.begin(), std::prev(positions.end()), | |
std::ostream_iterator<std::size_t>(std::cout, ",")); | |
std::cout << positions.back() << "\n"; | |
} | |
else | |
{ | |
std::cout << "Not Found.\n"; | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment