Skip to content

Instantly share code, notes, and snippets.

@DasNaCl
Created November 10, 2020 18:56
Show Gist options
  • Save DasNaCl/aaf0a1d3eb4fb8a66ce3b40be5225662 to your computer and use it in GitHub Desktop.
Save DasNaCl/aaf0a1d3eb4fb8a66ce3b40be5225662 to your computer and use it in GitHub Desktop.
Shift-Or-Algorithm with arbitrary Pattern length
/**
* This implements the Shift-Or-Algorithm.
* Text input is via STDIN, pattern is expected as cmd argument.
* Example call: echo "12121321231" | ./a.out 123
*
* The algorithm exits early as soon as a match was found. It can be easily modified
* to count all pattern occurences or even to provide all start positions.
*
* Compilation: g++ -O3 shiftor.cpp -o program
* Execution: ./program AA text.txt
*/
#include <unordered_set>
#include <iostream>
#include <iterator>
#include <fstream>
#include <cassert>
#include <vector>
int main(int argc, char** argv)
{
if(argc != 3)
{
std::cerr << "Pattern and file containing the text expected as cmd argument.\n";
return -1;
}
const std::string pattern = argv[1];
std::string text;
std::fstream infile(argv[2]);
for(std::string str; std::getline(infile, str); text += str + "\n")
{ ; }
// Setup alphabet hashset
std::unordered_set<char> alphabet;
alphabet.insert(pattern.begin(), pattern.end());
alphabet.insert(text.begin(), text.end());
// We represent the pattern with a vector of 32 Bit unsigned ints
// -> The number of ints is `m / 32 + parity`
const std::size_t width = pattern.size() / 32 + (pattern.size() % 32 == 0 ? 0 : 1);
std::vector<std::uint32_t> masks;
masks.resize(alphabet.size() * width);
// Init masks
for(auto it = alphabet.begin(); it != alphabet.end(); ++it)
{
const char letter = *it;
const std::size_t id = std::distance(alphabet.begin(), it);
for(std::size_t i = 0; i < pattern.size(); ++i)
{
auto& mask = masks[(i / 32) + id * width];
if(pattern[i] == letter)
mask &= ~(1UL << (i % 32)); // unset bit
else
mask |= (1UL << (i % 32)); // set bit
}
}
// Start pattern matching
std::vector<std::uint32_t> state;
state.resize(width, 0xFFFFFFFF);
std::vector<std::size_t> positions;
for(auto it = text.begin(); it != text.end(); ++it)
{
const char letter = *it;
// Determine number used in to refer to the correct mask
const std::size_t id = std::distance(alphabet.begin(), alphabet.find(letter));
// Perform shift
int carry = 0;
for(std::size_t i = 0; i < state.size(); ++i)
{
int new_carry = ((state[i] & (1U << 31U)) > 0 ? 1 : 0);
// State update is technically just shift, or, and lookup
// However, if we have a state consisting of 2 ints and the right one has 1 at position 31,
// then the 1 would be lost upon shifting. So we still need to carry it over.
state[i] = ((state[i] << 1U) + carry) | masks[i + id * width];
carry = new_carry;
}
// We have a matich if the last state in the NFA is active, which is represented as 0 in Shift-Or
if((state.back() & (1U << ((pattern.size() - 1) % 32U))) == 0)
positions.push_back(((it - text.begin()) - pattern.size()) + 1);
}
// Print the results to STDOUT
if(!positions.empty())
{
std::copy(positions.begin(), std::prev(positions.end()),
std::ostream_iterator<std::size_t>(std::cout, ","));
std::cout << positions.back() << "\n";
}
else
{
std::cout << "Not Found.\n";
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment