Skip to content

Instantly share code, notes, and snippets.

@wiso
Last active November 28, 2015 16:42
Show Gist options
  • Save wiso/1ecc4374f133a7c64590 to your computer and use it in GitHub Desktop.
Save wiso/1ecc4374f133a7c64590 to your computer and use it in GitHub Desktop.
Find pattern in blocked stream
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"application/javascript": [
"require(['codemirror/mode/clike/clike'], function(Clike) { console.log('ROOTaaS - C++ CodeMirror module loaded'); });"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/javascript": [
"IPython.CodeCell.config_defaults.highlight_modes['magic_text/x-c++src'] = {'reg':[/^%%cpp|^%%dcl/]};"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome to ROOTaas Beta\n"
]
}
],
"source": [
"from ROOTaaS.iPyROOT import ROOT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Inspired from this: http://stackoverflow.com/questions/1572290/fastest-way-to-scan-for-bit-pattern-in-a-stream-of-bits"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"***Warning: this can give false positive for the first block!***"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define the stream and fuction to read blocks"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%dcl\n",
"#include <string> \n",
"#include <bitset>\n",
"#include <iostream>\n",
"#include <cstring>\n",
"#include <vector> \n",
" \n",
"const char full_stream_str[] = \"101010100010110111010101010101000101010100110010011100011000010100010100101001001001101011011011\";\n",
"std::string full_stream_string = full_stream_str; // only for python, for some reason const char[] are not exported (?!)\n",
"const int full_stream_size = sizeof(full_stream_str) - 1;\n",
"\n",
"// this function is just for the test: it returns a different block every time it is called\n",
"// bits are read from the right\n",
"struct Stream\n",
"{\n",
" // a bit messy since we want to read from the left\n",
" std::bitset<full_stream_size> m_buffer;\n",
" Stream() : m_buffer(full_stream_str) {}\n",
" uint8_t read()\n",
" {\n",
" static std::bitset<full_stream_size> mask = std::bitset<full_stream_size>(0xFF) << (full_stream_size - 8);\n",
" uint8_t result = ((m_buffer & mask) >> (full_stream_size - 8)).to_ulong();\n",
" m_buffer <<= 8;\n",
" return result;\n",
" }\n",
"};\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Try it"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10101010\n",
"00101101\n",
"11010101\n",
"01010100\n",
"01010101\n",
"00110010\n",
"01110001\n",
"10000101\n",
"00010100\n",
"10100100\n",
"10011010\n",
"11011011\n"
]
}
],
"source": [
"%%cpp\n",
"{ \n",
" Stream stream;\n",
" for (int i = 0; i < full_stream_size / 8; ++i) {\n",
" const auto block = stream.read();\n",
" std::cout << std::bitset<8>(block) << std::endl;\n",
" }\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define the pattern to match"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%dcl\n",
"const int PATTERN_LEN = 5; \n",
"const char pattern_str[] = (\"10101\");\n",
"const uint8_t pattern = std::bitset<8>(pattern_str).to_ulong();"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pattern to match: 10101\u0000\n"
]
}
],
"source": [
"print \"pattern to match:\", ROOT.pattern_str"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find the pattern with python using string and regex from the full stream"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"1: <b>10101</b>0100010110111010101010101000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"2: 10<b>10101</b>00010110111010101010101000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"3: 10101010001011011<b>10101</b>01010101000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"4: 1010101000101101110<b>10101</b>010101000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"5: 101010100010110111010<b>10101</b>0101000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"6: 10101010001011011101010<b>10101</b>01000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"7: 1010101000101101110101010<b>10101</b>000101010100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"8: 101010100010110111010101010101000<b>10101</b>0100110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"9: 10101010001011011101010101010100010<b>10101</b>00110010011100011000010100010100101001001001101011011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"10: 101010100010110111010101010101000101010100110010011100011000010100010100101001001001<b>10101</b>1011011"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import re\n",
"from IPython.display import HTML, display\n",
"full_stream_str = str(ROOT.full_stream_str)[:-1] # remove last byte since char string are null terminated\n",
"pattern_str = str(ROOT.pattern_str)[:-1]\n",
"\n",
"regex = re.compile(r'(?=(' + pattern_str + '))') # use lookahead for overlapping\n",
"for i, m in enumerate(regex.finditer(full_stream_str), 1):\n",
" display(HTML(\"{i}: {left}<b>{matched}</b>{right}\".format(i=i, left=full_stream_str[:m.start()],\n",
" matched=m.group(1),\n",
" right=full_stream_str[(m.start() + ROOT.PATTERN_LEN):])))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%dcl \n",
"struct BlockBitsMatcher { // insert better name here please\n",
" const uint8_t m_pattern; // use uint_8 for pattern, even if it is shorter\n",
" const int m_pattern_length;\n",
" uint16_t m_all_patterns[8];\n",
" uint16_t m_all_masks[8];\n",
" \n",
" \n",
" BlockBitsMatcher(uint8_t pattern, int pattern_length) : m_pattern(pattern), m_pattern_length(pattern_length) {\n",
" // precompute all the possible shifted patterns in 2 blocks\n",
" const uint16_t mask = (1 << (m_pattern_length)) - 1;\n",
" for (int i = 0; i < 8; ++i) {\n",
" m_all_patterns[i] = m_pattern << i;\n",
" m_all_masks[i] = mask << i;\n",
" }\n",
" }\n",
" \n",
" std::vector<int> read_block(const uint8_t block) { \n",
" std::vector<int> matched_pos;\n",
" two_blocks = (two_blocks << 8) + block;\n",
" for (int i = 0; i < 8; ++i) {\n",
" if ((two_blocks & m_all_masks[i]) == m_all_patterns[i]) {\n",
" matched_pos.push_back(i);\n",
" }\n",
" }\n",
" return matched_pos;\n",
" }\n",
"private:\n",
" uint16_t two_blocks = 0;\n",
"};"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run it (position are from the right)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"reading 10101010\n",
" matched 2, pos = 1, 3\n",
"reading 00101101\n",
"reading 11010101\n",
" matched 1, pos = 0\n",
"reading 01010100\n",
" matched 3, pos = 2, 4, 6\n",
"reading 01010101\n",
" matched 2, pos = 0, 2\n",
"reading 00110010\n",
"reading 01110001\n",
"reading 10000101\n",
"reading 00010100\n",
"reading 10100100\n",
"reading 10011010\n",
"reading 11011011\n"
]
}
],
"source": [
"matcher = ROOT.BlockBitsMatcher(ROOT.pattern, 6)\n",
"stream = ROOT.Stream()\n",
"for i in xrange(ROOT.full_stream_size / 8):\n",
" block = stream.read()\n",
" print \"reading {:08b}\".format(ord(block))\n",
" matched_pos = list(matcher.read_block(block))\n",
" if matched_pos:\n",
" print \" matched %d, pos = %s\" % (len(matched_pos), ', '.join(map(str, matched_pos)))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"reading 10101010\n",
" matched = 2 pos = 1 3 \n",
"reading 00101101\n",
"reading 11010101\n",
" matched = 2 pos = 0 2 \n",
"reading 01010100\n",
" matched = 3 pos = 2 4 6 \n",
"reading 01010101\n",
" matched = 2 pos = 0 2 \n",
"reading 00110010\n",
"reading 01110001\n",
"reading 10000101\n",
"reading 00010100\n",
"reading 10100100\n",
"reading 10011010\n",
"reading 11011011\n",
" matched = 1 pos = 7 \n"
]
}
],
"source": [
"%%cpp\n",
"{\n",
" BlockBitsMatcher matcher(pattern, 5);\n",
" Stream stream;\n",
" for (int i = 0; i < full_stream_size / 8; ++i) {\n",
" const auto block = stream.read();\n",
" std::cout << \"reading \" << std::bitset<8>(block) << std::endl;\n",
" const auto matched_pos = matcher.read_block(block);\n",
" if (!matched_pos.empty()) {\n",
" std::cout << \" matched = \" << matched_pos.size() << \" pos = \";\n",
" for (auto p : matched_pos) { std::cout << p << \" \";}\n",
" std::cout << endl;\n",
" }\n",
" }\n",
"}\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment