Last active
November 28, 2015 16:42
-
-
Save wiso/1ecc4374f133a7c64590 to your computer and use it in GitHub Desktop.
Find pattern in blocked stream
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/javascript": [ | |
"require(['codemirror/mode/clike/clike'], function(Clike) { console.log('ROOTaaS - C++ CodeMirror module loaded'); });" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/javascript": [ | |
"IPython.CodeCell.config_defaults.highlight_modes['magic_text/x-c++src'] = {'reg':[/^%%cpp|^%%dcl/]};" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Welcome to ROOTaas Beta\n" | |
] | |
} | |
], | |
"source": [ | |
"from ROOTaaS.iPyROOT import ROOT" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Inspired from this: http://stackoverflow.com/questions/1572290/fastest-way-to-scan-for-bit-pattern-in-a-stream-of-bits" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"***Warning: this can give false positive for the first block!***" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Define the stream and fuction to read blocks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%%dcl\n", | |
"#include <string> \n", | |
"#include <bitset>\n", | |
"#include <iostream>\n", | |
"#include <cstring>\n", | |
"#include <vector> \n", | |
" \n", | |
"const char full_stream_str[] = \"101010100010110111010101010101000101010100110010011100011000010100010100101001001001101011011011\";\n", | |
"std::string full_stream_string = full_stream_str; // only for python, for some reason const char[] are not exported (?!)\n", | |
"const int full_stream_size = sizeof(full_stream_str) - 1;\n", | |
"\n", | |
"// this function is just for the test: it returns a different block every time it is called\n", | |
"// bits are read from the right\n", | |
"struct Stream\n", | |
"{\n", | |
" // a bit messy since we want to read from the left\n", | |
" std::bitset<full_stream_size> m_buffer;\n", | |
" Stream() : m_buffer(full_stream_str) {}\n", | |
" uint8_t read()\n", | |
" {\n", | |
" static std::bitset<full_stream_size> mask = std::bitset<full_stream_size>(0xFF) << (full_stream_size - 8);\n", | |
" uint8_t result = ((m_buffer & mask) >> (full_stream_size - 8)).to_ulong();\n", | |
" m_buffer <<= 8;\n", | |
" return result;\n", | |
" }\n", | |
"};\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Try it" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10101010\n", | |
"00101101\n", | |
"11010101\n", | |
"01010100\n", | |
"01010101\n", | |
"00110010\n", | |
"01110001\n", | |
"10000101\n", | |
"00010100\n", | |
"10100100\n", | |
"10011010\n", | |
"11011011\n" | |
] | |
} | |
], | |
"source": [ | |
"%%cpp\n", | |
"{ \n", | |
" Stream stream;\n", | |
" for (int i = 0; i < full_stream_size / 8; ++i) {\n", | |
" const auto block = stream.read();\n", | |
" std::cout << std::bitset<8>(block) << std::endl;\n", | |
" }\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Define the pattern to match" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%%dcl\n", | |
"const int PATTERN_LEN = 5; \n", | |
"const char pattern_str[] = (\"10101\");\n", | |
"const uint8_t pattern = std::bitset<8>(pattern_str).to_ulong();" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"pattern to match: 10101\u0000\n" | |
] | |
} | |
], | |
"source": [ | |
"print \"pattern to match:\", ROOT.pattern_str" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find the pattern with python using string and regex from the full stream" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"1: <b>10101</b>0100010110111010101010101000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"2: 10<b>10101</b>00010110111010101010101000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"3: 10101010001011011<b>10101</b>01010101000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"4: 1010101000101101110<b>10101</b>010101000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"5: 101010100010110111010<b>10101</b>0101000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"6: 10101010001011011101010<b>10101</b>01000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"7: 1010101000101101110101010<b>10101</b>000101010100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"8: 101010100010110111010101010101000<b>10101</b>0100110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"9: 10101010001011011101010101010100010<b>10101</b>00110010011100011000010100010100101001001001101011011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"10: 101010100010110111010101010101000101010100110010011100011000010100010100101001001001<b>10101</b>1011011" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import re\n", | |
"from IPython.display import HTML, display\n", | |
"full_stream_str = str(ROOT.full_stream_str)[:-1] # remove last byte since char string are null terminated\n", | |
"pattern_str = str(ROOT.pattern_str)[:-1]\n", | |
"\n", | |
"regex = re.compile(r'(?=(' + pattern_str + '))') # use lookahead for overlapping\n", | |
"for i, m in enumerate(regex.finditer(full_stream_str), 1):\n", | |
" display(HTML(\"{i}: {left}<b>{matched}</b>{right}\".format(i=i, left=full_stream_str[:m.start()],\n", | |
" matched=m.group(1),\n", | |
" right=full_stream_str[(m.start() + ROOT.PATTERN_LEN):])))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%%dcl \n", | |
"struct BlockBitsMatcher { // insert better name here please\n", | |
" const uint8_t m_pattern; // use uint_8 for pattern, even if it is shorter\n", | |
" const int m_pattern_length;\n", | |
" uint16_t m_all_patterns[8];\n", | |
" uint16_t m_all_masks[8];\n", | |
" \n", | |
" \n", | |
" BlockBitsMatcher(uint8_t pattern, int pattern_length) : m_pattern(pattern), m_pattern_length(pattern_length) {\n", | |
" // precompute all the possible shifted patterns in 2 blocks\n", | |
" const uint16_t mask = (1 << (m_pattern_length)) - 1;\n", | |
" for (int i = 0; i < 8; ++i) {\n", | |
" m_all_patterns[i] = m_pattern << i;\n", | |
" m_all_masks[i] = mask << i;\n", | |
" }\n", | |
" }\n", | |
" \n", | |
" std::vector<int> read_block(const uint8_t block) { \n", | |
" std::vector<int> matched_pos;\n", | |
" two_blocks = (two_blocks << 8) + block;\n", | |
" for (int i = 0; i < 8; ++i) {\n", | |
" if ((two_blocks & m_all_masks[i]) == m_all_patterns[i]) {\n", | |
" matched_pos.push_back(i);\n", | |
" }\n", | |
" }\n", | |
" return matched_pos;\n", | |
" }\n", | |
"private:\n", | |
" uint16_t two_blocks = 0;\n", | |
"};" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Run it (position are from the right)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"reading 10101010\n", | |
" matched 2, pos = 1, 3\n", | |
"reading 00101101\n", | |
"reading 11010101\n", | |
" matched 1, pos = 0\n", | |
"reading 01010100\n", | |
" matched 3, pos = 2, 4, 6\n", | |
"reading 01010101\n", | |
" matched 2, pos = 0, 2\n", | |
"reading 00110010\n", | |
"reading 01110001\n", | |
"reading 10000101\n", | |
"reading 00010100\n", | |
"reading 10100100\n", | |
"reading 10011010\n", | |
"reading 11011011\n" | |
] | |
} | |
], | |
"source": [ | |
"matcher = ROOT.BlockBitsMatcher(ROOT.pattern, 6)\n", | |
"stream = ROOT.Stream()\n", | |
"for i in xrange(ROOT.full_stream_size / 8):\n", | |
" block = stream.read()\n", | |
" print \"reading {:08b}\".format(ord(block))\n", | |
" matched_pos = list(matcher.read_block(block))\n", | |
" if matched_pos:\n", | |
" print \" matched %d, pos = %s\" % (len(matched_pos), ', '.join(map(str, matched_pos)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"reading 10101010\n", | |
" matched = 2 pos = 1 3 \n", | |
"reading 00101101\n", | |
"reading 11010101\n", | |
" matched = 2 pos = 0 2 \n", | |
"reading 01010100\n", | |
" matched = 3 pos = 2 4 6 \n", | |
"reading 01010101\n", | |
" matched = 2 pos = 0 2 \n", | |
"reading 00110010\n", | |
"reading 01110001\n", | |
"reading 10000101\n", | |
"reading 00010100\n", | |
"reading 10100100\n", | |
"reading 10011010\n", | |
"reading 11011011\n", | |
" matched = 1 pos = 7 \n" | |
] | |
} | |
], | |
"source": [ | |
"%%cpp\n", | |
"{\n", | |
" BlockBitsMatcher matcher(pattern, 5);\n", | |
" Stream stream;\n", | |
" for (int i = 0; i < full_stream_size / 8; ++i) {\n", | |
" const auto block = stream.read();\n", | |
" std::cout << \"reading \" << std::bitset<8>(block) << std::endl;\n", | |
" const auto matched_pos = matcher.read_block(block);\n", | |
" if (!matched_pos.empty()) {\n", | |
" std::cout << \" matched = \" << matched_pos.size() << \" pos = \";\n", | |
" for (auto p : matched_pos) { std::cout << p << \" \";}\n", | |
" std::cout << endl;\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment