Last active
December 11, 2015 13:21
-
-
Save vstakhov/6d1991729b1a41fb7802 to your computer and use it in GitHub Desktop.
hyperscan/pcre benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <fstream> | |
#include <vector> | |
#include <stdexcept> | |
#include <algorithm> | |
#include <set> | |
#include "pcre.h" | |
#include "hs.h" | |
#include <time.h> | |
#ifdef __APPLE__ | |
#include <mach/mach_time.h> | |
#endif | |
using namespace std; | |
double | |
get_ticks(void) | |
{ | |
double res; | |
#if defined(__APPLE__) | |
res = mach_absolute_time() / 1000000000.; | |
#else | |
struct timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
res = (double)ts.tv_sec + ts.tv_nsec / 1000000000.; | |
#endif | |
return res; | |
} | |
struct pcre_regexp { | |
pcre* re; | |
pcre_extra* extra; | |
pcre_regexp(const string& pattern) | |
{ | |
const char* err; | |
int err_off; | |
re = pcre_compile(pattern.c_str(), PCRE_NEWLINE_ANYCRLF, &err, &err_off, NULL); | |
if (re == NULL) { | |
throw invalid_argument(string("cannot compile: '") + pattern + "' error: " + err + " at offset: " + to_string(err_off)); | |
} | |
extra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &err); | |
if (extra == NULL) { | |
throw invalid_argument(string("cannot study: '") + pattern + "' error: " + err + " at offset: " + to_string(err_off)); | |
} | |
} | |
}; | |
struct cb_context { | |
set<int> approx_re; | |
vector<pcre_regexp> pcre_vec; | |
}; | |
struct cb_data { | |
struct cb_context* ctx; | |
vector<int> matched; | |
const std::string* str; | |
}; | |
bool remove_uncompileable(const string& s, int id, struct cb_context* ctx) | |
{ | |
hs_compile_error_t* hs_errors; | |
hs_database_t* hs_db; | |
if (hs_compile(s.c_str(), HS_FLAG_ALLOWEMPTY, HS_MODE_BLOCK, NULL, &hs_db, &hs_errors) != HS_SUCCESS) { | |
cout << "pattern: '" << s << "', error: " << hs_errors->message << endl; | |
if (hs_compile(s.c_str(), HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER, HS_MODE_BLOCK, NULL, &hs_db, &hs_errors) != HS_SUCCESS) { | |
cout << "completely bad pattern: '" << s << "', error: " << hs_errors->message << endl; | |
return true; | |
} else { | |
ctx->approx_re.insert(id); | |
} | |
} else { | |
hs_free_database(hs_db); | |
} | |
return false; | |
} | |
int match_cb(unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void* context) | |
{ | |
auto cbdata = (struct cb_data*)context; | |
auto& matched = cbdata->matched; | |
if (cbdata->ctx->approx_re.find(id) != cbdata->ctx->approx_re.end()) { | |
int ovec[3]; | |
auto re = cbdata->ctx->pcre_vec[id]; | |
auto* begin = cbdata->str->data(); | |
auto* p = begin; | |
auto sz = cbdata->str->size(); | |
while (pcre_exec(re.re, re.extra, p, sz - (p - begin), 0, 0, ovec, 3) > 0) { | |
p = p + ovec[1]; | |
matched[id]++; | |
} | |
} else { | |
matched[id]++; | |
} | |
return 0; | |
} | |
int main(int argc, char** argv) | |
{ | |
ifstream refile(argv[1]); | |
vector<string> re_vec; | |
double t1, t2, total_ticks = 0; | |
struct cb_context ctx; | |
int ls; | |
pcre_config(PCRE_CONFIG_LINK_SIZE, &ls); | |
cout << ls << endl; | |
for (std::string line; std::getline(refile, line);) { | |
re_vec.push_back(line); | |
} | |
string re_pipe; | |
const char** pats = new const char*[re_vec.size()]; | |
unsigned int i = 0, *ids = new unsigned int[re_vec.size()]; | |
//re_vec.erase(remove_if(re_vec.begin(), re_vec.end(), remove_uncompileable), re_vec.end()); | |
for (i = 0; i < re_vec.size(); i++) { | |
const auto& r = re_vec[i]; | |
remove_uncompileable(r, i, &ctx); | |
pats[i] = r.c_str(); | |
ids[i] = i; | |
re_pipe = re_pipe + string("(") + r + string(")|"); | |
} | |
// Last | | |
re_pipe.erase(re_pipe.size() - 1); | |
total_ticks = 0; | |
for (const auto& r : re_vec) { | |
t1 = get_ticks(); | |
ctx.pcre_vec.emplace_back(r); | |
t2 = get_ticks(); | |
total_ticks += t2 - t1; | |
} | |
cout << "PCRE compile time: " << total_ticks << endl; | |
ifstream input(argv[2]); | |
std::string in_str((std::istreambuf_iterator<char>(input)), | |
std::istreambuf_iterator<char>()); | |
hs_compile_error_t* hs_errors; | |
hs_database_t* hs_db; | |
hs_platform_info_t plt; | |
hs_populate_platform(&plt); | |
unsigned int* flags = new unsigned int[re_vec.size()]; | |
for (i = 0; i < re_vec.size(); i++) { | |
if (ctx.approx_re.find(i) != ctx.approx_re.end()) { | |
flags[i] = HS_FLAG_PREFILTER; | |
} else { | |
flags[i] = 0; | |
} | |
} | |
t1 = get_ticks(); | |
if (hs_compile_multi(pats, flags, ids, re_vec.size(), HS_MODE_BLOCK, &plt, &hs_db, &hs_errors) != HS_SUCCESS) { | |
cout << "BAD pattern: '" << re_vec[hs_errors->expression] << "', error: " << hs_errors->message << endl; | |
return -101; | |
} | |
t2 = get_ticks(); | |
cout << "Hyperscan compile time: " << (t2 - t1) << "; approx re: " | |
<< ctx.approx_re.size() << "; total re: " << re_vec.size() << endl; | |
char* bytes = NULL; | |
size_t bytes_len; | |
t1 = get_ticks(); | |
if (hs_serialize_database(hs_db, &bytes, &bytes_len) != HS_SUCCESS) { | |
cout << "BAD" << endl; | |
return -101; | |
} | |
t2 = get_ticks(); | |
cout << "Hyperscan serialize time: " << (t2 - t1) << "; size: " << bytes_len << " bytes" << endl; | |
hs_database_t* hs_db1 = NULL; | |
t1 = get_ticks(); | |
if (hs_deserialize_database(bytes, bytes_len, &hs_db1) != HS_SUCCESS) { | |
cout << "BAD1" << endl; | |
return -101; | |
} | |
t2 = get_ticks(); | |
cout << "Hyperscan deserialize time: " << (t2 - t1) << "; size: " << bytes_len << " bytes" << endl; | |
auto matches = 0; | |
total_ticks = 0; | |
for (const auto& re : ctx.pcre_vec) { | |
int ovec[3]; | |
auto* begin = in_str.data(); | |
auto* p = begin; | |
auto sz = in_str.size(); | |
t1 = get_ticks(); | |
while (pcre_exec(re.re, re.extra, p, sz - (p - begin), 0, 0, ovec, 3) > 0) { | |
p = p + ovec[1]; | |
matches++; | |
} | |
t2 = get_ticks(); | |
total_ticks += t2 - t1; | |
} | |
//cout << re_pipe << endl; | |
cout << "Time for individual re: " << total_ticks << "; matches: " << matches << endl; | |
//cout << "Time for piped re: " << (t2 - t1) << endl; | |
hs_scratch_t* scratch = NULL; | |
int rc; | |
if ((rc = hs_alloc_scratch(hs_db1, &scratch)) != HS_SUCCESS) { | |
cout << "bad scratch: " << rc << endl; | |
return -102; | |
} | |
struct cb_data cbdata; | |
cbdata.ctx = &ctx; | |
cbdata.matched = vector<int>(re_vec.size(), 0); | |
cbdata.str = &in_str; | |
t1 = get_ticks(); | |
if ((rc = hs_scan(hs_db1, in_str.data(), in_str.size(), 0, scratch, | |
match_cb, &cbdata)) | |
!= HS_SUCCESS) { | |
cout << "bad scan: " << rc << endl; | |
return -103; | |
} | |
t2 = get_ticks(); | |
matches = 0; | |
for_each(cbdata.matched.begin(), cbdata.matched.end(), [&matches](int elt) { matches += elt; }); | |
cout << "Time for hyperscan re: " << (t2 - t1) << "; matches: " << matches << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment