Skip to content

Instantly share code, notes, and snippets.

@vstakhov
Last active December 11, 2015 13:21
Show Gist options
  • Save vstakhov/6d1991729b1a41fb7802 to your computer and use it in GitHub Desktop.
Save vstakhov/6d1991729b1a41fb7802 to your computer and use it in GitHub Desktop.
hyperscan/pcre benchmark
#include <iostream>
#include <string>
#include <fstream>
#include <vector>
#include <stdexcept>
#include <algorithm>
#include <set>
#include "pcre.h"
#include "hs.h"
#include <time.h>
#ifdef __APPLE__
#include <mach/mach_time.h>
#endif
using namespace std;
double
get_ticks(void)
{
double res;
#if defined(__APPLE__)
res = mach_absolute_time() / 1000000000.;
#else
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
res = (double)ts.tv_sec + ts.tv_nsec / 1000000000.;
#endif
return res;
}
struct pcre_regexp {
pcre* re;
pcre_extra* extra;
pcre_regexp(const string& pattern)
{
const char* err;
int err_off;
re = pcre_compile(pattern.c_str(), PCRE_NEWLINE_ANYCRLF, &err, &err_off, NULL);
if (re == NULL) {
throw invalid_argument(string("cannot compile: '") + pattern + "' error: " + err + " at offset: " + to_string(err_off));
}
extra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &err);
if (extra == NULL) {
throw invalid_argument(string("cannot study: '") + pattern + "' error: " + err + " at offset: " + to_string(err_off));
}
}
};
struct cb_context {
set<int> approx_re;
vector<pcre_regexp> pcre_vec;
};
struct cb_data {
struct cb_context* ctx;
vector<int> matched;
const std::string* str;
};
bool remove_uncompileable(const string& s, int id, struct cb_context* ctx)
{
hs_compile_error_t* hs_errors;
hs_database_t* hs_db;
if (hs_compile(s.c_str(), HS_FLAG_ALLOWEMPTY, HS_MODE_BLOCK, NULL, &hs_db, &hs_errors) != HS_SUCCESS) {
cout << "pattern: '" << s << "', error: " << hs_errors->message << endl;
if (hs_compile(s.c_str(), HS_FLAG_ALLOWEMPTY | HS_FLAG_PREFILTER, HS_MODE_BLOCK, NULL, &hs_db, &hs_errors) != HS_SUCCESS) {
cout << "completely bad pattern: '" << s << "', error: " << hs_errors->message << endl;
return true;
} else {
ctx->approx_re.insert(id);
}
} else {
hs_free_database(hs_db);
}
return false;
}
int match_cb(unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void* context)
{
auto cbdata = (struct cb_data*)context;
auto& matched = cbdata->matched;
if (cbdata->ctx->approx_re.find(id) != cbdata->ctx->approx_re.end()) {
int ovec[3];
auto re = cbdata->ctx->pcre_vec[id];
auto* begin = cbdata->str->data();
auto* p = begin;
auto sz = cbdata->str->size();
while (pcre_exec(re.re, re.extra, p, sz - (p - begin), 0, 0, ovec, 3) > 0) {
p = p + ovec[1];
matched[id]++;
}
} else {
matched[id]++;
}
return 0;
}
int main(int argc, char** argv)
{
ifstream refile(argv[1]);
vector<string> re_vec;
double t1, t2, total_ticks = 0;
struct cb_context ctx;
int ls;
pcre_config(PCRE_CONFIG_LINK_SIZE, &ls);
cout << ls << endl;
for (std::string line; std::getline(refile, line);) {
re_vec.push_back(line);
}
string re_pipe;
const char** pats = new const char*[re_vec.size()];
unsigned int i = 0, *ids = new unsigned int[re_vec.size()];
//re_vec.erase(remove_if(re_vec.begin(), re_vec.end(), remove_uncompileable), re_vec.end());
for (i = 0; i < re_vec.size(); i++) {
const auto& r = re_vec[i];
remove_uncompileable(r, i, &ctx);
pats[i] = r.c_str();
ids[i] = i;
re_pipe = re_pipe + string("(") + r + string(")|");
}
// Last |
re_pipe.erase(re_pipe.size() - 1);
total_ticks = 0;
for (const auto& r : re_vec) {
t1 = get_ticks();
ctx.pcre_vec.emplace_back(r);
t2 = get_ticks();
total_ticks += t2 - t1;
}
cout << "PCRE compile time: " << total_ticks << endl;
ifstream input(argv[2]);
std::string in_str((std::istreambuf_iterator<char>(input)),
std::istreambuf_iterator<char>());
hs_compile_error_t* hs_errors;
hs_database_t* hs_db;
hs_platform_info_t plt;
hs_populate_platform(&plt);
unsigned int* flags = new unsigned int[re_vec.size()];
for (i = 0; i < re_vec.size(); i++) {
if (ctx.approx_re.find(i) != ctx.approx_re.end()) {
flags[i] = HS_FLAG_PREFILTER;
} else {
flags[i] = 0;
}
}
t1 = get_ticks();
if (hs_compile_multi(pats, flags, ids, re_vec.size(), HS_MODE_BLOCK, &plt, &hs_db, &hs_errors) != HS_SUCCESS) {
cout << "BAD pattern: '" << re_vec[hs_errors->expression] << "', error: " << hs_errors->message << endl;
return -101;
}
t2 = get_ticks();
cout << "Hyperscan compile time: " << (t2 - t1) << "; approx re: "
<< ctx.approx_re.size() << "; total re: " << re_vec.size() << endl;
char* bytes = NULL;
size_t bytes_len;
t1 = get_ticks();
if (hs_serialize_database(hs_db, &bytes, &bytes_len) != HS_SUCCESS) {
cout << "BAD" << endl;
return -101;
}
t2 = get_ticks();
cout << "Hyperscan serialize time: " << (t2 - t1) << "; size: " << bytes_len << " bytes" << endl;
hs_database_t* hs_db1 = NULL;
t1 = get_ticks();
if (hs_deserialize_database(bytes, bytes_len, &hs_db1) != HS_SUCCESS) {
cout << "BAD1" << endl;
return -101;
}
t2 = get_ticks();
cout << "Hyperscan deserialize time: " << (t2 - t1) << "; size: " << bytes_len << " bytes" << endl;
auto matches = 0;
total_ticks = 0;
for (const auto& re : ctx.pcre_vec) {
int ovec[3];
auto* begin = in_str.data();
auto* p = begin;
auto sz = in_str.size();
t1 = get_ticks();
while (pcre_exec(re.re, re.extra, p, sz - (p - begin), 0, 0, ovec, 3) > 0) {
p = p + ovec[1];
matches++;
}
t2 = get_ticks();
total_ticks += t2 - t1;
}
//cout << re_pipe << endl;
cout << "Time for individual re: " << total_ticks << "; matches: " << matches << endl;
//cout << "Time for piped re: " << (t2 - t1) << endl;
hs_scratch_t* scratch = NULL;
int rc;
if ((rc = hs_alloc_scratch(hs_db1, &scratch)) != HS_SUCCESS) {
cout << "bad scratch: " << rc << endl;
return -102;
}
struct cb_data cbdata;
cbdata.ctx = &ctx;
cbdata.matched = vector<int>(re_vec.size(), 0);
cbdata.str = &in_str;
t1 = get_ticks();
if ((rc = hs_scan(hs_db1, in_str.data(), in_str.size(), 0, scratch,
match_cb, &cbdata))
!= HS_SUCCESS) {
cout << "bad scan: " << rc << endl;
return -103;
}
t2 = get_ticks();
matches = 0;
for_each(cbdata.matched.begin(), cbdata.matched.end(), [&matches](int elt) { matches += elt; });
cout << "Time for hyperscan re: " << (t2 - t1) << "; matches: " << matches << endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment