Created
April 3, 2020 08:46
-
-
Save lava/8c96fa32363f29f4534d51273197222e to your computer and use it in GitHub Desktop.
AMQ Data Structures Benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> // needed by vacuum.h | |
#include <atomic> | |
#include <cuckoofilter.h> | |
#include <xorfilter.h> | |
#include <iostream> | |
#include <morton_filter.h> | |
#include <ostream> | |
#include <random> | |
#include <string> | |
#include <thread> | |
#include <unordered_set> | |
#include <vacuum.h> | |
#include "measuring.h" | |
#include <bits/stdint-uintn.h> | |
#include <vast/bloom_filter.hpp> | |
#include <vast/concept/hashable/xxhash.hpp> | |
#include <vast/concept/hashable/hash_append.hpp> | |
// All the filters expect uint64 as default input, so we don't bother | |
// generating and hashing strings for this benchmark. | |
std::vector<uint64_t> generate_unique_input(size_t N) { | |
static thread_local std::mt19937* rng; | |
if (!rng) { | |
size_t seed = 43; | |
rng = new std::mt19937(seed); | |
} | |
static std::uniform_int_distribution<uint64_t> dist; | |
std::unordered_set<uint64_t> seen; | |
std::vector<uint64_t> result(N); | |
for (int i = 0; i < N; ++i) { | |
uint64_t x; | |
do { | |
x = dist(*rng); | |
result[i] = x; | |
} while (seen.count(x)); | |
seen.insert(x); | |
} | |
return result; | |
} | |
struct XorFacade; | |
// The third argument gives the proportion of filter elements in the input, | |
// i.e. step == 3 => every third element matches => 33% true positive rate | |
template <typename Filter> | |
void benchmark_filter(const std::vector<uint64_t>& true_input, | |
const std::vector<uint64_t>& false_input, | |
double hit_percentage) { | |
size_t S = Filter::ARGUMENT_442_MIB; | |
std::atomic<size_t> lookups{0}; | |
std::atomic<size_t> true_positives{0}; | |
std::atomic<size_t> false_positives{0}; | |
std::atomic<bool> done{false}; | |
Filter filter(S); | |
if constexpr (std::is_same_v<Filter, XorFacade>) { | |
filter.insert_many(true_input); | |
} else { | |
for (auto x : true_input) { | |
filter.insert(x); | |
} | |
} | |
std::thread measurement( | |
tenzir::benchmark::measure_for_n_seconds, 5, &done, | |
std::map<std::string, std::atomic<size_t>*>{ | |
{"lookups", &lookups}, | |
{"false_positives", &false_positives}, | |
{"true_positives", &true_positives}}, | |
[](const std::map<std::string, size_t>& old_values, | |
const std::map<std::string, size_t>& new_values) { | |
if (new_values.empty()) | |
return; | |
auto lookups_delta = new_values.at("lookups") - old_values.at("lookups"); | |
auto tp_delta | |
= new_values.at("true_positives") - old_values.at("true_positives"); | |
auto fp_delta | |
= new_values.at("false_positives") - old_values.at("false_positives"); | |
// We don't include true positives in the error rate calculation, because | |
// they represent unavoidable work. | |
double error_rate = 1. * fp_delta / (lookups_delta - tp_delta); | |
std::cout << "epsilon: " << error_rate << "\n" | |
<< "false positives: " << fp_delta << "\n" | |
<< "lookups: " << lookups_delta << "\n"; | |
}); | |
size_t true_proportion = 100. * hit_percentage; | |
size_t false_proportion = 100 - true_proportion; | |
size_t false_negatives = 0; | |
size_t true_idx = 0, false_idx = 0; | |
while (!done.load(std::memory_order_relaxed)) { | |
for (int i=0; i<true_proportion; ++i) { | |
auto x = true_input[true_idx++ % true_input.size()]; | |
if (filter.lookup(x)) { | |
++true_positives; | |
} else { | |
++false_negatives; | |
} | |
++lookups; | |
} | |
for (int i=0; i<false_proportion; ++i) { | |
auto x = false_input[false_idx++ % false_input.size()]; | |
if (filter.lookup(x)) { | |
++false_positives; | |
} | |
++lookups; | |
} | |
} | |
if (false_negatives > 0) { | |
std::cerr << "FALSE NEGATIVES DETECTED: " << false_negatives << "\n"; | |
} | |
measurement.join(); | |
} | |
// All three filters have effectively the same interface, except that the | |
// functions for insertion/lookup have different names. To avoid using macros, | |
// we define these facade classes and hope the compiler will optimize them away. | |
using MortonFilter = CompressedCuckoo::Morton3_8; | |
struct MortonFacade : private MortonFilter { | |
constexpr static size_t ARGUMENT_442_MIB = 33*10'000'000; | |
MortonFacade(size_t S) : MortonFilter(S) { | |
} | |
using MortonFilter::insert; | |
bool lookup(uint64_t x) { | |
return MortonFilter::likely_contains(x); | |
} | |
}; | |
using MortonFilter16 = CompressedCuckoo::Morton7_16; | |
struct Morton716Facade : private MortonFilter16 { | |
constexpr static size_t ARGUMENT_442_MIB = 20*10'000'000; | |
Morton716Facade(size_t S) : MortonFilter16(S) { | |
} | |
using MortonFilter16::insert; | |
bool lookup(uint64_t x) { | |
return MortonFilter16::likely_contains(x); | |
} | |
}; | |
using CuckooFilter = cuckoofilter::CuckooFilter<uint64_t, 12>; | |
struct CuckooFacade : private CuckooFilter { | |
constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000; | |
CuckooFacade(size_t S) : CuckooFilter(S){}; | |
void insert(uint64_t x) { | |
CuckooFilter::Add(x); | |
} | |
bool lookup(uint64_t x) { | |
return CuckooFilter::Contain(x) == cuckoofilter::Ok; | |
} | |
}; | |
using CuckooFilter16 = cuckoofilter::CuckooFilter<uint64_t, 16>; | |
struct CuckooFacade16 : private CuckooFilter16 { | |
constexpr static size_t ARGUMENT_442_MIB = 25*10'000'000; | |
CuckooFacade16(size_t S) : CuckooFilter16(S){}; | |
void insert(uint64_t x) { | |
if (CuckooFilter16::Add(x) != cuckoofilter::Ok) | |
std::cerr << "ERROR INSERTING\n"; | |
} | |
bool lookup(uint64_t x) { | |
return CuckooFilter16::Contain(x) == cuckoofilter::Ok; | |
} | |
}; | |
struct XorFacade { | |
constexpr static size_t ARGUMENT_442_MIB = 21*10'000'000; | |
XorFacade(size_t S) { xor16_allocate(S, &xf); }; | |
~XorFacade() { xor16_free(&xf); } | |
// Single-item insert is *really* slow for xor filters | |
void insert_many(const std::vector<uint64_t>& x) { | |
xor16_populate(x.data(), x.size(), &xf); | |
} | |
bool lookup(uint64_t x) { | |
return xor16_contain(x, &xf); | |
} | |
xor16_t xf; | |
}; | |
using VacuumFilter_ = VacuumFilter<uint16_t, 16>; | |
struct VacuumFacade : private VacuumFilter_ { | |
constexpr static size_t ARGUMENT_442_MIB = 24*10'000'000; | |
VacuumFacade(size_t S) { VacuumFilter_::init(S, 4, 400); }; // max_item_numbers, slots per bucket, max_kick_steps | |
using VacuumFilter_::insert; | |
using VacuumFilter_::lookup; | |
}; | |
using BloomFilter = vast::bloom_filter<vast::xxhash>; | |
struct BloomFacade : private BloomFilter { | |
constexpr static size_t ARGUMENT_442_MIB = 8*46*10'000'000ull; | |
BloomFacade(size_t S): BloomFilter(S) {} | |
using BloomFilter::lookup; | |
void insert(uint64_t x) { | |
BloomFilter::add(x); | |
} | |
}; | |
int main() { | |
for (int i=0; i<6; ++i) { | |
int N = 500'000 + i*10'000'000; | |
std::cout << "# Generating input... " << std::endl; | |
auto true_input = generate_unique_input(N); | |
auto false_input = generate_unique_input(4096ull*4096ull); | |
double hit_percentage = 0.1; | |
std::cout << "# Measuring Vacuum Filter\n"; | |
benchmark_filter<VacuumFacade>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Cuckoo Filter\n"; | |
benchmark_filter<CuckooFacade>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Morton Filter\n"; | |
benchmark_filter<MortonFacade>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Xor Filter\n"; | |
benchmark_filter<XorFacade>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Bloom Filter\n"; | |
benchmark_filter<BloomFacade>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Cuckoo16 Filter\n"; | |
benchmark_filter<CuckooFacade16>(true_input, false_input, hit_percentage); | |
std::cout << "# Measuring Morton16 Filter\n"; | |
benchmark_filter<Morton716Facade>(true_input, false_input, hit_percentage); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment