Skip to content

Instantly share code, notes, and snippets.

@daniel-j-h
Last active December 30, 2015 08:49
Show Gist options
  • Save daniel-j-h/7804852 to your computer and use it in GitHub Desktop.
Save daniel-j-h/7804852 to your computer and use it in GitHub Desktop.
Simple cardinality estimator. You can provide hash<T> specializations for custom types, as long as the hash values roughly follow a uniform distribution. Otherwise the estimation gets skewed.
#include <cstddef>
#include <functional>
#include <algorithm>
#include <iterator>
#include <limits>
#include <iostream>
#include <string>
// g++ -Wall -Wextra -pedantic -std=c++11 count_min.cpp
// ./a.out < words_with_duplicates.txt
namespace {
template <typename T, typename hash_fn_type = std::hash<T>>
class count_min final {
public:
// cache friendly, we estimate rarely
using value_type = std::vector<typename hash_fn_type::result_type>;
using size_type = typename value_type::size_type;
count_min() = default;
template <typename InputIt>
explicit count_min(InputIt first, InputIt last) {
using item_type = typename InputIt::value_type;
std::for_each(first, last, [this](const item_type& item){ insert(item); });
}
void insert(const T& item) {
acc.emplace_back(hash_fn(item));
}
size_type real() {
std::sort(std::begin(acc), std::end(acc));
acc.erase(std::unique(std::begin(acc), std::end(acc)), std::end(acc));
return acc.size();
}
size_type estimate() const {
if(acc.empty())
return 0;
const auto min = *std::min_element(std::begin(acc), std::end(acc));
using range_type = typename hash_fn_type::result_type;
return std::numeric_limits<range_type>::max() / (min != 0 ? min : 1);
}
private:
value_type acc;
hash_fn_type hash_fn;
};
}
int main() {
count_min<std::string> cm{
std::istream_iterator<std::string>(std::cin),
std::istream_iterator<std::string>(),
};
std::cout << "estimate: " << cm.estimate() << std::endl;
std::cout << "real: " << cm.real() << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment