Last active
December 30, 2015 08:49
-
-
Save daniel-j-h/7804852 to your computer and use it in GitHub Desktop.
Simple cardinality estimator. You can provide hash<T> specializations for custom types, as long as the hash values roughly follow a uniform distribution. Otherwise the estimation gets skewed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstddef> | |
#include <functional> | |
#include <algorithm> | |
#include <iterator> | |
#include <limits> | |
#include <iostream> | |
#include <string> | |
// g++ -Wall -Wextra -pedantic -std=c++11 count_min.cpp | |
// ./a.out < words_with_duplicates.txt | |
namespace { | |
template <typename T, typename hash_fn_type = std::hash<T>> | |
class count_min final { | |
public: | |
// cache friendly, we estimate rarely | |
using value_type = std::vector<typename hash_fn_type::result_type>; | |
using size_type = typename value_type::size_type; | |
count_min() = default; | |
template <typename InputIt> | |
explicit count_min(InputIt first, InputIt last) { | |
using item_type = typename InputIt::value_type; | |
std::for_each(first, last, [this](const item_type& item){ insert(item); }); | |
} | |
void insert(const T& item) { | |
acc.emplace_back(hash_fn(item)); | |
} | |
size_type real() { | |
std::sort(std::begin(acc), std::end(acc)); | |
acc.erase(std::unique(std::begin(acc), std::end(acc)), std::end(acc)); | |
return acc.size(); | |
} | |
size_type estimate() const { | |
if(acc.empty()) | |
return 0; | |
const auto min = *std::min_element(std::begin(acc), std::end(acc)); | |
using range_type = typename hash_fn_type::result_type; | |
return std::numeric_limits<range_type>::max() / (min != 0 ? min : 1); | |
} | |
private: | |
value_type acc; | |
hash_fn_type hash_fn; | |
}; | |
} | |
int main() { | |
count_min<std::string> cm{ | |
std::istream_iterator<std::string>(std::cin), | |
std::istream_iterator<std::string>(), | |
}; | |
std::cout << "estimate: " << cm.estimate() << std::endl; | |
std::cout << "real: " << cm.real() << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment