Created
September 19, 2016 19:36
-
-
Save markpapadakis/410dda4d8da5cd91942ca6147cdccca6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "llqhistogram.h" | |
#include <alloca.h> | |
#ifndef HAVE_SWITCH | |
#include <cmath> | |
#include <string.h> | |
#include <algorithm> | |
#endif | |
double const LLQHistogram::power_of_ten[256] = { | |
1, 10, 100, 1000, 10000, 100000, 1e+06, 1e+07, 1e+08, 1e+09, 1e+10, | |
1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, | |
1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, | |
1e+31, 1e+32, 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40, | |
1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, 1e+49, 1e+50, | |
1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60, | |
1e+61, 1e+62, 1e+63, 1e+64, 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, | |
1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80, | |
1e+81, 1e+82, 1e+83, 1e+84, 1e+85, 1e+86, 1e+87, 1e+88, 1e+89, 1e+90, | |
1e+91, 1e+92, 1e+93, 1e+94, 1e+95, 1e+96, 1e+97, 1e+98, 1e+99, 1e+100, | |
1e+101, 1e+102, 1e+103, 1e+104, 1e+105, 1e+106, 1e+107, 1e+108, 1e+109, | |
1e+110, 1e+111, 1e+112, 1e+113, 1e+114, 1e+115, 1e+116, 1e+117, 1e+118, | |
1e+119, 1e+120, 1e+121, 1e+122, 1e+123, 1e+124, 1e+125, 1e+126, 1e+127, | |
1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120, | |
1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111, | |
1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102, | |
1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, | |
1e-95, 1e-94, 1e-93, 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, | |
1e-85, 1e-84, 1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, | |
1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, | |
1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, | |
1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, | |
1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, | |
1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28, 1e-27, 1e-26, | |
1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, | |
1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, | |
1e-05, 0.0001, 0.001, 0.01, 0.1}; | |
LLQHistogram::bucket::bucket(int64_t value, int32_t scale) | |
{ | |
if (!value) | |
{ | |
val = 0; | |
exp = 0; | |
return; | |
} | |
int8_t sign; | |
if (value < 0) | |
{ | |
value = 0 - value; | |
sign = -1; | |
} | |
else | |
sign = 1; | |
++scale; | |
if (value < 10) | |
{ | |
value *= 10; | |
scale -= 1; | |
} | |
while (value > 100) | |
{ | |
value /= 10; | |
++scale; | |
} | |
if (scale < -128) | |
{ | |
val = 0; | |
exp = 0; | |
return; | |
} | |
else if (scale > 127) | |
{ | |
value = int8_t(0xff); | |
exp = 0; | |
return; | |
} | |
val = sign * value; | |
exp = scale; | |
} | |
LLQHistogram::bucket::bucket(double d) | |
{ | |
if (std::isnan(d) || std::isinf(d)) | |
{ | |
val = int8_t(0xff); | |
exp = 0; | |
return; | |
} | |
else if (d == 0) | |
val = 0; | |
else | |
{ | |
const int8_t sign = (d < 0) ? -1 : 1; | |
d = fabs(d); | |
const auto big_exp = (int32_t)floor(log10(d)); | |
exp = (int8_t)big_exp; | |
if (exp != big_exp) | |
{ | |
// overflow | |
if (big_exp < 0) | |
{ | |
// d is in [0 .. 1e-128). Return 0 | |
val = 0; | |
exp = 0; | |
} | |
else | |
{ | |
// d is >= 1e128. Return NaN | |
val = (int8_t)0xff; | |
exp = 0; | |
} | |
return; | |
} | |
const auto *const pidx = (uint8_t *)&exp; | |
d /= power_of_ten[*pidx]; | |
d *= 10; | |
// avoid rounding problem at the bucket boundary | |
// e.g. d=0.11 results in hb.val = 10 (shoud be 11) | |
// by allowing a error margin (in the order or magintude | |
// of the exected rounding errors of the above transformations) | |
val = sign * (int)floor(d + 1e-13); | |
if (val == 100 || val == -100) | |
{ | |
if (exp < 127) | |
{ | |
val /= 10; | |
++exp; | |
} | |
else | |
{ | |
// can't increase exponent. Return NaN | |
val = (int8_t)0xff; | |
exp = 0; | |
} | |
} | |
if (val == 0) | |
{ | |
exp = 0; | |
} | |
else if (!((val >= 10 && val < 100) || (val <= -10 && val > -100))) | |
{ | |
val = (int8_t)0xff; | |
exp = 0; | |
} | |
} | |
} | |
std::pair<bool, uint32_t> LLQHistogram::find(const bucket b) const noexcept | |
{ | |
// This is a simple binary search returning the idx as pair.second in which | |
// the specified bucket belongs... returning true if it is there | |
// or false if the value would need to be inserted here (moving the | |
// rest of the buckets forward one. | |
if (used == 0) | |
return {false, 0}; | |
int32_t l{0}, r{used - 1}; | |
while (l < r) | |
{ | |
const int32_t check = (r + l) / 2; | |
const auto rv = bvs[check].bucket.cmp(b); | |
if (rv == 0) | |
return {true, check}; | |
else if (rv > 0) | |
l = check + 1; | |
else | |
r = check - 1; | |
} | |
if (bvs[l].bucket.cmp(b) < 0) | |
return {false, l}; | |
else | |
return {false, l + 1}; | |
} | |
LLQHistogram::count_t LLQHistogram::insert_impl(const bucket hb, count_t count) | |
{ | |
const auto findRes = find(hb); | |
const auto idx = findRes.second; | |
if (!findRes.first) | |
{ | |
if (used == allocd) | |
{ | |
allocd += DefaultHistSize; | |
auto *const newBvs = (decltype(bvs))malloc((allocd) * sizeof(*bvs)); | |
assert(newBvs); | |
if (idx > 0) | |
memcpy(newBvs, bvs, idx * sizeof(*bvs)); | |
newBvs[idx].bucket = hb; | |
newBvs[idx].count = count; | |
if (idx < used) | |
memcpy(newBvs + idx + 1, bvs + idx, (used - idx) * sizeof(*bvs)); | |
free(bvs); | |
bvs = newBvs; | |
} | |
else | |
{ | |
memmove(bvs + idx + 1, bvs + idx, (used - idx) * sizeof(*bvs)); | |
bvs[idx].bucket = hb; | |
bvs[idx].count = count; | |
} | |
++used; | |
} | |
else | |
{ | |
// Just need to update the counters | |
auto newval = bvs[idx].count + count; | |
if (newval < bvs[idx].count) | |
{ | |
// rolled | |
newval = ~(count_t)0; | |
} | |
count = newval - bvs[idx].count; | |
bvs[idx].count = newval; | |
} | |
return count; | |
} | |
LLQHistogram::count_t LLQHistogram::remove(const double val, count_t count) noexcept | |
{ | |
const auto res = find(val); | |
if (res.first) | |
{ | |
const auto idx = res.second; | |
auto newval = bvs[idx].count - count; | |
if (newval > bvs[idx].count) | |
newval = 0; | |
count = bvs[idx].count - newval; | |
bvs[idx].count = newval; | |
return count; | |
} | |
else | |
return 0; | |
} | |
double LLQHistogram::approx_sum() const noexcept | |
{ | |
double sum{0}; | |
for (uint32_t i{0}; i != used; ++i) | |
{ | |
if (bvs[i].bucket.isnan()) | |
continue; | |
const auto value = double(bvs[i].bucket); | |
const auto cardinality = double(bvs[i].count); | |
sum += value * cardinality; | |
} | |
return sum; | |
} | |
int8_t LLQHistogram::approx_quantile(const double *const q_in, const uint32_t nq, double *const q_out) const noexcept | |
{ | |
if (!nq) | |
return 0; | |
double total_cnt{0}; | |
// Sum up all samples from all the bins */ | |
for (uint32_t i{0}; i != used; ++i) | |
{ | |
if (bvs[i].bucket.isnan()) | |
continue; | |
total_cnt += double(bvs[i].count); | |
} | |
// Run through the quantiles and make sure they are in order | |
for (uint32_t i{1}; i < nq; ++i) | |
{ | |
if (q_in[i - 1] > q_in[i]) | |
return -2; | |
} | |
// We use q_out as temporary space to hold the count-normailzed quantiles | |
for (uint32_t i{0}; i != nq; ++i) | |
{ | |
if (q_in[i] < 0.0 || q_in[i] > 1.0) | |
return -3; | |
q_out[i] = total_cnt * q_in[i]; | |
} | |
#define TRACK_VARS(idx) \ | |
do \ | |
{ \ | |
bucket_width = bvs[idx].bucket.bin_width(); \ | |
bucket_left = bvs[idx].bucket.left(); \ | |
lower_cnt = upper_cnt; \ | |
upper_cnt = lower_cnt + bvs[idx].count; \ | |
} while (0) | |
// Find the least bin (first) | |
double lower_cnt{0}, upper_cnt{0}, bucket_left{0}, bucket_width{0}; | |
uint32_t i_b; | |
for (i_b = 0; i_b != used; ++i_b) | |
{ | |
if (bvs[i_b].bucket.isnan()) | |
continue; | |
TRACK_VARS(i_b); | |
break; | |
} | |
// Next walk the bins and the quantiles together | |
for (uint32_t i{0}; i != nq; ++i) | |
{ | |
// And within that, advance the bins as needed | |
while (i_b < used - 1 && upper_cnt < q_out[i]) | |
{ | |
++i_b; | |
TRACK_VARS(i_b); | |
} | |
if (lower_cnt == q_out[i]) | |
q_out[i] = bucket_left; | |
else if (upper_cnt == q_out[i]) | |
q_out[i] = bucket_left + bucket_width; | |
else | |
{ | |
if (bucket_width == 0) | |
q_out[i] = bucket_left; | |
else | |
q_out[i] = bucket_left + (q_out[i] - lower_cnt) / (upper_cnt - lower_cnt) * bucket_width; | |
} | |
} | |
#undef TRACK_VARS | |
return 0; | |
} | |
double LLQHistogram::approx_mean() const noexcept | |
{ | |
double divisor{0}, sum{0}; | |
for (uint32_t i{0}; i != used; ++i) | |
{ | |
if (bvs[i].bucket.isnan()) | |
continue; | |
const auto midpoint = bvs[i].bucket.midpoint(); | |
const auto cardinality = double(bvs[i].count); | |
divisor += cardinality; | |
sum += midpoint * cardinality; | |
} | |
return divisor ? sum / divisor : LLQHistogram::nan(); | |
} | |
void LLQHistogram::accumulate(const LLQHistogram **const src, const uint32_t srcCnt) | |
{ | |
auto list = (LLQHistogram **)alloca(sizeof(LLQHistogram *) * (srcCnt + 1)); | |
uint16_t cnt{0}; | |
uint32_t max{0}; | |
assert(list); | |
for (uint32_t i{0}; i != srcCnt; ++i) | |
{ | |
if (const auto n = src[i]->used) | |
{ | |
list[cnt++] = (LLQHistogram *)src[i]; | |
max = std::max<uint32_t>(max, n); | |
} | |
} | |
if (const auto n = used) | |
{ | |
list[cnt++] = this; | |
max = std::max<uint32_t>(max, n); | |
} | |
if (0 == cnt) | |
return; | |
auto toAdvance = (uint16_t *)alloca(cnt * sizeof(uint16_t)), itv = (uint16_t *)alloca(cnt * sizeof(uint16_t)); | |
auto newList = (decltype(bvs))malloc(sizeof(*bvs) * max); | |
uint16_t toAdvanceCnt; | |
uint32_t out{0}; | |
assert(toAdvance); | |
assert(itv); | |
memset(itv, 0, cnt * sizeof(uint16_t)); | |
do | |
{ | |
auto lowest = list[0]->bvs[itv[0]].bucket; | |
toAdvanceCnt = 1; | |
toAdvance[0] = 0; | |
for (uint32_t i{1}; i < cnt; ++i) | |
{ | |
const auto &b = list[i]->bvs[itv[i]].bucket; | |
const auto r = b.cmp(lowest); | |
if (r == 0) | |
toAdvance[toAdvanceCnt++] = i; | |
else if (r < 0) | |
{ | |
lowest = b; | |
toAdvanceCnt = 1; | |
toAdvance[0] = i; | |
} | |
} | |
if (out == max) | |
{ | |
max += DefaultHistSize; | |
newList = (decltype(newList))realloc(newList, max * sizeof(*bvs)); | |
} | |
count_t oldValue{0}; | |
newList[out].bucket = lowest; | |
while (toAdvanceCnt) | |
{ | |
const auto idx = toAdvance[--toAdvanceCnt]; | |
const auto ptr = &list[idx]->bvs[itv[idx]++]; | |
auto newVal = oldValue + ptr->count; | |
if (newVal < oldValue) | |
newVal = ~count_t(0); | |
oldValue = newVal; | |
if (itv[idx] == list[idx]->used) | |
{ | |
const auto span = (--cnt - idx); | |
memmove(list + idx, list + idx + 1, sizeof(LLQHistogram *) * span); | |
memmove(itv + idx, itv + idx + 1, sizeof(uint16_t) * span); | |
} | |
} | |
newList[out++].count = oldValue; | |
} while (cnt); | |
if (bvs) | |
free(bvs); | |
bvs = newList; | |
used = out; | |
allocd = max; | |
} | |
#ifdef HAVE_SWITCH | |
void LLQHistogram::serialize(IOBuffer *const out) const | |
{ | |
out->SerializeVarUInt32(used); | |
for (const auto it : Switch::make_range(bvs, used)) | |
{ | |
out->Serialize(it->bucket.val); | |
out->Serialize(it->bucket.exp); | |
out->SerializeVarUInt32(it->count); | |
} | |
} | |
LLQHistogram::LLQHistogram(IOBuffer *const in) | |
{ | |
used = in->UnserializeVarUInt32(); | |
allocd = used; | |
if (allocd) | |
{ | |
bvs = (decltype(bvs))malloc(sizeof(*bvs) * allocd); | |
for (uint32_t i{0}; i != used; ++i) | |
{ | |
auto it = bvs + i; | |
it->bucket.val = in->Unserialize<uint8_t>(); | |
it->bucket.exp = in->Unserialize<uint8_t>(); | |
it->count = in->UnserializeVarUInt32(); | |
} | |
} | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment