Created
November 30, 2021 16:53
-
-
Save develar/974a587f8180e183ce25cf308ffae39a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2000-2021 Jim Apple. Use of this source code is governed by the Apache 2.0 license that can be found in https://github.com/jbapple/libfilter/blob/master/LICENSE.txt. | |
package com.intellij.util.lang; | |
import java.nio.ByteBuffer; | |
/** | |
* BlockFilter is a block Bloom filter (from Putze et al.'s "Cache-, Hash- and | |
* Space-Efficient Bloom Filters") with some twists: | |
* <ol> | |
* <li> Each block is a split Bloom filter - see Section 2.1 of Broder and Mitzenmacher's | |
* "Network Applications of Bloom Filters: A Survey". | |
* <li> The number of bits set during each key addition is constant in order to try and | |
* take advantage of SIMD instructions. | |
* </ol> | |
* | |
* https://github.com/jbapple/libfilter | |
*/ | |
@SuppressWarnings("CommentedOutCode") | |
public final class BlockFilter { | |
private final int[] payload; | |
private final int bucketCount; | |
public BlockFilter(int[] payload) { | |
this.payload = payload; | |
this.bucketCount = payload.length / 8; | |
} | |
public BlockFilter(int numberOfDistinctValues, double falsePositiveProbability) { | |
payload = new int[Math.max(8, (getBytesNeeded(numberOfDistinctValues, falsePositiveProbability) / 4) / 8 * 8)]; | |
bucketCount = payload.length / 8; | |
} | |
public BlockFilter(ByteBuffer buffer) { | |
int size = buffer.getInt(); | |
payload = new int[size]; | |
buffer.asIntBuffer().get(payload); | |
buffer.position(buffer.position() + (payload.length * Integer.BYTES)); | |
bucketCount = payload.length / 8; | |
} | |
public int sizeInBytes() { | |
return Integer.BYTES + (payload.length * Integer.BYTES); | |
} | |
public void write(ByteBuffer buffer) { | |
buffer.putInt(payload.length); | |
buffer.asIntBuffer().put(payload); | |
buffer.position(buffer.position() + (payload.length * Integer.BYTES)); | |
} | |
/** | |
* Calculates the expected false positive probability from the size (in bytes) and fill | |
* factor (in number of distinct values, <code>ndv</code>). | |
* | |
* @param ndv the number of distinct values | |
* @param bytes the size of the filter | |
* @return the false positive probability | |
*/ | |
private static double computeFalsePositiveProbability(double ndv, double bytes) { | |
double word_bits = 32; | |
double bucket_words = 8; | |
double hash_bits = 32; | |
// From equation 3 of Putze et al. | |
// | |
// m = bytes * CHAR_BIT | |
// n = ndv | |
// c = m / n | |
// b = m / (bucket_words * word_bits) | |
// B = n * c / b = n * (m / n) / (m / (bucket_words * word_bits)) = m / (m / | |
// (bucket_words * word_bits)) = bucket_words * word_bits k = number of hashes, in our | |
// case: bucket_words | |
if (ndv == 0) return 0.0; | |
if (bytes <= 0) return 1.0; | |
//noinspection PointlessArithmeticExpression | |
if (1.0 * ndv / (1.0 * bytes * 8) > 3) { | |
return 1.0; | |
} | |
double result = 0; | |
double lam = bucket_words * word_bits / ((bytes * 8) / ndv); | |
double logLam = Math.log(lam); | |
double log1collide = -hash_bits * Math.log(2.0); | |
int MAX_J = 10000; | |
for (int j = 0; j < MAX_J; ++j) { | |
int i = MAX_J - 1 - j; | |
double logP = i * logLam - lam - logGamma(i + 1); | |
double logFinner = bucket_words * Math.log(1.0 - Math.pow(1.0 - 1.0 / word_bits, i)); | |
double logCollide = Math.log(i) + log1collide; | |
result += Math.exp(logP + logFinner) + Math.exp(logP + logCollide); | |
} | |
//noinspection ManualMinMaxCalculation | |
return (result > 1.0) ? 1.0 : result; | |
} | |
/** | |
* Calculates the number of bytes needed to create a filter with the given number of | |
* distinct values and false positive probability. | |
* | |
* @param ndv the number of distinct values | |
* @param fpp the desired false positive probability | |
* @return the number of bytes needed | |
*/ | |
public static int getBytesNeeded(int ndv, double fpp) { | |
double word_bits = 32; | |
double bucket_words = 8; | |
int bucket_bytes = (int)((word_bits * bucket_words) / 8); | |
double result = 1; | |
while (computeFalsePositiveProbability(ndv, result) > fpp) { | |
result *= 2; | |
} | |
if (result <= bucket_bytes) return bucket_bytes; | |
double lo = 0; | |
while (lo + 1 < result) { | |
double mid = lo + (result - lo) / 2; | |
double test = computeFalsePositiveProbability(ndv, mid); | |
if (test < fpp) { | |
result = mid; | |
} | |
else if (test == fpp) { | |
result = ((mid + bucket_bytes - 1) / bucket_bytes) * bucket_bytes; | |
if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE; | |
return (int)result; | |
} | |
else { | |
lo = mid; | |
} | |
} | |
result = ((result + bucket_bytes - 1) / bucket_bytes) * bucket_bytes; | |
if (result > Integer.MAX_VALUE) return Integer.MAX_VALUE; | |
return (int)result; | |
} | |
/** | |
* Calculates the number of distinct elements that can be stored in a filter of size | |
* <code>bytes</code> without exceeding the given false positive probability. | |
* | |
* @param bytes the size of the filter | |
* @param fpp the desired false positive probability | |
* @return the maximum number of distinct values that can be added | |
*/ | |
public static double getTotalHashCapacity(double bytes, double fpp) { | |
double result = 1; | |
while (computeFalsePositiveProbability(result, bytes) < fpp) { | |
result *= 2; | |
} | |
if (result == 1) return 0; | |
double lo = 0; | |
while (lo + 1 < result) { | |
double mid = lo + (result - lo) / 2; | |
double test = computeFalsePositiveProbability(mid, bytes); | |
if (test < fpp) { | |
lo = mid; | |
} | |
else if (test == fpp) { | |
return mid; | |
} | |
else { | |
result = mid; | |
} | |
} | |
return lo; | |
} | |
private static final int[] INTERNAL_HASH_SEEDS = {0x47b6137b, 0x44974d91, 0x8824ad5b, | |
0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31}; | |
private static int index(long hash, int num_buckets) { | |
return (int)(((hash >>> 32) * ((long)num_buckets)) >>> 32); | |
} | |
private static void makeMask(long hash, int[] result) { | |
// the loops are done individually for easier auto-vectorization by the JIT | |
for (int i = 0; i < 8; ++i) { | |
result[0] = (int)hash * INTERNAL_HASH_SEEDS[i]; | |
} | |
for (int i = 0; i < 8; ++i) { | |
result[i] >>>= (32 - 5); | |
} | |
for (int i = 0; i < 8; ++i) { | |
result[i] = 1 << result[i]; | |
} | |
} | |
/** | |
* Add a hash value to the filter. | |
* <p> | |
* Do not mix with <code>FindHash32</code> - a hash value that is present with | |
* <code>FindHash64(x)</code> will not be present when calling | |
* <code>FindHash32((int)x)</code>. | |
* <p> | |
* Do not pass values to this function, only their hashes. | |
* <p> | |
* <em>Hashes must be 64-bits</em>. 32-bit hashes will result in a useless filter where | |
* <code>FindHash64</code> always returns <code>true</code> for any input. | |
* | |
* @param hash the 64-bit hash value of the element you wish to insert | |
*/ | |
public void addHash64(long hash) { | |
int bucketIndex = index(hash, bucketCount); | |
int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9}; | |
makeMask(hash, mask); | |
for (int i = 0; i < 8; ++i) { | |
payload[bucketIndex * 8 + i] = mask[i] | payload[bucketIndex * 8 + i]; | |
} | |
} | |
/** | |
* Find a hash value in the filter. | |
* <p> | |
* Do not mix with <code>AddHash32</code> - a hash value that is added with | |
* <code>AddHash32</code> will not be present when calling <code>FindHash64</code>. | |
* <p> | |
* Do not pass values to this function, only their hashes. | |
* <p> | |
* <em>Hashes must be 64-bits</em>. 32-bit hashes will return incorrect results. | |
* | |
* @param hash the 64-bit hash value of the element you are checking the presence of | |
*/ | |
public boolean mightContain(long hash) { | |
int bucketIndex = index(hash, bucketCount); | |
int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9}; | |
makeMask(hash, mask); | |
for (int i = 0; i < 8; ++i) { | |
if (0 == (payload[bucketIndex * 8 + i] & mask[i])) { | |
return false; | |
} | |
} | |
return true; | |
} | |
//private static final long REHASH_32 = (((long)0xd1012a3a) << 32) | (long)0x7a1f4a8a; | |
///** | |
// * Add a hash value to the filter. | |
// * <p> | |
// * Do not mix with <code>FindHash64</code> - a hash value that is present with | |
// * <code>FindHash64(x)</code> will not be present when calling | |
// * <code>FindHash32((int)x)</code>. | |
// * | |
// * @param hash the 32-bit hash value of the element you wish to insert | |
// */ | |
//public boolean AddHash32(int hash) { | |
// long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash; | |
// int bucket_idx = index(hash64, payload.length / 8); | |
// int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9}; | |
// makeMask(hash, mask); | |
// for (int i = 0; i < 8; ++i) { | |
// payload[bucket_idx * 8 + i] = mask[i] | payload[bucket_idx * 8 + i]; | |
// } | |
// return true; | |
//} | |
///** | |
// * Find a hash value in the filter. | |
// * <p> | |
// * Do not mix with <code>AddHash64</code> - a hash value that is added with | |
// * <code>AddHash64</code> will not be present when calling <code>FindHash32</code>. | |
// * | |
// * @param hash the 32-bit hash value of the element you are checking the presence of | |
// */ | |
//public boolean FindHash32(int hash) { | |
// long hash64 = (((REHASH_32 * (long)hash) >>> 32) << 32) | hash; | |
// int bucket_idx = index(hash64, payload.length / 8); | |
// int[] mask = {0xca11, 8, 6, 7, 5, 3, 0, 9}; | |
// makeMask(hash, mask); | |
// for (int i = 0; i < 8; ++i) { | |
// if (0 == (payload[bucket_idx * 8 + i] & mask[i])) { | |
// return false; | |
// } | |
// } | |
// return true; | |
//} | |
private static double logGamma(double x) { | |
double ret; | |
if (Double.isNaN(x) || (x <= 0.0)) { | |
ret = Double.NaN; | |
} | |
else if (x < 0.5) { | |
return logGamma1p(x) - Math.log(x); | |
} | |
else if (x <= 2.5) { | |
return logGamma1p((x - 0.5) - 0.5); | |
} | |
else if (x <= 8.0) { | |
final int n = (int)Math.floor(x - 1.5); | |
double prod = 1.0; | |
for (int i = 1; i <= n; i++) { | |
prod *= x - i; | |
} | |
return logGamma1p(x - (n + 1)) + Math.log(prod); | |
} | |
else { | |
double sum = lanczos(x); | |
double tmp = x + LANCZOS_G + .5; | |
ret = ((x + .5) * Math.log(tmp)) - tmp + | |
HALF_LOG_2_PI + Math.log(sum / x); | |
} | |
return ret; | |
} | |
@SuppressWarnings("DuplicatedCode") | |
private static double invGamma1pm1(final double x) { | |
if (x < -0.5) { | |
throw new RuntimeException("NumberIsTooSmallException"); | |
} | |
if (x > 1.5) { | |
throw new RuntimeException("NumberIsTooLargeException"); | |
} | |
final double ret; | |
final double t = x <= 0.5 ? x : (x - 0.5) - 0.5; | |
if (t < 0.0) { | |
final double a = INV_GAMMA1P_M1_A0 + t * INV_GAMMA1P_M1_A1; | |
double b = INV_GAMMA1P_M1_B8; | |
b = INV_GAMMA1P_M1_B7 + t * b; | |
b = INV_GAMMA1P_M1_B6 + t * b; | |
b = INV_GAMMA1P_M1_B5 + t * b; | |
b = INV_GAMMA1P_M1_B4 + t * b; | |
b = INV_GAMMA1P_M1_B3 + t * b; | |
b = INV_GAMMA1P_M1_B2 + t * b; | |
b = INV_GAMMA1P_M1_B1 + t * b; | |
b = 1.0 + t * b; | |
double c = INV_GAMMA1P_M1_C13 + t * (a / b); | |
c = INV_GAMMA1P_M1_C12 + t * c; | |
c = INV_GAMMA1P_M1_C11 + t * c; | |
c = INV_GAMMA1P_M1_C10 + t * c; | |
c = INV_GAMMA1P_M1_C9 + t * c; | |
c = INV_GAMMA1P_M1_C8 + t * c; | |
c = INV_GAMMA1P_M1_C7 + t * c; | |
c = INV_GAMMA1P_M1_C6 + t * c; | |
c = INV_GAMMA1P_M1_C5 + t * c; | |
c = INV_GAMMA1P_M1_C4 + t * c; | |
c = INV_GAMMA1P_M1_C3 + t * c; | |
c = INV_GAMMA1P_M1_C2 + t * c; | |
c = INV_GAMMA1P_M1_C1 + t * c; | |
c = INV_GAMMA1P_M1_C + t * c; | |
if (x > 0.5) { | |
ret = t * c / x; | |
} | |
else { | |
ret = x * ((c + 0.5) + 0.5); | |
} | |
} | |
else { | |
double p = INV_GAMMA1P_M1_P6; | |
p = INV_GAMMA1P_M1_P5 + t * p; | |
p = INV_GAMMA1P_M1_P4 + t * p; | |
p = INV_GAMMA1P_M1_P3 + t * p; | |
p = INV_GAMMA1P_M1_P2 + t * p; | |
p = INV_GAMMA1P_M1_P1 + t * p; | |
p = INV_GAMMA1P_M1_P0 + t * p; | |
double q = INV_GAMMA1P_M1_Q4; | |
q = INV_GAMMA1P_M1_Q3 + t * q; | |
q = INV_GAMMA1P_M1_Q2 + t * q; | |
q = INV_GAMMA1P_M1_Q1 + t * q; | |
q = 1.0 + t * q; | |
double c = INV_GAMMA1P_M1_C13 + (p / q) * t; | |
c = INV_GAMMA1P_M1_C12 + t * c; | |
c = INV_GAMMA1P_M1_C11 + t * c; | |
c = INV_GAMMA1P_M1_C10 + t * c; | |
c = INV_GAMMA1P_M1_C9 + t * c; | |
c = INV_GAMMA1P_M1_C8 + t * c; | |
c = INV_GAMMA1P_M1_C7 + t * c; | |
c = INV_GAMMA1P_M1_C6 + t * c; | |
c = INV_GAMMA1P_M1_C5 + t * c; | |
c = INV_GAMMA1P_M1_C4 + t * c; | |
c = INV_GAMMA1P_M1_C3 + t * c; | |
c = INV_GAMMA1P_M1_C2 + t * c; | |
c = INV_GAMMA1P_M1_C1 + t * c; | |
c = INV_GAMMA1P_M1_C0 + t * c; | |
if (x > 0.5) { | |
ret = (t / x) * ((c - 0.5) - 0.5); | |
} | |
else { | |
ret = x * c; | |
} | |
} | |
return ret; | |
} | |
private static double lanczos(final double x) { | |
double sum = 0.0; | |
for (int i = LANCZOS.length - 1; i > 0; --i) { | |
sum += (LANCZOS[i] / (x + i)); | |
} | |
return sum + LANCZOS[0]; | |
} | |
private static double logGamma1p(final double x) { | |
if (x < -0.5) { | |
throw new RuntimeException("NumberIsTooSmallException"); | |
} | |
if (x > 1.5) { | |
throw new RuntimeException("NumberIsTooLargeException"); | |
} | |
return -Math.log1p(invGamma1pm1(x)); | |
} | |
private static final double HALF_LOG_2_PI = 0.5 * Math.log(2.0 * Math.PI); | |
private static final double LANCZOS_G = 607.0 / 128.0; | |
private static final double[] LANCZOS = { | |
0.99999999999999709182, | |
57.156235665862923517, | |
-59.597960355475491248, | |
14.136097974741747174, | |
-0.49191381609762019978, | |
.33994649984811888699e-4, | |
.46523628927048575665e-4, | |
-.98374475304879564677e-4, | |
.15808870322491248884e-3, | |
-.21026444172410488319e-3, | |
.21743961811521264320e-3, | |
-.16431810653676389022e-3, | |
.84418223983852743293e-4, | |
-.26190838401581408670e-4, | |
.36899182659531622704e-5, | |
}; | |
private static final double INV_GAMMA1P_M1_A0 = .611609510448141581788E-08; | |
private static final double INV_GAMMA1P_M1_A1 = .624730830116465516210E-08; | |
private static final double INV_GAMMA1P_M1_B1 = .203610414066806987300E+00; | |
private static final double INV_GAMMA1P_M1_B2 = .266205348428949217746E-01; | |
private static final double INV_GAMMA1P_M1_B3 = .493944979382446875238E-03; | |
private static final double INV_GAMMA1P_M1_B4 = -.851419432440314906588E-05; | |
private static final double INV_GAMMA1P_M1_B5 = -.643045481779353022248E-05; | |
private static final double INV_GAMMA1P_M1_B6 = .992641840672773722196E-06; | |
private static final double INV_GAMMA1P_M1_B7 = -.607761895722825260739E-07; | |
private static final double INV_GAMMA1P_M1_B8 = .195755836614639731882E-09; | |
private static final double INV_GAMMA1P_M1_P0 = .6116095104481415817861E-08; | |
private static final double INV_GAMMA1P_M1_P1 = .6871674113067198736152E-08; | |
private static final double INV_GAMMA1P_M1_P2 = .6820161668496170657918E-09; | |
private static final double INV_GAMMA1P_M1_P3 = .4686843322948848031080E-10; | |
private static final double INV_GAMMA1P_M1_P4 = .1572833027710446286995E-11; | |
private static final double INV_GAMMA1P_M1_P5 = -.1249441572276366213222E-12; | |
private static final double INV_GAMMA1P_M1_P6 = .4343529937408594255178E-14; | |
private static final double INV_GAMMA1P_M1_Q1 = .3056961078365221025009E+00; | |
private static final double INV_GAMMA1P_M1_Q2 = .5464213086042296536016E-01; | |
private static final double INV_GAMMA1P_M1_Q3 = .4956830093825887312020E-02; | |
private static final double INV_GAMMA1P_M1_Q4 = .2692369466186361192876E-03; | |
private static final double INV_GAMMA1P_M1_C = -.422784335098467139393487909917598E+00; | |
private static final double INV_GAMMA1P_M1_C0 = .577215664901532860606512090082402E+00; | |
private static final double INV_GAMMA1P_M1_C1 = -.655878071520253881077019515145390E+00; | |
private static final double INV_GAMMA1P_M1_C2 = -.420026350340952355290039348754298E-01; | |
private static final double INV_GAMMA1P_M1_C3 = .166538611382291489501700795102105E+00; | |
private static final double INV_GAMMA1P_M1_C4 = -.421977345555443367482083012891874E-01; | |
private static final double INV_GAMMA1P_M1_C5 = -.962197152787697356211492167234820E-02; | |
private static final double INV_GAMMA1P_M1_C6 = .721894324666309954239501034044657E-02; | |
private static final double INV_GAMMA1P_M1_C7 = -.116516759185906511211397108401839E-02; | |
private static final double INV_GAMMA1P_M1_C8 = -.215241674114950972815729963053648E-03; | |
private static final double INV_GAMMA1P_M1_C9 = .128050282388116186153198626328164E-03; | |
private static final double INV_GAMMA1P_M1_C10 = -.201348547807882386556893914210218E-04; | |
private static final double INV_GAMMA1P_M1_C11 = -.125049348214267065734535947383309E-05; | |
private static final double INV_GAMMA1P_M1_C12 = .113302723198169588237412962033074E-05; | |
private static final double INV_GAMMA1P_M1_C13 = -.205633841697760710345015413002057E-06; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment