From e252787268025c1e733d642052fefbe89c0bf599 Mon Sep 17 00:00:00 2001 From: Oliver Schonrock Date: Tue, 3 Dec 2024 07:44:58 +0000 Subject: [PATCH 1/4] make this repo depend on the up2date xor_singleheader lib via a gitsubmodule at https://github.com/FastFilter/xor_singleheader.git rather than an older and local copy of that lib --- .gitmodules | 3 + dependencies/xor_singleheader | 1 + .../include/binaryfusefilter.h | 740 ---------- .../xor_singleheader/include/xorfilter.h | 1283 ----------------- 4 files changed, 4 insertions(+), 2023 deletions(-) create mode 160000 dependencies/xor_singleheader delete mode 100644 dependencies/xor_singleheader/include/binaryfusefilter.h delete mode 100644 dependencies/xor_singleheader/include/xorfilter.h diff --git a/.gitmodules b/.gitmodules index 7f9cfe6..37f2f13 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "dependencies/fastfilter_cpp"] path = dependencies/fastfilter_cpp url = https://github.com/FastFilter/fastfilter_cpp.git +[submodule "dependencies/xor_singleheader"] + path = dependencies/xor_singleheader + url = https://github.com/FastFilter/xor_singleheader.git diff --git a/dependencies/xor_singleheader b/dependencies/xor_singleheader new file mode 160000 index 0000000..3c0fd15 --- /dev/null +++ b/dependencies/xor_singleheader @@ -0,0 +1 @@ +Subproject commit 3c0fd15b1e04281b2ada00cf82ddffc4b3292dee diff --git a/dependencies/xor_singleheader/include/binaryfusefilter.h b/dependencies/xor_singleheader/include/binaryfusefilter.h deleted file mode 100644 index 5cc1651..0000000 --- a/dependencies/xor_singleheader/include/binaryfusefilter.h +++ /dev/null @@ -1,740 +0,0 @@ -#ifndef BINARYFUSEFILTER_H -#define BINARYFUSEFILTER_H -#include -#include -#include -#include -#include -#include -#include -#ifndef XOR_MAX_ITERATIONS -#define XOR_MAX_ITERATIONS \ - 100 // probability of success should always be > 0.5 so 100 iterations is - // highly unlikely -#endif - -static int binary_fuse_cmpfunc(const void * a, const void * b) { - return ( *(const uint64_t*)a - *(const uint64_t*)b ); -} - -static size_t binary_fuse_sort_and_remove_dup(uint64_t* keys, size_t length) { - qsort(keys, length, sizeof(uint64_t), binary_fuse_cmpfunc); - size_t j = 0; - for(size_t i = 1; i < length; i++) { - if(keys[i] != keys[i-1]) { - keys[j] = keys[i]; - j++; - } - } - return j+1; -} - -/** - * We start with a few utilities. - ***/ -static inline uint64_t binary_fuse_murmur64(uint64_t h) { - h ^= h >> 33; - h *= UINT64_C(0xff51afd7ed558ccd); - h ^= h >> 33; - h *= UINT64_C(0xc4ceb9fe1a85ec53); - h ^= h >> 33; - return h; -} -static inline uint64_t binary_fuse_mix_split(uint64_t key, uint64_t seed) { - return binary_fuse_murmur64(key + seed); -} -static inline uint64_t binary_fuse_rotl64(uint64_t n, unsigned int c) { - return (n << (c & 63)) | (n >> ((-c) & 63)); -} -static inline uint32_t binary_fuse_reduce(uint32_t hash, uint32_t n) { - // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ - return (uint32_t)(((uint64_t)hash * n) >> 32); -} -static inline uint64_t binary_fuse8_fingerprint(uint64_t hash) { - return hash ^ (hash >> 32); -} - -/** - * We need a decent random number generator. - **/ - -// returns random number, modifies the seed -static inline uint64_t binary_fuse_rng_splitmix64(uint64_t *seed) { - uint64_t z = (*seed += UINT64_C(0x9E3779B97F4A7C15)); - z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); - z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); - return z ^ (z >> 31); -} - -typedef struct binary_fuse8_s { - uint64_t Seed; - uint32_t SegmentLength; - uint32_t SegmentLengthMask; - uint32_t SegmentCount; - uint32_t SegmentCountLength; - uint32_t ArrayLength; - uint8_t *Fingerprints; -} binary_fuse8_t; - -// #ifdefs adapted from: -// https://stackoverflow.com/a/50958815 -#ifdef __SIZEOF_INT128__ // compilers supporting __uint128, e.g., gcc, clang -static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { - return ((__uint128_t)a * b) >> 64; -} -#elif defined(_M_X64) || defined(_MARM64) // MSVC -static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { - return __umulh(a, b); -} -#elif defined(_M_IA64) // also MSVC -static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { - unsigned __int64 hi; - (void) _umul128(a, b, &hi); - return hi; -} -#else // portable implementation using uint64_t -static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { - // Adapted from: - // https://stackoverflow.com/a/51587262 - - /* - This is implementing schoolbook multiplication: - - a1 a0 - X b1 b0 - ------------- - 00 LOW PART - ------------- - 00 - 10 10 MIDDLE PART - + 01 - ------------- - 01 - + 11 11 HIGH PART - ------------- - */ - - const uint64_t a0 = (uint32_t) a; - const uint64_t a1 = a >> 32; - const uint64_t b0 = (uint32_t) b; - const uint64_t b1 = b >> 32; - const uint64_t p11 = a1 * b1; - const uint64_t p01 = a0 * b1; - const uint64_t p10 = a1 * b0; - const uint64_t p00 = a0 * b0; - - // 64-bit product + two 32-bit values - const uint64_t middle = p10 + (p00 >> 32) + (uint32_t) p01; - - /* - Proof that 64-bit products can accumulate two more 32-bit values - without overflowing: - - Max 32-bit value is 2^32 - 1. - PSum = (2^32-1) * (2^32-1) + (2^32-1) + (2^32-1) - = 2^64 - 2^32 - 2^32 + 1 + 2^32 - 1 + 2^32 - 1 - = 2^64 - 1 - Therefore the high half below cannot overflow regardless of input. - */ - - // high half - return p11 + (middle >> 32) + (p01 >> 32); - - // low half (which we don't care about, but here it is) - // (middle << 32) | (uint32_t) p00; -} -#endif - -typedef struct binary_hashes_s { - uint32_t h0; - uint32_t h1; - uint32_t h2; -} binary_hashes_t; - -static inline binary_hashes_t binary_fuse8_hash_batch(uint64_t hash, - const binary_fuse8_t *filter) { - uint64_t hi = binary_fuse_mulhi(hash, filter->SegmentCountLength); - binary_hashes_t ans; - ans.h0 = (uint32_t)hi; - ans.h1 = ans.h0 + filter->SegmentLength; - ans.h2 = ans.h1 + filter->SegmentLength; - ans.h1 ^= (uint32_t)(hash >> 18) & filter->SegmentLengthMask; - ans.h2 ^= (uint32_t)(hash)&filter->SegmentLengthMask; - return ans; -} - -static inline uint32_t binary_fuse8_hash(int index, uint64_t hash, - const binary_fuse8_t *filter) { - uint64_t h = binary_fuse_mulhi(hash, filter->SegmentCountLength); - h += index * filter->SegmentLength; - // keep the lower 36 bits - uint64_t hh = hash & ((1UL << 36) - 1); - // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift - h ^= (size_t)((hh >> (36 - 18 * index)) & filter->SegmentLengthMask); - return h; -} - -// Report if the key is in the set, with false positive rate. -static inline bool binary_fuse8_contain(uint64_t key, - const binary_fuse8_t *filter) { - uint64_t hash = binary_fuse_mix_split(key, filter->Seed); - uint8_t f = binary_fuse8_fingerprint(hash); - binary_hashes_t hashes = binary_fuse8_hash_batch(hash, filter); - f ^= filter->Fingerprints[hashes.h0] ^ filter->Fingerprints[hashes.h1] ^ - filter->Fingerprints[hashes.h2]; - return f == 0; -} - -static inline uint32_t binary_fuse_calculate_segment_length(uint32_t arity, - uint32_t size) { - // These parameters are very sensitive. Replacing 'floor' by 'round' can - // substantially affect the construction time. - if (arity == 3) { - return ((uint32_t)1) << (int)(floor(log((double)(size)) / log(3.33) + 2.25)); - } else if (arity == 4) { - return ((uint32_t)1) << (int)(floor(log((double)(size)) / log(2.91) - 0.5)); - } else { - return 65536; - } -} - -static inline double binary_fuse_max(double a, double b) { - if (a < b) { - return b; - } - return a; -} - -static inline double binary_fuse_calculate_size_factor(uint32_t arity, - uint32_t size) { - if (arity == 3) { - return binary_fuse_max(1.125, 0.875 + 0.25 * log(1000000.0) / log((double)size)); - } else if (arity == 4) { - return binary_fuse_max(1.075, 0.77 + 0.305 * log(600000.0) / log((double)size)); - } else { - return 2.0; - } -} - -// allocate enough capacity for a set containing up to 'size' elements -// caller is responsible to call binary_fuse8_free(filter) -// size should be at least 2. -static inline bool binary_fuse8_allocate(uint32_t size, - binary_fuse8_t *filter) { - uint32_t arity = 3; - filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size); - if (filter->SegmentLength > 262144) { - filter->SegmentLength = 262144; - } - filter->SegmentLengthMask = filter->SegmentLength - 1; - double sizeFactor = size <= 1 ? 0 : binary_fuse_calculate_size_factor(arity, size); - uint32_t capacity = size <= 1 ? 0 : (uint32_t)(round((double)size * sizeFactor)); - uint32_t initSegmentCount = - (capacity + filter->SegmentLength - 1) / filter->SegmentLength - - (arity - 1); - filter->ArrayLength = (initSegmentCount + arity - 1) * filter->SegmentLength; - filter->SegmentCount = - (filter->ArrayLength + filter->SegmentLength - 1) / filter->SegmentLength; - if (filter->SegmentCount <= arity - 1) { - filter->SegmentCount = 1; - } else { - filter->SegmentCount = filter->SegmentCount - (arity - 1); - } - filter->ArrayLength = - (filter->SegmentCount + arity - 1) * filter->SegmentLength; - filter->SegmentCountLength = filter->SegmentCount * filter->SegmentLength; - filter->Fingerprints = (uint8_t*)malloc(filter->ArrayLength); - return filter->Fingerprints != NULL; -} - -// report memory usage -static inline size_t binary_fuse8_size_in_bytes(const binary_fuse8_t *filter) { - return filter->ArrayLength * sizeof(uint8_t) + sizeof(binary_fuse8_t); -} - -// release memory -static inline void binary_fuse8_free(binary_fuse8_t *filter) { - free(filter->Fingerprints); - filter->Fingerprints = NULL; - filter->Seed = 0; - filter->SegmentLength = 0; - filter->SegmentLengthMask = 0; - filter->SegmentCount = 0; - filter->SegmentCountLength = 0; - filter->ArrayLength = 0; -} - -static inline uint8_t binary_fuse_mod3(uint8_t x) { - return x > 2 ? x - 3 : x; -} - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling binary_fuse8_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool binary_fuse8_populate(uint64_t *keys, uint32_t size, - binary_fuse8_t *filter) { - uint64_t rng_counter = 0x726b2b9d438b9d4d; - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - uint64_t *reverseOrder = (uint64_t *)calloc((size + 1), sizeof(uint64_t)); - uint32_t capacity = filter->ArrayLength; - uint32_t *alone = (uint32_t *)malloc(capacity * sizeof(uint32_t)); - uint8_t *t2count = (uint8_t *)calloc(capacity, sizeof(uint8_t)); - uint8_t *reverseH = (uint8_t *)malloc(size * sizeof(uint8_t)); - uint64_t *t2hash = (uint64_t *)calloc(capacity, sizeof(uint64_t)); - - uint32_t blockBits = 1; - while (((uint32_t)1 << blockBits) < filter->SegmentCount) { - blockBits += 1; - } - uint32_t block = ((uint32_t)1 << blockBits); - uint32_t *startPos = (uint32_t *)malloc((1 << blockBits) * sizeof(uint32_t)); - uint32_t h012[5]; - - if ((alone == NULL) || (t2count == NULL) || (reverseH == NULL) || - (t2hash == NULL) || (reverseOrder == NULL) || (startPos == NULL)) { - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return false; - } - reverseOrder[size] = 1; - for (int loop = 0; true; ++loop) { - if (loop + 1 > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system) - memset(filter->Fingerprints, ~0, filter->ArrayLength); - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return false; - } - - for (uint32_t i = 0; i < block; i++) { - // important : i * size would overflow as a 32-bit number in some - // cases. - startPos[i] = ((uint64_t)i * size) >> blockBits; - } - - uint64_t maskblock = block - 1; - for (uint32_t i = 0; i < size; i++) { - uint64_t hash = binary_fuse_murmur64(keys[i] + filter->Seed); - uint64_t segment_index = hash >> (64 - blockBits); - while (reverseOrder[startPos[segment_index]] != 0) { - segment_index++; - segment_index &= maskblock; - } - reverseOrder[startPos[segment_index]] = hash; - startPos[segment_index]++; - } - int error = 0; - uint32_t duplicates = 0; - for (uint32_t i = 0; i < size; i++) { - uint64_t hash = reverseOrder[i]; - uint32_t h0 = binary_fuse8_hash(0, hash, filter); - t2count[h0] += 4; - t2hash[h0] ^= hash; - uint32_t h1= binary_fuse8_hash(1, hash, filter); - t2count[h1] += 4; - t2count[h1] ^= 1; - t2hash[h1] ^= hash; - uint32_t h2 = binary_fuse8_hash(2, hash, filter); - t2count[h2] += 4; - t2hash[h2] ^= hash; - t2count[h2] ^= 2; - if ((t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0) { - if (((t2hash[h0] == 0) && (t2count[h0] == 8)) - || ((t2hash[h1] == 0) && (t2count[h1] == 8)) - || ((t2hash[h2] == 0) && (t2count[h2] == 8))) { - duplicates += 1; - t2count[h0] -= 4; - t2hash[h0] ^= hash; - t2count[h1] -= 4; - t2count[h1] ^= 1; - t2hash[h1] ^= hash; - t2count[h2] -= 4; - t2count[h2] ^= 2; - t2hash[h2] ^= hash; - } - } - error = (t2count[h0] < 4) ? 1 : error; - error = (t2count[h1] < 4) ? 1 : error; - error = (t2count[h2] < 4) ? 1 : error; - } - if(error) { - memset(reverseOrder, 0, sizeof(uint64_t) * size); - memset(t2count, 0, sizeof(uint8_t) * capacity); - memset(t2hash, 0, sizeof(uint64_t) * capacity); - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - continue; - } - - // End of key addition - uint32_t Qsize = 0; - // Add sets with one key to the queue. - for (uint32_t i = 0; i < capacity; i++) { - alone[Qsize] = i; - Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0; - } - uint32_t stacksize = 0; - while (Qsize > 0) { - Qsize--; - uint32_t index = alone[Qsize]; - if ((t2count[index] >> 2) == 1) { - uint64_t hash = t2hash[index]; - - //h012[0] = binary_fuse8_hash(0, hash, filter); - h012[1] = binary_fuse8_hash(1, hash, filter); - h012[2] = binary_fuse8_hash(2, hash, filter); - h012[3] = binary_fuse8_hash(0, hash, filter); // == h012[0]; - h012[4] = h012[1]; - uint8_t found = t2count[index] & 3; - reverseH[stacksize] = found; - reverseOrder[stacksize] = hash; - stacksize++; - uint32_t other_index1 = h012[found + 1]; - alone[Qsize] = other_index1; - Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0); - - t2count[other_index1] -= 4; - t2count[other_index1] ^= binary_fuse_mod3(found + 1); - t2hash[other_index1] ^= hash; - - uint32_t other_index2 = h012[found + 2]; - alone[Qsize] = other_index2; - Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0); - t2count[other_index2] -= 4; - t2count[other_index2] ^= binary_fuse_mod3(found + 2); - t2hash[other_index2] ^= hash; - } - } - if (stacksize + duplicates == size) { - // success - size = stacksize; - break; - } else if(duplicates > 0) { - size = binary_fuse_sort_and_remove_dup(keys, size); - } - memset(reverseOrder, 0, sizeof(uint64_t) * size); - memset(t2count, 0, sizeof(uint8_t) * capacity); - memset(t2hash, 0, sizeof(uint64_t) * capacity); - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - } - - for (uint32_t i = size - 1; i < size; i--) { - // the hash of the key we insert next - uint64_t hash = reverseOrder[i]; - uint8_t xor2 = binary_fuse8_fingerprint(hash); - uint8_t found = reverseH[i]; - h012[0] = binary_fuse8_hash(0, hash, filter); - h012[1] = binary_fuse8_hash(1, hash, filter); - h012[2] = binary_fuse8_hash(2, hash, filter); - h012[3] = h012[0]; - h012[4] = h012[1]; - filter->Fingerprints[h012[found]] = xor2 ^ - filter->Fingerprints[h012[found + 1]] ^ - filter->Fingerprints[h012[found + 2]]; - } - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return true; -} - -////////////////// -// fuse16 -////////////////// - -typedef struct binary_fuse16_s { - uint64_t Seed; - uint32_t SegmentLength; - uint32_t SegmentLengthMask; - uint32_t SegmentCount; - uint32_t SegmentCountLength; - uint32_t ArrayLength; - uint16_t *Fingerprints; -} binary_fuse16_t; - -static inline uint64_t binary_fuse16_fingerprint(uint64_t hash) { - return hash ^ (hash >> 32); -} - -static inline binary_hashes_t binary_fuse16_hash_batch(uint64_t hash, - const binary_fuse16_t *filter) { - uint64_t hi = binary_fuse_mulhi(hash, filter->SegmentCountLength); - binary_hashes_t ans; - ans.h0 = (uint32_t)hi; - ans.h1 = ans.h0 + filter->SegmentLength; - ans.h2 = ans.h1 + filter->SegmentLength; - ans.h1 ^= (uint32_t)(hash >> 18) & filter->SegmentLengthMask; - ans.h2 ^= (uint32_t)(hash)&filter->SegmentLengthMask; - return ans; -} -static inline uint32_t binary_fuse16_hash(int index, uint64_t hash, - const binary_fuse16_t *filter) { - uint64_t h = binary_fuse_mulhi(hash, filter->SegmentCountLength); - h += index * filter->SegmentLength; - // keep the lower 36 bits - uint64_t hh = hash & ((1UL << 36) - 1); - // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift - h ^= (size_t)((hh >> (36 - 18 * index)) & filter->SegmentLengthMask); - return h; -} - -// Report if the key is in the set, with false positive rate. -static inline bool binary_fuse16_contain(uint64_t key, - const binary_fuse16_t *filter) { - uint64_t hash = binary_fuse_mix_split(key, filter->Seed); - uint16_t f = binary_fuse16_fingerprint(hash); - binary_hashes_t hashes = binary_fuse16_hash_batch(hash, filter); - f ^= filter->Fingerprints[hashes.h0] ^ filter->Fingerprints[hashes.h1] ^ - filter->Fingerprints[hashes.h2]; - return f == 0; -} - - -// allocate enough capacity for a set containing up to 'size' elements -// caller is responsible to call binary_fuse16_free(filter) -// size should be at least 2. -static inline bool binary_fuse16_allocate(uint32_t size, - binary_fuse16_t *filter) { - uint32_t arity = 3; - filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size); - if (filter->SegmentLength > 262144) { - filter->SegmentLength = 262144; - } - filter->SegmentLengthMask = filter->SegmentLength - 1; - double sizeFactor = size <= 1 ? 0 : binary_fuse_calculate_size_factor(arity, size); - uint32_t capacity = size <= 1 ? 0 : (uint32_t)(round((double)size * sizeFactor)); - uint32_t initSegmentCount = - (capacity + filter->SegmentLength - 1) / filter->SegmentLength - - (arity - 1); - filter->ArrayLength = (initSegmentCount + arity - 1) * filter->SegmentLength; - filter->SegmentCount = - (filter->ArrayLength + filter->SegmentLength - 1) / filter->SegmentLength; - if (filter->SegmentCount <= arity - 1) { - filter->SegmentCount = 1; - } else { - filter->SegmentCount = filter->SegmentCount - (arity - 1); - } - filter->ArrayLength = - (filter->SegmentCount + arity - 1) * filter->SegmentLength; - filter->SegmentCountLength = filter->SegmentCount * filter->SegmentLength; - filter->Fingerprints = (uint16_t*)malloc(filter->ArrayLength * sizeof(uint16_t)); - return filter->Fingerprints != NULL; -} - -// report memory usage -static inline size_t binary_fuse16_size_in_bytes(const binary_fuse16_t *filter) { - return filter->ArrayLength * sizeof(uint16_t) + sizeof(binary_fuse16_t); -} - -// release memory -static inline void binary_fuse16_free(binary_fuse16_t *filter) { - free(filter->Fingerprints); - filter->Fingerprints = NULL; - filter->Seed = 0; - filter->SegmentLength = 0; - filter->SegmentLengthMask = 0; - filter->SegmentCount = 0; - filter->SegmentCountLength = 0; - filter->ArrayLength = 0; -} - - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling binary_fuse8_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool binary_fuse16_populate(uint64_t *keys, uint32_t size, - binary_fuse16_t *filter) { - uint64_t rng_counter = 0x726b2b9d438b9d4d; - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - uint64_t *reverseOrder = (uint64_t *)calloc((size + 1), sizeof(uint64_t)); - uint32_t capacity = filter->ArrayLength; - uint32_t *alone = (uint32_t *)malloc(capacity * sizeof(uint32_t)); - uint8_t *t2count = (uint8_t *)calloc(capacity, sizeof(uint8_t)); - uint8_t *reverseH = (uint8_t *)malloc(size * sizeof(uint8_t)); - uint64_t *t2hash = (uint64_t *)calloc(capacity, sizeof(uint64_t)); - - uint32_t blockBits = 1; - while (((uint32_t)1 << blockBits) < filter->SegmentCount) { - blockBits += 1; - } - uint32_t block = ((uint32_t)1 << blockBits); - uint32_t *startPos = (uint32_t *)malloc((1 << blockBits) * sizeof(uint32_t)); - uint32_t h012[5]; - - if ((alone == NULL) || (t2count == NULL) || (reverseH == NULL) || - (t2hash == NULL) || (reverseOrder == NULL) || (startPos == NULL)) { - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return false; - } - reverseOrder[size] = 1; - for (int loop = 0; true; ++loop) { - if (loop + 1 > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return false; - } - - for (uint32_t i = 0; i < block; i++) { - // important : i * size would overflow as a 32-bit number in some - // cases. - startPos[i] = ((uint64_t)i * size) >> blockBits; - } - - uint64_t maskblock = block - 1; - for (uint32_t i = 0; i < size; i++) { - uint64_t hash = binary_fuse_murmur64(keys[i] + filter->Seed); - uint64_t segment_index = hash >> (64 - blockBits); - while (reverseOrder[startPos[segment_index]] != 0) { - segment_index++; - segment_index &= maskblock; - } - reverseOrder[startPos[segment_index]] = hash; - startPos[segment_index]++; - } - int error = 0; - uint32_t duplicates = 0; - for (uint32_t i = 0; i < size; i++) { - uint64_t hash = reverseOrder[i]; - uint32_t h0 = binary_fuse16_hash(0, hash, filter); - t2count[h0] += 4; - t2hash[h0] ^= hash; - uint32_t h1= binary_fuse16_hash(1, hash, filter); - t2count[h1] += 4; - t2count[h1] ^= 1; - t2hash[h1] ^= hash; - uint32_t h2 = binary_fuse16_hash(2, hash, filter); - t2count[h2] += 4; - t2hash[h2] ^= hash; - t2count[h2] ^= 2; - if ((t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0) { - if (((t2hash[h0] == 0) && (t2count[h0] == 8)) - || ((t2hash[h1] == 0) && (t2count[h1] == 8)) - || ((t2hash[h2] == 0) && (t2count[h2] == 8))) { - duplicates += 1; - t2count[h0] -= 4; - t2hash[h0] ^= hash; - t2count[h1] -= 4; - t2count[h1] ^= 1; - t2hash[h1] ^= hash; - t2count[h2] -= 4; - t2count[h2] ^= 2; - t2hash[h2] ^= hash; - } - } - error = (t2count[h0] < 4) ? 1 : error; - error = (t2count[h1] < 4) ? 1 : error; - error = (t2count[h2] < 4) ? 1 : error; - } - if(error) { - memset(reverseOrder, 0, sizeof(uint64_t) * size); - memset(t2count, 0, sizeof(uint8_t) * capacity); - memset(t2hash, 0, sizeof(uint64_t) * capacity); - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - continue; - } - - // End of key addition - uint32_t Qsize = 0; - // Add sets with one key to the queue. - for (uint32_t i = 0; i < capacity; i++) { - alone[Qsize] = i; - Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0; - } - uint32_t stacksize = 0; - while (Qsize > 0) { - Qsize--; - uint32_t index = alone[Qsize]; - if ((t2count[index] >> 2) == 1) { - uint64_t hash = t2hash[index]; - - //h012[0] = binary_fuse16_hash(0, hash, filter); - h012[1] = binary_fuse16_hash(1, hash, filter); - h012[2] = binary_fuse16_hash(2, hash, filter); - h012[3] = binary_fuse16_hash(0, hash, filter); // == h012[0]; - h012[4] = h012[1]; - uint8_t found = t2count[index] & 3; - reverseH[stacksize] = found; - reverseOrder[stacksize] = hash; - stacksize++; - uint32_t other_index1 = h012[found + 1]; - alone[Qsize] = other_index1; - Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0); - - t2count[other_index1] -= 4; - t2count[other_index1] ^= binary_fuse_mod3(found + 1); - t2hash[other_index1] ^= hash; - - uint32_t other_index2 = h012[found + 2]; - alone[Qsize] = other_index2; - Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0); - t2count[other_index2] -= 4; - t2count[other_index2] ^= binary_fuse_mod3(found + 2); - t2hash[other_index2] ^= hash; - } - } - if (stacksize + duplicates == size) { - // success - size = stacksize; - break; - } else if(duplicates > 0) { - size = binary_fuse_sort_and_remove_dup(keys, size); - } - memset(reverseOrder, 0, sizeof(uint64_t) * size); - memset(t2count, 0, sizeof(uint8_t) * capacity); - memset(t2hash, 0, sizeof(uint64_t) * capacity); - filter->Seed = binary_fuse_rng_splitmix64(&rng_counter); - } - - for (uint32_t i = size - 1; i < size; i--) { - // the hash of the key we insert next - uint64_t hash = reverseOrder[i]; - uint16_t xor2 = binary_fuse16_fingerprint(hash); - uint8_t found = reverseH[i]; - h012[0] = binary_fuse16_hash(0, hash, filter); - h012[1] = binary_fuse16_hash(1, hash, filter); - h012[2] = binary_fuse16_hash(2, hash, filter); - h012[3] = h012[0]; - h012[4] = h012[1]; - filter->Fingerprints[h012[found]] = xor2 ^ - filter->Fingerprints[h012[found + 1]] ^ - filter->Fingerprints[h012[found + 2]]; - } - free(alone); - free(t2count); - free(reverseH); - free(t2hash); - free(reverseOrder); - free(startPos); - return true; -} - - - - -#endif diff --git a/dependencies/xor_singleheader/include/xorfilter.h b/dependencies/xor_singleheader/include/xorfilter.h deleted file mode 100644 index e2aff91..0000000 --- a/dependencies/xor_singleheader/include/xorfilter.h +++ /dev/null @@ -1,1283 +0,0 @@ -#ifndef XORFILTER_H -#define XORFILTER_H -#include -#include -#include -#include -#include -#include - -#ifndef XOR_SORT_ITERATIONS -#define XOR_SORT_ITERATIONS 10 // after 10 iterations, we sort and remove duplicates -#endif - -#ifndef XOR_MAX_ITERATIONS -#define XOR_MAX_ITERATIONS 100 // probabillity of success should always be > 0.5 so 100 iterations is highly unlikely -#endif - - -static int xor_cmpfunc(const void * a, const void * b) { - return ( *(const uint64_t*)a - *(const uint64_t*)b ); -} - -static size_t xor_sort_and_remove_dup(uint64_t* keys, size_t length) { - qsort(keys, length, sizeof(uint64_t), xor_cmpfunc); - size_t j = 0; - for(size_t i = 1; i < length; i++) { - if(keys[i] != keys[i-1]) { - keys[j] = keys[i]; - j++; - } - } - return j+1; -} -/** - * We assume that you have a large set of 64-bit integers - * and you want a data structure to do membership tests using - * no more than ~8 or ~16 bits per key. If your initial set - * is made of strings or other types, you first need to hash them - * to a 64-bit integer. - */ - -/** - * We start with a few utilities. - ***/ -static inline uint64_t xor_murmur64(uint64_t h) { - h ^= h >> 33; - h *= UINT64_C(0xff51afd7ed558ccd); - h ^= h >> 33; - h *= UINT64_C(0xc4ceb9fe1a85ec53); - h ^= h >> 33; - return h; -} - -static inline uint64_t xor_mix_split(uint64_t key, uint64_t seed) { - return xor_murmur64(key + seed); -} - -static inline uint64_t xor_rotl64(uint64_t n, unsigned int c) { - return (n << (c & 63)) | (n >> ((-c) & 63)); -} - -static inline uint32_t xor_reduce(uint32_t hash, uint32_t n) { - // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ - return (uint32_t)(((uint64_t)hash * n) >> 32); -} - -static inline uint64_t xor_fingerprint(uint64_t hash) { - return hash ^ (hash >> 32); -} - -/** - * We need a decent random number generator. - **/ - -// returns random number, modifies the seed -static inline uint64_t xor_rng_splitmix64(uint64_t *seed) { - uint64_t z = (*seed += UINT64_C(0x9E3779B97F4A7C15)); - z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); - z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); - return z ^ (z >> 31); -} - -/** - * xor8 is the recommended default, no more than - * a 0.3% false-positive probability. - */ -typedef struct xor8_s { - uint64_t seed; - uint64_t blockLength; - uint8_t - *fingerprints; // after xor8_allocate, will point to 3*blockLength values -} xor8_t; - -// Report if the key is in the set, with false positive rate. -static inline bool xor8_contain(uint64_t key, const xor8_t *filter) { - uint64_t hash = xor_mix_split(key, filter->seed); - uint8_t f = xor_fingerprint(hash); - uint32_t r0 = (uint32_t)hash; - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - uint32_t h0 = xor_reduce(r0, filter->blockLength); - uint32_t h1 = xor_reduce(r1, filter->blockLength) + filter->blockLength; - uint32_t h2 = xor_reduce(r2, filter->blockLength) + 2 * filter->blockLength; - return f == (filter->fingerprints[h0] ^ filter->fingerprints[h1] ^ - filter->fingerprints[h2]); -} - -typedef struct xor16_s { - uint64_t seed; - uint64_t blockLength; - uint16_t - *fingerprints; // after xor16_allocate, will point to 3*blockLength values -} xor16_t; - -// Report if the key is in the set, with false positive rate. -static inline bool xor16_contain(uint64_t key, const xor16_t *filter) { - uint64_t hash = xor_mix_split(key, filter->seed); - uint16_t f = xor_fingerprint(hash); - uint32_t r0 = (uint32_t)hash; - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - uint32_t h0 = xor_reduce(r0, filter->blockLength); - uint32_t h1 = xor_reduce(r1, filter->blockLength) + filter->blockLength; - uint32_t h2 = xor_reduce(r2, filter->blockLength) + 2 * filter->blockLength; - return f == (filter->fingerprints[h0] ^ filter->fingerprints[h1] ^ - filter->fingerprints[h2]); -} - -// allocate enough capacity for a set containing up to 'size' elements -// caller is responsible to call xor8_free(filter) -static inline bool xor8_allocate(uint32_t size, xor8_t *filter) { - size_t capacity = 32 + 1.23 * size; - capacity = capacity / 3 * 3; - filter->fingerprints = (uint8_t *)malloc(capacity * sizeof(uint8_t)); - if (filter->fingerprints != NULL) { - filter->blockLength = capacity / 3; - return true; - } else { - return false; - } -} - -// allocate enough capacity for a set containing up to 'size' elements -// caller is responsible to call xor16_free(filter) -static inline bool xor16_allocate(uint32_t size, xor16_t *filter) { - size_t capacity = 32 + 1.23 * size; - capacity = capacity / 3 * 3; - filter->fingerprints = (uint16_t *)malloc(capacity * sizeof(uint16_t)); - if (filter->fingerprints != NULL) { - filter->blockLength = capacity / 3; - return true; - } else { - return false; - } -} - -// report memory usage -static inline size_t xor8_size_in_bytes(const xor8_t *filter) { - return 3 * filter->blockLength * sizeof(uint8_t) + sizeof(xor8_t); -} - -// report memory usage -static inline size_t xor16_size_in_bytes(const xor16_t *filter) { - return 3 * filter->blockLength * sizeof(uint16_t) + sizeof(xor16_t); -} - -// release memory -static inline void xor8_free(xor8_t *filter) { - free(filter->fingerprints); - filter->fingerprints = NULL; - filter->blockLength = 0; -} - -// release memory -static inline void xor16_free(xor16_t *filter) { - free(filter->fingerprints); - filter->fingerprints = NULL; - filter->blockLength = 0; -} - -struct xor_xorset_s { - uint64_t xormask; - uint32_t count; -}; - -typedef struct xor_xorset_s xor_xorset_t; - -struct xor_hashes_s { - uint64_t h; - uint32_t h0; - uint32_t h1; - uint32_t h2; -}; - -typedef struct xor_hashes_s xor_hashes_t; - -static inline xor_hashes_t xor8_get_h0_h1_h2(uint64_t k, const xor8_t *filter) { - uint64_t hash = xor_mix_split(k, filter->seed); - xor_hashes_t answer; - answer.h = hash; - uint32_t r0 = (uint32_t)hash; - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - - answer.h0 = xor_reduce(r0, filter->blockLength); - answer.h1 = xor_reduce(r1, filter->blockLength); - answer.h2 = xor_reduce(r2, filter->blockLength); - return answer; -} - -struct xor_h0h1h2_s { - uint32_t h0; - uint32_t h1; - uint32_t h2; -}; - -typedef struct xor_h0h1h2_s xor_h0h1h2_t; - -static inline uint32_t xor8_get_h0(uint64_t hash, const xor8_t *filter) { - uint32_t r0 = (uint32_t)hash; - return xor_reduce(r0, filter->blockLength); -} -static inline uint32_t xor8_get_h1(uint64_t hash, const xor8_t *filter) { - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - return xor_reduce(r1, filter->blockLength); -} -static inline uint32_t xor8_get_h2(uint64_t hash, const xor8_t *filter) { - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - return xor_reduce(r2, filter->blockLength); -} -static inline uint32_t xor16_get_h0(uint64_t hash, const xor16_t *filter) { - uint32_t r0 = (uint32_t)hash; - return xor_reduce(r0, filter->blockLength); -} -static inline uint32_t xor16_get_h1(uint64_t hash, const xor16_t *filter) { - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - return xor_reduce(r1, filter->blockLength); -} -static inline uint32_t xor16_get_h2(uint64_t hash, const xor16_t *filter) { - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - return xor_reduce(r2, filter->blockLength); -} -static inline xor_hashes_t xor16_get_h0_h1_h2(uint64_t k, - const xor16_t *filter) { - uint64_t hash = xor_mix_split(k, filter->seed); - xor_hashes_t answer; - answer.h = hash; - uint32_t r0 = (uint32_t)hash; - uint32_t r1 = (uint32_t)xor_rotl64(hash, 21); - uint32_t r2 = (uint32_t)xor_rotl64(hash, 42); - - answer.h0 = xor_reduce(r0, filter->blockLength); - answer.h1 = xor_reduce(r1, filter->blockLength); - answer.h2 = xor_reduce(r2, filter->blockLength); - return answer; -} - -struct xor_keyindex_s { - uint64_t hash; - uint32_t index; -}; - -typedef struct xor_keyindex_s xor_keyindex_t; - -struct xor_setbuffer_s { - xor_keyindex_t *buffer; - uint32_t *counts; - int insignificantbits; - uint32_t slotsize; // should be 1<< insignificantbits - uint32_t slotcount; - size_t originalsize; -}; - -typedef struct xor_setbuffer_s xor_setbuffer_t; - -static inline bool xor_init_buffer(xor_setbuffer_t *buffer, size_t size) { - buffer->originalsize = size; - buffer->insignificantbits = 18; - buffer->slotsize = UINT32_C(1) << buffer->insignificantbits; - buffer->slotcount = (size + buffer->slotsize - 1) / buffer->slotsize; - buffer->buffer = (xor_keyindex_t *)malloc( - buffer->slotcount * buffer->slotsize * sizeof(xor_keyindex_t)); - buffer->counts = (uint32_t *)malloc(buffer->slotcount * sizeof(uint32_t)); - if ((buffer->counts == NULL) || (buffer->buffer == NULL)) { - free(buffer->counts); - free(buffer->buffer); - return false; - } - memset(buffer->counts, 0, buffer->slotcount * sizeof(uint32_t)); - return true; -} - -static inline void xor_free_buffer(xor_setbuffer_t *buffer) { - free(buffer->counts); - free(buffer->buffer); - buffer->counts = NULL; - buffer->buffer = NULL; -} - -static inline void xor_buffered_increment_counter(uint32_t index, uint64_t hash, - xor_setbuffer_t *buffer, - xor_xorset_t *sets) { - uint32_t slot = index >> buffer->insignificantbits; - size_t addr = buffer->counts[slot] + (slot << buffer->insignificantbits); - buffer->buffer[addr].index = index; - buffer->buffer[addr].hash = hash; - buffer->counts[slot]++; - size_t offset = (slot << buffer->insignificantbits); - if (buffer->counts[slot] == buffer->slotsize) { - // must empty the buffer - for (size_t i = offset; i < buffer->slotsize + offset; i++) { - xor_keyindex_t ki = - buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count++; - } - buffer->counts[slot] = 0; - } -} - -static inline void xor_make_buffer_current(xor_setbuffer_t *buffer, - xor_xorset_t *sets, uint32_t index, - xor_keyindex_t *Q, size_t *Qsize) { - uint32_t slot = index >> buffer->insignificantbits; - if(buffer->counts[slot] > 0) { // uncommon! - size_t qsize = *Qsize; - size_t offset = (slot << buffer->insignificantbits); - for (size_t i = offset; i < buffer->counts[slot] + offset; i++) { - xor_keyindex_t ki = buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count--; - if (sets[ki.index].count == 1) {// this branch might be hard to predict - ki.hash = sets[ki.index].xormask; - Q[qsize] = ki; - qsize += 1; - } - } - *Qsize = qsize; - buffer->counts[slot] = 0; - } -} - - - -static inline void xor_buffered_decrement_counter(uint32_t index, uint64_t hash, - xor_setbuffer_t *buffer, - xor_xorset_t *sets, - xor_keyindex_t *Q, - size_t *Qsize) { - uint32_t slot = index >> buffer->insignificantbits; - size_t addr = buffer->counts[slot] + (slot << buffer->insignificantbits); - buffer->buffer[addr].index = index; - buffer->buffer[addr].hash = hash; - buffer->counts[slot]++; - if (buffer->counts[slot] == buffer->slotsize) { - size_t qsize = *Qsize; - size_t offset = (slot << buffer->insignificantbits); - for (size_t i = offset; i < buffer->counts[slot] + offset; i++) { - xor_keyindex_t ki = - buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count--; - if (sets[ki.index].count == 1) { - ki.hash = sets[ki.index].xormask; - Q[qsize] = ki; - qsize += 1; - } - } - *Qsize = qsize; - buffer->counts[slot] = 0; - } -} - -static inline void xor_flush_increment_buffer(xor_setbuffer_t *buffer, - xor_xorset_t *sets) { - for (uint32_t slot = 0; slot < buffer->slotcount; slot++) { - size_t offset = (slot << buffer->insignificantbits); - for (size_t i = offset; i < buffer->counts[slot] + offset; i++) { - xor_keyindex_t ki = - buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count++; - } - buffer->counts[slot] = 0; - } -} - -static inline void xor_flush_decrement_buffer(xor_setbuffer_t *buffer, - xor_xorset_t *sets, - xor_keyindex_t *Q, - size_t *Qsize) { - size_t qsize = *Qsize; - for (uint32_t slot = 0; slot < buffer->slotcount; slot++) { - uint32_t base = (slot << buffer->insignificantbits); - for (size_t i = base; i < buffer->counts[slot] + base; i++) { - xor_keyindex_t ki = buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count--; - if (sets[ki.index].count == 1) { - ki.hash = sets[ki.index].xormask; - Q[qsize] = ki; - qsize += 1; - } - } - buffer->counts[slot] = 0; - } - *Qsize = qsize; -} - -static inline uint32_t xor_flushone_decrement_buffer(xor_setbuffer_t *buffer, - xor_xorset_t *sets, - xor_keyindex_t *Q, - size_t *Qsize) { - uint32_t bestslot = 0; - uint32_t bestcount = buffer->counts[bestslot]; - for (uint32_t slot = 1; slot < buffer->slotcount; slot++) { - if (buffer->counts[slot] > bestcount) { - bestslot = slot; - bestcount = buffer->counts[slot]; - } - } - uint32_t slot = bestslot; - size_t qsize = *Qsize; - // for(uint32_t slot = 0; slot < buffer->slotcount; slot++) { - uint32_t base = (slot << buffer->insignificantbits); - for (size_t i = base; i < buffer->counts[slot] + base; i++) { - xor_keyindex_t ki = buffer->buffer[i]; - sets[ki.index].xormask ^= ki.hash; - sets[ki.index].count--; - if (sets[ki.index].count == 1) { - ki.hash = sets[ki.index].xormask; - Q[qsize] = ki; - qsize += 1; - } - } - *Qsize = qsize; - buffer->counts[slot] = 0; - //} - return bestslot; -} - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling xor8_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool xor8_buffered_populate(uint64_t *keys, uint32_t size, xor8_t *filter) { - if(size == 0) { return false; } - uint64_t rng_counter = 1; - filter->seed = xor_rng_splitmix64(&rng_counter); - size_t arrayLength = filter->blockLength * 3; // size of the backing array - xor_setbuffer_t buffer0, buffer1, buffer2; - size_t blockLength = filter->blockLength; - bool ok0 = xor_init_buffer(&buffer0, blockLength); - bool ok1 = xor_init_buffer(&buffer1, blockLength); - bool ok2 = xor_init_buffer(&buffer2, blockLength); - if (!ok0 || !ok1 || !ok2) { - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - return false; - } - - xor_xorset_t *sets = - (xor_xorset_t *)malloc(arrayLength * sizeof(xor_xorset_t)); - xor_xorset_t *sets0 = sets; - - xor_keyindex_t *Q = - (xor_keyindex_t *)malloc(arrayLength * sizeof(xor_keyindex_t)); - - xor_keyindex_t *stack = - (xor_keyindex_t *)malloc(size * sizeof(xor_keyindex_t)); - - if ((sets == NULL) || (Q == NULL) || (stack == NULL)) { - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - free(sets); - free(Q); - free(stack); - return false; - } - xor_xorset_t *sets1 = sets + blockLength; - xor_xorset_t *sets2 = sets + 2 * blockLength; - xor_keyindex_t *Q0 = Q; - xor_keyindex_t *Q1 = Q + blockLength; - xor_keyindex_t *Q2 = Q + 2 * blockLength; - - int iterations = 0; - - while (true) { - iterations ++; - if(iterations == XOR_SORT_ITERATIONS) { - size = xor_sort_and_remove_dup(keys, size); - } - if(iterations > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - free(sets); - free(Q); - free(stack); - return false; - } - memset(sets, 0, sizeof(xor_xorset_t) * arrayLength); - for (size_t i = 0; i < size; i++) { - uint64_t key = keys[i]; - xor_hashes_t hs = xor8_get_h0_h1_h2(key, filter); - xor_buffered_increment_counter(hs.h0, hs.h, &buffer0, sets0); - xor_buffered_increment_counter(hs.h1, hs.h, &buffer1, - sets1); - xor_buffered_increment_counter(hs.h2, hs.h, &buffer2, - sets2); - } - xor_flush_increment_buffer(&buffer0, sets0); - xor_flush_increment_buffer(&buffer1, sets1); - xor_flush_increment_buffer(&buffer2, sets2); - // todo: the flush should be sync with the detection that follows - // scan for values with a count of one - size_t Q0size = 0, Q1size = 0, Q2size = 0; - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets0[i].count == 1) { - Q0[Q0size].index = i; - Q0[Q0size].hash = sets0[i].xormask; - Q0size++; - } - } - - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets1[i].count == 1) { - Q1[Q1size].index = i; - Q1[Q1size].hash = sets1[i].xormask; - Q1size++; - } - } - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets2[i].count == 1) { - Q2[Q2size].index = i; - Q2[Q2size].hash = sets2[i].xormask; - Q2size++; - } - } - - size_t stack_size = 0; - while (Q0size + Q1size + Q2size > 0) { - while (Q0size > 0) { - xor_keyindex_t keyindex = Q0[--Q0size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer0, sets0, index, Q0, &Q0size); - - if (sets0[index].count == 0) - continue; // not actually possible after the initial scan. - //sets0[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h1 = xor8_get_h1(hash, filter); - uint32_t h2 = xor8_get_h2(hash, filter); - - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h1, hash, &buffer1, sets1, - Q1, &Q1size); - xor_buffered_decrement_counter(h2, hash, &buffer2, - sets2, Q2, &Q2size); - } - if (Q1size == 0) - xor_flushone_decrement_buffer(&buffer1, sets1, Q1, &Q1size); - - while (Q1size > 0) { - xor_keyindex_t keyindex = Q1[--Q1size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer1, sets1, index, Q1, &Q1size); - - if (sets1[index].count == 0) - continue; - //sets1[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h0 = xor8_get_h0(hash, filter); - uint32_t h2 = xor8_get_h2(hash, filter); - keyindex.index += blockLength; - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h0, hash, &buffer0, sets0, Q0, &Q0size); - xor_buffered_decrement_counter(h2, hash, &buffer2, - sets2, Q2, &Q2size); - } - if (Q2size == 0) - xor_flushone_decrement_buffer(&buffer2, sets2, Q2, &Q2size); - while (Q2size > 0) { - xor_keyindex_t keyindex = Q2[--Q2size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer2, sets2, index, Q2, &Q2size); - if (sets2[index].count == 0) - continue; - - //sets2[index].count = 0; - uint64_t hash = keyindex.hash; - - uint32_t h0 = xor8_get_h0(hash, filter); - uint32_t h1 = xor8_get_h1(hash, filter); - keyindex.index += 2 * blockLength; - - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h0, hash, &buffer0, sets0, Q0, &Q0size); - xor_buffered_decrement_counter(h1, hash, &buffer1, sets1, - Q1, &Q1size); - } - if (Q0size == 0) - xor_flushone_decrement_buffer(&buffer0, sets0, Q0, &Q0size); - if ((Q0size + Q1size + Q2size == 0) && (stack_size < size)) { - // this should basically never happen - xor_flush_decrement_buffer(&buffer0, sets0, Q0, &Q0size); - xor_flush_decrement_buffer(&buffer1, sets1, Q1, &Q1size); - xor_flush_decrement_buffer(&buffer2, sets2, Q2, &Q2size); - } - } - if (stack_size == size) { - // success - break; - } - - filter->seed = xor_rng_splitmix64(&rng_counter); - } - uint8_t * fingerprints0 = filter->fingerprints; - uint8_t * fingerprints1 = filter->fingerprints + blockLength; - uint8_t * fingerprints2 = filter->fingerprints + 2 * blockLength; - - size_t stack_size = size; - while (stack_size > 0) { - xor_keyindex_t ki = stack[--stack_size]; - uint64_t val = xor_fingerprint(ki.hash); - if(ki.index < blockLength) { - val ^= fingerprints1[xor8_get_h1(ki.hash,filter)] ^ fingerprints2[xor8_get_h2(ki.hash,filter)]; - } else if(ki.index < 2 * blockLength) { - val ^= fingerprints0[xor8_get_h0(ki.hash,filter)] ^ fingerprints2[xor8_get_h2(ki.hash,filter)]; - } else { - val ^= fingerprints0[xor8_get_h0(ki.hash,filter)] ^ fingerprints1[xor8_get_h1(ki.hash,filter)]; - } - filter->fingerprints[ki.index] = val; - } - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - - free(sets); - free(Q); - free(stack); - return true; -} - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling xor8_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool xor8_populate(uint64_t *keys, uint32_t size, xor8_t *filter) { - if(size == 0) { return false; } - uint64_t rng_counter = 1; - filter->seed = xor_rng_splitmix64(&rng_counter); - size_t arrayLength = filter->blockLength * 3; // size of the backing array - size_t blockLength = filter->blockLength; - - xor_xorset_t *sets = - (xor_xorset_t *)malloc(arrayLength * sizeof(xor_xorset_t)); - - xor_keyindex_t *Q = - (xor_keyindex_t *)malloc(arrayLength * sizeof(xor_keyindex_t)); - - xor_keyindex_t *stack = - (xor_keyindex_t *)malloc(size * sizeof(xor_keyindex_t)); - - if ((sets == NULL) || (Q == NULL) || (stack == NULL)) { - free(sets); - free(Q); - free(stack); - return false; - } - xor_xorset_t *sets0 = sets; - xor_xorset_t *sets1 = sets + blockLength; - xor_xorset_t *sets2 = sets + 2 * blockLength; - xor_keyindex_t *Q0 = Q; - xor_keyindex_t *Q1 = Q + blockLength; - xor_keyindex_t *Q2 = Q + 2 * blockLength; - - int iterations = 0; - - while (true) { - iterations ++; - if(iterations == XOR_SORT_ITERATIONS) { - size = xor_sort_and_remove_dup(keys, size); - } - if(iterations > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). - free(sets); - free(Q); - free(stack); - return false; - } - - memset(sets, 0, sizeof(xor_xorset_t) * arrayLength); - for (size_t i = 0; i < size; i++) { - uint64_t key = keys[i]; - xor_hashes_t hs = xor8_get_h0_h1_h2(key, filter); - sets0[hs.h0].xormask ^= hs.h; - sets0[hs.h0].count++; - sets1[hs.h1].xormask ^= hs.h; - sets1[hs.h1].count++; - sets2[hs.h2].xormask ^= hs.h; - sets2[hs.h2].count++; - } - // todo: the flush should be sync with the detection that follows - // scan for values with a count of one - size_t Q0size = 0, Q1size = 0, Q2size = 0; - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets0[i].count == 1) { - Q0[Q0size].index = i; - Q0[Q0size].hash = sets0[i].xormask; - Q0size++; - } - } - - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets1[i].count == 1) { - Q1[Q1size].index = i; - Q1[Q1size].hash = sets1[i].xormask; - Q1size++; - } - } - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets2[i].count == 1) { - Q2[Q2size].index = i; - Q2[Q2size].hash = sets2[i].xormask; - Q2size++; - } - } - - size_t stack_size = 0; - while (Q0size + Q1size + Q2size > 0) { - while (Q0size > 0) { - xor_keyindex_t keyindex = Q0[--Q0size]; - size_t index = keyindex.index; - if (sets0[index].count == 0) - continue; // not actually possible after the initial scan. - //sets0[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h1 = xor8_get_h1(hash, filter); - uint32_t h2 = xor8_get_h2(hash, filter); - - stack[stack_size] = keyindex; - stack_size++; - sets1[h1].xormask ^= hash; - sets1[h1].count--; - if (sets1[h1].count == 1) { - Q1[Q1size].index = h1; - Q1[Q1size].hash = sets1[h1].xormask; - Q1size++; - } - sets2[h2].xormask ^= hash; - sets2[h2].count--; - if (sets2[h2].count == 1) { - Q2[Q2size].index = h2; - Q2[Q2size].hash = sets2[h2].xormask; - Q2size++; - } - } - while (Q1size > 0) { - xor_keyindex_t keyindex = Q1[--Q1size]; - size_t index = keyindex.index; - if (sets1[index].count == 0) - continue; - //sets1[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h0 = xor8_get_h0(hash, filter); - uint32_t h2 = xor8_get_h2(hash, filter); - keyindex.index += blockLength; - stack[stack_size] = keyindex; - stack_size++; - sets0[h0].xormask ^= hash; - sets0[h0].count--; - if (sets0[h0].count == 1) { - Q0[Q0size].index = h0; - Q0[Q0size].hash = sets0[h0].xormask; - Q0size++; - } - sets2[h2].xormask ^= hash; - sets2[h2].count--; - if (sets2[h2].count == 1) { - Q2[Q2size].index = h2; - Q2[Q2size].hash = sets2[h2].xormask; - Q2size++; - } - } - while (Q2size > 0) { - xor_keyindex_t keyindex = Q2[--Q2size]; - size_t index = keyindex.index; - if (sets2[index].count == 0) - continue; - - //sets2[index].count = 0; - uint64_t hash = keyindex.hash; - - uint32_t h0 = xor8_get_h0(hash, filter); - uint32_t h1 = xor8_get_h1(hash, filter); - keyindex.index += 2 * blockLength; - - stack[stack_size] = keyindex; - stack_size++; - sets0[h0].xormask ^= hash; - sets0[h0].count--; - if (sets0[h0].count == 1) { - Q0[Q0size].index = h0; - Q0[Q0size].hash = sets0[h0].xormask; - Q0size++; - } - sets1[h1].xormask ^= hash; - sets1[h1].count--; - if (sets1[h1].count == 1) { - Q1[Q1size].index = h1; - Q1[Q1size].hash = sets1[h1].xormask; - Q1size++; - } - - } - } - if (stack_size == size) { - // success - break; - } - - filter->seed = xor_rng_splitmix64(&rng_counter); - } - uint8_t * fingerprints0 = filter->fingerprints; - uint8_t * fingerprints1 = filter->fingerprints + blockLength; - uint8_t * fingerprints2 = filter->fingerprints + 2 * blockLength; - - size_t stack_size = size; - while (stack_size > 0) { - xor_keyindex_t ki = stack[--stack_size]; - uint64_t val = xor_fingerprint(ki.hash); - if(ki.index < blockLength) { - val ^= fingerprints1[xor8_get_h1(ki.hash,filter)] ^ fingerprints2[xor8_get_h2(ki.hash,filter)]; - } else if(ki.index < 2 * blockLength) { - val ^= fingerprints0[xor8_get_h0(ki.hash,filter)] ^ fingerprints2[xor8_get_h2(ki.hash,filter)]; - } else { - val ^= fingerprints0[xor8_get_h0(ki.hash,filter)] ^ fingerprints1[xor8_get_h1(ki.hash,filter)]; - } - filter->fingerprints[ki.index] = val; - } - - free(sets); - free(Q); - free(stack); - return true; -} - - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling xor16_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool xor16_buffered_populate(uint64_t *keys, uint32_t size, xor16_t *filter) { - if(size == 0) { return false; } - uint64_t rng_counter = 1; - filter->seed = xor_rng_splitmix64(&rng_counter); - size_t arrayLength = filter->blockLength * 3; // size of the backing array - xor_setbuffer_t buffer0, buffer1, buffer2; - size_t blockLength = filter->blockLength; - bool ok0 = xor_init_buffer(&buffer0, blockLength); - bool ok1 = xor_init_buffer(&buffer1, blockLength); - bool ok2 = xor_init_buffer(&buffer2, blockLength); - if (!ok0 || !ok1 || !ok2) { - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - return false; - } - - xor_xorset_t *sets = - (xor_xorset_t *)malloc(arrayLength * sizeof(xor_xorset_t)); - - xor_keyindex_t *Q = - (xor_keyindex_t *)malloc(arrayLength * sizeof(xor_keyindex_t)); - - xor_keyindex_t *stack = - (xor_keyindex_t *)malloc(size * sizeof(xor_keyindex_t)); - - if ((sets == NULL) || (Q == NULL) || (stack == NULL)) { - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - free(sets); - free(Q); - free(stack); - return false; - } - xor_xorset_t *sets0 = sets; - xor_xorset_t *sets1 = sets + blockLength; - xor_xorset_t *sets2 = sets + 2 * blockLength; - xor_keyindex_t *Q0 = Q; - xor_keyindex_t *Q1 = Q + blockLength; - xor_keyindex_t *Q2 = Q + 2 * blockLength; - - int iterations = 0; - - while (true) { - iterations ++; - if(iterations == XOR_SORT_ITERATIONS) { - size = xor_sort_and_remove_dup(keys, size); - } - if(iterations > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system)é - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - free(sets); - free(Q); - free(stack); - return false; - } - - memset(sets, 0, sizeof(xor_xorset_t) * arrayLength); - for (size_t i = 0; i < size; i++) { - uint64_t key = keys[i]; - xor_hashes_t hs = xor16_get_h0_h1_h2(key, filter); - xor_buffered_increment_counter(hs.h0, hs.h, &buffer0, sets0); - xor_buffered_increment_counter(hs.h1, hs.h, &buffer1, - sets1); - xor_buffered_increment_counter(hs.h2, hs.h, &buffer2, - sets2); - } - xor_flush_increment_buffer(&buffer0, sets0); - xor_flush_increment_buffer(&buffer1, sets1); - xor_flush_increment_buffer(&buffer2, sets2); - // todo: the flush should be sync with the detection that follows - // scan for values with a count of one - size_t Q0size = 0, Q1size = 0, Q2size = 0; - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets0[i].count == 1) { - Q0[Q0size].index = i; - Q0[Q0size].hash = sets0[i].xormask; - Q0size++; - } - } - - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets1[i].count == 1) { - Q1[Q1size].index = i; - Q1[Q1size].hash = sets1[i].xormask; - Q1size++; - } - } - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets2[i].count == 1) { - Q2[Q2size].index = i; - Q2[Q2size].hash = sets2[i].xormask; - Q2size++; - } - } - - size_t stack_size = 0; - while (Q0size + Q1size + Q2size > 0) { - while (Q0size > 0) { - xor_keyindex_t keyindex = Q0[--Q0size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer0, sets0, index, Q0, &Q0size); - - if (sets0[index].count == 0) - continue; // not actually possible after the initial scan. - //sets0[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h1 = xor16_get_h1(hash, filter); - uint32_t h2 = xor16_get_h2(hash, filter); - - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h1, hash, &buffer1, sets1, - Q1, &Q1size); - xor_buffered_decrement_counter(h2, hash, &buffer2, - sets2, Q2, &Q2size); - } - if (Q1size == 0) - xor_flushone_decrement_buffer(&buffer1, sets1, Q1, &Q1size); - - while (Q1size > 0) { - xor_keyindex_t keyindex = Q1[--Q1size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer1, sets1, index, Q1, &Q1size); - - if (sets1[index].count == 0) - continue; - //sets1[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h0 = xor16_get_h0(hash, filter); - uint32_t h2 = xor16_get_h2(hash, filter); - keyindex.index += blockLength; - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h0, hash, &buffer0, sets0, Q0, &Q0size); - xor_buffered_decrement_counter(h2, hash, &buffer2, - sets2, Q2, &Q2size); - } - if (Q2size == 0) - xor_flushone_decrement_buffer(&buffer2, sets2, Q2, &Q2size); - while (Q2size > 0) { - xor_keyindex_t keyindex = Q2[--Q2size]; - size_t index = keyindex.index; - xor_make_buffer_current(&buffer2, sets2, index, Q2, &Q2size); - if (sets2[index].count == 0) - continue; - - //sets2[index].count = 0; - uint64_t hash = keyindex.hash; - - uint32_t h0 = xor16_get_h0(hash, filter); - uint32_t h1 = xor16_get_h1(hash, filter); - keyindex.index += 2 * blockLength; - - stack[stack_size] = keyindex; - stack_size++; - xor_buffered_decrement_counter(h0, hash, &buffer0, sets0, Q0, &Q0size); - xor_buffered_decrement_counter(h1, hash, &buffer1, sets1, - Q1, &Q1size); - } - if (Q0size == 0) - xor_flushone_decrement_buffer(&buffer0, sets0, Q0, &Q0size); - if ((Q0size + Q1size + Q2size == 0) && (stack_size < size)) { - // this should basically never happen - xor_flush_decrement_buffer(&buffer0, sets0, Q0, &Q0size); - xor_flush_decrement_buffer(&buffer1, sets1, Q1, &Q1size); - xor_flush_decrement_buffer(&buffer2, sets2, Q2, &Q2size); - } - } - if (stack_size == size) { - // success - break; - } - - filter->seed = xor_rng_splitmix64(&rng_counter); - } - uint16_t * fingerprints0 = filter->fingerprints; - uint16_t * fingerprints1 = filter->fingerprints + blockLength; - uint16_t * fingerprints2 = filter->fingerprints + 2 * blockLength; - - size_t stack_size = size; - while (stack_size > 0) { - xor_keyindex_t ki = stack[--stack_size]; - uint64_t val = xor_fingerprint(ki.hash); - if(ki.index < blockLength) { - val ^= fingerprints1[xor16_get_h1(ki.hash,filter)] ^ fingerprints2[xor16_get_h2(ki.hash,filter)]; - } else if(ki.index < 2 * blockLength) { - val ^= fingerprints0[xor16_get_h0(ki.hash,filter)] ^ fingerprints2[xor16_get_h2(ki.hash,filter)]; - } else { - val ^= fingerprints0[xor16_get_h0(ki.hash,filter)] ^ fingerprints1[xor16_get_h1(ki.hash,filter)]; - } - filter->fingerprints[ki.index] = val; - } - xor_free_buffer(&buffer0); - xor_free_buffer(&buffer1); - xor_free_buffer(&buffer2); - - free(sets); - free(Q); - free(stack); - return true; -} - - - -// Construct the filter, returns true on success, false on failure. -// The algorithm fails when there is insufficient memory. -// The caller is responsable for calling xor16_allocate(size,filter) -// before. For best performance, the caller should ensure that there are not too -// many duplicated keys. -static inline bool xor16_populate(uint64_t *keys, uint32_t size, xor16_t *filter) { - if(size == 0) { return false; } - uint64_t rng_counter = 1; - filter->seed = xor_rng_splitmix64(&rng_counter); - size_t arrayLength = filter->blockLength * 3; // size of the backing array - size_t blockLength = filter->blockLength; - - xor_xorset_t *sets = - (xor_xorset_t *)malloc(arrayLength * sizeof(xor_xorset_t)); - - xor_keyindex_t *Q = - (xor_keyindex_t *)malloc(arrayLength * sizeof(xor_keyindex_t)); - - xor_keyindex_t *stack = - (xor_keyindex_t *)malloc(size * sizeof(xor_keyindex_t)); - - if ((sets == NULL) || (Q == NULL) || (stack == NULL)) { - free(sets); - free(Q); - free(stack); - return false; - } - xor_xorset_t *sets0 = sets; - xor_xorset_t *sets1 = sets + blockLength; - xor_xorset_t *sets2 = sets + 2 * blockLength; - - xor_keyindex_t *Q0 = Q; - xor_keyindex_t *Q1 = Q + blockLength; - xor_keyindex_t *Q2 = Q + 2 * blockLength; - - int iterations = 0; - - while (true) { - iterations ++; - if(iterations == XOR_SORT_ITERATIONS) { - size = xor_sort_and_remove_dup(keys, size); - } - if(iterations > XOR_MAX_ITERATIONS) { - // The probability of this happening is lower than the - // the cosmic-ray probability (i.e., a cosmic ray corrupts your system). - free(sets); - free(Q); - free(stack); - return false; - } - - memset(sets, 0, sizeof(xor_xorset_t) * arrayLength); - for (size_t i = 0; i < size; i++) { - uint64_t key = keys[i]; - xor_hashes_t hs = xor16_get_h0_h1_h2(key, filter); - sets0[hs.h0].xormask ^= hs.h; - sets0[hs.h0].count++; - sets1[hs.h1].xormask ^= hs.h; - sets1[hs.h1].count++; - sets2[hs.h2].xormask ^= hs.h; - sets2[hs.h2].count++; - } - // todo: the flush should be sync with the detection that follows - // scan for values with a count of one - size_t Q0size = 0, Q1size = 0, Q2size = 0; - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets0[i].count == 1) { - Q0[Q0size].index = i; - Q0[Q0size].hash = sets0[i].xormask; - Q0size++; - } - } - - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets1[i].count == 1) { - Q1[Q1size].index = i; - Q1[Q1size].hash = sets1[i].xormask; - Q1size++; - } - } - for (size_t i = 0; i < filter->blockLength; i++) { - if (sets2[i].count == 1) { - Q2[Q2size].index = i; - Q2[Q2size].hash = sets2[i].xormask; - Q2size++; - } - } - - size_t stack_size = 0; - while (Q0size + Q1size + Q2size > 0) { - while (Q0size > 0) { - xor_keyindex_t keyindex = Q0[--Q0size]; - size_t index = keyindex.index; - if (sets0[index].count == 0) - continue; // not actually possible after the initial scan. - //sets0[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h1 = xor16_get_h1(hash, filter); - uint32_t h2 = xor16_get_h2(hash, filter); - - stack[stack_size] = keyindex; - stack_size++; - sets1[h1].xormask ^= hash; - sets1[h1].count--; - if (sets1[h1].count == 1) { - Q1[Q1size].index = h1; - Q1[Q1size].hash = sets1[h1].xormask; - Q1size++; - } - sets2[h2].xormask ^= hash; - sets2[h2].count--; - if (sets2[h2].count == 1) { - Q2[Q2size].index = h2; - Q2[Q2size].hash = sets2[h2].xormask; - Q2size++; - } - } - while (Q1size > 0) { - xor_keyindex_t keyindex = Q1[--Q1size]; - size_t index = keyindex.index; - if (sets1[index].count == 0) - continue; - //sets1[index].count = 0; - uint64_t hash = keyindex.hash; - uint32_t h0 = xor16_get_h0(hash, filter); - uint32_t h2 = xor16_get_h2(hash, filter); - keyindex.index += blockLength; - stack[stack_size] = keyindex; - stack_size++; - sets0[h0].xormask ^= hash; - sets0[h0].count--; - if (sets0[h0].count == 1) { - Q0[Q0size].index = h0; - Q0[Q0size].hash = sets0[h0].xormask; - Q0size++; - } - sets2[h2].xormask ^= hash; - sets2[h2].count--; - if (sets2[h2].count == 1) { - Q2[Q2size].index = h2; - Q2[Q2size].hash = sets2[h2].xormask; - Q2size++; - } - } - while (Q2size > 0) { - xor_keyindex_t keyindex = Q2[--Q2size]; - size_t index = keyindex.index; - if (sets2[index].count == 0) - continue; - - //sets2[index].count = 0; - uint64_t hash = keyindex.hash; - - uint32_t h0 = xor16_get_h0(hash, filter); - uint32_t h1 = xor16_get_h1(hash, filter); - keyindex.index += 2 * blockLength; - - stack[stack_size] = keyindex; - stack_size++; - sets0[h0].xormask ^= hash; - sets0[h0].count--; - if (sets0[h0].count == 1) { - Q0[Q0size].index = h0; - Q0[Q0size].hash = sets0[h0].xormask; - Q0size++; - } - sets1[h1].xormask ^= hash; - sets1[h1].count--; - if (sets1[h1].count == 1) { - Q1[Q1size].index = h1; - Q1[Q1size].hash = sets1[h1].xormask; - Q1size++; - } - - } - } - if (stack_size == size) { - // success - break; - } - - filter->seed = xor_rng_splitmix64(&rng_counter); - } - uint16_t * fingerprints0 = filter->fingerprints; - uint16_t * fingerprints1 = filter->fingerprints + blockLength; - uint16_t * fingerprints2 = filter->fingerprints + 2 * blockLength; - - size_t stack_size = size; - while (stack_size > 0) { - xor_keyindex_t ki = stack[--stack_size]; - uint64_t val = xor_fingerprint(ki.hash); - if(ki.index < blockLength) { - val ^= fingerprints1[xor16_get_h1(ki.hash,filter)] ^ fingerprints2[xor16_get_h2(ki.hash,filter)]; - } else if(ki.index < 2 * blockLength) { - val ^= fingerprints0[xor16_get_h0(ki.hash,filter)] ^ fingerprints2[xor16_get_h2(ki.hash,filter)]; - } else { - val ^= fingerprints0[xor16_get_h0(ki.hash,filter)] ^ fingerprints1[xor16_get_h1(ki.hash,filter)]; - } - filter->fingerprints[ki.index] = val; - } - - free(sets); - free(Q); - free(stack); - return true; -} - - - -#endif From fb0ed32933fa78db58699a3d1377050589cbfeb3 Mon Sep 17 00:00:00 2001 From: Oliver Schonrock Date: Tue, 3 Dec 2024 08:58:07 +0000 Subject: [PATCH 2/4] tidying up Makfile one dependency per line to make things clearer dependcies for each target made more complete by generating with g++ -M -MF - init submodules for deps from either foreign repo --- Makefile | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9a17efd..036c172 100644 --- a/Makefile +++ b/Makefile @@ -3,15 +3,39 @@ all: build_filter query_filter dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h: git submodule update --init --recursive +dependencies/xor_singleheader/include/binaryfusefilter.h: + git submodule update --init --recursive -query_filter: src/query_filter.cpp src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h - c++ -O3 -o query_filter src/query_filter.cpp -Wall -std=c++11 -Idependencies/fastfilter_cpp/src -Idependencies +query_filter: src/query_filter.cpp \ + src/hexutil.h \ + src/mappeablebloomfilter.h \ + src/util.h \ + src/sha.h \ + dependencies/xor_singleheader/include/binaryfusefilter.h \ + dependencies/xor_singleheader/include/xorfilter.h + c++ -O3 -o query_filter src/query_filter.cpp -Wall -std=c++11 -Idependencies -build_filter: src/build_filter.cpp dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h dependencies/fastfilter_cpp/src/xorfilter/xorfilter_plus.h src/hexutil.h dependencies/xor_singleheader/include/xorfilter.h +build_filter: src/build_filter.cpp \ + src/hexutil.h \ + src/mappeablebloomfilter.h \ + src/util.h \ + dependencies/fastfilter_cpp/src/bloom/bloom.h \ + dependencies/fastfilter_cpp/src/hashutil.h \ + dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h \ + dependencies/xor_singleheader/include/binaryfusefilter.h \ + dependencies/xor_singleheader/include/xorfilter.h c++ -O3 -o build_filter src/build_filter.cpp -std=c++11 -Wall -Idependencies/fastfilter_cpp/src -Idependencies test: build_filter query_filter - ./build_filter -V -f xor8 -o filter.bin sample.txt && ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && ./build_filter -V -f binaryfuse8 -o filter.bin sample.txt && ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && ./build_filter -V -f binaryfuse16 -o filter.bin sample.txt && ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && ./build_filter -V -f bloom12 -o filter.bin sample.txt && ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && echo "SUCCESS" || (echo "Failure. There is a bug."| exit -1) + ./build_filter -V -f xor8 -o filter.bin sample.txt && \ + ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && \ + ./build_filter -V -f binaryfuse8 -o filter.bin sample.txt && \ + ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && \ + ./build_filter -V -f binaryfuse16 -o filter.bin sample.txt && \ + ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && \ + ./build_filter -V -f bloom12 -o filter.bin sample.txt && \ + ./query_filter filter.bin 7C4A8D09CA3762AF | grep "Probably in the set" && \ + echo "SUCCESS" || (echo "Failure. There is a bug."| exit -1) clean: rm -f build_filter query_filter From 5cdb78103d0c525f84049cc4e2c8f06e823a8d13 Mon Sep 17 00:00:00 2001 From: Oliver Schonrock Date: Tue, 3 Dec 2024 09:05:27 +0000 Subject: [PATCH 3/4] removing unused #includes which also means we are now only including one xorfilter.h from one repo change Makefile to reflect this --- Makefile | 1 - src/build_filter.cpp | 5 ----- 2 files changed, 6 deletions(-) diff --git a/Makefile b/Makefile index 036c172..9394373 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,6 @@ build_filter: src/build_filter.cpp \ src/util.h \ dependencies/fastfilter_cpp/src/bloom/bloom.h \ dependencies/fastfilter_cpp/src/hashutil.h \ - dependencies/fastfilter_cpp/src/xorfilter/xorfilter.h \ dependencies/xor_singleheader/include/binaryfusefilter.h \ dependencies/xor_singleheader/include/xorfilter.h c++ -O3 -o build_filter src/build_filter.cpp -std=c++11 -Wall -Idependencies/fastfilter_cpp/src -Idependencies diff --git a/src/build_filter.cpp b/src/build_filter.cpp index 4a26e88..bef630f 100644 --- a/src/build_filter.cpp +++ b/src/build_filter.cpp @@ -1,8 +1,6 @@ #include #include -#include #include -#include #include #include #include @@ -11,15 +9,12 @@ #include "hexutil.h" #include "xor_singleheader/include/binaryfusefilter.h" #include "xor_singleheader/include/xorfilter.h" -#include "xorfilter/xorfilter.h" #include "mappeablebloomfilter.h" static void printusage(char *command) { printf(" Try %s -f binaryfuse8 -o filter.bin mydatabase \n", command); - ; printf("The supported filters are xor8, binaryfuse8, binaryfuse16 and bloom12.\n"); - printf("The -V flag verifies the resulting filter.\n"); } From a9630fa29873f54ec22f22593422912a766250e7 Mon Sep 17 00:00:00 2001 From: Oliver Schonrock Date: Tue, 3 Dec 2024 09:08:27 +0000 Subject: [PATCH 4/4] also remove unsused includes from query_filter.cpp no impact on Makefile here --- src/query_filter.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/query_filter.cpp b/src/query_filter.cpp index 19e7f54..8a8dc0e 100644 --- a/src/query_filter.cpp +++ b/src/query_filter.cpp @@ -7,9 +7,7 @@ #include #include #include -#include #include -#include #include #include #include