diff --git a/CMakeLists.txt b/CMakeLists.txt index e3505c19..056bb701 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,7 @@ add_subdirectory(quantiles) add_subdirectory(count) add_subdirectory(density) add_subdirectory(tdigest) +add_subdirectory(filters) if (WITH_PYTHON) add_subdirectory(python) diff --git a/filters/CMakeLists.txt b/filters/CMakeLists.txt new file mode 100644 index 00000000..cd4b92d0 --- /dev/null +++ b/filters/CMakeLists.txt @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_library(filters INTERFACE) + +add_library(${PROJECT_NAME}::FILTERS ALIAS filters) + +if (BUILD_TESTS) + add_subdirectory(test) +endif() + +target_include_directories(filters + INTERFACE + $ + $/include> +) + +target_link_libraries(filters INTERFACE common) + +install(TARGETS filters + EXPORT ${PROJECT_NAME} +) + +install(FILES + include/quotient_filter.hpp + include/quotient_filter_impl.hpp + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/filters/include/quotient_filter.hpp b/filters/include/quotient_filter.hpp new file mode 100755 index 00000000..532e1f90 --- /dev/null +++ b/filters/include/quotient_filter.hpp @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef QUOTIENT_FILTER_HPP_ +#define QUOTIENT_FILTER_HPP_ + +#include +#include + +#include "common_defs.hpp" + +namespace datasketches { + +// forward declarations +template class quotient_filter_alloc; + +/// Quotient filter alias with default allocator +using quotient_filter = quotient_filter_alloc>; + +template +class quotient_filter_alloc { +public: + using vector_bytes = std::vector::template rebind_alloc>; + + static constexpr float DEFAULT_LOAD_FACTOR = 0.8; + + /** + * @param lg_q + * @param num_fingerprint_bits length of fingerprint in bits + * @param load_factor threshold for the ratio of the number of entries to the number of slots for expansion + * @param allocator for use by this sketch to allocate memory + */ + explicit quotient_filter_alloc(uint8_t lg_q, uint8_t num_fingerprint_bits, float load_factor = DEFAULT_LOAD_FACTOR, const Allocator& allocator = Allocator()); + + /** + * Update this filter with given unsigned 64-bit integer. + * @param value uint64_t to update the filter with + * @return true if the filter was updated + */ + bool update(uint64_t value); + + /** + * Update this filter with given data of any type. + * This is a "universal" update that covers all cases above, + * but may produce different hashes. + * Be very careful to hash input values consistently using the same approach + * both over time and on different platforms + * and while passing filters between different languages. + * For instance, for signed 32-bit values call update(int32_t) method above, + * which does widening conversion to int64_t, if compatibility with Java is expected + * @param data pointer to the data + * @param length of the data in bytes + * @return true if the filter was updated + */ + bool update(const void* data, size_t length); + + /** + * Queries the filter with the given unsigned 64-bit integer and returns whether + * the value might have been seen previously. The filter's expected + * False Positive Probability determines the chances of a true result being + * a false positive. False negatives are never possible. + * @param value uint64_t with which to query the filter + * @return The result of querying the filter with the given value + */ + bool query(uint64_t value) const; + + /** + * Queries the filter with given data of any type. + * This is a "universal" query that covers all cases above, + * but may produce different hashes. + * Be very careful to hash input values consistently using the same approach + * both over time and on different platforms + * and while passing filters between different languages. + * For instance, for signed 32-bit values call update(int32_t) method above, + * which does widening conversion to int64_t, if compatibility with Java is expected + * @param data pointer to the data + * @param length of the data in bytes + * @return The result of querying the filter with the given value + */ + bool query(const void* data, size_t length) const; + + + void merge(const quotient_filter_alloc& other); + + size_t get_num_entries() const; + + uint8_t get_lg_q() const; + + uint8_t get_num_bits_per_entry() const; + + uint8_t get_num_bits_in_value() const; + + uint8_t get_num_expansions() const; + + /** + * Returns an instance of the allocator for this filter. + * @return allocator + */ + Allocator get_allocator() const; + + /** + * Provides a human-readable summary of this filter as a string + * @param print_entries if true include the list of entries + * @return filter summary as a string + */ + string to_string(bool print_entries = false) const; + + void serialize(std::ostream& os) const; + +private: + Allocator allocator_; + uint8_t lg_q_; + uint8_t num_fingerprint_bits_; + uint8_t num_expansions_; + float load_factor_; + size_t num_entries_; + vector_bytes bytes_; + + inline size_t get_q() const; + inline size_t get_slot_mask() const; + inline uint64_t get_value_mask() const; + + inline size_t quotient_from_hash(uint64_t hash) const; + inline uint64_t value_from_hash(uint64_t hash) const; + + inline bool get_bit(size_t bit_index) const; + inline bool get_is_occupied(size_t slot) const; + inline bool get_is_continuation(size_t slot) const; + inline bool get_is_shifted(size_t slot) const; + inline bool is_slot_empty(size_t slot) const; + inline uint64_t get_value(size_t slot) const; + inline void set_bit(size_t bit_index, bool state); + inline void set_is_occupied(size_t slot, bool state); + inline void set_is_continuation(size_t slot, bool state); + inline void set_is_shifted(size_t slot, bool state); + inline void set_value(size_t slot, uint64_t value); + + size_t find_run_start(size_t quotient) const; + std::pair find_in_run(size_t run_start, uint64_t value) const; + + bool insert(size_t quotient, uint64_t value); + void insert_and_shift(size_t quotient, size_t slot, uint64_t value, bool is_new_run, bool is_run_start); + + void expand(); +}; + +} /* namespace datasketches */ + +#include "quotient_filter_impl.hpp" + +#endif diff --git a/filters/include/quotient_filter_impl.hpp b/filters/include/quotient_filter_impl.hpp new file mode 100755 index 00000000..887c9e3a --- /dev/null +++ b/filters/include/quotient_filter_impl.hpp @@ -0,0 +1,447 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef QUOTIENT_FILTER_IMPL_HPP_ +#define QUOTIENT_FILTER_IMPL_HPP_ + +#include +#include +#include + +#include "MurmurHash3.h" + +namespace datasketches { + +template +T u64_to_hold_bits(T bits) { + static_assert(std::is_integral::value, "integral type expected"); + return (bits >> 6) + ((bits & 0x3f) > 0); +} + +static inline void put_bits(uint64_t value, uint8_t bits, uint8_t* ptr, uint8_t offset) { + if (offset > 0) { + const uint8_t chunk = 8 - offset; + if (bits < chunk) { + const uint8_t mask = ((1 << bits) - 1) << offset; + *ptr &= ~mask; + *ptr |= (value << offset) & mask; + return; + } + const uint8_t mask = ((1 << chunk) - 1) << offset; + *ptr &= ~mask; + *ptr++ |= (value << offset) & mask; + bits -= chunk; + value >>= chunk; + } + while (bits >= 8) { + *ptr++ = value; + bits -= 8; + value >>= 8; + } + if (bits > 0) { + const uint8_t mask = (1 << bits) - 1; + *ptr &= ~mask; + *ptr |= value & mask; + } +} + +static inline uint64_t get_bits(uint8_t bits, const uint8_t* ptr, uint8_t offset) { + const uint8_t avail_bits = 8 - offset; + const uint8_t chunk_bits = std::min(avail_bits, bits); + const uint8_t mask = ((1 << chunk_bits) - 1); + uint64_t value = 0; + value = (*ptr >> offset) & mask; + ptr += avail_bits == chunk_bits; + offset = chunk_bits; + bits -= chunk_bits; + while (bits >= 8) { + value |= static_cast(*ptr++) << offset; + bits -= 8; + offset += 8; + } + if (bits > 0) { + const uint8_t mask = (1 << bits) - 1; + value |= static_cast(*ptr & mask) << offset; + } + return value; +} + +template +quotient_filter_alloc::quotient_filter_alloc(uint8_t lg_q, uint8_t num_fingerprint_bits, float load_factor, const A& allocator): +allocator_(allocator), +lg_q_(lg_q), +num_fingerprint_bits_(num_fingerprint_bits), +num_expansions_(0), +load_factor_(load_factor), +num_entries_(0), +bytes_(allocator) +{ + // check input + // allocate multiples of 8 bytes to match Java + bytes_.resize(u64_to_hold_bits(get_q() * get_num_bits_per_entry()) * sizeof(uint64_t)); +} + +template +bool quotient_filter_alloc::update(uint64_t value) { + return update(&value, sizeof(value)); +} + +template +bool quotient_filter_alloc::update(const void* data, size_t length) { + HashState hashes; + MurmurHash3_x64_128(data, length, DEFAULT_SEED, hashes); + const size_t quotient = quotient_from_hash(hashes.h1); + const uint64_t remainder = value_from_hash(hashes.h1); + return insert(quotient, remainder); +} + +template +bool quotient_filter_alloc::insert(size_t quotient, uint64_t value) { + const size_t run_start = find_run_start(quotient); + if (!get_is_occupied(quotient)) { + insert_and_shift(quotient, run_start, value, true, true); + return true; + } + const auto pair = find_in_run(run_start, value); + if (pair.second) return false; + insert_and_shift(quotient, pair.first, value, false, pair.first == run_start); + return true; +} + +template +bool quotient_filter_alloc::query(uint64_t value) const { + return query(&value, sizeof(value)); +} + +template +bool quotient_filter_alloc::query(const void* data, size_t length) const { + HashState hashes; + MurmurHash3_x64_128(data, length, DEFAULT_SEED, hashes); + const size_t quotient = quotient_from_hash(hashes.h1); + if (!get_is_occupied(quotient)) return false; + const size_t run_start = find_run_start(quotient); + const uint64_t remainder = value_from_hash(hashes.h1); + const auto pair = find_in_run(run_start, remainder); + return pair.second; +} + +template +void quotient_filter_alloc::merge(const quotient_filter_alloc& other) { + if (lg_q_ + num_fingerprint_bits_ != other.lg_q_ + other.num_fingerprint_bits_) { + throw std::invalid_argument("incompatible sketches in merge"); + } + // find cluster start + size_t i = 0; + if (!other.is_slot_empty(i)) while (other.get_is_shifted(i)) i = (i - 1) & other.get_slot_mask(); + + std::queue fifo; + size_t count = 0; + while (count < other.num_entries_) { + if (!other.is_slot_empty(i)) { + if (other.get_is_occupied(i)) fifo.push(i); + const size_t quotient = fifo.front(); + const uint64_t value = other.get_value(i); + const uint64_t hash = quotient << other.num_fingerprint_bits_ | value; + insert(quotient_from_hash(hash), value_from_hash(hash)); + ++count; + } + i = (i + 1) & other.get_slot_mask(); + if (!fifo.empty() && !other.get_is_continuation(i)) fifo.pop(); + } +} + +template +size_t quotient_filter_alloc::get_q() const { + return static_cast(1) << get_lg_q(); +} + +template +size_t quotient_filter_alloc::get_slot_mask() const { + return get_q() - 1; +} + +template +uint64_t quotient_filter_alloc::get_value_mask() const { + return (static_cast(1) << get_num_bits_in_value()) - 1; +} + +template +size_t quotient_filter_alloc::quotient_from_hash(uint64_t hash) const { + return (hash >> get_num_bits_in_value()) & get_slot_mask(); +} + +template +uint64_t quotient_filter_alloc::value_from_hash(uint64_t hash) const { + return hash & get_value_mask(); +} + +template +size_t quotient_filter_alloc::get_num_entries() const { + return num_entries_; +} + +template +uint8_t quotient_filter_alloc::get_lg_q() const { + return lg_q_; +} + +template +uint8_t quotient_filter_alloc::get_num_bits_per_entry() const { + return num_fingerprint_bits_ + 3; +} + +template +uint8_t quotient_filter_alloc::get_num_bits_in_value() const { + return num_fingerprint_bits_; +} + +template +uint8_t quotient_filter_alloc::get_num_expansions() const { + return num_expansions_; +} + +template +A quotient_filter_alloc::get_allocator() const { + return allocator_; +} + +template +string quotient_filter_alloc::to_string(bool print_entries) const { + // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. + // The stream does not support passing an allocator instance, and alternatives are complicated. + std::ostringstream os; + os << "### Quotient filter summary:" << std::endl; + os << " LgQ : " << std::to_string(lg_q_) << std::endl; + os << " Fingerprint bits : " << std::to_string(num_fingerprint_bits_) << std::endl; + os << " Load factor : " << std::to_string(load_factor_) << std::endl; + os << " Num expansions : " << std::to_string(num_expansions_) << std::endl; + os << " Num entries : " << num_entries_ << std::endl; + os << "### End filter summary" << std::endl; + + if (print_entries) { + os << "### Quotient filter entries:" << std::endl; + for (size_t slot = 0; slot < get_q(); ++slot) { + os << slot << ": "; + os << (get_is_occupied(slot) ? "1" : "0"); + os << (get_is_continuation(slot) ? "1" : "0"); + os << (get_is_shifted(slot) ? "1" : "0"); + os << " "; + os << std::hex << get_value(slot) << std::dec; + os << std::endl; + } + +// for (size_t bit = 0; bit < get_q() * bits_per_entry_; ++bit) { +// size_t remainder = bit % bits_per_entry_; +// if (remainder == 0) { +// size_t slot = bit / bits_per_entry_; +// os << slot << ": "; +// } +// if (remainder == 3) os << " "; +// os << (get_bit(bit) ? "1" : "0"); +// if (remainder == bits_per_entry_ - 1) os << "\n"; +// } + + os << "### End filter entries" << std::endl; + } + return string(os.str().c_str(), get_allocator()); +} + +template +size_t quotient_filter_alloc::find_run_start(size_t slot) const { + size_t num_runs_to_skip = 0; + while (get_is_shifted(slot)) { + slot = (slot - 1) & get_slot_mask(); + if (get_is_occupied(slot)) ++num_runs_to_skip; + } + while (num_runs_to_skip > 0) { + slot = (slot + 1) & get_slot_mask(); + if (!get_is_continuation(slot)) --num_runs_to_skip; + } + return slot; +} + +template +std::pair quotient_filter_alloc::find_in_run(size_t slot, uint64_t value) const { + do { + const uint64_t value_from_entry = get_value(slot); + if (value_from_entry >= value) return std::make_pair(slot, value_from_entry == value); + slot = (slot + 1) & get_slot_mask(); + } while (get_is_continuation(slot)); + return std::make_pair(slot, false); +} + +template +void quotient_filter_alloc::insert_and_shift(size_t quotient, size_t slot, uint64_t value, bool is_new_run, bool is_run_start) { +// std::cout << "insert " << quotient << ":" << std::hex << value << std::dec << " at " << slot << " as " << (is_new_run ? "new run\n" : "existing run\n"); + + // in the first shifted entry set is_continuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + bool force_continuation = !is_new_run && is_run_start; + + // prepare flags for the current slot + bool is_continuation = !is_run_start; + bool is_shifted = slot != quotient; + + // remember the existing entry from the current slot to be shifted to the next slot + // is_occupied flag belongs to the slot, therefore it is never shifted + // is_shifted flag is always true for all shifted entries, no need to remember it + uint64_t existing_value = get_value(slot); + bool existing_is_continuation = get_is_continuation(slot); + + while (!is_slot_empty(slot)) { + // set the current slot + set_value(slot, value); + set_is_continuation(slot, is_continuation); + set_is_shifted(slot, is_shifted); + + // prepare values for the next slot + value = existing_value; + is_continuation = existing_is_continuation | force_continuation; + is_shifted = true; + + slot = (slot + 1) & get_slot_mask(); + + // remember the existing entry to be shifted + existing_value = get_value(slot); + existing_is_continuation = get_is_continuation(slot); + + force_continuation = false; // this is needed for the first shift only + } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming value or the last shifted one + set_value(slot, value); + set_is_continuation(slot, is_continuation); + set_is_shifted(slot, is_shifted); + + if (is_new_run) set_is_occupied(quotient, true); + ++num_entries_; + if (num_entries_ == static_cast(get_q() * load_factor_)) expand(); +} + +template +void quotient_filter_alloc::expand() { + if (get_num_bits_in_value() < 2) throw std::logic_error("for expansion value must have at least 2 bits"); + quotient_filter_alloc other(lg_q_ + 1, num_fingerprint_bits_ - 1, load_factor_, allocator_); + + // find cluster start + size_t i = 0; + if (!is_slot_empty(i)) while (get_is_shifted(i)) i = (i - 1) & get_slot_mask(); + + std::queue fifo; + size_t count = 0; + while (count < num_entries_) { + if (!is_slot_empty(i)) { + if (get_is_occupied(i)) fifo.push(i); + const uint64_t value = get_value(i); + const size_t new_quotient = (fifo.front() << 1) | (value >> other.get_num_bits_in_value()); + other.insert(new_quotient, value & other.get_value_mask()); + ++count; + } + i = (i + 1) & get_slot_mask(); + if (!fifo.empty() && !get_is_continuation(i)) fifo.pop(); + } + std::swap(*this, other); + num_expansions_ = other.num_expansions_ + 1; +} + +template +bool quotient_filter_alloc::get_bit(size_t bit_index) const { + const size_t byte_offset = bit_index >> 3; + const uint8_t bit_offset = bit_index & 7; + const uint8_t* ptr = bytes_.data() + byte_offset; + return *ptr & (1 << bit_offset); +} + +template +bool quotient_filter_alloc::get_is_occupied(size_t slot) const { + return get_bit(slot * get_num_bits_per_entry()); +} + +template +bool quotient_filter_alloc::get_is_continuation(size_t slot) const { + return get_bit(slot * get_num_bits_per_entry() + 1); +} + +template +bool quotient_filter_alloc::get_is_shifted(size_t slot) const { + return get_bit(slot * get_num_bits_per_entry() + 2); +} + +template +bool quotient_filter_alloc::is_slot_empty(size_t slot) const { + return !get_is_occupied(slot) && !get_is_continuation(slot) && !get_is_shifted(slot); +} + +template +uint64_t quotient_filter_alloc::get_value(size_t slot) const { + const size_t bits = slot * get_num_bits_per_entry() + 3; + const size_t byte_offset = bits >> 3; + const uint8_t bit_offset = bits & 7; + const uint8_t* ptr = bytes_.data() + byte_offset; + return get_bits(get_num_bits_in_value(), ptr, bit_offset); +} + +template +void quotient_filter_alloc::set_bit(size_t bit_index, bool state) { + const size_t byte_offset = bit_index >> 3; + const uint8_t bit_offset = bit_index & 7; + uint8_t* ptr = bytes_.data() + byte_offset; + if (state) { + *ptr |= 1 << bit_offset; + } else { + *ptr &= ~(1 << bit_offset); + } +} + +template +void quotient_filter_alloc::set_is_occupied(size_t slot, bool state) { + set_bit(slot * get_num_bits_per_entry(), state); +} + +template +void quotient_filter_alloc::set_is_continuation(size_t slot, bool state) { + set_bit(slot * get_num_bits_per_entry() + 1, state); +} + +template +void quotient_filter_alloc::set_is_shifted(size_t slot, bool state) { + set_bit(slot * get_num_bits_per_entry() + 2, state); +} + +template +void quotient_filter_alloc::set_value(size_t slot, uint64_t value) { + const size_t bits = slot * get_num_bits_per_entry() + 3; + const size_t byte_offset = bits >> 3; + const uint8_t bit_offset = bits & 7; + uint8_t* ptr = bytes_.data() + byte_offset; + put_bits(value, get_num_bits_in_value(), ptr, bit_offset); +} + +template +void quotient_filter_alloc::serialize(std::ostream& os) const { + write(os, 0); // placeholders, to be implemented + write(os, 0); + write(os, 0); + write(os, 0); + write(os, bytes_.data(), bytes_.size()); +} + +} /* namespace datasketches */ + +#endif diff --git a/filters/test/CMakeLists.txt b/filters/test/CMakeLists.txt new file mode 100644 index 00000000..b7870d18 --- /dev/null +++ b/filters/test/CMakeLists.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_executable(filters_test) + +target_link_libraries(filters_test filters common_test_lib) + +set_target_properties(filters_test PROPERTIES + CXX_STANDARD_REQUIRED YES +) + +add_test( + NAME filters_test + COMMAND filters_test +) + +target_sources(filters_test + PRIVATE + quotient_filter_test.cpp +) diff --git a/filters/test/quotient_filter_test.cpp b/filters/test/quotient_filter_test.cpp new file mode 100755 index 00000000..84e3da18 --- /dev/null +++ b/filters/test/quotient_filter_test.cpp @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include + +namespace datasketches { + +TEST_CASE("empty", "[quotient_filter]") { + quotient_filter f(10, 6); + REQUIRE(f.get_lg_q() == 10); + REQUIRE(f.get_num_bits_in_value() == 6); + REQUIRE(f.get_num_entries() == 0); +} + +TEST_CASE("one entry", "[quotient_filter]") { + quotient_filter f(4, 6); + REQUIRE_FALSE(f.query(1)); + REQUIRE(f.update(1)); + REQUIRE(f.query(1)); + REQUIRE(f.get_num_entries() == 1); + REQUIRE_FALSE(f.update(1)); +} + +TEST_CASE("several entries", "[quotient_filter]") { + quotient_filter f(5, 9); + f.update(1); + f.update(2); + f.update(3); +// std::cout << f.to_string(true); + REQUIRE(f.query(1)); + REQUIRE(f.query(2)); + REQUIRE(f.query(3)); + REQUIRE(f.get_num_entries() == 3); +} + +TEST_CASE("many entries no expansion 1", "[quotient_filter]") { + quotient_filter f(4, 9); + const size_t n = 11; + for (size_t i = 0; i < n; ++i) f.update(i); +// std::cout << f.to_string(true); + + REQUIRE(f.get_num_expansions() == 0); + REQUIRE(f.get_num_entries() == n); + size_t positives = 0; + for (size_t i = 0; i < n; ++i) { + if (f.query(i)) ++positives; + } + REQUIRE(positives == n); + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives; + REQUIRE(positives < 2); +} + +TEST_CASE("many entries no expansion 2", "[quotient_filter]") { + quotient_filter f(6, 12); + const size_t n = 40; + for (size_t i = 0; i < n; ++i) f.update(i); +// std::cout << f.to_string(true); + REQUIRE(f.get_num_expansions() == 0); + REQUIRE(f.get_num_entries() == n); + size_t positives = 0; + for (size_t i = 0; i < n; ++i) { + if (f.query(i)) ++positives; + } + REQUIRE(positives == n); + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives; + REQUIRE(positives == 0); +} + +TEST_CASE("many more entries no expansion", "[quotient_filter]") { + quotient_filter f(16, 16); + const size_t n = 30000; + for (size_t i = 0; i < n; ++i) f.update(i); +// std::cout << f.to_string(true); + REQUIRE(f.get_num_expansions() == 0); + REQUIRE(f.get_num_entries() > n * 0.999); // allow a few hash collisions + + // query the same keys + size_t positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i)) ++positives; + REQUIRE(positives == n); + + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives; + REQUIRE(positives < 2); +} + +TEST_CASE("small expansion", "[quotient_filter]") { + quotient_filter f(5, 12); + const size_t n = 30; + for (size_t i = 0; i < n; ++i) f.update(i); + std::cout << f.to_string(true); + REQUIRE(f.get_num_expansions() == 1); + REQUIRE(f.get_num_entries() == n); + + // query the same keys + size_t positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i)) ++positives; + REQUIRE(positives == n); + + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives; + REQUIRE(positives < 2); +} + +TEST_CASE("expansion", "[quotient_filter]") { + quotient_filter f(16, 16); + const size_t n = 60000; + for (size_t i = 0; i < n; ++i) f.update(i); + std::cout << f.to_string(); +// std::cout << f.to_string(true); + REQUIRE(f.get_num_expansions() == 1); + REQUIRE(f.get_num_entries() > n * 0.999); // allow a few hash collisions + + // query the same keys + size_t positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i)) ++positives; + REQUIRE(positives == n); + + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives; + REQUIRE(positives < 7); +} + +TEST_CASE("merge empty", "[quotient_filter]") { + quotient_filter qf1(4, 3); + quotient_filter qf2(4, 3); + qf1.merge(qf2); + REQUIRE(qf1.get_lg_q() == 4); + REQUIRE(qf1.get_num_bits_in_value() == 3); + REQUIRE(qf1.get_num_entries() == 0); +} + +TEST_CASE("merge", "[quotient_filter]") { + quotient_filter qf1(16, 12); + quotient_filter qf2(16, 12); + const size_t n = 50000; + for (size_t i = 0; i < n / 2; ++i) { + qf1.update(i); + qf2.update(i + n / 2); + } + qf1.merge(qf2); + REQUIRE(qf1.get_num_expansions() == 0); + REQUIRE(qf1.get_num_entries() > n * 0.9999); // allow a few hash collisions + + // query the same keys + size_t positives = 0; + for (size_t i = 0; i < n; ++i) if (qf1.query(i)) ++positives; + REQUIRE(positives == n); + + // query novel keys + positives = 0; + for (size_t i = 0; i < n; ++i) if (qf1.query(i + n)) ++positives; + REQUIRE(positives < 6); +} + +TEST_CASE("merge different configuration", "[quotient_filter]") { + quotient_filter qf1(6, 5); + quotient_filter qf2(5, 6); + for (int i = 0; i < 10; ++i) { + qf1.update(i); + qf2.update(i); + } + qf1.merge(qf2); + REQUIRE(qf1.get_num_entries() == 10); +} + +TEST_CASE("merge incompatible", "[quotient_filter]") { + quotient_filter qf1(6, 5); + quotient_filter qf2(6, 6); + REQUIRE_THROWS_AS(qf1.merge(qf2), std::invalid_argument); +} + +TEST_CASE("serialize", "[quotient_filter]") { + quotient_filter f(4, 9); + for (int i = 0; i < 12; ++i) f.update(i); + + std::ofstream os("quotient_filter_4_9_cpp.sk", std::ios::binary); + f.serialize(os); +} + +// inverse golden ratio (0.618... of max uint64_t) +static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL; + +TEST_CASE("pack unpack bits") { + for (uint8_t num_bits = 1; num_bits <= 63; ++num_bits) { + int n = 8; + const uint64_t mask = (1ULL << num_bits) - 1; + std::vector input(n, 0); + const uint64_t igolden64 = IGOLDEN64; + uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + for (int i = 0; i < n; ++i) { + input[i] = value & mask; + value += igolden64; + } + std::vector bytes(n * sizeof(uint64_t), 0); + for (int i = 0; i < n; ++i) { + const size_t bit_index = i * num_bits; + const size_t byte_offset = bit_index >> 3; + const uint8_t bit_offset = bit_index & 7; + put_bits(input[i], num_bits, bytes.data() + byte_offset, bit_offset); + } + + std::vector output(n, 0); + for (int i = 0; i < n; ++i) { + const size_t bit_index = i * num_bits; + const size_t byte_offset = bit_index >> 3; + const uint8_t bit_offset = bit_index & 7; + output[i] = get_bits(num_bits, bytes.data() + byte_offset, bit_offset); + } + for (int i = 0; i < n; ++i) { + REQUIRE(input[i] == output[i]); + } + } +} + +} /* namespace datasketches */