From c38f798abcf5d35ccb08aebc7b8d2bc64e07d1f6 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Fri, 10 Jan 2025 11:11:42 -0800 Subject: [PATCH 01/75] release process: setting development target to 5.3.0 --- version.cfg.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.cfg.in b/version.cfg.in index ad2e7b11..5ffad51a 100644 --- a/version.cfg.in +++ b/version.cfg.in @@ -1 +1 @@ -5.2.@DT@.@HHMM@ +5.3.@DT@.@HHMM@ From eb3200e7095c6fc073051dc09634e05fd1b64566 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 20:50:44 -0800 Subject: [PATCH 02/75] code cleanup and alignment with Java --- theta/include/theta_sketch.hpp | 4 +- theta/include/theta_sketch_impl.hpp | 58 ++++++++++++++--------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp index ad6421e7..4aab4b92 100644 --- a/theta/include/theta_sketch.hpp +++ b/theta/include/theta_sketch.hpp @@ -609,9 +609,11 @@ class wrapped_compact_theta_sketch_alloc::const_iterator { uint32_t index_; uint64_t previous_; bool is_block_mode_; - uint8_t buf_i_; uint8_t offset_; uint64_t buffer_[8]; + + inline void unpack1(); + inline void unpack8(); }; } /* namespace datasketches */ diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp index b6a5d7ee..8f7b1e8d 100644 --- a/theta/include/theta_sketch_impl.hpp +++ b/theta/include/theta_sketch_impl.hpp @@ -817,23 +817,15 @@ num_entries_(num_entries), index_(index), previous_(0), is_block_mode_(num_entries_ >= 8), -buf_i_(0), offset_(0) { if (entry_bits == 64) { // no compression ptr_ = reinterpret_cast(ptr) + index; } else if (index < num_entries) { if (is_block_mode_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + unpack8(); } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } @@ -844,35 +836,41 @@ auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++() ptr_ = reinterpret_cast(ptr_) + 1; return *this; } - ++index_; - if (index_ < num_entries_) { + if (++index_ < num_entries_) { if (is_block_mode_) { - ++buf_i_; - if (buf_i_ == 8) { - buf_i_ = 0; - if (index_ + 8 < num_entries_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + if ((index_ & 7) == 0) { + if (num_entries_ - index_ >= 8) { + unpack8(); } else { is_block_mode_ = false; - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } return *this; } +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack1() { + const uint32_t i = index_ & 7; + offset_ = unpack_bits(buffer_[i], entry_bits_, reinterpret_cast(ptr_), offset_); + buffer_[i] += previous_; + previous_ = buffer_[i]; +} + +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack8() { + unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); + ptr_ = reinterpret_cast(ptr_) + entry_bits_; + for (int i = 0; i < 8; ++i) { + buffer_[i] += previous_; + previous_ = buffer_[i]; + } +} + template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++(int) -> const_iterator { const_iterator tmp(*this); @@ -895,13 +893,13 @@ bool wrapped_compact_theta_sketch_alloc::const_iterator::operator==(c template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator*() const -> reference { if (entry_bits_ == 64) return *reinterpret_cast(ptr_); - return buffer_[buf_i_]; + return buffer_[index_ & 7]; } template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator->() const -> pointer { if (entry_bits_ == 64) return reinterpret_cast(ptr_); - return buffer_ + buf_i_; + return buffer_ + (index_ & 7); } } /* namespace datasketches */ From f82217d472f0d122b848cd379b231e7ac8616cf2 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 20:55:40 -0800 Subject: [PATCH 03/75] test equivalence of packing and unpacking single values and blocks --- theta/test/bit_packing_test.cpp | 50 +++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp index b39f8996..0e0cf015 100644 --- a/theta/test/bit_packing_test.cpp +++ b/theta/test/bit_packing_test.cpp @@ -80,4 +80,54 @@ TEST_CASE("pack unpack blocks") { } } +TEST_CASE("pack bits unpack blocks") { + uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + uint8_t offset = 0; + uint8_t* ptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = pack_bits(input[i], bits, ptr, offset); + } + std::vector output(8, 0); + unpack_bits_block8(output.data(), bytes.data(), bits); + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + +TEST_CASE("pack blocks unpack bits") { + uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + pack_bits_block8(input.data(), bytes.data(), bits); + std::vector output(8, 0); + uint8_t offset = 0; + const uint8_t* cptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = unpack_bits(output[i], bits, cptr, offset); + } + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + } /* namespace datasketches */ From dea8d481cab8461f981e4edb5ab292936a87abff Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 21:02:34 -0800 Subject: [PATCH 04/75] different starting points for pseudo-random sequences for more coverage --- theta/test/bit_packing_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp index 0e0cf015..0094f9fd 100644 --- a/theta/test/bit_packing_test.cpp +++ b/theta/test/bit_packing_test.cpp @@ -81,7 +81,7 @@ TEST_CASE("pack unpack blocks") { } TEST_CASE("pack bits unpack blocks") { - uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + uint64_t value = 0; // arbitrary starting value for (int m = 0; m < 10000; ++m) { for (uint8_t bits = 1; bits <= 63; ++bits) { const uint64_t mask = (1ULL << bits) - 1; @@ -106,7 +106,7 @@ TEST_CASE("pack bits unpack blocks") { } TEST_CASE("pack blocks unpack bits") { - uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + uint64_t value = 111; // arbitrary starting value for (int m = 0; m < 10000; ++m) { for (uint8_t bits = 1; bits <= 63; ++bits) { const uint64_t mask = (1ULL << bits) - 1; From 27d988f0eda27b36c0afac31a51bf39073ea3e17 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 18 Feb 2025 18:40:10 -0800 Subject: [PATCH 05/75] enable branch protection --- .asf.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index 45d974b1..f15ed263 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -2,3 +2,11 @@ github: homepage: https://datasketches.apache.org ghp_branch: gh-pages ghp_path: /docs + + protected_branches: + master: + required_pull_request_reviews: + dismiss_stale_reviews: false + required_approving_review_count: 1 + required_signatures: false + required_conversation_resolution: false From 0a6218ce37b4d4f5c2328d9ba0ad1d2942c7fec9 Mon Sep 17 00:00:00 2001 From: geonove Date: Sun, 25 May 2025 14:41:30 +0200 Subject: [PATCH 06/75] Use REQUIRE_THROWS_WITH to check for error message --- count/test/count_min_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/count/test/count_min_test.cpp b/count/test/count_min_test.cpp index 143be1b8..8b7ae0a7 100644 --- a/count/test/count_min_test.cpp +++ b/count/test/count_min_test.cpp @@ -55,7 +55,7 @@ TEST_CASE("CM init") { TEST_CASE("CM parameter suggestions", "[error parameters]") { // Bucket suggestions - REQUIRE_THROWS(count_min_sketch::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_buckets(-1.0), "Relative error must be at least 0."); REQUIRE(count_min_sketch::suggest_num_buckets(0.2) == 14); REQUIRE(count_min_sketch::suggest_num_buckets(0.1) == 28); REQUIRE(count_min_sketch::suggest_num_buckets(0.05) == 55); @@ -69,8 +69,8 @@ TEST_CASE("CM parameter suggestions", "[error parameters]") { REQUIRE(count_min_sketch(n_hashes, 272).get_relative_error() <= 0.01); // Hash suggestions - REQUIRE_THROWS(count_min_sketch::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ); - REQUIRE_THROWS(count_min_sketch::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); REQUIRE(count_min_sketch::suggest_num_hashes(0.682689492) == 2); // 1 STDDEV REQUIRE(count_min_sketch::suggest_num_hashes(0.954499736) == 4); // 2 STDDEV REQUIRE(count_min_sketch::suggest_num_hashes(0.997300204) == 6); // 3 STDDEV @@ -161,9 +161,9 @@ TEST_CASE("CM merge - reject", "[reject cases]") { std::vector> sketches = {s1, s2, s3}; // Fail cases - REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." ); + REQUIRE_THROWS_WITH(s.merge(s), "Cannot merge a sketch with itself." ); for (count_min_sketch sk : sketches) { - REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." ); + REQUIRE_THROWS_WITH(s.merge(sk), "Incompatible sketch configuration." ); } } From 75edfbb3b59b047bf8cdf7fb5a5d46798ea8bf08 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 27 May 2025 23:42:03 -0700 Subject: [PATCH 07/75] ds-java main branch requires 21 --- .github/workflows/serde_compat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml index 33c31801..084d1385 100644 --- a/.github/workflows/serde_compat.yml +++ b/.github/workflows/serde_compat.yml @@ -21,7 +21,7 @@ jobs: - name: Setup Java uses: actions/setup-java@v4 with: - java-version: '17' + java-version: '21' distribution: 'temurin' - name: Run Java run: cd java && mvn test -P generate-java-files From 82630e554e35d702ec6358b1d1ec5e1f186e7447 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Sat, 7 Jun 2025 11:28:16 +0530 Subject: [PATCH 08/75] Provide get_centroids implementation --- tdigest/include/tdigest.hpp | 5 +++++ tdigest/include/tdigest_impl.hpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index d33084ed..21cf47a2 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -143,6 +143,11 @@ class tdigest { */ uint64_t get_total_weight() const; + /** + * @return centroids + */ + vector_centroid get_centroids() const; + /** * Returns an instance of the allocator for this t-Digest. * @return allocator diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 6e3ae1a0..73429f6d 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -85,6 +85,11 @@ uint64_t tdigest::get_total_weight() const { return centroids_weight_ + buffer_.size(); } +template +auto tdigest::get_centroids() const -> vector_centroid{ + return centroids_; +} + template A tdigest::get_allocator() const { return buffer_.get_allocator(); From 866f6d036a7fe91153d01d9648f9127755e5af77 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Mon, 9 Jun 2025 19:54:59 +0530 Subject: [PATCH 09/75] Introduced const_iterator for tdigest --- tdigest/include/tdigest.hpp | 42 ++++++++++++++++++--- tdigest/include/tdigest_impl.hpp | 64 +++++++++++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 14 +++++++ 3 files changed, 110 insertions(+), 10 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 21cf47a2..e821e4c0 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -143,11 +143,6 @@ class tdigest { */ uint64_t get_total_weight() const; - /** - * @return centroids - */ - vector_centroid get_centroids() const; - /** * Returns an instance of the allocator for this t-Digest. * @return allocator @@ -262,6 +257,21 @@ class tdigest { */ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator()); + class const_iterator; + + /** + * Iterator pointing to the first centroid in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first centroid in the sketch + */ + const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end centroid in the sketch. + * It does not point to any centroid, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end centroid in the sketch + */ + const_iterator end() const; private: bool reverse_merge_; uint16_t k_; @@ -302,6 +312,28 @@ class tdigest { static inline void check_split_points(const T* values, uint32_t size); }; +template +class tdigest::const_iterator { +public: + using iterator_category = std::input_iterator_tag; + using value_type = std::pair; + using difference_type = void; + using pointer = const return_value_holder; + using reference = const value_type; + + const_iterator(const tdigest &tdigest_, bool is_end); + + const_iterator& operator++(); + const_iterator& operator++(int); + bool operator==(const const_iterator& other) const; + bool operator!=(const const_iterator& other) const; + reference operator*() const; + pointer operator->() const; +private: + friend class tdigest; + uint32_t index_; + vector_centroid centroids_; +}; } /* namespace datasketches */ #include "tdigest_impl.hpp" diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 73429f6d..49fd98a5 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -85,11 +85,6 @@ uint64_t tdigest::get_total_weight() const { return centroids_weight_ + buffer_.size(); } -template -auto tdigest::get_centroids() const -> vector_centroid{ - return centroids_; -} - template A tdigest::get_allocator() const { return buffer_.get_allocator(); @@ -632,6 +627,65 @@ void tdigest::check_split_points(const T* values, uint32_t size) { } } +template +typename tdigest::const_iterator tdigest::begin() const { + return tdigest::const_iterator(*this, false); +} + +template + typename tdigest::const_iterator tdigest::end() const { + return tdigest::const_iterator(*this, true); +} + +template +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): + centroids_() +{ + // Create a copy of the tdigest to generate the centroids after processing the buffered values + tdigest tmp(tdigest_); + tmp.compress(); + centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); + + if (is_end) { + index_ = centroids_.size(); + } else { + index_ = 0; + } +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++() { + ++index_; + return *this; +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++(int) { + const_iterator tmp(*this); + operator++(); + return tmp; +} + +template +bool tdigest::const_iterator::operator==(const const_iterator& other) const { + return index_ == other.index_; +} + +template +bool tdigest::const_iterator::operator!=(const const_iterator& other) const { + return !operator==(other); +} + +template +auto tdigest::const_iterator::operator*() const -> reference { + return value_type(centroids_[index_].get_mean(), centroids_[index_].get_weight()); +} + +template +auto tdigest::const_iterator::operator->() const -> pointer { + return **this; +} + } /* namespace datasketches */ #endif // _TDIGEST_IMPL_HPP_ diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index fc3f5d1c..41b00943 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -453,4 +453,18 @@ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") REQUIRE(td.get_rank(n) == 1); } +TEST_CASE("iterate centroids", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; i++) { + td.update(i); + } + + auto centroid_count = 0; + for (const auto ¢roid: td) { + centroid_count++; + } + // Ensure that centroids are retrieved for a case where there is buffered values + REQUIRE(centroid_count == 10); +} + } /* namespace datasketches */ From 27cb7b8940659924cca7434136c537ff930716bc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 9 Jun 2025 13:34:21 -0700 Subject: [PATCH 10/75] Fix typos --- CMakeLists.txt | 2 +- CODE_OF_CONDUCT.md | 2 +- filters/include/bloom_filter.hpp | 2 +- filters/test/bloom_filter_test.cpp | 2 +- hll/include/CubicInterpolation-internal.hpp | 2 +- hll/include/HllArray-internal.hpp | 2 +- hll/test/HllSketchTest.cpp | 6 +++--- hll/test/HllUnionTest.cpp | 2 +- quantiles/include/quantiles_sketch.hpp | 4 ++-- quantiles/include/quantiles_sketch_impl.hpp | 2 +- sampling/include/var_opt_sketch_impl.hpp | 6 +++--- sampling/include/var_opt_union_impl.hpp | 2 +- sampling/test/ebpps_allocation_test.cpp | 2 +- sampling/test/ebpps_sketch_test.cpp | 4 ++-- sampling/test/var_opt_allocation_test.cpp | 2 +- 15 files changed, 21 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 056bb701..c469e456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,7 +59,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT CMAKE_CXX_COMPILER_VERSION VERS add_compile_options(-Wimplicit-fallthrough=3) endif() -# Code generation options, to ensure shaerd libraries work and are portable +# Code generation options, to ensure shared libraries work and are portable set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_C_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index bdce6af9..0bdf0791 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,3 +1,3 @@ # Code of Conduct -We adhere to the Apache Softare Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct). \ No newline at end of file +We adhere to the Apache Software Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct). diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index f3c6a031..fd5816a1 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -624,7 +624,7 @@ class bloom_filter_alloc { uint64_t capacity_bits_; uint64_t num_bits_set_; uint8_t* bit_array_; // data backing bit_array_, regardless of ownership - uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr + uint8_t* memory_; // if wrapped, pointer to the start of the filter, otherwise nullptr }; /** diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 41b63e64..d8bcec8e 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -399,7 +399,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf_writable.query(-1.0)); // not good memory management to do this, but because we wrapped the same bytes as both - // read-only adn writable, that update should ahve changed the read-only version, too + // read-only and writable, that update should have changed the read-only version, too REQUIRE(bf_wrap.query(-1.0)); } diff --git a/hll/include/CubicInterpolation-internal.hpp b/hll/include/CubicInterpolation-internal.hpp index c60ddab9..9677b99d 100644 --- a/hll/include/CubicInterpolation-internal.hpp +++ b/hll/include/CubicInterpolation-internal.hpp @@ -191,7 +191,7 @@ double CubicInterpolation::usingXArrAndYStride(const double xArr[], const int const int xArrLenM1 = xArrLen - 1; if ((xArrLen < 4) || (x < xArr[0]) || (x > xArr[xArrLenM1])) { - throw std::logic_error("impossible values during interpolaiton"); + throw std::logic_error("impossible values during interpolation"); } if (x == xArr[xArrLenM1]) { /* corner case */ diff --git a/hll/include/HllArray-internal.hpp b/hll/include/HllArray-internal.hpp index c3c6b3f8..8986f068 100644 --- a/hll/include/HllArray-internal.hpp +++ b/hll/include/HllArray-internal.hpp @@ -173,7 +173,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { hll_mode mode = HllSketchImpl::extractCurMode(listHeader[hll_constants::MODE_BYTE]); if (mode != HLL) { - throw std::invalid_argument("Calling HLL construtor with non-HLL mode data"); + throw std::invalid_argument("Calling HLL constructor with non-HLL mode data"); } const target_hll_type tgtHllType = HllSketchImpl::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]); diff --git a/hll/test/HllSketchTest.cpp b/hll/test/HllSketchTest.cpp index 1ce21bbe..91197f13 100644 --- a/hll/test/HllSketchTest.cpp +++ b/hll/test/HllSketchTest.cpp @@ -298,7 +298,7 @@ TEST_CASE("hll sketch: deserialize list mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range); REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -325,7 +325,7 @@ TEST_CASE("hll sketch: deserialize set mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range); REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -355,7 +355,7 @@ TEST_CASE("hll sketch: deserialize HLL mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 16420, 0), std::out_of_range); // before aux table REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); diff --git a/hll/test/HllUnionTest.cpp b/hll/test/HllUnionTest.cpp index 41443786..ceaef12f 100644 --- a/hll/test/HllUnionTest.cpp +++ b/hll/test/HllUnionTest.cpp @@ -58,7 +58,7 @@ static void basicUnion(uint64_t n1, uint64_t n2, hll_sketch result = u.get_result(resultType); - // ensure we check a direct union estimate, without first caling get_result() + // ensure we check a direct union estimate, without first calling get_result() u.reset(); u.update(std::move(h1)); u.update(h2); diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp index ab493c99..b1e2e3c1 100644 --- a/quantiles/include/quantiles_sketch.hpp +++ b/quantiles/include/quantiles_sketch.hpp @@ -537,10 +537,10 @@ class quantiles_sketch { static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator); template - static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); + static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator); template - static std::pair deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); + static std::pair deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator); static void check_k(uint16_t k); static void check_serial_version(uint8_t serial_version); diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp index 558c13c5..50c82c18 100644 --- a/quantiles/include/quantiles_sketch_impl.hpp +++ b/quantiles/include/quantiles_sketch_impl.hpp @@ -581,7 +581,7 @@ auto quantiles_sketch::deserialize_array(const void* bytes, size_t size // serde did not throw, enable destructors items.get_deleter().set_destroy(true); - // succesfully read, now put into a Level + // successfully read, now put into a Level Level level(allocator); level.reserve(capacity); level.insert(level.begin(), diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp index 7bf40958..36ee3fc8 100644 --- a/sampling/include/var_opt_sketch_impl.hpp +++ b/sampling/include/var_opt_sketch_impl.hpp @@ -1029,7 +1029,7 @@ void var_opt_sketch::transition_from_warmup() { total_wt_r_ = weights_[k_]; // only one item, known location weights_[k_] = -1.0; - // The two lightest items are ncessarily downsample-able to one item, + // The two lightest items are necessarily downsample-able to one item, // and are therefore a valid initial candidate set grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2); } @@ -1065,7 +1065,7 @@ void var_opt_sketch::restore_towards_leaves(uint32_t slot_in) { while (child <= last_slot) { uint32_t child2 = child + 1; // might also be invalid if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) { - // siwtch to other child if it's both valid and smaller + // switch to other child if it's both valid and smaller child = child2; } @@ -1221,7 +1221,7 @@ uint32_t var_opt_sketch::choose_delete_slot(double wt_cands, uint32_t num_ if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) { return pick_random_slot_in_r(); // keep item in M } else { - return h_; // indext of item in M + return h_; // index of item in M } } else { // general case diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp index 1d252245..d04be0cb 100644 --- a/sampling/include/var_opt_union_impl.hpp +++ b/sampling/include/var_opt_union_impl.hpp @@ -590,7 +590,7 @@ void var_opt_union::migrate_marked_items_by_decreasing_k(var_opt_sketch Date: Tue, 10 Jun 2025 00:28:48 +0530 Subject: [PATCH 11/75] Addressing review comments --- tdigest/include/tdigest.hpp | 8 ++++++-- tdigest/include/tdigest_impl.hpp | 24 +++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 3 +++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index e821e4c0..2ad410f5 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -106,6 +106,12 @@ class tdigest { */ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator()); + /** + * Copy constructor + * @param other sketch to be copied + */ + tdigest(const tdigest& other); + /** * Update this t-Digest with the given value * @param value to update the t-Digest with @@ -275,13 +281,11 @@ class tdigest { private: bool reverse_merge_; uint16_t k_; - uint16_t internal_k_; T min_; T max_; size_t centroids_capacity_; vector_centroid centroids_; uint64_t centroids_weight_; - size_t buffer_capacity_; vector_t buffer_; static const size_t BUFFER_MULTIPLIER = 4; diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 49fd98a5..1dba9eb1 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -597,6 +597,24 @@ bool tdigest::is_single_value() const { return get_total_weight() == 1; } +template +tdigest::tdigest(const tdigest& other): + reverse_merge_(other.reverse_merge_), + k_(other.k_), + min_(other.min_), + max_(other.max_), + centroids_capacity_(other.centroids_capacity_), + centroids_(other.centroids_, other.get_allocator()), + centroids_weight_(other.centroids_weight_), + buffer_(other.buffer_, other.get_allocator()) +{ + if (other.k_ < 10) throw std::invalid_argument("k must be at least 10"); + const size_t fudge = other.k_ < 30 ? 30 : 10; + centroids_capacity_ = 2 * k_ + fudge; + centroids_.reserve(centroids_capacity_); + buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER); +} + template tdigest::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer): reverse_merge_(reverse_merge), @@ -638,11 +656,11 @@ template } template -tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): - centroids_() +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): + centroids_(tdigest_.get_allocator()) { // Create a copy of the tdigest to generate the centroids after processing the buffered values - tdigest tmp(tdigest_); + tdigest tmp(tdigest_); tmp.compress(); centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 41b00943..9f92094d 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -460,11 +460,14 @@ TEST_CASE("iterate centroids", "[tdigest]") { } auto centroid_count = 0; + uint64_t total_weight = 0; for (const auto ¢roid: td) { centroid_count++; + total_weight += centroid.second; } // Ensure that centroids are retrieved for a case where there is buffered values REQUIRE(centroid_count == 10); + REQUIRE(td.get_total_weight() == total_weight); } } /* namespace datasketches */ From faca5d0262173c96c846d0d36d900cc5bfa48b6d Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Wed, 11 Jun 2025 22:47:10 +0530 Subject: [PATCH 12/75] Retaining the default copy constructor --- tdigest/include/tdigest.hpp | 6 ------ tdigest/include/tdigest_impl.hpp | 18 ------------------ 2 files changed, 24 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 2ad410f5..99e8dfa3 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -106,12 +106,6 @@ class tdigest { */ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator()); - /** - * Copy constructor - * @param other sketch to be copied - */ - tdigest(const tdigest& other); - /** * Update this t-Digest with the given value * @param value to update the t-Digest with diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 1dba9eb1..ab4ce9e4 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -597,24 +597,6 @@ bool tdigest::is_single_value() const { return get_total_weight() == 1; } -template -tdigest::tdigest(const tdigest& other): - reverse_merge_(other.reverse_merge_), - k_(other.k_), - min_(other.min_), - max_(other.max_), - centroids_capacity_(other.centroids_capacity_), - centroids_(other.centroids_, other.get_allocator()), - centroids_weight_(other.centroids_weight_), - buffer_(other.buffer_, other.get_allocator()) -{ - if (other.k_ < 10) throw std::invalid_argument("k must be at least 10"); - const size_t fudge = other.k_ < 30 ? 30 : 10; - centroids_capacity_ = 2 * k_ + fudge; - centroids_.reserve(centroids_capacity_); - buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER); -} - template tdigest::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer): reverse_merge_(reverse_merge), From ada87563432eebc989088d6fab3a1fd4d0aabc36 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Thu, 12 Jun 2025 11:32:08 +0530 Subject: [PATCH 13/75] Removing the unnecessary parameters --- tdigest/include/tdigest_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index ab4ce9e4..0f53adc4 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -638,11 +638,11 @@ template } template -tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): centroids_(tdigest_.get_allocator()) { // Create a copy of the tdigest to generate the centroids after processing the buffered values - tdigest tmp(tdigest_); + tdigest tmp(tdigest_); tmp.compress(); centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); From 5be04f2561ca2d394f564b78400aa981d98b4c9e Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Fri, 13 Jun 2025 10:50:19 +0530 Subject: [PATCH 14/75] Review comments --- tdigest/include/tdigest.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 99e8dfa3..cc7898e3 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -319,8 +319,6 @@ class tdigest::const_iterator { using pointer = const return_value_holder; using reference = const value_type; - const_iterator(const tdigest &tdigest_, bool is_end); - const_iterator& operator++(); const_iterator& operator++(int); bool operator==(const const_iterator& other) const; @@ -328,9 +326,10 @@ class tdigest::const_iterator { reference operator*() const; pointer operator->() const; private: - friend class tdigest; + friend class tdigest; uint32_t index_; vector_centroid centroids_; + const_iterator(const tdigest& tdigest_, bool is_end); }; } /* namespace datasketches */ From 2e92ea0474a502edb6b6760d962f7f2e47660177 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Mon, 14 Jul 2025 21:58:57 -0700 Subject: [PATCH 15/75] porting bug fix from Java --- tdigest/include/tdigest_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 0f53adc4..b8fab38d 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -193,7 +193,7 @@ T tdigest::get_quantile(double rank) const { } const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0; const double w2 = centroids_.back().get_weight() / 2.0 - w1; - return weighted_average(centroids_.back().get_weight(), w1, max_, w2); + return weighted_average(centroids_.back().get_mean(), w1, max_, w2); } template From 0595e4d62422b113b3ee036d8552cf4dd01c4b1c Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Fri, 8 Aug 2025 13:16:03 -0700 Subject: [PATCH 16/75] ds-java main branch requires java 24 --- .github/workflows/serde_compat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml index 084d1385..f3d7ed67 100644 --- a/.github/workflows/serde_compat.yml +++ b/.github/workflows/serde_compat.yml @@ -21,7 +21,7 @@ jobs: - name: Setup Java uses: actions/setup-java@v4 with: - java-version: '21' + java-version: '24' distribution: 'temurin' - name: Run Java run: cd java && mvn test -P generate-java-files From 1db40c997541aea061391afe08a0af15dcbea1e0 Mon Sep 17 00:00:00 2001 From: devillove084 <786537003@qq.com> Date: Mon, 1 Sep 2025 08:21:21 +0000 Subject: [PATCH 17/75] fix: unnecessary virtual specifier on destructor --- hll/include/hll.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp index 9d5f78f1..5fc49629 100644 --- a/hll/include/hll.hpp +++ b/hll/include/hll.hpp @@ -160,7 +160,7 @@ class hll_sketch_alloc final { static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A()); //! Class destructor - virtual ~hll_sketch_alloc(); + ~hll_sketch_alloc(); /** * Copy assignment operator From a3bc4e48551a72cfb25bfefd09b6cb22e84e0551 Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 13 Nov 2025 01:16:51 +0900 Subject: [PATCH 18/75] refactor: clean up use get_preamble_longs --- theta/include/theta_sketch_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp index 8f7b1e8d..304ae64c 100644 --- a/theta/include/theta_sketch_impl.hpp +++ b/theta/include/theta_sketch_impl.hpp @@ -376,7 +376,7 @@ size_t compact_theta_sketch_alloc::get_compressed_serialized_size_bytes(uint8 template void compact_theta_sketch_alloc::serialize(std::ostream& os) const { - const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2; + const uint8_t preamble_longs = get_preamble_longs(false); write(os, preamble_longs); write(os, UNCOMPRESSED_SERIAL_VERSION); write(os, SKETCH_TYPE); @@ -459,7 +459,7 @@ uint8_t compact_theta_sketch_alloc::compute_entry_bits() const { template void compact_theta_sketch_alloc::serialize_version_4(std::ostream& os) const { - const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1; + const uint8_t preamble_longs = get_preamble_longs(true); const uint8_t entry_bits = compute_entry_bits(); const uint8_t num_entries_bytes = get_num_entries_bytes(); From 9d1b524a50cfcd3a3b7c6f492fe0853807403e9a Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 12 Nov 2025 14:29:35 -0800 Subject: [PATCH 19/75] ds-java main branch requires Java 25 --- .github/workflows/serde_compat.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml index f3d7ed67..81547ee7 100644 --- a/.github/workflows/serde_compat.yml +++ b/.github/workflows/serde_compat.yml @@ -12,16 +12,16 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Checkout Java - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: apache/datasketches-java path: java - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: - java-version: '24' + java-version: '25' distribution: 'temurin' - name: Run Java run: cd java && mvn test -P generate-java-files From a83254d9eb7933ca21489855c4bc89c0098a3537 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 5 Jan 2026 16:58:21 +0900 Subject: [PATCH 20/75] fix: division by 0 --- common/include/binomial_bounds.hpp | 4 +- common/test/CMakeLists.txt | 1 + common/test/binomial_bounds_test.cpp | 279 +++++++++++++++++++++++++++ 3 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 common/test/binomial_bounds_test.cpp diff --git a/common/include/binomial_bounds.hpp b/common/include/binomial_bounds.hpp index 3b73535b..ff7cccc9 100644 --- a/common/include/binomial_bounds.hpp +++ b/common/include/binomial_bounds.hpp @@ -441,8 +441,8 @@ class binomial_bounds { } static void check_theta(double theta) { - if (theta < 0 || theta > 1) { - throw std::invalid_argument("theta must be in [0, 1]"); + if (theta <= 0 || theta > 1) { + throw std::invalid_argument("theta must be in (0, 1]"); } } diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt index c598c353..7593bd0b 100644 --- a/common/test/CMakeLists.txt +++ b/common/test/CMakeLists.txt @@ -69,6 +69,7 @@ target_sources(common_test PRIVATE quantiles_sorted_view_test.cpp optional_test.cpp + binomial_bounds_test.cpp ) # now the integration test part diff --git a/common/test/binomial_bounds_test.cpp b/common/test/binomial_bounds_test.cpp new file mode 100644 index 00000000..6bde0910 --- /dev/null +++ b/common/test/binomial_bounds_test.cpp @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "binomial_bounds.hpp" + +namespace datasketches { + +TEST_CASE("binomial_bounds: get_lower_bound", "[common]") { + + SECTION("num_samples == 0") { + double result = binomial_bounds::get_lower_bound(0, 0.5, 1); + REQUIRE(result == 0.0); + } + + SECTION("theta == 1.0") { + double result = binomial_bounds::get_lower_bound(100, 1.0, 1); + REQUIRE(result == 100.0); + } + + SECTION("num_samples == 1") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples == 1, stddev=2") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples == 1, stddev=3") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120") { + double result = binomial_bounds::get_lower_bound(121, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=2") { + double result = binomial_bounds::get_lower_bound(200, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=3") { + double result = binomial_bounds::get_lower_bound(500, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5)") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 1); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 2); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 3); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 1); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 2); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 3); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta (exact calculation)") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=2") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=3") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("theta=0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(10, 0.0, 1), std::invalid_argument); + } + + SECTION("theta very close to 0") { + double result = binomial_bounds::get_lower_bound(10, 1e-10, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=2 boundary") { + double result = binomial_bounds::get_lower_bound(2, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=120 boundary") { + double result = binomial_bounds::get_lower_bound(120, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("estimate clamping case") { + double result = binomial_bounds::get_lower_bound(10, 0.9, 1); + double estimate = 10.0 / 0.9; + REQUIRE(result <= estimate); + } + + SECTION("invalid theta < 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, -0.1, 1), std::invalid_argument); + } + + SECTION("invalid theta > 1") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 1.1, 1), std::invalid_argument); + } + + SECTION("invalid stddev = 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 0), std::invalid_argument); + } + + SECTION("invalid stddev = 4") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 4), std::invalid_argument); + } +} + +TEST_CASE("binomial_bounds: get_upper_bound", "[common]") { + + SECTION("theta == 1.0") { + double result = binomial_bounds::get_upper_bound(100, 1.0, 1); + REQUIRE(result == 100.0); + } + + SECTION("num_samples == 0") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 1); + REQUIRE(result > 0.0); + } + + SECTION("num_samples == 0, stddev=2") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 2); + REQUIRE(result > 0.0); + } + + SECTION("num_samples == 0, stddev=3") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 3); + REQUIRE(result > 0.0); + } + + SECTION("num_samples > 120") { + double result = binomial_bounds::get_upper_bound(121, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=2") { + double result = binomial_bounds::get_upper_bound(200, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=3") { + double result = binomial_bounds::get_upper_bound(500, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5)") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 1); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 2); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 3); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 1); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 2); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 3); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta (exact calculation)") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=2") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=3") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("theta=0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(10, 0.0, 1), std::invalid_argument); + } + + SECTION("theta very close to 0") { + double result = binomial_bounds::get_upper_bound(10, 1e-10, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=1 boundary") { + double result = binomial_bounds::get_upper_bound(1, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=120 boundary") { + double result = binomial_bounds::get_upper_bound(120, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("estimate clamping case") { + double result = binomial_bounds::get_upper_bound(10, 0.9, 1); + double estimate = 10.0 / 0.9; + REQUIRE(result >= estimate); + } + + SECTION("invalid theta < 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, -0.1, 1), std::invalid_argument); + } + + SECTION("invalid theta > 1") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 1.1, 1), std::invalid_argument); + } + + SECTION("invalid stddev = 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 0), std::invalid_argument); + } + + SECTION("invalid stddev = 4") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 4), std::invalid_argument); + } +} + +} /* namespace datasketches */ From 59e5f366fe18c54f8d9e4f26742c02f6e7a9164a Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Wed, 7 Jan 2026 19:24:49 +0900 Subject: [PATCH 21/75] fix: rejecting inf as value --- tdigest/include/tdigest_impl.hpp | 5 +++ tdigest/test/tdigest_test.cpp | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index b8fab38d..75f2d9ee 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -37,6 +37,7 @@ tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::i template void tdigest::update(T value) { if (std::isnan(value)) return; + if (std::isinf(value)) return; if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress(); buffer_.push_back(value); min_ = std::min(min_, value); @@ -94,6 +95,7 @@ template double tdigest::get_rank(T value) const { if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN"); + if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity"); if (value < min_) return 0; if (value > max_) return 1; // one centroid and value == min_ == max_ @@ -621,6 +623,9 @@ void tdigest::check_split_points(const T* values, uint32_t size) { if (std::isnan(values[i])) { throw std::invalid_argument("Values must not be NaN"); } + if (std::isinf(values[i])) { + throw std::invalid_argument("Values must not be infinity"); + } if ((i < (size - 1)) && !(values[i] < values[i + 1])) { throw std::invalid_argument("Values must be unique and monotonically increasing"); } diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 9f92094d..45c10822 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -470,4 +470,57 @@ TEST_CASE("iterate centroids", "[tdigest]") { REQUIRE(td.get_total_weight() == total_weight); } +TEST_CASE("update rejects positive infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_max_value() == 2.0); +} + +TEST_CASE("update rejects negative infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(-std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_min_value() == 1.0); +} + +TEST_CASE("get_rank rejects positive infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits::infinity()), std::invalid_argument); +} + +TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); +} + +TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[2] = {50.0, std::numeric_limits::infinity()}; + REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); +} + +TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[2] = {-std::numeric_limits::infinity(), 50.0}; + REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); +} + +TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 100; ++i) td.update(i); + const double split_points[1] = {std::numeric_limits::infinity()}; + REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument); +} + } /* namespace datasketches */ From 588fd73c09b09740a0ebd493ef336a02fab2eb0f Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 00:40:31 +0900 Subject: [PATCH 22/75] fix: check invalid inputs on deserialization --- tdigest/include/tdigest.hpp | 2 + tdigest/include/tdigest_impl.hpp | 91 +++++++++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 97 ++++++++++++++++++++++++++++---- 3 files changed, 171 insertions(+), 19 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index cc7898e3..7d060ec1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -108,6 +108,7 @@ class tdigest { /** * Update this t-Digest with the given value + * NaN and infinity values are ignored * @param value to update the t-Digest with */ void update(T value); @@ -153,6 +154,7 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. + *

NaN and infinity values throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 75f2d9ee..294dab88 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -29,6 +29,24 @@ namespace datasketches { +template +inline void check_not_nan(T value, const char* name) { + if (std::isnan(value)) { + std::ostringstream oss; + oss << name << " must not be NaN"; + throw std::invalid_argument(oss.str()); + } +} + +template +inline void check_not_infinite(T value, const char* name) { + if (std::isinf(value)) { + std::ostringstream oss; + oss << name << " must not be infinite"; + throw std::invalid_argument(oss.str()); + } +} + template tdigest::tdigest(uint16_t k, const A& allocator): tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::infinity(), vector_centroid(allocator), 0, vector_t(allocator)) @@ -402,6 +420,8 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE); if (is_single_value) { const T value = read(is); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -410,12 +430,24 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const T min = read(is); const T max = read(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -453,6 +485,8 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ensure_minimum_memory(end_ptr - ptr, sizeof(T)); T value; ptr += copy_from_mem(ptr, value); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -467,12 +501,24 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ptr += copy_from_mem(ptr, min); T max; ptr += copy_from_mem(ptr, max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -489,13 +535,22 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc if (type == COMPAT_DOUBLE) { // compatibility with asBytes() const auto min = read_big_endian(is); const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); const auto num_centroids = read_big_endian(is); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_double = read_big_endian(is); + check_not_nan(weight_double, "centroid weight"); + check_not_infinite(weight_double, "centroid weight"); const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_double); c = centroid(mean, weight); total_weight += weight; } @@ -504,6 +559,10 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc // COMPAT_FLOAT: compatibility with asSmallBytes() const auto min = read_big_endian(is); // reference implementation uses doubles for min and max const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); // reference implementation stores capacities of the array of centroids and the buffer as shorts // they can be derived from k in the constructor @@ -512,8 +571,13 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_float = read_big_endian(is); + check_not_nan(weight_float, "centroid weight"); + check_not_infinite(weight_float, "centroid weight"); const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_float); c = centroid(mean, weight); total_weight += weight; } @@ -540,6 +604,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); double k_double; ptr += copy_from_mem(ptr, k_double); const uint16_t k = static_cast(byteswap(k_double)); @@ -556,6 +624,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -569,6 +641,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); float k_float; ptr += copy_from_mem(ptr, k_float); const uint16_t k = static_cast(byteswap(k_float)); @@ -588,6 +664,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, float mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -623,9 +703,6 @@ void tdigest::check_split_points(const T* values, uint32_t size) { if (std::isnan(values[i])) { throw std::invalid_argument("Values must not be NaN"); } - if (std::isinf(values[i])) { - throw std::invalid_argument("Values must not be infinity"); - } if ((i < (size - 1)) && !(values[i] < values[i + 1])) { throw std::invalid_argument("Values must be unique and monotonically increasing"); } diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 45c10822..0019b936 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -18,13 +18,35 @@ */ #include +#include #include #include +#include #include "tdigest.hpp" namespace datasketches { +namespace { +constexpr size_t kHeaderSize = 8; +constexpr size_t kCountsSize = 8; +constexpr size_t kMinOffset = kHeaderSize + kCountsSize; +constexpr size_t kMaxOffset = kMinOffset + sizeof(double); +constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2; +constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset; +constexpr size_t kSingleValueOffset = kHeaderSize; + +template +void write_bytes(std::vector& bytes, size_t offset, T value) { + std::memcpy(bytes.data() + offset, &value, sizeof(T)); +} + +template +void write_bytes(std::string& data, size_t offset, T value) { + std::memcpy(&data[offset], &value, sizeof(T)); +} +} // namespace + TEST_CASE("empty", "[tdigest]") { tdigest_double td(10); // std::cout << td.to_string(); @@ -502,25 +524,76 @@ TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); } -TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, kSingleValueOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(3.0); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, kMinOffset, std::numeric_limits::infinity()); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[2] = {50.0, std::numeric_limits::infinity()}; - REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, kSingleValueOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, kMaxOffset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } -TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[2] = {-std::numeric_limits::infinity(), 50.0}; - REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, kMaxOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } -TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") { +TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { tdigest_double td(100); - for (int i = 0; i < 100; ++i) td.update(i); - const double split_points[1] = {std::numeric_limits::infinity()}; - REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } } /* namespace datasketches */ From b8489fd7327721fa4c1a16ff2a93565e7b077e5e Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 01:04:40 +0900 Subject: [PATCH 23/75] perf: remove ostringstream --- tdigest/include/tdigest_impl.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 294dab88..0be1a486 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -22,7 +22,6 @@ #include #include -#include #include "common_defs.hpp" #include "memory_operations.hpp" @@ -32,18 +31,14 @@ namespace datasketches { template inline void check_not_nan(T value, const char* name) { if (std::isnan(value)) { - std::ostringstream oss; - oss << name << " must not be NaN"; - throw std::invalid_argument(oss.str()); + throw std::invalid_argument(std::string(name) + " must not be NaN"); } } template inline void check_not_infinite(T value, const char* name) { if (std::isinf(value)) { - std::ostringstream oss; - oss << name << " must not be infinite"; - throw std::invalid_argument(oss.str()); + throw std::invalid_argument(std::string(name) + " must not be infinite"); } } From c680a81c9fd690971de14cff4da3116fb04903cf Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 02:08:48 +0900 Subject: [PATCH 24/75] style: follow local convention --- tdigest/test/tdigest_test.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 0019b936..fd0a71c1 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -28,13 +28,13 @@ namespace datasketches { namespace { -constexpr size_t kHeaderSize = 8; -constexpr size_t kCountsSize = 8; -constexpr size_t kMinOffset = kHeaderSize + kCountsSize; -constexpr size_t kMaxOffset = kMinOffset + sizeof(double); -constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2; -constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset; -constexpr size_t kSingleValueOffset = kHeaderSize; +constexpr size_t header_size = 8; +constexpr size_t counts_size = 8; +constexpr size_t min_offset = header_size + counts_size; +constexpr size_t max_offset = min_offset + sizeof(double); +constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2; +constexpr size_t first_buffered_value_offset = first_centroid_mean_offset; +constexpr size_t single_value_offset = header_size; template void write_bytes(std::vector& bytes, size_t offset, T value) { @@ -528,7 +528,7 @@ TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); auto bytes = td.serialize(); - write_bytes(bytes, kSingleValueOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, single_value_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -539,7 +539,7 @@ TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") { td.update(3.0); auto bytes = td.serialize(); std::string data(reinterpret_cast(bytes.data()), bytes.size()); - write_bytes(data, kMinOffset, std::numeric_limits::infinity()); + write_bytes(data, min_offset, std::numeric_limits::infinity()); std::istringstream is(data, std::ios::binary); REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); } @@ -548,7 +548,7 @@ TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") { tdigest_double td(100); for (int i = 0; i < 10; ++i) td.update(i); auto bytes = td.serialize(); - write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, first_centroid_mean_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -557,7 +557,7 @@ TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(0, true); - write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -565,7 +565,7 @@ TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); auto bytes = td.serialize(); - write_bytes(bytes, kSingleValueOffset, std::numeric_limits::infinity()); + write_bytes(bytes, single_value_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -574,7 +574,7 @@ TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(); - write_bytes(bytes, kMaxOffset, std::numeric_limits::quiet_NaN()); + write_bytes(bytes, max_offset, std::numeric_limits::quiet_NaN()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -583,7 +583,7 @@ TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(); - write_bytes(bytes, kMaxOffset, std::numeric_limits::infinity()); + write_bytes(bytes, max_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } @@ -592,7 +592,7 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { td.update(1.0); td.update(2.0); auto bytes = td.serialize(0, true); - write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits::infinity()); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::infinity()); REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } From 99d06bfd2b3c668911720ee3e2598a6fce7cc917 Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 13 Jan 2026 02:10:05 +0900 Subject: [PATCH 25/75] fix: add missing dependency --- tdigest/include/tdigest_impl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 0be1a486..043c7bab 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include #include "common_defs.hpp" #include "memory_operations.hpp" From 662aef37c3912b4b2c6cbf3cc6ab0dbf5d40a0df Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 15:20:44 +0900 Subject: [PATCH 26/75] fix: allow inf for get_rank --- tdigest/include/tdigest.hpp | 1 - tdigest/include/tdigest_impl.hpp | 1 - tdigest/test/tdigest_test.cpp | 14 -------------- 3 files changed, 16 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 7d060ec1..7ce87dd1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -154,7 +154,6 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. - *

NaN and infinity values throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 043c7bab..e6904f20 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -109,7 +109,6 @@ template double tdigest::get_rank(T value) const { if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN"); - if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity"); if (value < min_) return 0; if (value > max_) return 1; // one centroid and value == min_ == max_ diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index fd0a71c1..8dd62132 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -510,20 +510,6 @@ TEST_CASE("update rejects negative infinity", "[tdigest]") { REQUIRE(td.get_min_value() == 1.0); } -TEST_CASE("get_rank rejects positive infinity", "[tdigest]") { - tdigest_double td(100); - td.update(1.0); - td.update(2.0); - REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits::infinity()), std::invalid_argument); -} - -TEST_CASE("get_rank rejects negative infinity", "[tdigest]") { - tdigest_double td(100); - td.update(1.0); - td.update(2.0); - REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits::infinity()), std::invalid_argument); -} - TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { tdigest_double td(100); td.update(1.0); From bded7aa1eb09c13daa742ce901443388e1a8994a Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 16:15:30 +0900 Subject: [PATCH 27/75] fix: check weight is zero --- tdigest/include/tdigest_impl.hpp | 15 +++++++++++++++ tdigest/test/tdigest_test.cpp | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index e6904f20..065e3ef1 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "common_defs.hpp" #include "memory_operations.hpp" @@ -43,6 +44,14 @@ inline void check_not_infinite(T value, const char* name) { } } +template +inline void check_non_zero(T value, const char* name) { + static_assert(std::is_arithmetic::value, "T must be an arithmetic type"); + if (value == 0) { + throw std::invalid_argument(std::string(name) + " must not be zero"); + } +} + template tdigest::tdigest(uint16_t k, const A& allocator): tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::infinity(), vector_centroid(allocator), 0, vector_t(allocator)) @@ -437,6 +446,8 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { for (const auto& c: centroids) { check_not_nan(c.get_mean(), "centroid mean"); check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + weight += c.get_weight(); } for (const auto& value: buffer) { @@ -508,6 +519,8 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A for (const auto& c: centroids) { check_not_nan(c.get_mean(), "centroid mean"); check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + weight += c.get_weight(); } for (const auto& value: buffer) { @@ -542,6 +555,8 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc const auto weight_double = read_big_endian(is); check_not_nan(weight_double, "centroid weight"); check_not_infinite(weight_double, "centroid weight"); + check_non_zero(weight_double, "centroid weight"); + const auto mean = read_big_endian(is); check_not_nan(mean, "centroid mean"); check_not_infinite(mean, "centroid mean"); diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 8dd62132..07d6185f 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -33,6 +33,7 @@ constexpr size_t counts_size = 8; constexpr size_t min_offset = header_size + counts_size; constexpr size_t max_offset = min_offset + sizeof(double); constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2; +constexpr size_t first_centroid_weight_offset = first_centroid_mean_offset + sizeof(double); constexpr size_t first_buffered_value_offset = first_centroid_mean_offset; constexpr size_t single_value_offset = header_size; @@ -582,4 +583,22 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); } +TEST_CASE("deserialize bytes rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, first_centroid_weight_offset, static_cast(0)); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, first_centroid_weight_offset, static_cast(0)); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + } /* namespace datasketches */ From 19798344aad67441f12f0e356ffc78b6fbd3078e Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 13 Jan 2026 16:22:12 +0900 Subject: [PATCH 28/75] doc: update throw NaN for get_rank --- tdigest/include/tdigest.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 7ce87dd1..2d3620b1 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -154,6 +154,7 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. + *

NaN value throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) From 21362396a54fd142abf3481f93df8a8058b3e00f Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 13 Jan 2026 16:12:02 +0800 Subject: [PATCH 29/75] Refactor README for clarity and consistency Updated README to improve clarity and formatting. --- README.md | 139 ++++++++++++++++++++++++++---------------------------- 1 file changed, 66 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 4b216167..57125686 100644 --- a/README.md +++ b/README.md @@ -1,106 +1,99 @@ # Apache DataSketches Core C++ Library Component -This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications. -This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL. +This is the core C++ component of the Apache DataSketches library. It contains all the key sketching algorithms from the Java implementation and can be accessed directly by user applications. -Note that we have a parallel core component for [Java]((https://github.com/apache/datasketches-java) and [Python]((https://github.com/apache/datasketches-python) implementations of the same sketch algorithms. +This component is also a dependency of other library components that create adaptors for target systems, such as [PostgreSQL](https://github.com/apache/datasketches-postgresql). + +Note that we have parallel core library components for Java, Python, and GO implementations of many of the same sketch algorithms: + +- [datasketches-java](https://github.com/apache/datasketches-java) +- [datasketches-python](https://github.com/apache/datasketches-python) +- [datasketches-go](https://github.com/apache/datasketches-go) Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information. -If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. +If you are interested in making contributions to this site, please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. --- This code requires C++11. -This library is header-only. The build process provided is only for building unit tests. +This library is header-only. The provided build process is only for unit tests. -Building the unit tests requires cmake 3.12.0 or higher. +Building the unit tests requires CMake 3.12.0 or higher. -Installing the latest cmake on OSX: brew install cmake +Installing the latest CMake on OSX: `brew install cmake`. -Building and running unit tests using cmake for OSX and Linux: +Building and running unit tests using CMake for OSX and Linux: -``` - $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release - $ cmake --build build/Release -t all test +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release +cmake --build build/Release -t all test ``` -Building and running unit tests using cmake for Windows from the command line: +Building and running unit tests using CMake for Windows from the command line: -``` - $ cd build - $ cmake .. - $ cd .. - $ cmake --build build --config Release - $ cmake --build build --config Release --target RUN_TESTS +```shell +cd build +cmake .. +cd .. +cmake --build build --config Release +cmake --build build --config Release --target RUN_TESTS ``` -To install a local distribution (OSX and Linux), use the following command. The -CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it -defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below, -the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include, -/tmp/install/DataSketches/lib, etc) +To install a local distribution (OSX and Linux), use the following command. The `CMAKE_INSTALL_PREFIX` variable controls the destination. If not specified, it defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below, the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include, /tmp/install/DataSketches/lib, etc). -``` - $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches - $ cmake --build build/Release -t install +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches +cmake --build build/Release -t install ``` -To generate an installable package using cmake's built in cpack packaging tool, -use the following command. The type of packaging is controlled by the CPACK_GENERATOR -variable (semi-colon separated list). Cmake usually supports packaging types such as RPM, -DEB, STGZ, TGZ, TZ, ZIP, etc. +To generate an installable package using CMake's built-in cpack packaging tool, use the following command. The type of packaging is controlled by the `CPACK_GENERATOR` variable (semi-colon separated list). CMake usually supports packaging formats such as RPM, DEB, STGZ, TGZ, TZ, and ZIP. -``` - $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" - $ cmake3 --build build/Release -t package +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" +cmake --build build/Release -t package ``` The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways. -If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some -way, then CMake's `find_package` command can be used like this: -``` - find_package(DataSketches 3.2 REQUIRED) - target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB}) +If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some way, then CMake's `find_package` command can be used like this: + +```cmake +find_package(DataSketches 3.2 REQUIRED) +target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB}) ``` When used with find_package, DataSketches exports several variables, including - - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported. - - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files. - Because cmake automatically includes the interface directories for included target libraries when - using `target_link_library`, under normal circumstances there will be no need to include this directly. - - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling - in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies - and include paths. - -If you don't have DataSketches installed locally, dependent projects can pull it directly -from GitHub using CMake's `ExternalProject` module. The code would look something like this: - -``` - cmake_policy(SET CMP0097 NEW) - include(ExternalProject) - ExternalProject_Add(datasketches - GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git - GIT_TAG 3.2.0 - GIT_SHALLOW true - GIT_SUBMODULES "" - INSTALL_DIR /tmp/datasketches-prefix - CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix - - # Override the install command to add DESTDIR - # This is necessary to work around an oddity in the RPM (but not other) package - # generation, as CMake otherwise picks up the Datasketch files when building - # an RPM for a dependent package. (RPM scans the directory for files in addition to installing - # those files referenced in an "install" rule in the cmake file) - INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install - ) - ExternalProject_Get_property(datasketches INSTALL_DIR) - set(datasketches_INSTALL_DIR ${INSTALL_DIR}) - message("Source dir of datasketches = ${datasketches_INSTALL_DIR}") - target_include_directories(my_dependent_target - PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches) - add_dependencies(my_dependent_target datasketches) +- `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported. +- `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files. Because CMake automatically includes the interface directories for included target libraries when using `target_link_library`, under normal circumstances, there will be no need to include this directly +- `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies and include paths. + +If you don't have DataSketches installed locally, dependent projects can pull it directly from GitHub using CMake's `ExternalProject` module. The code would look something like this: + +```cmake +cmake_policy(SET CMP0097 NEW) +include(ExternalProject) +ExternalProject_Add(datasketches + GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git + GIT_TAG 3.2.0 + GIT_SHALLOW true + GIT_SUBMODULES "" + INSTALL_DIR /tmp/datasketches-prefix + CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix + + # Override the install command to add DESTDIR + # This is necessary to work around an oddity in the RPM (but not other) package + # generation, as CMake otherwise picks up the Datasketch files when building + # an RPM for a dependent package. (RPM scans the directory for files in addition to installing + # those files referenced in an "install" rule in the cmake file) + INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install +) +ExternalProject_Get_property(datasketches INSTALL_DIR) +set(datasketches_INSTALL_DIR ${INSTALL_DIR}) +message("Source dir of datasketches = ${datasketches_INSTALL_DIR}") +target_include_directories(my_dependent_target + PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches) +add_dependencies(my_dependent_target datasketches) ``` From da95fd28419a982d8c6d802ab153362f724c470f Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:19:38 +0900 Subject: [PATCH 30/75] feat: add utf8cpp --- NOTICE | 3 + common/CMakeLists.txt | 11 + common/include/third_party/utf8cpp/LICENSE | 23 + common/include/third_party/utf8cpp/utf8.h | 46 ++ .../third_party/utf8cpp/utf8/checked.h | 359 +++++++++++++ .../include/third_party/utf8cpp/utf8/core.h | 500 ++++++++++++++++++ .../include/third_party/utf8cpp/utf8/cpp11.h | 70 +++ .../include/third_party/utf8cpp/utf8/cpp17.h | 96 ++++ .../include/third_party/utf8cpp/utf8/cpp20.h | 124 +++++ .../third_party/utf8cpp/utf8/unchecked.h | 286 ++++++++++ 10 files changed, 1518 insertions(+) create mode 100644 common/include/third_party/utf8cpp/LICENSE create mode 100644 common/include/third_party/utf8cpp/utf8.h create mode 100644 common/include/third_party/utf8cpp/utf8/checked.h create mode 100644 common/include/third_party/utf8cpp/utf8/core.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp11.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp17.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp20.h create mode 100644 common/include/third_party/utf8cpp/utf8/unchecked.h diff --git a/NOTICE b/NOTICE index 11ba6f6c..6a2376d9 100644 --- a/NOTICE +++ b/NOTICE @@ -10,3 +10,6 @@ The Apache Software Foundation (http://www.apache.org/). Prior to moving to ASF, the software for this project was developed at Yahoo Inc. (https://developer.yahoo.com). + +This product includes utf8cpp (https://github.com/nemtrif/utfcpp), +licensed under the Boost Software License, Version 1.0. diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8514433b..2d5c7330 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,3 +51,14 @@ install(FILES include/serde.hpp include/xxhash64.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") + +install(FILES + include/third_party/utf8cpp/utf8.h + include/third_party/utf8cpp/utf8/checked.h + include/third_party/utf8cpp/utf8/core.h + include/third_party/utf8cpp/utf8/cpp11.h + include/third_party/utf8cpp/utf8/cpp17.h + include/third_party/utf8cpp/utf8/cpp20.h + include/third_party/utf8cpp/utf8/unchecked.h + include/third_party/utf8cpp/LICENSE + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp") diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE new file mode 100644 index 00000000..36b7cd93 --- /dev/null +++ b/common/include/third_party/utf8cpp/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h new file mode 100644 index 00000000..b5135309 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8.h @@ -0,0 +1,46 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h new file mode 100644 index 00000000..96ceb4d5 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/checked.h @@ -0,0 +1,359 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + utfchar32_t cp; + public: + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } + utfchar32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + utfchar8_t u8; + public: + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast(c)) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } + utfchar8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + utfchar16_t u16; + public: + invalid_utf16 (utfchar16_t u) : u16(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } + utfchar16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append(cp, result); + } + + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it, octet_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(static_cast(*it)); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + utfchar32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = static_cast(utf8::internal::mask16(*start++)); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + const utfchar32_t trail_surrogate = static_cast(utf8::internal::mask16(*start++)); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + const utfchar32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#include "cpp17.h" +#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later + +#endif //header guard + diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h new file mode 100644 index 00000000..8e128c18 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/core.h @@ -0,0 +1,500 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include +#include +#include + +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept + #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() + // Not worth simulating static_assert: + #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition); +#endif // C++ 11 or later + + +namespace utf8 +{ +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline utfchar8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + + template + inline utfchar16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + inline bool is_lead_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); + } + + inline bool is_trail_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_code_point_valid(utfchar32_t cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + + template + int sequence_length(octet_iterator lead_it) + { + const utfchar8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + inline bool is_overlong_sequence(utfchar32_t cp, int length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, const octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + utfchar32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + utfchar32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + // Make sure the iterator dereferences a large enough type + typedef typename std::iterator_traits::value_type word_type; + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + // Check the edge case: + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + + // Internal implementation of both checked and unchecked append() function + // This function will be invoked by the overloads below, as they will know + // the octet_type. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + // One of the following overloads will be invoked from the API calls + + // A simple (but dangerous) case: the caller appends byte(s) to a char array + inline char* append(utfchar32_t cp, char* result) { + return append(cp, result); + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append(cp, std::back_inserter(str)); + template + std::back_insert_iterator append + (utfchar32_t cp, std::back_insert_iterator result) { + return append, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine octet_type + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h new file mode 100644 index 00000000..691633c8 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp11.h @@ -0,0 +1,70 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" + +namespace utf8 +{ + inline void append16(utfchar32_t cp, std::u16string& s) + { + append16(cp, std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h new file mode 100644 index 00000000..07587300 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp17.h @@ -0,0 +1,96 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp11.h" + +namespace utf8 +{ + inline std::string utf16to8(std::u16string_view s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(std::u32string_view s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(std::string_view s) + { + std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(std::string_view s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(std::string_view s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h new file mode 100644 index 00000000..07b61d0f --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h new file mode 100644 index 00000000..173d0302 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/unchecked.h @@ -0,0 +1,286 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + return internal::append(cp, result); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append(replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append(replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it) + { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { + case 1: + break; + case 2: + ++it; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp = static_cast(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + } + ++it; + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + + template + utfchar32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + template + void advance(octet_iterator& it, distance_type n) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } + } + + template + typename std::iterator_traits::difference_type + distance(octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start == end) + return result; + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + utfchar32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + +#endif // header guard + From 2b48f475ed20ed576342f68f7a7e3afa401bf12c Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:21:02 +0900 Subject: [PATCH 31/75] feat: add aos tuple sketch --- tuple/CMakeLists.txt | 2 + tuple/include/array_of_strings_sketch.hpp | 150 +++++++++ .../include/array_of_strings_sketch_impl.hpp | 284 ++++++++++++++++++ tuple/test/CMakeLists.txt | 5 +- .../aos_sketch_deserialize_from_java_test.cpp | 172 +++++++++++ tuple/test/aos_sketch_serialize_for_java.cpp | 155 ++++++++++ tuple/test/array_of_strings_sketch_test.cpp | 243 +++++++++++++++ ...uple_sketch_deserialize_from_java_test.cpp | 2 +- 8 files changed, 1011 insertions(+), 2 deletions(-) create mode 100644 tuple/include/array_of_strings_sketch.hpp create mode 100644 tuple/include/array_of_strings_sketch_impl.hpp create mode 100644 tuple/test/aos_sketch_deserialize_from_java_test.cpp create mode 100644 tuple/test/aos_sketch_serialize_for_java.cpp create mode 100644 tuple/test/array_of_strings_sketch_test.cpp diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt index 4b0a48c7..54df11ee 100644 --- a/tuple/CMakeLists.txt +++ b/tuple/CMakeLists.txt @@ -54,4 +54,6 @@ install(FILES include/array_tuple_intersection_impl.hpp include/array_tuple_a_not_b.hpp include/array_tuple_a_not_b_impl.hpp + include/array_of_strings_sketch.hpp + include/array_of_strings_sketch_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp new file mode 100644 index 00000000..a3f8ddd7 --- /dev/null +++ b/tuple/include/array_of_strings_sketch.hpp @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_HPP_ + +#include +#include + +#include "array_tuple_sketch.hpp" +#include "xxhash64.h" + +namespace datasketches { + +// default update policy for an array of strings +template> +class default_array_of_strings_update_policy { +public: + using array_of_strings = array; + + explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); + + array_of_strings create() const; + + void update(array_of_strings& array, const array_of_strings& input) const; + + void update(array_of_strings& array, const array_of_strings* input) const; + +private: + Allocator allocator_; +}; + +// serializer/deserializer for an array of strings +// Requirements: all strings must be valid UTF-8 and array size must be <= 127. +template> +struct array_of_strings_serde { + using array_of_strings = array; + + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; + size_t size_of_item(const array_of_strings& item) const; + +private: + static void check_num_nodes(uint8_t num_nodes); + static uint32_t compute_total_bytes(const array_of_strings& item); + static void check_utf8(const std::string& value); +}; + +/** + * Extended class of compact_tuple_sketch for array of strings + * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + */ +template> +class compact_array_of_strings_tuple_sketch: + public compact_tuple_sketch< + array, + typename std::allocator_traits::template rebind_alloc> + > { +public: + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + using Base = compact_tuple_sketch; + using vector_bytes = typename Base::vector_bytes; + + template + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); + + void serialize(std::ostream& os) const; + vector_bytes serialize(unsigned header_size_bytes = 0) const; + + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, + const Allocator& allocator = Allocator()); + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, + const Allocator& allocator = Allocator()); + +private: + explicit compact_array_of_strings_tuple_sketch(Base&& base); +}; + +/** + * Extended class of update_tuple_sketch for array of strings + * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + */ +template> +class update_array_of_strings_tuple_sketch: + public update_tuple_sketch< + array, + array, + default_array_of_strings_update_policy, + typename std::allocator_traits::template rebind_alloc> + > { +public: + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + using policy_type = default_array_of_strings_update_policy; + using Base = update_tuple_sketch< + array_of_strings, + array_of_strings, + policy_type, + summary_allocator + >; + using resize_factor = typename Base::resize_factor; + class builder; + using Base::update; + + void update(const array_of_strings& key, const array_of_strings& value); + compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; + +private: + update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, + uint64_t seed, const policy_type& policy, const summary_allocator& allocator); + + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; + + static uint64_t hash_key(const array_of_strings& key); +}; + +template +class update_array_of_strings_tuple_sketch::builder: + public tuple_base_builder { +public: + builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); + + update_array_of_strings_tuple_sketch build() const; +}; + +} /* namespace datasketches */ + +#include "array_of_strings_sketch_impl.hpp" + +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp new file mode 100644 index 00000000..264f79bf --- /dev/null +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ + +#include + +#include "common_defs.hpp" +#include "third_party/utf8cpp/utf8.h" + +namespace datasketches { + +template +default_array_of_strings_update_policy::default_array_of_strings_update_policy(const Allocator& allocator): + allocator_(allocator) {} + +template +auto default_array_of_strings_update_policy::create() const -> array_of_strings { + return array_of_strings(0, "", allocator_); +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings& input +) const { + const auto length = input.size(); + array = array_of_strings(length, "", allocator_); + for (uint8_t i = 0; i < length; ++i) array[i] = input[i]; +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings* input +) const { + if (input == nullptr) { + array = array_of_strings(0, "", allocator_); + return; + } + const auto length = input->size(); + array = array_of_strings(length, "", allocator_); + for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i]; +} + +template +update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( + uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, + uint64_t seed, const policy_type& policy, const summary_allocator& allocator +): +Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} + +template +void update_array_of_strings_tuple_sketch::update( + const array_of_strings& key, const array_of_strings& value +) { + const uint64_t hash = hash_key(key); + Base::update(hash, value); +} + +template +uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { + XXHash64 hasher(STRING_ARR_HASH_SEED); + const auto size = static_cast(key.size()); + for (size_t i = 0; i < size; ++i) { + const auto& entry = key[static_cast(i)]; + hasher.add(entry.data(), entry.size()); + if (i + 1 < size) hasher.add(",", 1); + } + return hasher.hash(); +} + +template +compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { + return compact_array_of_strings_tuple_sketch(*this, ordered); +} + +// builder + +template +update_array_of_strings_tuple_sketch::builder::builder( + const policy_type& policy, const summary_allocator& allocator +): +tuple_base_builder(policy, allocator) {} + +template +auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { + return update_array_of_strings_tuple_sketch( + this->starting_lg_size(), + this->lg_k_, + this->rf_, + this->p_, + this->starting_theta(), + this->seed_, + this->policy_, + this->allocator_ + ); +} + +template +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + const Sketch& sketch, bool ordered +): Base(sketch, ordered) {} + +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + Base&& base +): Base(std::move(base)) {} + +template +void compact_array_of_strings_tuple_sketch::serialize(std::ostream& os) const { + Base::serialize(os, array_of_strings_serde()); +} + +template +auto compact_array_of_strings_tuple_sketch::serialize(unsigned header_size_bytes) const -> vector_bytes { + return Base::serialize(header_size_bytes, array_of_strings_serde()); +} + +template +auto compact_array_of_strings_tuple_sketch::deserialize( + std::istream& is, uint64_t seed, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(is, seed, array_of_strings_serde(), alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +auto compact_array_of_strings_tuple_sketch::deserialize( + const void* bytes, size_t size, uint64_t seed, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde(), alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +void array_of_strings_serde::serialize( + std::ostream& os, const array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } + } +} + +template +void array_of_strings_serde::deserialize( + std::istream& is, array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + read(is); // total_bytes + const uint8_t num_nodes = read(is); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", Allocator()); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + std::string value(length, '\0'); + is.read(&value[0], length); + check_utf8(value); + array[j] = std::move(value); + } + new (&items[i]) array_of_strings(std::move(array)); + } +} + +template +size_t array_of_strings_serde::serialize( + void* ptr, size_t capacity, const array_of_strings* items, unsigned num +) const { + uint8_t* ptr8 = static_cast(ptr); + size_t bytes_written = 0; + + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + check_memory_size(bytes_written + total_bytes, capacity); + bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); + bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + + bytes_written += copy_to_mem(length, ptr8 + bytes_written); + bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length); + } + } + return bytes_written; +} + +template +size_t array_of_strings_serde::deserialize( + const void* ptr, size_t capacity, array_of_strings* items, unsigned num +) const { + const uint8_t* ptr8 = static_cast(ptr); + size_t bytes_read = 0; + + for (unsigned i = 0; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", Allocator()); + for (uint8_t j = 0; j < num_nodes; ++j) { + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + check_utf8(value); + array[j] = std::move(value); + } + new (&items[i]) array_of_strings(std::move(array)); + } + return bytes_read; +} + +template +size_t array_of_strings_serde::size_of_item(const array_of_strings& item) const { + return compute_total_bytes(item); +} + +template +void array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { + if (num_nodes > 127) { + throw std::runtime_error("array_of_strings size exceeds 127"); + } +} + +template +uint32_t array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { + const auto count = item.size(); + check_num_nodes(static_cast(count)); + size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); + const std::string* data = item.data(); + for (uint32_t j = 0; j < count; ++j) { + total += data[j].size(); + } + if (total > std::numeric_limits::max()) { + throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max"); + } + return static_cast(total); +} + +template +void array_of_strings_serde::check_utf8(const std::string& value) { + if (!utf8::is_valid(value.begin(), value.end())) { + throw std::runtime_error("array_of_strings contains invalid UTF-8"); + } +} + +} /* namespace datasketches */ + +#endif diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..8c561745 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES CXX_STANDARD_REQUIRED YES ) -file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH) +file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH) string(APPEND THETA_TEST_BINARY_PATH "/") target_compile_definitions(tuple_test PRIVATE @@ -44,6 +44,7 @@ target_sources(tuple_test tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp array_of_doubles_sketch_test.cpp + array_of_strings_sketch_test.cpp engagement_test.cpp ) @@ -52,6 +53,7 @@ target_sources(tuple_test PRIVATE aod_sketch_deserialize_from_java_test.cpp tuple_sketch_deserialize_from_java_test.cpp + aos_sketch_deserialize_from_java_test.cpp ) endif() @@ -60,5 +62,6 @@ target_sources(tuple_test PRIVATE aod_sketch_serialize_for_java.cpp tuple_sketch_serialize_for_java.cpp + aos_sketch_serialize_for_java.cpp ) endif() diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp new file mode 100644 index 00000000..15e9d6dd --- /dev/null +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -0,0 +1,172 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + // assume the binary sketches for this test have been generated by datasketches-java code + // in the subdirectory called "java" in the root directory of this project + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; + + TEST_CASE("aos sketch one value", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + + TEST_CASE("aos sketch three values", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + } + + TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + + TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + + TEST_CASE("aos sketch unicode strings", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + } + + TEST_CASE("aos sketch empty strings", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + } +} diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp new file mode 100644 index 00000000..3a154132 --- /dev/null +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using aos_sketch = update_array_of_strings_tuple_sketch<>; +using array_of_strings = aos_sketch::array_of_strings; + +static array_of_strings make_array(std::initializer_list items) { + array_of_strings array(static_cast(items.size()), ""); + size_t i = 0; + for (const auto& item: items) { + array[static_cast(i)] = item; + ++i; + } + return array; +} + +TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { + auto sketch = aos_sketch::builder() + .set_lg_k(12) + .set_resize_factor(resize_factor::X8) + .set_p(0.01f) + .build(); + array_of_strings key(1, ""); + key[0] = "key1"; + array_of_strings value(1, ""); + value[0] = "value1"; + sketch.update(key, value); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + make_array({u8"키", u8"열쇠"}), + make_array({u8"밸류", u8"값"}) + ); + sketch.update( + make_array({u8"🔑", u8"🗝️"}), + make_array({u8"📦", u8"🎁"}) + ); + sketch.update( + make_array({u8"ключ1", u8"ключ2"}), + make_array({u8"ценить1", u8"ценить2"}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + make_array({""}), + make_array({"empty_key_value"}) + ); + sketch.update( + make_array({"empty_value_key"}), + make_array({""}) + ); + sketch.update( + make_array({"", ""}), + make_array({"", ""}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +} /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp new file mode 100644 index 00000000..45e554bd --- /dev/null +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using array_of_strings = array; + +TEST_CASE("aos update policy", "[tuple_sketch]") { + default_array_of_strings_update_policy<> policy; + + SECTION("create empty") { + auto values = policy.create(); + REQUIRE(values.size() == 0); + } + + SECTION("replace array") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "alpha"; + input[1] = "beta"; + policy.update(values, input); + REQUIRE(values.size() == 2); + REQUIRE(values[0] == "alpha"); + REQUIRE(values[1] == "beta"); + input[0] = "changed"; + REQUIRE(values[0] == "alpha"); + + array_of_strings input2(1, "", std::allocator()); + input2[0] = "gamma"; + policy.update(values, input2); + REQUIRE(values.size() == 1); + REQUIRE(values[0] == "gamma"); + } + + SECTION("nullptr clears") { + array_of_strings values(2, "", std::allocator()); + values[0] = "one"; + values[1] = "two"; + + policy.update(values, static_cast(nullptr)); + REQUIRE(values.size() == 0); + } + + SECTION("pointer input copies") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "first"; + input[1] = "second"; + policy.update(values, &input); + REQUIRE(values.size() == 2); + REQUIRE(values[1] == "second"); + input[1] = "changed"; + REQUIRE(values[1] == "second"); + } +} + +TEST_CASE("aos sketch update", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto* entry: entries) array[i++] = entry; + return array; + }; + + SECTION("same key replaces summary") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({"alpha", "beta"}), make_array({"first"})); + sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"})); + + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 2); + REQUIRE(it->second[0] == "second"); + REQUIRE(it->second[1] == "third"); + } + + SECTION("distinct keys retain multiple entries") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({"a", "bc"}), make_array({"one"})); + sketch.update(make_array({"ab", "c"}), make_array({"two"})); + + REQUIRE(sketch.get_num_retained() == 2); + + bool saw_one = false; + bool saw_two = false; + for (const auto& entry: sketch) { + REQUIRE(entry.second.size() == 1); + if (entry.second[0] == "one") saw_one = true; + if (entry.second[0] == "two") saw_two = true; + } + REQUIRE(saw_one); + REQUIRE(saw_two); + } + + SECTION("empty key") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({}), make_array({"value"})); + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 1); + REQUIRE(it->second[0] == "value"); + } +} + +TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto& entry: entries) array[i++] = entry; + return array; + }; + + auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + typedef std::pair entry_type; + std::vector entries; + for (const auto& entry: sketch) entries.push_back(entry); + struct entry_less { + bool operator()(const entry_type& lhs, const entry_type& rhs) const { + return lhs.first < rhs.first; + } + }; + std::sort(entries.begin(), entries.end(), entry_less()); + return entries; + }; + + auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + compact_sketch.serialize(ss); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss); + + auto bytes = compact_sketch.serialize(); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size()); + + const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { + &deserialized_stream, + &deserialized_bytes + }; + for (int list_index = 0; list_index < 2; ++list_index) { + const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index]; + REQUIRE(compact_sketch.is_empty() == deserialized->is_empty()); + REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode()); + REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered()); + REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained()); + REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10)); + REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10)); + REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10)); + REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10)); + + auto original_entries = collect_entries(compact_sketch); + auto round_trip_entries = collect_entries(*deserialized); + REQUIRE(original_entries.size() == round_trip_entries.size()); + for (size_t i = 0; i < original_entries.size(); ++i) { + REQUIRE(original_entries[i].first == round_trip_entries[i].first); + REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size()); + for (size_t j = 0; j < original_entries[i].second.size(); ++j) { + REQUIRE(original_entries[i].second[static_cast(j)] == + round_trip_entries[i].second[static_cast(j)]); + } + } + } + }; + + auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = sketch.compact(true); + auto unordered = sketch.compact(false); + check_round_trip(ordered); + check_round_trip(unordered); + }; + + SECTION("empty sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + exercise_ordering(sketch); + } + + SECTION("single entry sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + sketch.update(make_array({"key"}), make_array({"value"})); + exercise_ordering(sketch); + } + + SECTION("multiple entries exact mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); + for (int i = 0; i < 50; ++i) { + sketch.update( + make_array({std::string("key-") + std::to_string(i)}), + make_array({std::string("value-") + std::to_string(i), "extra"}) + ); + } + REQUIRE_FALSE(sketch.is_estimation_mode()); + exercise_ordering(sketch); + } + + SECTION("multiple entries estimation mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + for (int i = 0; i < 10000; ++i) { + sketch.update( + make_array({std::string("key-") + std::to_string(i)}), + make_array({std::string("value-") + std::to_string(i)}) + ); + } + REQUIRE(sketch.is_estimation_mode()); + exercise_ordering(sketch); + } +} + +} /* namespace datasketches */ diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp index 408223f9..cf589cd0 100644 --- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp @@ -25,7 +25,7 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project -static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; +static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; TEST_CASE("tuple sketch int", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; From 307fe02179bf4a1776897ffff7bbee1913125402 Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:22:16 +0900 Subject: [PATCH 32/75] test: rollback test file path --- tuple/test/tuple_sketch_deserialize_from_java_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp index cf589cd0..408223f9 100644 --- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp @@ -25,7 +25,7 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project -static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; +static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; TEST_CASE("tuple sketch int", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; From a1e24c80e6336bccacc02a75a256643b5958d3be Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:23:42 +0900 Subject: [PATCH 33/75] chore: rollback test directory --- tuple/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 8c561745..3d7ccca3 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES CXX_STANDARD_REQUIRED YES ) -file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH) +file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH) string(APPEND THETA_TEST_BINARY_PATH "/") target_compile_definitions(tuple_test PRIVATE From 4b87a2d569e94006e3c986a96ba17eee431ba292 Mon Sep 17 00:00:00 2001 From: Mahesh Pai Date: Sat, 24 Jan 2026 17:43:17 +0530 Subject: [PATCH 34/75] Bugfix: tdigest const_iterator returns dangling reference causing incorrect values --- tdigest/include/tdigest.hpp | 2 +- tdigest/test/CMakeLists.txt | 1 + tdigest/test/tdigest_iterator_test.cpp | 274 +++++++++++++++++++++++++ 3 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 tdigest/test/tdigest_iterator_test.cpp diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 2d3620b1..095752e9 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -316,7 +316,7 @@ template class tdigest::const_iterator { public: using iterator_category = std::input_iterator_tag; - using value_type = std::pair; + using value_type = std::pair; using difference_type = void; using pointer = const return_value_holder; using reference = const value_type; diff --git a/tdigest/test/CMakeLists.txt b/tdigest/test/CMakeLists.txt index 18bf3599..8dcfb4f0 100644 --- a/tdigest/test/CMakeLists.txt +++ b/tdigest/test/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(tdigest_test PRIVATE tdigest_test.cpp tdigest_custom_allocator_test.cpp + tdigest_iterator_test.cpp ) if (SERDE_COMPAT) diff --git a/tdigest/test/tdigest_iterator_test.cpp b/tdigest/test/tdigest_iterator_test.cpp new file mode 100644 index 00000000..e7c03205 --- /dev/null +++ b/tdigest/test/tdigest_iterator_test.cpp @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include "tdigest.hpp" + +namespace datasketches { + +TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") { + tdigest_double td(100); + + // Insert 10 distinct values + for (int i = 0; i < 10; i++) { + td.update(static_cast(i)); + } + + // Collect all centroids via iteration + std::map centroids; + for (const auto&& centroid : td) { + centroids[centroid.first] = centroid.second; + } + + // Should have collected all 10 distinct values + REQUIRE(centroids.size() == 10); + + // Verify each value was captured correctly + for (int i = 0; i < 10; i++) { + REQUIRE(centroids.count(static_cast(i)) == 1); + REQUIRE(centroids[static_cast(i)] == 1); + } +} + +TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") { + // This test reproduces the bug scenario found in ClickHouse + std::unique_ptr td(new tdigest_double(100)); + + // Insert distinct values + for (int i = 0; i < 10; i++) { + td->update(static_cast(i)); + } + + // Use explicit begin/end iterators + auto it = td->begin(); + auto end_it = td->end(); + + std::vector means; + std::vector weights; + + while (it != end_it) { + // Before the fix, accessing it->first would return garbage or same value repeatedly + double mean = it->first; + uint64_t weight = it->second; + means.push_back(mean); + weights.push_back(weight); + ++it; + } + + // Should have collected 10 centroids + REQUIRE(means.size() == 10); + REQUIRE(weights.size() == 10); + + // All means should be distinct (not all zeros or garbage) + std::set unique_means(means.begin(), means.end()); + REQUIRE(unique_means.size() == 10); + + // Verify all expected values are present + for (int i = 0; i < 10; i++) { + REQUIRE(unique_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") { + tdigest_double td(100); + + for (int i = 0; i < 5; i++) { + td.update(static_cast(i * 10)); + } + + std::vector> collected; + + // Test structured bindings + for (auto it = td.begin(); it != td.end(); ++it) { + const auto& centroid = *it; + collected.emplace_back(centroid.first, centroid.second); + } + + REQUIRE(collected.size() == 5); + + // Verify distinct values were collected + std::set means; + for (const auto& pair : collected) { + means.insert(pair.first); + REQUIRE(pair.second == 1); // Each value inserted once + } + + REQUIRE(means.size() == 5); + for (int i = 0; i < 5; i++) { + REQUIRE(means.count(static_cast(i * 10)) == 1); + } +} + +TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (int i = 1; i <= 10; i++) { + td.update(static_cast(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100 + } + + // Access via operator-> + std::map centroids; + auto end_it = td.end(); + for (auto it = td.begin(); it != end_it; ++it) { + // operator-> should return valid values + centroids[it->first] = it->second; + } + + REQUIRE(centroids.size() == 10); + + // Verify the squared values + for (int i = 1; i <= 10; i++) { + double expected = static_cast(i * i); + REQUIRE(centroids.count(expected) == 1); + } +} + +TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (double d = 0.0; d < 10.0; d += 1.0) { + td.update(d); + } + + size_t count = 0; + std::set seen_means; + + // This pattern was working in simple tests but failing in optimized builds + for (const auto&& centroid : td) { + seen_means.insert(centroid.first); + count++; + } + + REQUIRE(count == 10); + REQUIRE(seen_means.size() == 10); + + // Verify all values from 0 to 9 are present + for (int i = 0; i < 10; i++) { + REQUIRE(seen_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") { + tdigest_double td(100); + + td.update(1.0); + td.update(2.0); + td.update(3.0); + + auto it = td.begin(); + + // Store the pair + auto pair1 = *it; + double mean1 = pair1.first; + + ++it; + + // Store another pair + auto pair2 = *it; + double mean2 = pair2.first; + + ++it; + + auto pair3 = *it; + double mean3 = pair3.first; + + // All three means should be distinct + REQUIRE(mean1 != mean2); + REQUIRE(mean2 != mean3); + REQUIRE(mean1 != mean3); + + // And they should match our input values + std::set means = {mean1, mean2, mean3}; + REQUIRE(means.count(1.0) == 1); + REQUIRE(means.count(2.0) == 1); + REQUIRE(means.count(3.0) == 1); +} + +TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") { + tdigest_double td(100); + + // Empty sketch should have begin() == end() + REQUIRE(td.begin() == td.end()); + + // Range-based for should not execute + size_t count = 0; + for (const auto&& centroid : td) { + (void)centroid; // Silence unused warning + count++; + } + REQUIRE(count == 0); +} + +TEST_CASE("tdigest iterator: single value", "[tdigest]") { + tdigest_double td(100); + td.update(42.0); + + size_t count = 0; + double captured_mean = 0.0; + uint64_t captured_weight = 0; + + for (const auto&& centroid : td) { + captured_mean = centroid.first; + captured_weight = centroid.second; + count++; + } + + REQUIRE(count == 1); + REQUIRE(captured_mean == 42.0); + REQUIRE(captured_weight == 1); +} + +TEST_CASE("tdigest iterator: large dataset", "[tdigest]") { + tdigest_double td(100); + + // Insert 1000 distinct values + for (int i = 0; i < 1000; i++) { + td.update(static_cast(i)); + } + + // Iterator should provide compressed centroids (not all 1000) + size_t centroid_count = 0; + std::set unique_means; + uint64_t total_weight = 0; + + for (const auto&& centroid : td) { + unique_means.insert(centroid.first); + total_weight += centroid.second; + centroid_count++; + } + + // Should have fewer centroids than input values due to compression + REQUIRE(centroid_count < 1000); + REQUIRE(centroid_count > 0); + + // Total weight should equal number of input values + REQUIRE(total_weight == 1000); + + // All means should be unique (no duplicates) + REQUIRE(unique_means.size() == centroid_count); +} + +} // namespace datasketches From 9381dcd227a5a06f8e18803dd21aa6c877b46b25 Mon Sep 17 00:00:00 2001 From: proost Date: Sun, 25 Jan 2026 22:14:47 +0900 Subject: [PATCH 35/75] fix: empty string handling --- tuple/include/array_of_strings_sketch.hpp | 66 +++- .../include/array_of_strings_sketch_impl.hpp | 101 +++--- .../aos_sketch_deserialize_from_java_test.cpp | 311 ++++++++++++------ tuple/test/aos_sketch_serialize_for_java.cpp | 12 +- tuple/test/array_of_strings_sketch_test.cpp | 16 +- 5 files changed, 334 insertions(+), 172 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index a3f8ddd7..4442fd64 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -49,8 +49,11 @@ class default_array_of_strings_update_policy { // serializer/deserializer for an array of strings // Requirements: all strings must be valid UTF-8 and array size must be <= 127. template> -struct array_of_strings_serde { +struct default_array_of_strings_serde { using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; @@ -59,6 +62,8 @@ struct array_of_strings_serde { size_t size_of_item(const array_of_strings& item) const; private: + Allocator allocator_; + summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); static void check_utf8(const std::string& value); @@ -79,17 +84,41 @@ class compact_array_of_strings_tuple_sketch: using summary_allocator = typename std::allocator_traits::template rebind_alloc; using Base = compact_tuple_sketch; using vector_bytes = typename Base::vector_bytes; - + using Base::serialize; + + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (update or compact) + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ template compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); - void serialize(std::ostream& os) const; - vector_bytes serialize(unsigned header_size_bytes = 0) const; - + /** + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, - const Allocator& allocator = Allocator()); + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + + /** + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, - const Allocator& allocator = Allocator()); + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); private: explicit compact_array_of_strings_tuple_sketch(Base&& base); @@ -97,20 +126,20 @@ class compact_array_of_strings_tuple_sketch: /** * Extended class of update_tuple_sketch for array of strings - * Requirements: all strings must be valid UTF-8 and array size must be <= 127. */ -template> +template class Policy = default_array_of_strings_update_policy, + typename Allocator = std::allocator> class update_array_of_strings_tuple_sketch: public update_tuple_sketch< array, array, - default_array_of_strings_update_policy, + Policy, typename std::allocator_traits::template rebind_alloc> > { public: using array_of_strings = array; using summary_allocator = typename std::allocator_traits::template rebind_alloc; - using policy_type = default_array_of_strings_update_policy; + using policy_type = Policy; using Base = update_tuple_sketch< array_of_strings, array_of_strings, @@ -121,7 +150,18 @@ class update_array_of_strings_tuple_sketch: class builder; using Base::update; + /** + * Updates the sketch with string array for both key and value. + * @param key the given string array key + * @param value the given string array value + */ void update(const array_of_strings& key, const array_of_strings& value); + + /** + * Converts this sketch to a compact sketch (ordered or unordered). + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; private: @@ -134,8 +174,8 @@ class update_array_of_strings_tuple_sketch: static uint64_t hash_key(const array_of_strings& key); }; -template -class update_array_of_strings_tuple_sketch::builder: +template class Policy, typename Allocator> +class update_array_of_strings_tuple_sketch::builder: public tuple_base_builder { public: builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 264f79bf..b95987a0 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -40,9 +40,9 @@ template void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings& input ) const { - const auto length = input.size(); - array = array_of_strings(length, "", allocator_); - for (uint8_t i = 0; i < length; ++i) array[i] = input[i]; + const auto length = static_cast(input.size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = input[i]; } template @@ -53,53 +53,53 @@ void default_array_of_strings_update_policy::update( array = array_of_strings(0, "", allocator_); return; } - const auto length = input->size(); - array = array_of_strings(length, "", allocator_); - for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i]; + const auto length = static_cast(input->size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } -template -update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( +template class Policy, typename Allocator> +update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const policy_type& policy, const summary_allocator& allocator ): Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} -template -void update_array_of_strings_tuple_sketch::update( +template class Policy, typename Allocator> +void update_array_of_strings_tuple_sketch::update( const array_of_strings& key, const array_of_strings& value ) { const uint64_t hash = hash_key(key); Base::update(hash, value); } -template -uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { +template class Policy, typename Allocator> +uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { XXHash64 hasher(STRING_ARR_HASH_SEED); const auto size = static_cast(key.size()); for (size_t i = 0; i < size; ++i) { - const auto& entry = key[static_cast(i)]; + const auto& entry = key[i]; hasher.add(entry.data(), entry.size()); if (i + 1 < size) hasher.add(",", 1); } return hasher.hash(); } -template -compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { +template class Policy, typename Allocator> +compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { return compact_array_of_strings_tuple_sketch(*this, ordered); } // builder -template -update_array_of_strings_tuple_sketch::builder::builder( +template class Policy, typename Allocator> +update_array_of_strings_tuple_sketch::builder::builder( const policy_type& policy, const summary_allocator& allocator ): tuple_base_builder(policy, allocator) {} -template -auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { +template class Policy, typename Allocator> +auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { return update_array_of_strings_tuple_sketch( this->starting_lg_size(), this->lg_k_, @@ -124,35 +124,32 @@ compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple ): Base(std::move(base)) {} template -void compact_array_of_strings_tuple_sketch::serialize(std::ostream& os) const { - Base::serialize(os, array_of_strings_serde()); -} - -template -auto compact_array_of_strings_tuple_sketch::serialize(unsigned header_size_bytes) const -> vector_bytes { - return Base::serialize(header_size_bytes, array_of_strings_serde()); -} - -template +template auto compact_array_of_strings_tuple_sketch::deserialize( - std::istream& is, uint64_t seed, const Allocator& allocator + std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { summary_allocator alloc(allocator); - auto base = Base::deserialize(is, seed, array_of_strings_serde(), alloc); + auto base = Base::deserialize(is, seed, sd, alloc); return compact_array_of_strings_tuple_sketch(std::move(base)); } template +template auto compact_array_of_strings_tuple_sketch::deserialize( - const void* bytes, size_t size, uint64_t seed, const Allocator& allocator + const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { summary_allocator alloc(allocator); - auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde(), alloc); + auto base = Base::deserialize(bytes, size, seed, sd, alloc); return compact_array_of_strings_tuple_sketch(std::move(base)); } template -void array_of_strings_serde::serialize( +default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): + allocator_(allocator), + summary_allocator_(allocator) {} + +template +void default_array_of_strings_serde::serialize( std::ostream& os, const array_of_strings* items, unsigned num ) const { for (unsigned i = 0; i < num; ++i) { @@ -171,27 +168,34 @@ void array_of_strings_serde::serialize( } template -void array_of_strings_serde::deserialize( +void default_array_of_strings_serde::deserialize( std::istream& is, array_of_strings* items, unsigned num ) const { for (unsigned i = 0; i < num; ++i) { read(is); // total_bytes + if (!is) throw std::runtime_error("array_of_strings stream read failed"); const uint8_t num_nodes = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", Allocator()); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { const uint32_t length = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); std::string value(length, '\0'); - is.read(&value[0], length); + if (length != 0) { + is.read(value.data(), length); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + } check_utf8(value); array[j] = std::move(value); } - new (&items[i]) array_of_strings(std::move(array)); + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } } template -size_t array_of_strings_serde::serialize( +size_t default_array_of_strings_serde::serialize( void* ptr, size_t capacity, const array_of_strings* items, unsigned num ) const { uint8_t* ptr8 = static_cast(ptr); @@ -216,7 +220,7 @@ size_t array_of_strings_serde::serialize( } template -size_t array_of_strings_serde::deserialize( +size_t default_array_of_strings_serde::deserialize( const void* ptr, size_t capacity, array_of_strings* items, unsigned num ) const { const uint8_t* ptr8 = static_cast(ptr); @@ -231,34 +235,37 @@ size_t array_of_strings_serde::deserialize( uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", Allocator()); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); std::string value(length, '\0'); - bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length); + } check_utf8(value); array[j] = std::move(value); } - new (&items[i]) array_of_strings(std::move(array)); + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } return bytes_read; } template -size_t array_of_strings_serde::size_of_item(const array_of_strings& item) const { +size_t default_array_of_strings_serde::size_of_item(const array_of_strings& item) const { return compute_total_bytes(item); } template -void array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { +void default_array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { if (num_nodes > 127) { throw std::runtime_error("array_of_strings size exceeds 127"); } } template -uint32_t array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { +uint32_t default_array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { const auto count = item.size(); check_num_nodes(static_cast(count)); size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); @@ -273,7 +280,7 @@ uint32_t array_of_strings_serde::compute_total_bytes(const array_of_s } template -void array_of_strings_serde::check_utf8(const std::string& value) { +void default_array_of_strings_serde::check_utf8(const std::string& value) { if (!utf8::is_valid(value.begin(), value.end())) { throw std::runtime_error("array_of_strings contains invalid UTF-8"); } diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp index 15e9d6dd..af37d6c2 100644 --- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -26,21 +26,53 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project - static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + + static std::vector read_binary_file(const std::string& path) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + is.seekg(0, std::ios::end); + const auto size = static_cast(is.tellg()); + is.seekg(0, std::ios::beg); + std::vector bytes(size); + if (size != 0) { + is.read(reinterpret_cast(bytes.data()), size); + } + return bytes; + } TEST_CASE("aos sketch one value", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 1); + const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } } } } @@ -48,125 +80,204 @@ namespace datasketches { TEST_CASE("aos sketch three values", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 3); + const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } } } } TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE(sketch.get_num_retained() == 0); + const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } } TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 1); + const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } } } } TEST_CASE("aos sketch unicode strings", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE_FALSE(sketch.is_estimation_mode()); - REQUIRE(sketch.get_num_retained() == 3); + const auto path = testBinaryInputPath + "aos_unicode_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); - const std::vector> expected_values = { - {"밸류", "값"}, - {"📦", "🎁"}, - {"ценить1", "ценить2"} - }; - std::vector matched(expected_values.size(), false); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 2); + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); - bool found = false; - for (size_t i = 0; i < expected_values.size(); ++i) { - if (matched[i]) continue; - const auto& expected = expected_values[i]; - if (entry.second.size() != expected.size()) continue; - bool equal = true; - for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { - equal = false; + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; break; } } - if (equal) { - matched[i] = true; - found = true; - break; - } + REQUIRE(found); } - REQUIRE(found); + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); } - for (bool found: matched) REQUIRE(found); } TEST_CASE("aos sketch empty strings", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE_FALSE(sketch.is_estimation_mode()); - REQUIRE(sketch.get_num_retained() == 3); - const std::vector> expected_values = { - {"empty_key_value"}, - {""}, - {"", ""} - }; - std::vector matched(expected_values.size(), false); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); + const auto path = testBinaryInputPath + "aos_empty_strings_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); - bool found = false; - for (size_t i = 0; i < expected_values.size(); ++i) { - if (matched[i]) continue; - const auto& expected = expected_values[i]; - if (entry.second.size() != expected.size()) continue; - bool equal = true; - for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { - equal = false; + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; break; } } - if (equal) { - matched[i] = true; - found = true; - break; - } + REQUIRE(found); } - REQUIRE(found); + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); } - for (bool found: matched) REQUIRE(found); } } diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index 3a154132..db506825 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -52,7 +52,7 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -72,7 +72,7 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -90,7 +90,7 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 0); std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { @@ -108,7 +108,7 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -129,7 +129,7 @@ TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { @@ -149,7 +149,7 @@ TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 45e554bd..3e3673aa 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -65,7 +65,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { values[0] = "one"; values[1] = "two"; - policy.update(values, static_cast(nullptr)); + policy.update(values, nullptr); REQUIRE(values.size() == 0); } @@ -162,11 +162,15 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); - compact_sketch.serialize(ss); - auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss); - - auto bytes = compact_sketch.serialize(); - auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size()); + compact_sketch.serialize(ss, default_array_of_strings_serde<>()); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize( + ss, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>()); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { &deserialized_stream, From 46c945d5c9f59a0bfd1e7afe6c8256fdd3e8d4f4 Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 27 Jan 2026 01:11:18 +0900 Subject: [PATCH 36/75] refactor: remove update sketch --- tuple/include/array_of_strings_sketch.hpp | 79 ++++++------------- .../include/array_of_strings_sketch_impl.hpp | 55 +++---------- tuple/test/aos_sketch_serialize_for_java.cpp | 43 ++++------ tuple/test/array_of_strings_sketch_test.cpp | 43 ++++++---- 4 files changed, 79 insertions(+), 141 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index 4442fd64..db147723 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -69,6 +69,12 @@ struct default_array_of_strings_serde { static void check_utf8(const std::string& value); }; +/** + * Hashes an array of strings using ArrayOfStrings-compatible hashing. + */ +template> +uint64_t hash_array_of_strings_key(const array& key); + /** * Extended class of compact_tuple_sketch for array of strings * Requirements: all strings must be valid UTF-8 and array size must be <= 127. @@ -125,63 +131,26 @@ class compact_array_of_strings_tuple_sketch: }; /** - * Extended class of update_tuple_sketch for array of strings + * Convenience alias for update_tuple_sketch for array of strings */ -template class Policy = default_array_of_strings_update_policy, - typename Allocator = std::allocator> -class update_array_of_strings_tuple_sketch: - public update_tuple_sketch< - array, - array, - Policy, - typename std::allocator_traits::template rebind_alloc> - > { -public: - using array_of_strings = array; - using summary_allocator = typename std::allocator_traits::template rebind_alloc; - using policy_type = Policy; - using Base = update_tuple_sketch< - array_of_strings, - array_of_strings, - policy_type, - summary_allocator - >; - using resize_factor = typename Base::resize_factor; - class builder; - using Base::update; - - /** - * Updates the sketch with string array for both key and value. - * @param key the given string array key - * @param value the given string array value - */ - void update(const array_of_strings& key, const array_of_strings& value); +template, + typename Policy = default_array_of_strings_update_policy> +using update_array_of_strings_tuple_sketch = update_tuple_sketch< + array, + array, + Policy, + typename std::allocator_traits::template rebind_alloc> +>; - /** - * Converts this sketch to a compact sketch (ordered or unordered). - * @param ordered optional flag to specify if an ordered sketch should be produced - * @return compact array of strings sketch - */ - compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; - -private: - update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, - uint64_t seed, const policy_type& policy, const summary_allocator& allocator); - - // Matches Java Util.PRIME for ArrayOfStrings key hashing. - static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; - - static uint64_t hash_key(const array_of_strings& key); -}; - -template class Policy, typename Allocator> -class update_array_of_strings_tuple_sketch::builder: - public tuple_base_builder { -public: - builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); - - update_array_of_strings_tuple_sketch build() const; -}; +/** + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). + * @param sketch input sketch + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ +template, typename Policy = default_array_of_strings_update_policy> +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); } /* namespace datasketches */ diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index b95987a0..01a3daba 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -58,23 +58,10 @@ void default_array_of_strings_update_policy::update( for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } -template class Policy, typename Allocator> -update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( - uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, - uint64_t seed, const policy_type& policy, const summary_allocator& allocator -): -Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} - -template class Policy, typename Allocator> -void update_array_of_strings_tuple_sketch::update( - const array_of_strings& key, const array_of_strings& value -) { - const uint64_t hash = hash_key(key); - Base::update(hash, value); -} - -template class Policy, typename Allocator> -uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { +template +uint64_t hash_array_of_strings_key(const array& key) { + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; XXHash64 hasher(STRING_ARR_HASH_SEED); const auto size = static_cast(key.size()); for (size_t i = 0; i < size; ++i) { @@ -85,31 +72,11 @@ uint64_t update_array_of_strings_tuple_sketch::hash_key(const return hasher.hash(); } -template class Policy, typename Allocator> -compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { - return compact_array_of_strings_tuple_sketch(*this, ordered); -} - -// builder - -template class Policy, typename Allocator> -update_array_of_strings_tuple_sketch::builder::builder( - const policy_type& policy, const summary_allocator& allocator -): -tuple_base_builder(policy, allocator) {} - -template class Policy, typename Allocator> -auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { - return update_array_of_strings_tuple_sketch( - this->starting_lg_size(), - this->lg_k_, - this->rf_, - this->p_, - this->starting_theta(), - this->seed_, - this->policy_, - this->allocator_ - ); +template +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered +) { + return compact_array_of_strings_tuple_sketch(sketch, ordered); } template @@ -183,7 +150,7 @@ void default_array_of_strings_serde::deserialize( if (!is) throw std::runtime_error("array_of_strings stream read failed"); std::string value(length, '\0'); if (length != 0) { - is.read(value.data(), length); + is.read(&value[0], length); if (!is) throw std::runtime_error("array_of_strings stream read failed"); } check_utf8(value); @@ -241,7 +208,7 @@ size_t default_array_of_strings_serde::deserialize( bytes_read += copy_from_mem(ptr8 + bytes_read, length); std::string value(length, '\0'); if (length != 0) { - bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length); + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } check_utf8(value); array[j] = std::move(value); diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index db506825..c6eb0dfc 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -26,7 +26,7 @@ namespace datasketches { using aos_sketch = update_array_of_strings_tuple_sketch<>; -using array_of_strings = aos_sketch::array_of_strings; +using array_of_strings = array; static array_of_strings make_array(std::initializer_list items) { array_of_strings array(static_cast(items.size()), ""); @@ -47,12 +47,12 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { key[0] = std::to_string(i); array_of_strings value(1, ""); value[0] = "value" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } @@ -67,12 +67,12 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { value[0] = "a" + std::to_string(i); value[1] = "b" + std::to_string(i); value[2] = "c" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } @@ -86,11 +86,11 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { key[0] = "key1"; array_of_strings value(1, ""); value[0] = "value1"; - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 0); std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { @@ -103,53 +103,44 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { key[1] = "subkey" + std::to_string(i % 10); array_of_strings value(1, ""); value[0] = "value" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { auto sketch = aos_sketch::builder().build(); sketch.update( - make_array({u8"키", u8"열쇠"}), + hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})), make_array({u8"밸류", u8"값"}) ); sketch.update( - make_array({u8"🔑", u8"🗝️"}), + hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})), make_array({u8"📦", u8"🎁"}) ); sketch.update( - make_array({u8"ключ1", u8"ключ2"}), + hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})), make_array({u8"ценить1", u8"ценить2"}) ); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { auto sketch = aos_sketch::builder().build(); - sketch.update( - make_array({""}), - make_array({"empty_key_value"}) - ); - sketch.update( - make_array({"empty_value_key"}), - make_array({""}) - ); - sketch.update( - make_array({"", ""}), - make_array({"", ""}) - ); + sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"})); + sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""})); + sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""})); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 3e3673aa..59cc04ca 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -18,7 +18,6 @@ */ #include -#include #include #include #include @@ -94,8 +93,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("same key replaces summary") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"alpha", "beta"}), make_array({"first"})); - sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"})); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"first"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"second", "third"}) + ); REQUIRE(sketch.get_num_retained() == 1); @@ -109,8 +114,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("distinct keys retain multiple entries") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"a", "bc"}), make_array({"one"})); - sketch.update(make_array({"ab", "c"}), make_array({"two"})); + sketch.update( + hash_array_of_strings_key(make_array({"a", "bc"})), + make_array({"one"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"ab", "c"})), + make_array({"two"}) + ); REQUIRE(sketch.get_num_retained() == 2); @@ -128,7 +139,7 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("empty key") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({}), make_array({"value"})); + sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"})); REQUIRE(sketch.get_num_retained() == 1); auto it = sketch.begin(); @@ -201,46 +212,46 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { } }; - auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) { - auto ordered = sketch.compact(true); - auto unordered = sketch.compact(false); + auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = compact_array_of_strings_sketch(sketch, true); + auto unordered = compact_array_of_strings_sketch(sketch, false); check_round_trip(ordered); check_round_trip(unordered); }; SECTION("empty sketch") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - exercise_ordering(sketch); + run_tests(sketch); } SECTION("single entry sketch") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"key"}), make_array({"value"})); - exercise_ordering(sketch); + sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"})); + run_tests(sketch); } SECTION("multiple entries exact mode") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); for (int i = 0; i < 50; ++i) { sketch.update( - make_array({std::string("key-") + std::to_string(i)}), + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), make_array({std::string("value-") + std::to_string(i), "extra"}) ); } REQUIRE_FALSE(sketch.is_estimation_mode()); - exercise_ordering(sketch); + run_tests(sketch); } SECTION("multiple entries estimation mode") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); for (int i = 0; i < 10000; ++i) { sketch.update( - make_array({std::string("key-") + std::to_string(i)}), + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), make_array({std::string("value-") + std::to_string(i)}) ); } REQUIRE(sketch.is_estimation_mode()); - exercise_ordering(sketch); + run_tests(sketch); } } From 1b91666377a34e49548097caf9f482a45a5b4e93 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Tue, 27 Jan 2026 13:23:09 +0530 Subject: [PATCH 37/75] BugFix: SIGABRT in quantiles_sketch::deserialize(): dereferencing empty std::optional (libc++ verbose_abort) --- kll/include/kll_sketch_impl.hpp | 37 +++++++++++++-------- quantiles/include/quantiles_sketch_impl.hpp | 37 +++++++++++++-------- req/include/req_sketch_impl.hpp | 37 +++++++++++++-------- sampling/include/ebpps_sample_impl.hpp | 25 +++++++++----- 4 files changed, 86 insertions(+), 50 deletions(-) diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index fde0a314..44fe6a15 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "conditional_forward.hpp" #include "count_zeros.hpp" @@ -481,18 +482,22 @@ kll_sketch kll_sketch::deserialize(std::istream& is, const Ser read(is, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; @@ -565,18 +570,22 @@ kll_sketch kll_sketch::deserialize(const void* bytes, size_t s ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp index 50c82c18..2dacf21e 100644 --- a/quantiles/include/quantiles_sketch_impl.hpp +++ b/quantiles/include/quantiles_sketch_impl.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "count_zeros.hpp" #include "conditional_forward.hpp" @@ -393,18 +394,22 @@ auto quantiles_sketch::deserialize(std::istream &is, const SerDe& serde const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - serde.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - serde.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { read(is); // no longer used @@ -507,18 +512,22 @@ auto quantiles_sketch::deserialize(const void* bytes, size_t size, cons const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { uint64_t unused_long; diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp index a28e74e2..3c1c2fc1 100755 --- a/req/include/req_sketch_impl.hpp +++ b/req/include/req_sketch_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace datasketches { @@ -461,7 +462,6 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -472,14 +472,19 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser uint64_t n = 1; if (num_levels > 1) { n = read(is); - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { @@ -537,7 +542,6 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -549,14 +553,19 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s if (num_levels > 1) { ensure_minimum_memory(end_ptr - ptr, sizeof(n)); ptr += copy_from_mem(ptr, n); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { diff --git a/sampling/include/ebpps_sample_impl.hpp b/sampling/include/ebpps_sample_impl.hpp index 88a86ae0..c48b32aa 100644 --- a/sampling/include/ebpps_sample_impl.hpp +++ b/sampling/include/ebpps_sample_impl.hpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace datasketches { @@ -365,11 +366,15 @@ std::pair, size_t> ebpps_sample::deserialize(const uint optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } return std::pair, size_t>( @@ -400,11 +405,15 @@ ebpps_sample ebpps_sample::deserialize(std::istream& is, const SerDe optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - sd.deserialize(is, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } if (!is.good()) throw std::runtime_error("error reading from std::istream"); From 342248f294020a362a35c34d9b40ea7224d27438 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:37:41 +0900 Subject: [PATCH 38/75] fix: control array and element life cycle --- tuple/include/array_tuple_sketch.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 547b240c..54a000e8 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -34,17 +34,22 @@ class array { public: using value_type = T; using allocator_type = Allocator; + using alloc_traits = std::allocator_traits; - explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()): - allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - std::fill(array_, array_ + size_, value); + explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): + allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } } array(const array& other): allocator_(other.allocator_), size_(other.size_), - array_(allocator_.allocate(size_)) + array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { - std::copy(other.array_, other.array_ + size_, array_); + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -52,9 +57,15 @@ class array { array_(other.array_) { other.array_ = nullptr; + other.size_ = 0; } ~array() { - if (array_ != nullptr) allocator_.deallocate(array_, size_); + if (array_ != nullptr) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + allocator_.deallocate(array_, size_); + } } array& operator=(const array& other) { array copy(other); From 2c712e99ff17d8c6eff3c2c1dc53db5b10e0613d Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:48:12 +0900 Subject: [PATCH 39/75] fix: null ptr to empty array --- tuple/include/array_tuple_sketch.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 54a000e8..416816e0 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -37,7 +37,7 @@ class array { using alloc_traits = std::allocator_traits; explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): - allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { + allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { for (uint8_t i = 0; i < size_; ++i) { alloc_traits::construct(allocator_, array_ + i, value); } @@ -45,7 +45,7 @@ class array { array(const array& other): allocator_(other.allocator_), size_(other.size_), - array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) + array_(allocator_.allocate(size_)) { for (uint8_t i = 0; i < size_; ++i) { alloc_traits::construct(allocator_, array_ + i, other.array_[i]); From d463bfb159cd338fd88094ba600a50cfcc0e34d2 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:54:32 +0900 Subject: [PATCH 40/75] test: serde validation cases --- .../include/array_of_strings_sketch_impl.hpp | 3 --- tuple/test/array_of_strings_sketch_test.cpp | 26 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 01a3daba..f5fa0652 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -240,9 +240,6 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar for (uint32_t j = 0; j < count; ++j) { total += data[j].size(); } - if (total > std::numeric_limits::max()) { - throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max"); - } return static_cast(total); } diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 59cc04ca..74b225b0 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -255,4 +255,30 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { } } +TEST_CASE("aos serde validation", "[tuple_sketch]") { + default_array_of_strings_serde<> serde; + + SECTION("invalid utf8 rejected") { + array_of_strings array(1, "", std::allocator()); + const std::string invalid_utf8("\xC3\x28", 2); + array[0] = invalid_utf8; + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("invalid UTF-8") + ); + } + + SECTION("too many nodes rejected") { + array_of_strings array(128, "", std::allocator()); + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("size exceeds 127") + ); + } +} + } /* namespace datasketches */ From 3b3a13de52445e663472f630bd41080810977649 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 03:32:39 +0900 Subject: [PATCH 41/75] perf: avoid allocation if data type is primitive --- tuple/include/array_tuple_sketch.hpp | 38 +++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 416816e0..9baa2010 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -22,6 +22,9 @@ #include #include +#include +#include +#include #include "serde.hpp" #include "tuple_sketch.hpp" @@ -38,18 +41,14 @@ class array { explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::construct(allocator_, array_ + i, value); - } + init_values(value, std::is_trivially_copyable()); } array(const array& other): allocator_(other.allocator_), size_(other.size_), array_(allocator_.allocate(size_)) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::construct(allocator_, array_ + i, other.array_[i]); - } + copy_from(other, std::is_trivially_copyable()); } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -61,9 +60,7 @@ class array { } ~array() { if (array_ != nullptr) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::destroy(allocator_, array_ + i); - } + destroy_values(std::is_trivially_destructible()); allocator_.deallocate(array_, size_); } } @@ -90,6 +87,29 @@ class array { return true; } private: + void init_values(const T& value, std::true_type) { + std::fill(array_, array_ + size_, value); + } + void init_values(const T& value, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } + } + void copy_from(const array& other, std::true_type) { + std::copy(other.array_, other.array_ + size_, array_); + } + void copy_from(const array& other, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } + } + void destroy_values(std::true_type) {} + void destroy_values(std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + } + Allocator allocator_; uint8_t size_; T* array_; From 189d22de11b53ff48bb279d066586abbd5af71b9 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 03:33:57 +0900 Subject: [PATCH 42/75] chore: remove unused header --- tuple/include/array_tuple_sketch.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 9baa2010..03761ff4 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "serde.hpp" From ba2aa6909aa5269af7fc3aa67ad9ced4f75a2938 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Tue, 27 Jan 2026 18:54:09 +0530 Subject: [PATCH 43/75] Added testcases --- .github/workflows/hardening.yml | 59 +++++++ common/test/CMakeLists.txt | 69 +++++++- common/test/deserialize_hardening_test.cpp | 188 +++++++++++++++++++++ 3 files changed, 310 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/hardening.yml create mode 100644 common/test/deserialize_hardening_test.cpp diff --git a/.github/workflows/hardening.yml b/.github/workflows/hardening.yml new file mode 100644 index 00000000..e264ebd9 --- /dev/null +++ b/.github/workflows/hardening.yml @@ -0,0 +1,59 @@ +name: libc++ Hardening Tests + +on: + push: + branches: + - master + pull_request: + branches: + - master + workflow_dispatch: + +env: + BUILD_TYPE: Debug + +jobs: + hardening-test: + name: C++17 with libc++ Hardening Mode + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + + - name: Install LLVM and libc++ + run: | + # Install LLVM/Clang with libc++ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + sudo add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" + sudo apt-get update + sudo apt-get install -y clang-18 libc++-18-dev libc++abi-18-dev + + - name: Configure with C++17 and libc++ hardening + env: + CC: clang-18 + CXX: clang++-18 + run: | + cmake -B build -S . \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS="-stdlib=libc++ -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG" \ + -DCMAKE_EXE_LINKER_FLAGS="-stdlib=libc++ -lc++abi" + + - name: Build hardening tests + run: cmake --build build --target hardening_test --config ${{ env.BUILD_TYPE }} + + - name: Run hardening tests + run: | + cd build + ./common/test/hardening_test "[deserialize_hardening]" + + - name: Report results + if: always() + run: | + echo "✅ Tests passed with libc++ hardening enabled!" + echo "This verifies the fix for issue #477 prevents SIGABRT." diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt index 7593bd0b..d190b628 100644 --- a/common/test/CMakeLists.txt +++ b/common/test/CMakeLists.txt @@ -75,12 +75,28 @@ target_sources(common_test # now the integration test part add_executable(integration_test) -target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple common_test_lib) - -set_target_properties(integration_test PROPERTIES - CXX_STANDARD 11 - CXX_STANDARD_REQUIRED YES -) +target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib) + +# Use C++17 if CMAKE_CXX_STANDARD is set to 17+, otherwise C++11 +# This allows hardening tests to use std::optional with libc++ hardening +if(DEFINED CMAKE_CXX_STANDARD) + if(CMAKE_CXX_STANDARD MATCHES "17|20|23") + set_target_properties(integration_test PROPERTIES + CXX_STANDARD ${CMAKE_CXX_STANDARD} + CXX_STANDARD_REQUIRED YES + ) + else() + set_target_properties(integration_test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES + ) + endif() +else() + set_target_properties(integration_test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES + ) +endif() add_test( NAME integration_test @@ -91,3 +107,44 @@ target_sources(integration_test PRIVATE integration_test.cpp ) + +# Separate hardening test executable (header-only, no pre-compiled libs) +# This ensures the sketch code is compiled with C++17 + hardening +# Always build this target - it will use CMAKE_CXX_STANDARD if set, otherwise C++17 +message(STATUS "CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}") + +add_executable(hardening_test) +target_link_libraries(hardening_test common common_test_lib) + +# Include directories for header-only sketch implementations +target_include_directories(hardening_test PRIVATE + ${CMAKE_SOURCE_DIR}/quantiles/include + ${CMAKE_SOURCE_DIR}/kll/include + ${CMAKE_SOURCE_DIR}/req/include + ${CMAKE_SOURCE_DIR}/common/include +) + +# Use C++17 minimum for hardening tests +if(CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17) + set_target_properties(hardening_test PROPERTIES + CXX_STANDARD ${CMAKE_CXX_STANDARD} + CXX_STANDARD_REQUIRED YES + ) + message(STATUS "hardening_test will use C++${CMAKE_CXX_STANDARD}") +else() + set_target_properties(hardening_test PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED YES + ) + message(STATUS "hardening_test will use C++17 (default)") +endif() + +add_test( + NAME hardening_test + COMMAND hardening_test "[deserialize_hardening]" +) + +target_sources(hardening_test + PRIVATE + deserialize_hardening_test.cpp +) diff --git a/common/test/deserialize_hardening_test.cpp b/common/test/deserialize_hardening_test.cpp new file mode 100644 index 00000000..64e654b4 --- /dev/null +++ b/common/test/deserialize_hardening_test.cpp @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +// Include all affected sketch types +#include +#include +#include + +namespace datasketches { + +/** + * Test for fix of issue #477: + * BUG: SIGABRT in deserialize(): dereferencing empty std::optional (libc++ verbose_abort) + * + * These tests exercise the actual deserialization code path that contained the bug. + * With buggy code (&*tmp on empty optional) and hardening enabled, these will SIGABRT. + * With fixed code (aligned_storage), these pass normally. + * + * IMPORTANT: These tests actually call deserialize() on multi-item sketches, which + * exercises the buggy code path where min/max are deserialized. + */ + +TEST_CASE("quantiles_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + // Create sketch with multiple items (so min/max are stored in serialization) + quantiles_sketch sketch(128); + for (int i = 0; i < 1000; i++) { + sketch.update(static_cast(i)); + } + + // Serialize + auto bytes = sketch.serialize(); + + // Deserialize - WITH BUGGY CODE AND HARDENING, THIS WILL SIGABRT HERE + // The bug is: sd.deserialize(is, &*tmp, 1) where tmp is empty optional + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + + // Verify deserialization worked correctly + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); + REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5)); +} + +TEST_CASE("quantiles_sketch: deserialize from stream", "[deserialize_hardening]") { + quantiles_sketch sketch(256); + for (int i = 0; i < 2000; i++) { + sketch.update(static_cast(i) * 0.5f); + } + + // Serialize to stream + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream - exercises the buggy code path + auto sketch2 = quantiles_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + kll_sketch sketch(200); + for (int i = 0; i < 1500; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy &*tmp code path + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize from stream", "[deserialize_hardening]") { + kll_sketch sketch(400); + for (int i = 0; i < 3000; i++) { + sketch.update(i); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = kll_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize multi-level sketch", "[deserialize_hardening]") { + // REQ sketch only has the bug when num_levels > 1 + // We need to add enough items to trigger multiple levels + req_sketch sketch(12); + for (int i = 0; i < 10000; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy code path when num_levels > 1 + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize from stream", "[deserialize_hardening]") { + req_sketch sketch(20); + for (int i = 0; i < 15000; i++) { + sketch.update(static_cast(i) * 0.1); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = req_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("multiple sketch types: stress test", "[deserialize_hardening]") { + SECTION("quantiles with various sizes") { + for (int k : {64, 128, 256}) { + quantiles_sketch sketch(k); + for (int i = 0; i < 5000; i++) { + sketch.update(i); + } + auto bytes = sketch.serialize(); + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("kll with various sizes") { + for (int k : {100, 200, 400}) { + kll_sketch sketch(k); + for (int i = 0; i < 4000; i++) { + sketch.update(static_cast(i) / 10.0); + } + auto bytes = sketch.serialize(); + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("req with various sizes") { + for (int k : {12, 20}) { + req_sketch sketch(k); + for (int i = 0; i < 8000; i++) { + sketch.update(static_cast(i)); + } + auto bytes = sketch.serialize(); + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } +} + +} // namespace datasketches From 4894e5e7b156f1ce6a909ef84321ab081078c16c Mon Sep 17 00:00:00 2001 From: proost Date: Sun, 1 Feb 2026 20:08:24 +0900 Subject: [PATCH 44/75] fix: allocation handling for string in deserialize --- tuple/include/array_of_strings_sketch.hpp | 50 +++++++++++----- .../include/array_of_strings_sketch_impl.hpp | 32 +++++----- .../aos_sketch_deserialize_from_java_test.cpp | 11 +++- tuple/test/aos_sketch_serialize_for_java.cpp | 58 ++++++++++++------- tuple/test/array_of_strings_sketch_test.cpp | 35 +++++++---- 5 files changed, 124 insertions(+), 62 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index db147723..4ee3bc9c 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -28,11 +28,22 @@ namespace datasketches { +template +struct array_of_strings_types { + using string_allocator = typename std::allocator_traits::template rebind_alloc; + using string_type = std::basic_string, string_allocator>; + using array_allocator = typename std::allocator_traits::template rebind_alloc; + using array_of_strings = array; +}; + // default update policy for an array of strings -template> +template> class default_array_of_strings_update_policy { public: - using array_of_strings = array; + using string_allocator = typename array_of_strings_types::string_allocator; + using string_type = typename array_of_strings_types::string_type; + using array_allocator = typename array_of_strings_types::array_allocator; + using array_of_strings = typename array_of_strings_types::array_of_strings; explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); @@ -48,9 +59,12 @@ class default_array_of_strings_update_policy { // serializer/deserializer for an array of strings // Requirements: all strings must be valid UTF-8 and array size must be <= 127. -template> +template> struct default_array_of_strings_serde { - using array_of_strings = array; + using string_allocator = typename array_of_strings_types::string_allocator; + using string_type = typename array_of_strings_types::string_type; + using array_allocator = typename array_of_strings_types::array_allocator; + using array_of_strings = typename array_of_strings_types::array_of_strings; using summary_allocator = typename std::allocator_traits::template rebind_alloc; explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); @@ -66,27 +80,29 @@ struct default_array_of_strings_serde { summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); - static void check_utf8(const std::string& value); + static void check_utf8(const string_type& value); }; /** * Hashes an array of strings using ArrayOfStrings-compatible hashing. */ -template> -uint64_t hash_array_of_strings_key(const array& key); +template> +uint64_t hash_array_of_strings_key(const typename array_of_strings_types::array_of_strings& key); /** * Extended class of compact_tuple_sketch for array of strings * Requirements: all strings must be valid UTF-8 and array size must be <= 127. */ -template> +template> class compact_array_of_strings_tuple_sketch: public compact_tuple_sketch< - array, - typename std::allocator_traits::template rebind_alloc> + typename array_of_strings_types::array_of_strings, + typename std::allocator_traits::template rebind_alloc< + typename array_of_strings_types::array_of_strings + > > { public: - using array_of_strings = array; + using array_of_strings = typename array_of_strings_types::array_of_strings; using summary_allocator = typename std::allocator_traits::template rebind_alloc; using Base = compact_tuple_sketch; using vector_bytes = typename Base::vector_bytes; @@ -133,13 +149,15 @@ class compact_array_of_strings_tuple_sketch: /** * Convenience alias for update_tuple_sketch for array of strings */ -template, +template, typename Policy = default_array_of_strings_update_policy> using update_array_of_strings_tuple_sketch = update_tuple_sketch< - array, - array, + typename array_of_strings_types::array_of_strings, + typename array_of_strings_types::array_of_strings, Policy, - typename std::allocator_traits::template rebind_alloc> + typename std::allocator_traits::template rebind_alloc< + typename array_of_strings_types::array_of_strings + > >; /** @@ -148,7 +166,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch< * @param ordered optional flag to specify if an ordered sketch should be produced * @return compact array of strings sketch */ -template, typename Policy = default_array_of_strings_update_policy> +template, typename Policy = default_array_of_strings_update_policy> compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index f5fa0652..e8725c55 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -33,7 +33,8 @@ default_array_of_strings_update_policy::default_array_of_strings_upda template auto default_array_of_strings_update_policy::create() const -> array_of_strings { - return array_of_strings(0, "", allocator_); + const string_type empty{string_allocator(allocator_)}; + return array_of_strings(0, empty, array_allocator(allocator_)); } template @@ -41,7 +42,8 @@ void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings& input ) const { const auto length = static_cast(input.size()); - array = array_of_strings(static_cast(length), "", allocator_); + const string_type empty{string_allocator(allocator_)}; + array = array_of_strings(static_cast(length), empty, array_allocator(allocator_)); for (size_t i = 0; i < length; ++i) array[i] = input[i]; } @@ -50,16 +52,18 @@ void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings* input ) const { if (input == nullptr) { - array = array_of_strings(0, "", allocator_); + const string_type empty{string_allocator(allocator_)}; + array = array_of_strings(0, empty, array_allocator(allocator_)); return; } const auto length = static_cast(input->size()); - array = array_of_strings(static_cast(length), "", allocator_); + const string_type empty{string_allocator(allocator_)}; + array = array_of_strings(static_cast(length), empty, array_allocator(allocator_)); for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } template -uint64_t hash_array_of_strings_key(const array& key) { +uint64_t hash_array_of_strings_key(const typename array_of_strings_types::array_of_strings& key) { // Matches Java Util.PRIME for ArrayOfStrings key hashing. static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; XXHash64 hasher(STRING_ARR_HASH_SEED); @@ -124,7 +128,7 @@ void default_array_of_strings_serde::serialize( const uint8_t num_nodes = static_cast(items[i].size()); write(os, total_bytes); write(os, num_nodes); - const std::string* data = items[i].data(); + const string_type* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); @@ -144,11 +148,12 @@ void default_array_of_strings_serde::deserialize( const uint8_t num_nodes = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", allocator_); + const string_type empty{string_allocator(allocator_)}; + array_of_strings array(num_nodes, empty, array_allocator(allocator_)); for (uint8_t j = 0; j < num_nodes; ++j) { const uint32_t length = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); - std::string value(length, '\0'); + string_type value(length, '\0', string_allocator(allocator_)); if (length != 0) { is.read(&value[0], length); if (!is) throw std::runtime_error("array_of_strings stream read failed"); @@ -174,7 +179,7 @@ size_t default_array_of_strings_serde::serialize( check_memory_size(bytes_written + total_bytes, capacity); bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); - const std::string* data = items[i].data(); + const string_type* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); @@ -202,11 +207,12 @@ size_t default_array_of_strings_serde::deserialize( uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", allocator_); + const string_type empty{string_allocator(allocator_)}; + array_of_strings array(num_nodes, empty, array_allocator(allocator_)); for (uint8_t j = 0; j < num_nodes; ++j) { uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); - std::string value(length, '\0'); + string_type value(length, '\0', string_allocator(allocator_)); if (length != 0) { bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } @@ -236,7 +242,7 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar const auto count = item.size(); check_num_nodes(static_cast(count)); size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); - const std::string* data = item.data(); + const string_type* data = item.data(); for (uint32_t j = 0; j < count; ++j) { total += data[j].size(); } @@ -244,7 +250,7 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar } template -void default_array_of_strings_serde::check_utf8(const std::string& value) { +void default_array_of_strings_serde::check_utf8(const string_type& value) { if (!utf8::is_valid(value.begin(), value.end())) { throw std::runtime_error("array_of_strings contains invalid UTF-8"); } diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp index af37d6c2..a623f618 100644 --- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include #include @@ -24,6 +25,12 @@ #include "array_of_strings_sketch.hpp" namespace datasketches { + using types = array_of_strings_types>; + using string_type = types::string_type; + + static bool equals_string(const string_type& lhs, const std::string& rhs) { + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); + } // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; @@ -193,7 +200,7 @@ namespace datasketches { if (entry.second.size() != expected.size()) continue; bool equal = true; for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { + if (!equals_string(entry.second[j], expected[j])) { equal = false; break; } @@ -248,7 +255,7 @@ namespace datasketches { if (entry.second.size() != expected.size()) continue; bool equal = true; for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { + if (!equals_string(entry.second[j], expected[j])) { equal = false; break; } diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index c6eb0dfc..ab5fd53b 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -26,13 +26,18 @@ namespace datasketches { using aos_sketch = update_array_of_strings_tuple_sketch<>; -using array_of_strings = array; +using types = array_of_strings_types>; +using array_of_strings = types::array_of_strings; +using string_allocator = types::string_allocator; +using string_type = types::string_type; +using array_allocator = types::array_allocator; static array_of_strings make_array(std::initializer_list items) { - array_of_strings array(static_cast(items.size()), ""); + const string_type empty{string_allocator()}; + array_of_strings array(static_cast(items.size()), empty, array_allocator()); size_t i = 0; for (const auto& item: items) { - array[static_cast(i)] = item; + array[static_cast(i)] = string_type(item.data(), item.size(), string_allocator()); ++i; } return array; @@ -43,10 +48,13 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - array_of_strings key(1, ""); - key[0] = std::to_string(i); - array_of_strings value(1, ""); - value[0] = "value" + std::to_string(i); + const string_type empty{string_allocator()}; + array_of_strings key(1, empty, array_allocator()); + const std::string key_value = std::to_string(i); + key[0] = string_type(key_value.data(), key_value.size(), string_allocator()); + array_of_strings value(1, empty, array_allocator()); + const std::string value_str = "value" + std::to_string(i); + value[0] = string_type(value_str.data(), value_str.size(), string_allocator()); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); @@ -61,12 +69,17 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - array_of_strings key(1, ""); - key[0] = std::to_string(i); - array_of_strings value(3, ""); - value[0] = "a" + std::to_string(i); - value[1] = "b" + std::to_string(i); - value[2] = "c" + std::to_string(i); + const string_type empty{string_allocator()}; + array_of_strings key(1, empty, array_allocator()); + const std::string key_value = std::to_string(i); + key[0] = string_type(key_value.data(), key_value.size(), string_allocator()); + array_of_strings value(3, empty, array_allocator()); + const std::string value_a = "a" + std::to_string(i); + const std::string value_b = "b" + std::to_string(i); + const std::string value_c = "c" + std::to_string(i); + value[0] = string_type(value_a.data(), value_a.size(), string_allocator()); + value[1] = string_type(value_b.data(), value_b.size(), string_allocator()); + value[2] = string_type(value_c.data(), value_c.size(), string_allocator()); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); @@ -82,9 +95,10 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { .set_resize_factor(resize_factor::X8) .set_p(0.01f) .build(); - array_of_strings key(1, ""); + const string_type empty{string_allocator()}; + array_of_strings key(1, empty, array_allocator()); key[0] = "key1"; - array_of_strings value(1, ""); + array_of_strings value(1, empty, array_allocator()); value[0] = "value1"; sketch.update(hash_array_of_strings_key(key), value); REQUIRE_FALSE(sketch.is_empty()); @@ -98,11 +112,15 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - array_of_strings key(2, ""); - key[0] = "key" + std::to_string(i); - key[1] = "subkey" + std::to_string(i % 10); - array_of_strings value(1, ""); - value[0] = "value" + std::to_string(i); + const string_type empty{string_allocator()}; + array_of_strings key(2, empty, array_allocator()); + const std::string key0 = "key" + std::to_string(i); + const std::string key1 = "subkey" + std::to_string(i % 10); + key[0] = string_type(key0.data(), key0.size(), string_allocator()); + key[1] = string_type(key1.data(), key1.size(), string_allocator()); + array_of_strings value(1, empty, array_allocator()); + const std::string value_str = "value" + std::to_string(i); + value[0] = string_type(value_str.data(), value_str.size(), string_allocator()); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 74b225b0..8e1f1582 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -29,7 +29,11 @@ namespace datasketches { -using array_of_strings = array; +using types = array_of_strings_types>; +using array_of_strings = types::array_of_strings; +using string_allocator = types::string_allocator; +using string_type = types::string_type; +using array_allocator = types::array_allocator; TEST_CASE("aos update policy", "[tuple_sketch]") { default_array_of_strings_update_policy<> policy; @@ -42,7 +46,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { SECTION("replace array") { auto values = policy.create(); - array_of_strings input(2, "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings input(2, empty, array_allocator()); input[0] = "alpha"; input[1] = "beta"; policy.update(values, input); @@ -52,7 +57,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { input[0] = "changed"; REQUIRE(values[0] == "alpha"); - array_of_strings input2(1, "", std::allocator()); + array_of_strings input2(1, empty, array_allocator()); input2[0] = "gamma"; policy.update(values, input2); REQUIRE(values.size() == 1); @@ -60,7 +65,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { } SECTION("nullptr clears") { - array_of_strings values(2, "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings values(2, empty, array_allocator()); values[0] = "one"; values[1] = "two"; @@ -71,7 +77,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { SECTION("pointer input copies") { auto values = policy.create(); - array_of_strings input(2, "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings input(2, empty, array_allocator()); input[0] = "first"; input[1] = "second"; policy.update(values, &input); @@ -84,7 +91,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { TEST_CASE("aos sketch update", "[tuple_sketch]") { auto make_array = [](std::initializer_list entries) { - array_of_strings array(static_cast(entries.size()), "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings array(static_cast(entries.size()), empty, array_allocator()); uint8_t i = 0; for (const auto* entry: entries) array[i++] = entry; return array; @@ -151,9 +159,12 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { auto make_array = [](std::initializer_list entries) { - array_of_strings array(static_cast(entries.size()), "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings array(static_cast(entries.size()), empty, array_allocator()); uint8_t i = 0; - for (const auto& entry: entries) array[i++] = entry; + for (const auto& entry: entries) { + array[i++] = string_type(entry.data(), entry.size(), string_allocator()); + } return array; }; @@ -259,8 +270,9 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") { default_array_of_strings_serde<> serde; SECTION("invalid utf8 rejected") { - array_of_strings array(1, "", std::allocator()); - const std::string invalid_utf8("\xC3\x28", 2); + const string_type empty{string_allocator()}; + array_of_strings array(1, empty, array_allocator()); + const string_type invalid_utf8("\xC3\x28", 2, string_allocator()); array[0] = invalid_utf8; std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -271,7 +283,8 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") { } SECTION("too many nodes rejected") { - array_of_strings array(128, "", std::allocator()); + const string_type empty{string_allocator()}; + array_of_strings array(128, empty, array_allocator()); std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); REQUIRE_THROWS_WITH( From 852b26bfd180fe48aafab307a5391232b011012b Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 3 Feb 2026 01:29:47 +0900 Subject: [PATCH 45/75] test: add missing kll long sketch compatibility cases --- .../kll_sketch_deserialize_from_java_test.cpp | 24 +++++++++++++++++++ kll/test/kll_sketch_serialize_for_java.cpp | 10 ++++++++ 2 files changed, 34 insertions(+) diff --git a/kll/test/kll_sketch_deserialize_from_java_test.cpp b/kll/test/kll_sketch_deserialize_from_java_test.cpp index 795486ae..65efc3e5 100644 --- a/kll/test/kll_sketch_deserialize_from_java_test.cpp +++ b/kll/test/kll_sketch_deserialize_from_java_test.cpp @@ -100,4 +100,28 @@ TEST_CASE("kll string", "[serde_compat]") { } } +TEST_CASE("kll long", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "kll_long_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = kll_sketch::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K)); + REQUIRE(sketch.get_n() == n); + if (n > 0) { + REQUIRE(sketch.get_min_item() == 1); + REQUIRE(sketch.get_max_item() == static_cast(n)); + uint64_t weight = 0; + for (const auto pair: sketch) { + REQUIRE(pair.first >= sketch.get_min_item()); + REQUIRE(pair.first <= sketch.get_max_item()); + weight += pair.second; + } + REQUIRE(weight == sketch.get_n()); + } + } +} + } /* namespace datasketches */ diff --git a/kll/test/kll_sketch_serialize_for_java.cpp b/kll/test/kll_sketch_serialize_for_java.cpp index 00b8913d..22b75774 100644 --- a/kll/test/kll_sketch_serialize_for_java.cpp +++ b/kll/test/kll_sketch_serialize_for_java.cpp @@ -43,6 +43,16 @@ TEST_CASE("kll sketch double generate", "[serialize_for_java]") { } } +TEST_CASE("kll sketch long generate", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + kll_sketch sketch; + for (unsigned i = 1; i <= n; ++i) sketch.update(i); + std::ofstream os("kll_long_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.serialize(os); + } +} + struct compare_as_number { bool operator()(const std::string& a, const std::string& b) const { return std::stoi(a) < std::stoi(b); From f5fb9d9d9142aed46295fb98888ee6b9b414e73f Mon Sep 17 00:00:00 2001 From: Mahesh Pai Date: Wed, 4 Feb 2026 14:55:58 +0530 Subject: [PATCH 46/75] Review comments --- common/test/CMakeLists.txt | 48 +++++++++++++------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt index d190b628..c3e937a2 100644 --- a/common/test/CMakeLists.txt +++ b/common/test/CMakeLists.txt @@ -77,26 +77,15 @@ add_executable(integration_test) target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib) -# Use C++17 if CMAKE_CXX_STANDARD is set to 17+, otherwise C++11 -# This allows hardening tests to use std::optional with libc++ hardening +# Use CMAKE_CXX_STANDARD if defined, otherwise C++11 +set(_integration_cxx_standard 11) if(DEFINED CMAKE_CXX_STANDARD) - if(CMAKE_CXX_STANDARD MATCHES "17|20|23") - set_target_properties(integration_test PROPERTIES - CXX_STANDARD ${CMAKE_CXX_STANDARD} - CXX_STANDARD_REQUIRED YES - ) - else() - set_target_properties(integration_test PROPERTIES - CXX_STANDARD 11 - CXX_STANDARD_REQUIRED YES - ) - endif() -else() - set_target_properties(integration_test PROPERTIES - CXX_STANDARD 11 - CXX_STANDARD_REQUIRED YES - ) + set(_integration_cxx_standard ${CMAKE_CXX_STANDARD}) endif() +set_target_properties(integration_test PROPERTIES + CXX_STANDARD ${_integration_cxx_standard} + CXX_STANDARD_REQUIRED YES +) add_test( NAME integration_test @@ -110,8 +99,7 @@ target_sources(integration_test # Separate hardening test executable (header-only, no pre-compiled libs) # This ensures the sketch code is compiled with C++17 + hardening -# Always build this target - it will use CMAKE_CXX_STANDARD if set, otherwise C++17 -message(STATUS "CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}") +# Always build this target - it will use CMAKE_CXX_STANDARD if set (and >= 17), otherwise C++17 add_executable(hardening_test) target_link_libraries(hardening_test common common_test_lib) @@ -125,19 +113,15 @@ target_include_directories(hardening_test PRIVATE ) # Use C++17 minimum for hardening tests -if(CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17) - set_target_properties(hardening_test PROPERTIES - CXX_STANDARD ${CMAKE_CXX_STANDARD} - CXX_STANDARD_REQUIRED YES - ) - message(STATUS "hardening_test will use C++${CMAKE_CXX_STANDARD}") -else() - set_target_properties(hardening_test PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES - ) - message(STATUS "hardening_test will use C++17 (default)") +set(_hardening_cxx_standard 17) +if(DEFINED CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17) + set(_hardening_cxx_standard ${CMAKE_CXX_STANDARD}) endif() +set_target_properties(hardening_test PROPERTIES + CXX_STANDARD ${_hardening_cxx_standard} + CXX_STANDARD_REQUIRED YES +) +message(STATUS "hardening_test will use C++${_hardening_cxx_standard}") add_test( NAME hardening_test From fccb2385f3a66416fc34c0d7bd0513696721ecd9 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 5 Feb 2026 21:44:05 +0800 Subject: [PATCH 47/75] Fix error message for empty window data in compressor (#482) --- cpc/include/cpc_compressor_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpc/include/cpc_compressor_impl.hpp b/cpc/include/cpc_compressor_impl.hpp index 062e2e0e..0cc24b19 100644 --- a/cpc/include/cpc_compressor_impl.hpp +++ b/cpc/include/cpc_compressor_impl.hpp @@ -157,7 +157,7 @@ void cpc_compressor::compress(const cpc_sketch_alloc& source, compressed_s break; case cpc_sketch_alloc::flavor::PINNED: compress_pinned_flavor(source, result); - if (result.window_data.size() == 0) throw std::logic_error("window is not expected"); + if (result.window_data.size() == 0) throw std::logic_error("window is expected"); break; case cpc_sketch_alloc::flavor::SLIDING: compress_sliding_flavor(source, result); From 79cb75cc0e873922ab80cc6f32f84220710afe10 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Fri, 6 Feb 2026 12:22:43 +0900 Subject: [PATCH 48/75] fix: check length for equal --- tuple/include/array_tuple_sketch.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 03761ff4..d331f8b1 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -82,6 +82,7 @@ class array { T* data() { return array_; } const T* data() const { return array_; } bool operator==(const array& other) const { + if (size_ != other.size_) return false; for (uint8_t i = 0; i < size_; ++i) if (array_[i] != other.array_[i]) return false; return true; } From 2a59f114871027c9b068bbfca39350c9cf3f2da8 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Fri, 6 Feb 2026 23:32:55 +0900 Subject: [PATCH 49/75] Revert "fix: allocation handling for string in deserialize" This reverts commit 4894e5e7b156f1ce6a909ef84321ab081078c16c. --- tuple/include/array_of_strings_sketch.hpp | 50 +++++----------- .../include/array_of_strings_sketch_impl.hpp | 32 +++++----- .../aos_sketch_deserialize_from_java_test.cpp | 11 +--- tuple/test/aos_sketch_serialize_for_java.cpp | 58 +++++++------------ tuple/test/array_of_strings_sketch_test.cpp | 35 ++++------- 5 files changed, 62 insertions(+), 124 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index 4ee3bc9c..db147723 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -28,22 +28,11 @@ namespace datasketches { -template -struct array_of_strings_types { - using string_allocator = typename std::allocator_traits::template rebind_alloc; - using string_type = std::basic_string, string_allocator>; - using array_allocator = typename std::allocator_traits::template rebind_alloc; - using array_of_strings = array; -}; - // default update policy for an array of strings -template> +template> class default_array_of_strings_update_policy { public: - using string_allocator = typename array_of_strings_types::string_allocator; - using string_type = typename array_of_strings_types::string_type; - using array_allocator = typename array_of_strings_types::array_allocator; - using array_of_strings = typename array_of_strings_types::array_of_strings; + using array_of_strings = array; explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); @@ -59,12 +48,9 @@ class default_array_of_strings_update_policy { // serializer/deserializer for an array of strings // Requirements: all strings must be valid UTF-8 and array size must be <= 127. -template> +template> struct default_array_of_strings_serde { - using string_allocator = typename array_of_strings_types::string_allocator; - using string_type = typename array_of_strings_types::string_type; - using array_allocator = typename array_of_strings_types::array_allocator; - using array_of_strings = typename array_of_strings_types::array_of_strings; + using array_of_strings = array; using summary_allocator = typename std::allocator_traits::template rebind_alloc; explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); @@ -80,29 +66,27 @@ struct default_array_of_strings_serde { summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); - static void check_utf8(const string_type& value); + static void check_utf8(const std::string& value); }; /** * Hashes an array of strings using ArrayOfStrings-compatible hashing. */ -template> -uint64_t hash_array_of_strings_key(const typename array_of_strings_types::array_of_strings& key); +template> +uint64_t hash_array_of_strings_key(const array& key); /** * Extended class of compact_tuple_sketch for array of strings * Requirements: all strings must be valid UTF-8 and array size must be <= 127. */ -template> +template> class compact_array_of_strings_tuple_sketch: public compact_tuple_sketch< - typename array_of_strings_types::array_of_strings, - typename std::allocator_traits::template rebind_alloc< - typename array_of_strings_types::array_of_strings - > + array, + typename std::allocator_traits::template rebind_alloc> > { public: - using array_of_strings = typename array_of_strings_types::array_of_strings; + using array_of_strings = array; using summary_allocator = typename std::allocator_traits::template rebind_alloc; using Base = compact_tuple_sketch; using vector_bytes = typename Base::vector_bytes; @@ -149,15 +133,13 @@ class compact_array_of_strings_tuple_sketch: /** * Convenience alias for update_tuple_sketch for array of strings */ -template, +template, typename Policy = default_array_of_strings_update_policy> using update_array_of_strings_tuple_sketch = update_tuple_sketch< - typename array_of_strings_types::array_of_strings, - typename array_of_strings_types::array_of_strings, + array, + array, Policy, - typename std::allocator_traits::template rebind_alloc< - typename array_of_strings_types::array_of_strings - > + typename std::allocator_traits::template rebind_alloc> >; /** @@ -166,7 +148,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch< * @param ordered optional flag to specify if an ordered sketch should be produced * @return compact array of strings sketch */ -template, typename Policy = default_array_of_strings_update_policy> +template, typename Policy = default_array_of_strings_update_policy> compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index e8725c55..f5fa0652 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -33,8 +33,7 @@ default_array_of_strings_update_policy::default_array_of_strings_upda template auto default_array_of_strings_update_policy::create() const -> array_of_strings { - const string_type empty{string_allocator(allocator_)}; - return array_of_strings(0, empty, array_allocator(allocator_)); + return array_of_strings(0, "", allocator_); } template @@ -42,8 +41,7 @@ void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings& input ) const { const auto length = static_cast(input.size()); - const string_type empty{string_allocator(allocator_)}; - array = array_of_strings(static_cast(length), empty, array_allocator(allocator_)); + array = array_of_strings(static_cast(length), "", allocator_); for (size_t i = 0; i < length; ++i) array[i] = input[i]; } @@ -52,18 +50,16 @@ void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings* input ) const { if (input == nullptr) { - const string_type empty{string_allocator(allocator_)}; - array = array_of_strings(0, empty, array_allocator(allocator_)); + array = array_of_strings(0, "", allocator_); return; } const auto length = static_cast(input->size()); - const string_type empty{string_allocator(allocator_)}; - array = array_of_strings(static_cast(length), empty, array_allocator(allocator_)); + array = array_of_strings(static_cast(length), "", allocator_); for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } template -uint64_t hash_array_of_strings_key(const typename array_of_strings_types::array_of_strings& key) { +uint64_t hash_array_of_strings_key(const array& key) { // Matches Java Util.PRIME for ArrayOfStrings key hashing. static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; XXHash64 hasher(STRING_ARR_HASH_SEED); @@ -128,7 +124,7 @@ void default_array_of_strings_serde::serialize( const uint8_t num_nodes = static_cast(items[i].size()); write(os, total_bytes); write(os, num_nodes); - const string_type* data = items[i].data(); + const std::string* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); @@ -148,12 +144,11 @@ void default_array_of_strings_serde::deserialize( const uint8_t num_nodes = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); check_num_nodes(num_nodes); - const string_type empty{string_allocator(allocator_)}; - array_of_strings array(num_nodes, empty, array_allocator(allocator_)); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { const uint32_t length = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); - string_type value(length, '\0', string_allocator(allocator_)); + std::string value(length, '\0'); if (length != 0) { is.read(&value[0], length); if (!is) throw std::runtime_error("array_of_strings stream read failed"); @@ -179,7 +174,7 @@ size_t default_array_of_strings_serde::serialize( check_memory_size(bytes_written + total_bytes, capacity); bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); - const string_type* data = items[i].data(); + const std::string* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); @@ -207,12 +202,11 @@ size_t default_array_of_strings_serde::deserialize( uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); - const string_type empty{string_allocator(allocator_)}; - array_of_strings array(num_nodes, empty, array_allocator(allocator_)); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); - string_type value(length, '\0', string_allocator(allocator_)); + std::string value(length, '\0'); if (length != 0) { bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } @@ -242,7 +236,7 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar const auto count = item.size(); check_num_nodes(static_cast(count)); size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); - const string_type* data = item.data(); + const std::string* data = item.data(); for (uint32_t j = 0; j < count; ++j) { total += data[j].size(); } @@ -250,7 +244,7 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar } template -void default_array_of_strings_serde::check_utf8(const string_type& value) { +void default_array_of_strings_serde::check_utf8(const std::string& value) { if (!utf8::is_valid(value.begin(), value.end())) { throw std::runtime_error("array_of_strings contains invalid UTF-8"); } diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp index a623f618..af37d6c2 100644 --- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -17,7 +17,6 @@ * under the License. */ -#include #include #include #include @@ -25,12 +24,6 @@ #include "array_of_strings_sketch.hpp" namespace datasketches { - using types = array_of_strings_types>; - using string_type = types::string_type; - - static bool equals_string(const string_type& lhs, const std::string& rhs) { - return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); - } // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; @@ -200,7 +193,7 @@ namespace datasketches { if (entry.second.size() != expected.size()) continue; bool equal = true; for (size_t j = 0; j < expected.size(); ++j) { - if (!equals_string(entry.second[j], expected[j])) { + if (entry.second[j] != expected[j]) { equal = false; break; } @@ -255,7 +248,7 @@ namespace datasketches { if (entry.second.size() != expected.size()) continue; bool equal = true; for (size_t j = 0; j < expected.size(); ++j) { - if (!equals_string(entry.second[j], expected[j])) { + if (entry.second[j] != expected[j]) { equal = false; break; } diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index ab5fd53b..c6eb0dfc 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -26,18 +26,13 @@ namespace datasketches { using aos_sketch = update_array_of_strings_tuple_sketch<>; -using types = array_of_strings_types>; -using array_of_strings = types::array_of_strings; -using string_allocator = types::string_allocator; -using string_type = types::string_type; -using array_allocator = types::array_allocator; +using array_of_strings = array; static array_of_strings make_array(std::initializer_list items) { - const string_type empty{string_allocator()}; - array_of_strings array(static_cast(items.size()), empty, array_allocator()); + array_of_strings array(static_cast(items.size()), ""); size_t i = 0; for (const auto& item: items) { - array[static_cast(i)] = string_type(item.data(), item.size(), string_allocator()); + array[static_cast(i)] = item; ++i; } return array; @@ -48,13 +43,10 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - const string_type empty{string_allocator()}; - array_of_strings key(1, empty, array_allocator()); - const std::string key_value = std::to_string(i); - key[0] = string_type(key_value.data(), key_value.size(), string_allocator()); - array_of_strings value(1, empty, array_allocator()); - const std::string value_str = "value" + std::to_string(i); - value[0] = string_type(value_str.data(), value_str.size(), string_allocator()); + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); @@ -69,17 +61,12 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - const string_type empty{string_allocator()}; - array_of_strings key(1, empty, array_allocator()); - const std::string key_value = std::to_string(i); - key[0] = string_type(key_value.data(), key_value.size(), string_allocator()); - array_of_strings value(3, empty, array_allocator()); - const std::string value_a = "a" + std::to_string(i); - const std::string value_b = "b" + std::to_string(i); - const std::string value_c = "c" + std::to_string(i); - value[0] = string_type(value_a.data(), value_a.size(), string_allocator()); - value[1] = string_type(value_b.data(), value_b.size(), string_allocator()); - value[2] = string_type(value_c.data(), value_c.size(), string_allocator()); + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); @@ -95,10 +82,9 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { .set_resize_factor(resize_factor::X8) .set_p(0.01f) .build(); - const string_type empty{string_allocator()}; - array_of_strings key(1, empty, array_allocator()); + array_of_strings key(1, ""); key[0] = "key1"; - array_of_strings value(1, empty, array_allocator()); + array_of_strings value(1, ""); value[0] = "value1"; sketch.update(hash_array_of_strings_key(key), value); REQUIRE_FALSE(sketch.is_empty()); @@ -112,15 +98,11 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { for (const unsigned n: n_arr) { auto sketch = aos_sketch::builder().build(); for (unsigned i = 0; i < n; ++i) { - const string_type empty{string_allocator()}; - array_of_strings key(2, empty, array_allocator()); - const std::string key0 = "key" + std::to_string(i); - const std::string key1 = "subkey" + std::to_string(i % 10); - key[0] = string_type(key0.data(), key0.size(), string_allocator()); - key[1] = string_type(key1.data(), key1.size(), string_allocator()); - array_of_strings value(1, empty, array_allocator()); - const std::string value_str = "value" + std::to_string(i); - value[0] = string_type(value_str.data(), value_str.size(), string_allocator()); + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 8e1f1582..74b225b0 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -29,11 +29,7 @@ namespace datasketches { -using types = array_of_strings_types>; -using array_of_strings = types::array_of_strings; -using string_allocator = types::string_allocator; -using string_type = types::string_type; -using array_allocator = types::array_allocator; +using array_of_strings = array; TEST_CASE("aos update policy", "[tuple_sketch]") { default_array_of_strings_update_policy<> policy; @@ -46,8 +42,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { SECTION("replace array") { auto values = policy.create(); - const string_type empty{string_allocator()}; - array_of_strings input(2, empty, array_allocator()); + array_of_strings input(2, "", std::allocator()); input[0] = "alpha"; input[1] = "beta"; policy.update(values, input); @@ -57,7 +52,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { input[0] = "changed"; REQUIRE(values[0] == "alpha"); - array_of_strings input2(1, empty, array_allocator()); + array_of_strings input2(1, "", std::allocator()); input2[0] = "gamma"; policy.update(values, input2); REQUIRE(values.size() == 1); @@ -65,8 +60,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { } SECTION("nullptr clears") { - const string_type empty{string_allocator()}; - array_of_strings values(2, empty, array_allocator()); + array_of_strings values(2, "", std::allocator()); values[0] = "one"; values[1] = "two"; @@ -77,8 +71,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { SECTION("pointer input copies") { auto values = policy.create(); - const string_type empty{string_allocator()}; - array_of_strings input(2, empty, array_allocator()); + array_of_strings input(2, "", std::allocator()); input[0] = "first"; input[1] = "second"; policy.update(values, &input); @@ -91,8 +84,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { TEST_CASE("aos sketch update", "[tuple_sketch]") { auto make_array = [](std::initializer_list entries) { - const string_type empty{string_allocator()}; - array_of_strings array(static_cast(entries.size()), empty, array_allocator()); + array_of_strings array(static_cast(entries.size()), "", std::allocator()); uint8_t i = 0; for (const auto* entry: entries) array[i++] = entry; return array; @@ -159,12 +151,9 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { auto make_array = [](std::initializer_list entries) { - const string_type empty{string_allocator()}; - array_of_strings array(static_cast(entries.size()), empty, array_allocator()); + array_of_strings array(static_cast(entries.size()), "", std::allocator()); uint8_t i = 0; - for (const auto& entry: entries) { - array[i++] = string_type(entry.data(), entry.size(), string_allocator()); - } + for (const auto& entry: entries) array[i++] = entry; return array; }; @@ -270,9 +259,8 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") { default_array_of_strings_serde<> serde; SECTION("invalid utf8 rejected") { - const string_type empty{string_allocator()}; - array_of_strings array(1, empty, array_allocator()); - const string_type invalid_utf8("\xC3\x28", 2, string_allocator()); + array_of_strings array(1, "", std::allocator()); + const std::string invalid_utf8("\xC3\x28", 2); array[0] = invalid_utf8; std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -283,8 +271,7 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") { } SECTION("too many nodes rejected") { - const string_type empty{string_allocator()}; - array_of_strings array(128, empty, array_allocator()); + array_of_strings array(128, "", std::allocator()); std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); REQUIRE_THROWS_WITH( From 25ce65cfd2c20995e99b2e8dd0cea99b308925f3 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sat, 7 Feb 2026 00:57:47 +0900 Subject: [PATCH 50/75] refactor: change allocator only for array_of_strings --- tuple/include/array_of_strings_sketch.hpp | 42 +++++++------------ .../include/array_of_strings_sketch_impl.hpp | 38 +++++++---------- tuple/test/array_of_strings_sketch_test.cpp | 4 +- 3 files changed, 30 insertions(+), 54 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index db147723..60c43ca9 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -28,29 +28,24 @@ namespace datasketches { +using array_of_strings = array; + // default update policy for an array of strings -template> class default_array_of_strings_update_policy { public: - using array_of_strings = array; - - explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); + default_array_of_strings_update_policy() = default; array_of_strings create() const; void update(array_of_strings& array, const array_of_strings& input) const; void update(array_of_strings& array, const array_of_strings* input) const; - -private: - Allocator allocator_; }; // serializer/deserializer for an array of strings // Requirements: all strings must be valid UTF-8 and array size must be <= 127. -template> +template> struct default_array_of_strings_serde { - using array_of_strings = array; using summary_allocator = typename std::allocator_traits::template rebind_alloc; explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); @@ -62,7 +57,6 @@ struct default_array_of_strings_serde { size_t size_of_item(const array_of_strings& item) const; private: - Allocator allocator_; summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); @@ -72,23 +66,17 @@ struct default_array_of_strings_serde { /** * Hashes an array of strings using ArrayOfStrings-compatible hashing. */ -template> -uint64_t hash_array_of_strings_key(const array& key); +uint64_t hash_array_of_strings_key(const array_of_strings& key); /** * Extended class of compact_tuple_sketch for array of strings * Requirements: all strings must be valid UTF-8 and array size must be <= 127. */ -template> +template> class compact_array_of_strings_tuple_sketch: - public compact_tuple_sketch< - array, - typename std::allocator_traits::template rebind_alloc> - > { + public compact_tuple_sketch { public: - using array_of_strings = array; - using summary_allocator = typename std::allocator_traits::template rebind_alloc; - using Base = compact_tuple_sketch; + using Base = compact_tuple_sketch; using vector_bytes = typename Base::vector_bytes; using Base::serialize; @@ -133,13 +121,13 @@ class compact_array_of_strings_tuple_sketch: /** * Convenience alias for update_tuple_sketch for array of strings */ -template, - typename Policy = default_array_of_strings_update_policy> +template, + typename Policy = default_array_of_strings_update_policy> using update_array_of_strings_tuple_sketch = update_tuple_sketch< - array, - array, + array_of_strings, + array_of_strings, Policy, - typename std::allocator_traits::template rebind_alloc> + Allocator >; /** @@ -148,7 +136,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch< * @param ordered optional flag to specify if an ordered sketch should be produced * @return compact array of strings sketch */ -template, typename Policy = default_array_of_strings_update_policy> +template, typename Policy = default_array_of_strings_update_policy> compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); @@ -156,4 +144,4 @@ compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch #include "array_of_strings_sketch_impl.hpp" -#endif +#endif \ No newline at end of file diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index f5fa0652..78b683d7 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -22,44 +22,37 @@ #include +#include "array_of_strings_sketch.hpp" #include "common_defs.hpp" #include "third_party/utf8cpp/utf8.h" namespace datasketches { -template -default_array_of_strings_update_policy::default_array_of_strings_update_policy(const Allocator& allocator): - allocator_(allocator) {} - -template -auto default_array_of_strings_update_policy::create() const -> array_of_strings { - return array_of_strings(0, "", allocator_); +inline array_of_strings default_array_of_strings_update_policy::create() const { + return array_of_strings(0, ""); } -template -void default_array_of_strings_update_policy::update( +inline void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings& input ) const { const auto length = static_cast(input.size()); - array = array_of_strings(static_cast(length), "", allocator_); + array = array_of_strings(static_cast(length), ""); for (size_t i = 0; i < length; ++i) array[i] = input[i]; } -template -void default_array_of_strings_update_policy::update( +inline void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings* input ) const { if (input == nullptr) { - array = array_of_strings(0, "", allocator_); + array = array_of_strings(0, ""); return; } const auto length = static_cast(input->size()); - array = array_of_strings(static_cast(length), "", allocator_); + array = array_of_strings(static_cast(length), ""); for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } -template -uint64_t hash_array_of_strings_key(const array& key) { +inline uint64_t hash_array_of_strings_key(const array_of_strings& key) { // Matches Java Util.PRIME for ArrayOfStrings key hashing. static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; XXHash64 hasher(STRING_ARR_HASH_SEED); @@ -95,8 +88,7 @@ template auto compact_array_of_strings_tuple_sketch::deserialize( std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { - summary_allocator alloc(allocator); - auto base = Base::deserialize(is, seed, sd, alloc); + auto base = Base::deserialize(is, seed, sd, allocator); return compact_array_of_strings_tuple_sketch(std::move(base)); } @@ -105,14 +97,12 @@ template auto compact_array_of_strings_tuple_sketch::deserialize( const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { - summary_allocator alloc(allocator); - auto base = Base::deserialize(bytes, size, seed, sd, alloc); + auto base = Base::deserialize(bytes, size, seed, sd, allocator); return compact_array_of_strings_tuple_sketch(std::move(base)); } template default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): - allocator_(allocator), summary_allocator_(allocator) {} template @@ -144,7 +134,7 @@ void default_array_of_strings_serde::deserialize( const uint8_t num_nodes = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", allocator_); + array_of_strings array(num_nodes, ""); for (uint8_t j = 0; j < num_nodes; ++j) { const uint32_t length = read(is); if (!is) throw std::runtime_error("array_of_strings stream read failed"); @@ -202,7 +192,7 @@ size_t default_array_of_strings_serde::deserialize( uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", allocator_); + array_of_strings array(num_nodes, ""); for (uint8_t j = 0; j < num_nodes; ++j) { uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); @@ -252,4 +242,4 @@ void default_array_of_strings_serde::check_utf8(const std::string& va } /* namespace datasketches */ -#endif +#endif \ No newline at end of file diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 74b225b0..dc21aceb 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -29,10 +29,8 @@ namespace datasketches { -using array_of_strings = array; - TEST_CASE("aos update policy", "[tuple_sketch]") { - default_array_of_strings_update_policy<> policy; + default_array_of_strings_update_policy policy; SECTION("create empty") { auto values = policy.create(); From 7f05c0305b62fd9988aee70dffaf631c79565846 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sat, 7 Feb 2026 00:59:18 +0900 Subject: [PATCH 51/75] style: add new line end of files --- tuple/include/array_of_strings_sketch.hpp | 2 +- tuple/include/array_of_strings_sketch_impl.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index 60c43ca9..ac49fd5b 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -144,4 +144,4 @@ compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch #include "array_of_strings_sketch_impl.hpp" -#endif \ No newline at end of file +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 78b683d7..81045472 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -242,4 +242,4 @@ void default_array_of_strings_serde::check_utf8(const std::string& va } /* namespace datasketches */ -#endif \ No newline at end of file +#endif From 2956f150933438368d539ab42b19bf8e4c8665e1 Mon Sep 17 00:00:00 2001 From: yaojun <940334249@qq.com> Date: Tue, 10 Feb 2026 15:19:52 +0800 Subject: [PATCH 52/75] Add clang-tidy and check script and fix the warnings under count directory --- .clang-tidy | 36 ++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 29 +++++++++++++++++++++++++ count/include/count_min_impl.hpp | 5 +++-- 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 .clang-tidy create mode 100644 .pre-commit-config.yaml diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..93e3edeb --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +--- +Checks: | + clang-diagnostic-*, + clang-analyzer-*, + -clang-analyzer-alpha*, + google-*, + modernize-*, + -modernize-avoid-c-arrays, + -modernize-use-trailing-return-type, + -modernize-use-nodiscard, + +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..262fd02e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# To use this, install the python package `pre-commit` and +# run once `pre-commit install`. This will setup a git pre-commit-hook +# that is executed on each commit and will report the linting problems. +# To run all hooks on all files use `pre-commit run -a` + +repos: + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + - id: clang-tidy + args: ['--quiet', '-p=build/compile_commands.json', '--config-file=.clang-tidy'] + types_or: [c++, c] \ No newline at end of file diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 45376e7b..99b0a41e 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -74,7 +74,7 @@ uint64_t count_min_sketch::get_seed() const { template double count_min_sketch::get_relative_error() const { - return exp(1.0) / double(_num_buckets); + return exp(1.0) / static_cast(_num_buckets); } template @@ -449,8 +449,9 @@ string count_min_sketch::to_string() const { // count the number of used entries in the sketch uint64_t num_nonzero = 0; for (const auto entry: _sketch_array) { - if (entry != static_cast(0.0)) + if (entry != static_cast(0.0)){ ++num_nonzero; + } } // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. From 9bf45794d3030e21c9d9f6910a2469a163feced7 Mon Sep 17 00:00:00 2001 From: yaojun <940334249@qq.com> Date: Sat, 31 Jan 2026 13:48:52 +0800 Subject: [PATCH 53/75] fix: Add the missing brackets and support one line statement --- .clang-tidy | 2 +- count/include/count_min_impl.hpp | 32 ++++++++++++++------------------ 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 93e3edeb..d0cdc6e9 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -27,7 +27,7 @@ Checks: | CheckOptions: - key: google-readability-braces-around-statements.ShortStatementLines - value: '1' + value: '0' - key: google-readability-function-size.StatementThreshold value: '800' - key: google-readability-namespace-comments.ShortNamespaceLines diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 99b0a41e..2f2629fc 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -39,7 +39,9 @@ _num_buckets(num_buckets), _sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator), _seed(seed), _total_weight(0) { - if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1."); + if (num_buckets < 3) { + throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1."); + } // This check is to ensure later compatibility with a Java implementation whose maximum size can only // be 2^31-1. We check only against 2^30 for simplicity. @@ -147,7 +149,7 @@ W count_min_sketch::get_estimate(int64_t item) const {return get_estimate(& template W count_min_sketch::get_estimate(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_estimate(item.c_str(), item.length()); } @@ -176,7 +178,7 @@ void count_min_sketch::update(int64_t item, W weight) { template void count_min_sketch::update(const std::string& item, W weight) { - if (item.empty()) return; + if (item.empty()) { return; } update(item.c_str(), item.length(), weight); } @@ -201,7 +203,7 @@ W count_min_sketch::get_upper_bound(int64_t item) const {return get_upper_b template W count_min_sketch::get_upper_bound(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_upper_bound(item.c_str(), item.length()); } @@ -218,7 +220,7 @@ W count_min_sketch::get_lower_bound(int64_t item) const {return get_lower_b template W count_min_sketch::get_lower_bound(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_lower_bound(item.c_str(), item.length()); } @@ -232,17 +234,13 @@ void count_min_sketch::merge(const count_min_sketch &other_sketch) { /* * Merges this sketch into other_sketch sketch by elementwise summing of buckets */ - if (this == &other_sketch) { - throw std::invalid_argument( "Cannot merge a sketch with itself." ); - } + if (this == &other_sketch) { throw std::invalid_argument( "Cannot merge a sketch with itself." ); } bool acceptable_config = (get_num_hashes() == other_sketch.get_num_hashes()) && (get_num_buckets() == other_sketch.get_num_buckets()) && (get_seed() == other_sketch.get_seed()); - if (!acceptable_config) { - throw std::invalid_argument( "Incompatible sketch configuration." ); - } + if (!acceptable_config) { throw std::invalid_argument( "Incompatible sketch configuration." ); } // Merge step - iterate over the other vector and add the weights to this sketch auto it = _sketch_array.begin(); // This is a std::vector iterator. @@ -290,7 +288,7 @@ void count_min_sketch::serialize(std::ostream& os) const { write(os, nhashes); write(os, seed_hash); write(os, unused8); - if (is_empty()) return; // sketch is empty, no need to write further bytes. + if (is_empty()) { return; } // sketch is empty, no need to write further bytes. // Long 2 write(os, _total_weight); @@ -327,7 +325,7 @@ auto count_min_sketch::deserialize(std::istream& is, uint64_t seed, const A } count_min_sketch c(nhashes, nbuckets, seed, allocator); const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; - if (is_empty == 1) return c; // sketch is empty, no need to read further. + if (is_empty == 1) { return c; } // sketch is empty, no need to read further. // Set the sketch weight and read in the sketch values const auto weight = read(is); @@ -373,7 +371,7 @@ auto count_min_sketch::serialize(unsigned header_size_bytes) const -> vecto ptr += copy_to_mem(nhashes, ptr); ptr += copy_to_mem(seed_hash, ptr); ptr += copy_to_mem(null_characters_8, ptr); - if (is_empty()) return bytes; // sketch is empty, no need to write further bytes. + if (is_empty()) { return bytes; } // sketch is empty, no need to write further bytes. // Long 2 const W t_weight = _total_weight; @@ -423,7 +421,7 @@ auto count_min_sketch::deserialize(const void* bytes, size_t size, uint64_t } count_min_sketch c(nhashes, nbuckets, seed, allocator); const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; - if (is_empty) return c; // sketch is empty, no need to read further. + if (is_empty) { return c; } // sketch is empty, no need to read further. ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes)); @@ -449,9 +447,7 @@ string count_min_sketch::to_string() const { // count the number of used entries in the sketch uint64_t num_nonzero = 0; for (const auto entry: _sketch_array) { - if (entry != static_cast(0.0)){ - ++num_nonzero; - } + if (entry != static_cast(0.0)) { ++num_nonzero; } } // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. From 13bb3a922457f15e6d15e389b61291a9651f6b06 Mon Sep 17 00:00:00 2001 From: syaojun Date: Mon, 23 Feb 2026 14:07:27 +0800 Subject: [PATCH 54/75] perf: Replace push_back with emplace_back to optimize object construction --- theta/include/theta_set_difference_base_impl.hpp | 2 +- tuple/include/array_tuple_sketch_impl.hpp | 4 ++-- tuple/include/tuple_sketch_impl.hpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/theta/include/theta_set_difference_base_impl.hpp b/theta/include/theta_set_difference_base_impl.hpp index 02317816..40f94a2f 100644 --- a/theta/include/theta_set_difference_base_impl.hpp +++ b/theta/include/theta_set_difference_base_impl.hpp @@ -69,7 +69,7 @@ CS theta_set_difference_base::compute(FwdSketch&& a, const Sketch const uint64_t hash = EK()(entry); if (hash < theta) { auto result = table.find(hash); - if (!result.second) entries.push_back(conditional_forward(entry)); + if (!result.second) entries.emplace_back(conditional_forward(entry)); } else if (a.is_ordered()) { break; // early stop } diff --git a/tuple/include/array_tuple_sketch_impl.hpp b/tuple/include/array_tuple_sketch_impl.hpp index 42b39216..ad0c999c 100644 --- a/tuple/include/array_tuple_sketch_impl.hpp +++ b/tuple/include/array_tuple_sketch_impl.hpp @@ -166,7 +166,7 @@ compact_array_tuple_sketch compact_array_tuple_sketch compact_array_tuple_sketch compact_tuple_sketch::deserialize(std::istream& for (size_t i = 0; i < num_entries; ++i) { const auto key = read(is); sd.deserialize(is, summary.get(), 1); - entries.push_back(Entry(key, std::move(*summary))); + entries.emplace_back(key, std::move(*summary)); (*summary).~S(); } } @@ -585,7 +585,7 @@ compact_tuple_sketch compact_tuple_sketch::deserialize(const void* b uint64_t key; ptr += copy_from_mem(ptr, key); ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1); - entries.push_back(Entry(key, std::move(*summary))); + entries.emplace_back(key, std::move(*summary)); (*summary).~S(); } } From c764d901ed9ce53ad41a28f60331411ce3445707 Mon Sep 17 00:00:00 2001 From: syaojun Date: Mon, 23 Feb 2026 14:31:11 +0800 Subject: [PATCH 55/75] style(kll): Add braces to single-line if statements for consistency --- kll/include/kll_helper_impl.hpp | 20 ++++++++++---------- kll/include/kll_sketch_impl.hpp | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kll/include/kll_helper_impl.hpp b/kll/include/kll_helper_impl.hpp index bb92bdc7..31534d9a 100644 --- a/kll/include/kll_helper_impl.hpp +++ b/kll/include/kll_helper_impl.hpp @@ -36,17 +36,17 @@ bool kll_helper::is_odd(uint32_t value) { } uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) { - if (denom > numer) return 0; + if (denom > numer) { return 0; } uint8_t count = 0; while (true) { denom <<= 1; - if (denom > numer) return count; + if (denom > numer) { return count; } count++; } } uint8_t kll_helper::ub_on_num_levels(uint64_t n) { - if (n == 0) return 1; + if (n == 0) { return 1; } return 1 + floor_of_log2_of_fraction(n, 1); } @@ -65,8 +65,8 @@ uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t heigh } uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) { - if (depth > 60) throw std::invalid_argument("depth > 60"); - if (depth <= 30) return int_cap_aux_aux(k, depth); + if (depth > 60) { throw std::invalid_argument("depth > 60"); } + if (depth <= 30) { return int_cap_aux_aux(k, depth); } const uint8_t half = depth / 2; const uint8_t rest = depth - half; const uint16_t tmp = int_cap_aux_aux(k, half); @@ -74,11 +74,11 @@ uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) { } uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) { - if (depth > 30) throw std::invalid_argument("depth > 30"); + if (depth > 30) { throw std::invalid_argument("depth > 30"); } const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2 const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]); const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2 - if (result > k) throw std::logic_error("result > k"); + if (result > k) { throw std::logic_error("result > k"); } return static_cast(result); } @@ -94,7 +94,7 @@ uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* template void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) { - if (!is_even(length)) throw std::invalid_argument("length must be even"); + if (!is_even(length)) { throw std::invalid_argument("length must be even"); } const uint32_t half_length = length / 2; #ifdef KLL_VALIDATION const uint32_t offset = deterministic_offset(); @@ -110,7 +110,7 @@ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) { template void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) { - if (!is_even(length)) throw std::invalid_argument("length must be even"); + if (!is_even(length)) { throw std::invalid_argument("length must be even"); } const uint32_t half_length = length / 2; #ifdef KLL_VALIDATION const uint32_t offset = deterministic_offset(); @@ -206,7 +206,7 @@ template kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items, uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted) { - if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed + if (num_levels_in == 0) { throw std::invalid_argument("num_levels_in == 0"); } // things are too weird if zero levels are allowed const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0]; uint8_t current_num_levels = num_levels_in; uint32_t current_item_count = starting_item_count; // decreases with each compaction diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index 44fe6a15..b12a39c8 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -199,7 +199,7 @@ void kll_sketch::update_min_max(const T& item) { template uint32_t kll_sketch::internal_update() { - if (levels_[0] == 0) compress_while_updating(); + if (levels_[0] == 0) { compress_while_updating(); } n_++; is_level_zero_sorted_ = false; return --levels_[0]; @@ -208,7 +208,7 @@ uint32_t kll_sketch::internal_update() { template template void kll_sketch::merge(FwdSk&& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } if (m_ != other.m_) { throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_)); } @@ -224,9 +224,9 @@ void kll_sketch::merge(FwdSk&& other) { const uint32_t index = internal_update(); new (&items_[index]) T(conditional_forward(other.items_[i])); } - if (other.num_levels_ >= 2) merge_higher_levels(other, final_n); + if (other.num_levels_ >= 2) { merge_higher_levels(other, final_n); } n_ = final_n; - if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_); + if (other.is_estimation_mode()) { min_k_ = std::min(min_k_, other.min_k_); } assert_correct_total_weight(); reset_sorted_view(); } @@ -258,13 +258,13 @@ bool kll_sketch::is_estimation_mode() const { template T kll_sketch::get_min_item() const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } return *min_item_; } template T kll_sketch::get_max_item() const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } return *max_item_; } @@ -280,28 +280,28 @@ A kll_sketch::get_allocator() const { template double kll_sketch::get_rank(const T& item, bool inclusive) const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_rank(item, inclusive); } template auto kll_sketch::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_PMF(split_points, size, inclusive); } template auto kll_sketch::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_CDF(split_points, size, inclusive); } template auto kll_sketch::get_quantile(double rank, bool inclusive) const -> quantile_return_type { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } if ((rank < 0.0) || (rank > 1.0)) { throw std::invalid_argument("normalized rank cannot be less than zero or greater than 1.0"); } From a46fc2f00e83fc152b7aeb6b7a27c7cb15a9494d Mon Sep 17 00:00:00 2001 From: syaojun Date: Mon, 23 Feb 2026 14:55:18 +0800 Subject: [PATCH 56/75] style(hll): Add braces to single-line if statements for consistency --- hll/include/CouponHashSet-internal.hpp | 3 +-- hll/include/CouponList-internal.hpp | 3 +-- hll/include/CubicInterpolation-internal.hpp | 6 +++--- hll/include/Hll4Array-internal.hpp | 2 +- hll/include/HllArray-internal.hpp | 16 ++++++++-------- hll/include/HllUnion-internal.hpp | 16 ++++++++++------ hll/include/coupon_iterator-internal.hpp | 4 ++-- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/hll/include/CouponHashSet-internal.hpp b/hll/include/CouponHashSet-internal.hpp index 7474cf2c..2ec4d6a8 100644 --- a/hll/include/CouponHashSet-internal.hpp +++ b/hll/include/CouponHashSet-internal.hpp @@ -176,8 +176,7 @@ CouponHashSet* CouponHashSet::newSet(std::istream& is, const A& allocator) read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t)); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return ptr.release(); } diff --git a/hll/include/CouponList-internal.hpp b/hll/include/CouponList-internal.hpp index a240a000..c92820e2 100644 --- a/hll/include/CouponList-internal.hpp +++ b/hll/include/CouponList-internal.hpp @@ -162,8 +162,7 @@ CouponList* CouponList::newList(std::istream& is, const A& allocator) { read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t)); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return ptr.release(); } diff --git a/hll/include/CubicInterpolation-internal.hpp b/hll/include/CubicInterpolation-internal.hpp index 9677b99d..fb74c402 100644 --- a/hll/include/CubicInterpolation-internal.hpp +++ b/hll/include/CubicInterpolation-internal.hpp @@ -165,10 +165,10 @@ static int recursiveFindStraddle(const double xArr[], const int l, const int r, throw std::logic_error("target value invariant violated in search"); } - if (l+1 == r) return (l); + if (l+1 == r) { return (l); } m = l + ((r-l)/2); - if (xArr[m] <= x) return (recursiveFindStraddle(xArr, m, r, x)); - else return (recursiveFindStraddle(xArr, l, m, x)); + if (xArr[m] <= x) { return (recursiveFindStraddle(xArr, m, r, x)); } + else { return (recursiveFindStraddle(xArr, l, m, x)); } } diff --git a/hll/include/Hll4Array-internal.hpp b/hll/include/Hll4Array-internal.hpp index 9d22006b..082f168f 100644 --- a/hll/include/Hll4Array-internal.hpp +++ b/hll/include/Hll4Array-internal.hpp @@ -131,7 +131,7 @@ uint8_t Hll4Array::getSlot(uint32_t slotNo) const { template uint8_t Hll4Array::adjustRawValue(uint32_t slot, uint8_t value) const { - if (value != hll_constants::AUX_TOKEN) return value + this->curMin_; + if (value != hll_constants::AUX_TOKEN) { return value + this->curMin_; } return auxHashMap_->mustFindValueFor(slot); } diff --git a/hll/include/HllArray-internal.hpp b/hll/include/HllArray-internal.hpp index 8986f068..62ea7f78 100644 --- a/hll/include/HllArray-internal.hpp +++ b/hll/include/HllArray-internal.hpp @@ -142,15 +142,16 @@ HllArray* HllArray::newHll(const void* bytes, size_t len, const A& allocat HllArray* sketch = HllSketchImplFactory::newHll(lgK, tgtHllType, startFullSizeFlag, allocator); sketch->putCurMin(curMin); sketch->putOutOfOrderFlag(oooFlag); - if (!oooFlag) sketch->putHipAccum(hip); + if (!oooFlag) { sketch->putHipAccum(hip); } sketch->putKxQ0(kxq0); sketch->putKxQ1(kxq1); sketch->putNumAtCurMin(numAtCurMin); std::memcpy(sketch->hllByteArr_.data(), data + hll_constants::HLL_BYTE_ARR_START, arrayBytes); - if (auxHashMap != nullptr) + if (auxHashMap != nullptr) { ((Hll4Array*)sketch)->putAuxHashMap(auxHashMap); + } aux_ptr.release(); return sketch; @@ -193,7 +194,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { const auto hip = read(is); const auto kxq0 = read(is); const auto kxq1 = read(is); - if (!oooFlag) sketch->putHipAccum(hip); + if (!oooFlag) { sketch->putHipAccum(hip); } sketch->putKxQ0(kxq0); sketch->putKxQ1(kxq1); @@ -209,8 +210,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { ((Hll4Array*)sketch)->putAuxHashMap(auxHashMap); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return sketch_ptr.release(); } @@ -545,7 +545,7 @@ template void HllArray::hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue) { const uint32_t configK = 1 << this->getLgConfigK(); // update hip BEFORE updating kxq - if (!oooFlag_) hipAccum_ += configK / (kxq0_ + kxq1_); + if (!oooFlag_) { hipAccum_ += configK / (kxq0_ + kxq1_); } // update kxq0 and kxq1; subtract first, then add if (oldValue < 32) { kxq0_ -= INVERSE_POWERS_OF_2[oldValue]; } else { kxq1_ -= INVERSE_POWERS_OF_2[oldValue]; } @@ -648,7 +648,7 @@ array_(array), array_size_(array_size), index_(index), hll_type_(hll_type), exce { while (index_ < array_size_) { value_ = get_value(array_, index_, hll_type_, exceptions_, offset_); - if (all_ || value_ != hll_constants::EMPTY) break; + if (all_ || value_ != hll_constants::EMPTY) { break; } ++index_; } } @@ -657,7 +657,7 @@ template typename HllArray::const_iterator& HllArray::const_iterator::operator++() { while (++index_ < array_size_) { value_ = get_value(array_, index_, hll_type_, exceptions_, offset_); - if (all_ || value_ != hll_constants::EMPTY) break; + if (all_ || value_ != hll_constants::EMPTY) { break; } } return *this; } diff --git a/hll/include/HllUnion-internal.hpp b/hll/include/HllUnion-internal.hpp index 3a5a926c..27adab74 100644 --- a/hll/include/HllUnion-internal.hpp +++ b/hll/include/HllUnion-internal.hpp @@ -44,13 +44,13 @@ hll_sketch_alloc hll_union_alloc::get_result(target_hll_type target_type) template void hll_union_alloc::update(const hll_sketch_alloc& sketch) { - if (sketch.is_empty()) return; + if (sketch.is_empty()) { return; } union_impl(sketch, lg_max_k_); } template void hll_union_alloc::update(hll_sketch_alloc&& sketch) { - if (sketch.is_empty()) return; + if (sketch.is_empty()) { return; } if (gadget_.is_empty() && sketch.get_target_type() == HLL_8 && sketch.get_lg_config_k() <= lg_max_k_) { if (sketch.get_current_mode() == HLL || sketch.get_lg_config_k() == lg_max_k_) { gadget_ = std::move(sketch); @@ -131,29 +131,33 @@ void hll_union_alloc::coupon_update(uint32_t coupon) { template double hll_union_alloc::get_estimate() const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_estimate(); } template double hll_union_alloc::get_composite_estimate() const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_composite_estimate(); } template double hll_union_alloc::get_lower_bound(uint8_t num_std_dev) const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_lower_bound(num_std_dev); } template double hll_union_alloc::get_upper_bound(uint8_t num_std_dev) const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_upper_bound(num_std_dev); } diff --git a/hll/include/coupon_iterator-internal.hpp b/hll/include/coupon_iterator-internal.hpp index 84133ffb..356517ec 100644 --- a/hll/include/coupon_iterator-internal.hpp +++ b/hll/include/coupon_iterator-internal.hpp @@ -28,7 +28,7 @@ template coupon_iterator::coupon_iterator(const uint32_t* array, size_t array_size, size_t index, bool all): array_(array), array_size_(array_size), index_(index), all_(all) { while (index_ < array_size_) { - if (all_ || array_[index_] != hll_constants::EMPTY) break; + if (all_ || array_[index_] != hll_constants::EMPTY) { break; } ++index_; } } @@ -36,7 +36,7 @@ array_(array), array_size_(array_size), index_(index), all_(all) { template coupon_iterator& coupon_iterator::operator++() { while (++index_ < array_size_) { - if (all_ || array_[index_] != hll_constants::EMPTY) break; + if (all_ || array_[index_] != hll_constants::EMPTY) { break; } } return *this; } From b444a2ad2db9d70c4782e081cb4d92eb9b6cb8b8 Mon Sep 17 00:00:00 2001 From: syaojun Date: Mon, 23 Feb 2026 15:00:40 +0800 Subject: [PATCH 57/75] style(fi): Add braces to single-line if statements for consistency --- fi/include/frequent_items_sketch_impl.hpp | 17 ++++++++--------- fi/include/reverse_purge_hash_map_impl.hpp | 10 +++++----- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fi/include/frequent_items_sketch_impl.hpp b/fi/include/frequent_items_sketch_impl.hpp index acbd2ee1..3eba188b 100644 --- a/fi/include/frequent_items_sketch_impl.hpp +++ b/fi/include/frequent_items_sketch_impl.hpp @@ -45,13 +45,13 @@ map( allocator ) { - if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size"); + if (lg_start_map_size > lg_max_map_size) { throw std::invalid_argument("starting size must not be greater than maximum size"); } } template void frequent_items_sketch::update(const T& item, W weight) { check_weight(weight); - if (weight == 0) return; + if (weight == 0) { return; } total_weight += weight; offset += map.adjust_or_insert(item, weight); } @@ -59,14 +59,14 @@ void frequent_items_sketch::update(const T& item, W weight) { template void frequent_items_sketch::update(T&& item, W weight) { check_weight(weight); - if (weight == 0) return; + if (weight == 0) { return; } total_weight += weight; offset += map.adjust_or_insert(std::move(item), weight); } template void frequent_items_sketch::merge(const frequent_items_sketch& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end for (auto it: other.map) { update(it.first, it.second); @@ -77,7 +77,7 @@ void frequent_items_sketch::merge(const frequent_items_sketch& ot template void frequent_items_sketch::merge(frequent_items_sketch&& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end for (auto it: other.map) { update(std::move(it.first), it.second); @@ -105,7 +105,7 @@ template W frequent_items_sketch::get_estimate(const T& item) const { // if item is tracked estimate = weight + offset, otherwise 0 const W weight = map.get(item); - if (weight > 0) return weight + offset; + if (weight > 0) { return weight + offset; } return 0; } @@ -210,7 +210,7 @@ void frequent_items_sketch::serialize(std::ostream& os, const Ser template template size_t frequent_items_sketch::get_serialized_size_bytes(const SerDe& sd) const { - if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t); + if (is_empty()) { return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t); } size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W); for (auto it: map) size += sd.size_of_item(it.first); return size; @@ -328,8 +328,7 @@ frequent_items_sketch frequent_items_sketch::deser sketch.total_weight = total_weight; sketch.offset = offset; } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return sketch; } diff --git a/fi/include/reverse_purge_hash_map_impl.hpp b/fi/include/reverse_purge_hash_map_impl.hpp index fa2ad824..63909cf3 100644 --- a/fi/include/reverse_purge_hash_map_impl.hpp +++ b/fi/include/reverse_purge_hash_map_impl.hpp @@ -74,7 +74,7 @@ states_(nullptr) if (other.states_[i] > 0) { new (&keys_[i]) K(other.keys_[i]); values_[i] = other.values_[i]; - if (--num == 0) break; + if (--num == 0) { break; } } } } @@ -105,7 +105,7 @@ reverse_purge_hash_map::~reverse_purge_hash_map() { for (uint32_t i = 0; i < size; i++) { if (is_active(i)) { keys_[i].~K(); - if (--num_active_ == 0) break; + if (--num_active_ == 0) { break; } } } } @@ -166,7 +166,7 @@ V reverse_purge_hash_map::get(const K& key) const { const uint32_t mask = (1 << lg_cur_size_) - 1; uint32_t probe = fmix64(H()(key)) & mask; while (is_active(probe)) { - if (E()(keys_[probe], key)) return values_[probe]; + if (E()(keys_[probe], key)) { return values_[probe]; } probe = (probe + 1) & mask; } return 0; @@ -271,7 +271,7 @@ void reverse_purge_hash_map::hash_delete(uint32_t delete_index) { probe = (probe + 1) & mask; drift++; // only used for theoretical analysis - if (drift >= DRIFT_LIMIT) throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT"); + if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT"); } } } @@ -289,7 +289,7 @@ uint32_t reverse_purge_hash_map::internal_adjust_or_insert(const index = (index + 1) & mask; drift++; // only used for theoretical analysis - if (drift >= DRIFT_LIMIT) throw std::logic_error("drift limit reached"); + if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift limit reached"); } } // adding the key and value to the table if (num_active_ > get_capacity()) { From 4e92e0bd71db0d00af93e3be6fc8a3f9152173e1 Mon Sep 17 00:00:00 2001 From: syaojun Date: Mon, 23 Feb 2026 15:13:51 +0800 Subject: [PATCH 58/75] style(cpc): Fix missing braces in if statements in cpc/include --- cpc/include/cpc_sketch_impl.hpp | 69 +++++++++++++++++---------------- cpc/include/cpc_union_impl.hpp | 54 +++++++++++++------------- cpc/include/cpc_util.hpp | 14 +++---- cpc/include/icon_estimator.hpp | 10 ++--- cpc/include/u32_table_impl.hpp | 32 +++++++-------- 5 files changed, 90 insertions(+), 89 deletions(-) diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp index 84709cdc..80f111f1 100644 --- a/cpc/include/cpc_sketch_impl.hpp +++ b/cpc/include/cpc_sketch_impl.hpp @@ -73,7 +73,7 @@ bool cpc_sketch_alloc::is_empty() const { template double cpc_sketch_alloc::get_estimate() const { - if (!was_merged) return get_hip_estimate(); + if (!was_merged) { return get_hip_estimate(); } return get_icon_estimate(); } @@ -92,7 +92,7 @@ double cpc_sketch_alloc::get_lower_bound(unsigned kappa) const { if (kappa < 1 || kappa > 3) { throw std::invalid_argument("kappa must be 1, 2 or 3"); } - if (!was_merged) return get_hip_confidence_lb(*this, kappa); + if (!was_merged) { return get_hip_confidence_lb(*this, kappa); } return get_icon_confidence_lb(*this, kappa); } @@ -101,13 +101,13 @@ double cpc_sketch_alloc::get_upper_bound(unsigned kappa) const { if (kappa < 1 || kappa > 3) { throw std::invalid_argument("kappa must be 1, 2 or 3"); } - if (!was_merged) return get_hip_confidence_ub(*this, kappa); + if (!was_merged) { return get_hip_confidence_ub(*this, kappa); } return get_icon_confidence_ub(*this, kappa); } template void cpc_sketch_alloc::update(const std::string& value) { - if (value.empty()) return; + if (value.empty()) { return; } update(value.c_str(), value.length()); } @@ -173,15 +173,15 @@ void cpc_sketch_alloc::update(float value) { } static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) { - if (lg_k > 26) throw std::logic_error("lg_k > 26"); + if (lg_k > 26) { throw std::logic_error("lg_k > 26"); } const uint32_t k = 1 << lg_k; uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64 - if (col > 63) col = 63; // clip so that 0 <= col <= 63 + if (col > 63) { col = 63; } // clip so that 0 <= col <= 63 const uint32_t row = hash0 & (k - 1); uint32_t row_col = (row << 6) | col; // To avoid the hash table's "empty" value, we change the row of the following pair. // This case is extremely unlikely, but we might as well handle it. - if (row_col == UINT32_MAX) row_col ^= 1 << 6; + if (row_col == UINT32_MAX) { row_col ^= 1 << 6; } return row_col; } @@ -195,7 +195,7 @@ void cpc_sketch_alloc::update(const void* value, size_t size) { template void cpc_sketch_alloc::row_col_update(uint32_t row_col) { const uint8_t col = row_col & 63; - if (col < first_interesting_column) return; // important speed optimization + if (col < first_interesting_column) { return; } // important speed optimization // window size is 0 until sketch is promoted from sparse to windowed if (sliding_window.size() == 0) { update_sparse(row_col); @@ -208,26 +208,26 @@ template void cpc_sketch_alloc::update_sparse(uint32_t row_col) { const uint32_t k = 1 << lg_k; const uint64_t c32pre = static_cast(num_coupons) << 5; - if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE + if (c32pre >= 3 * k) { throw std::logic_error("c32pre >= 3 * k"); } // C < 3K/32, in other words flavor == SPARSE bool is_novel = surprising_value_table.maybe_insert(row_col); if (is_novel) { num_coupons++; update_hip(row_col); const uint64_t c32post = static_cast(num_coupons) << 5; - if (c32post >= 3 * k) promote_sparse_to_windowed(); // C >= 3K/32 + if (c32post >= 3 * k) { promote_sparse_to_windowed(); } // C >= 3K/32 } } // the flavor is HYBRID, PINNED, or SLIDING template void cpc_sketch_alloc::update_windowed(uint32_t row_col) { - if (window_offset > 56) throw std::logic_error("wrong window offset"); + if (window_offset > 56) { throw std::logic_error("wrong window offset"); } const uint32_t k = 1 << lg_k; const uint64_t c32pre = static_cast(num_coupons) << 5; - if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID + if (c32pre < 3 * k) { throw std::logic_error("c32pre < 3 * k"); } // C < 3K/32, in other words flavor >= HYBRID const uint64_t c8pre = static_cast(num_coupons) << 3; const uint64_t w8pre = static_cast(window_offset) << 3; - if (c8pre >= (27 + w8pre) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset) + if (c8pre >= (27 + w8pre) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset) bool is_novel = false; const uint8_t col = row_col & 63; @@ -235,7 +235,7 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { if (col < window_offset) { // track the surprising 0's "before" the window is_novel = surprising_value_table.maybe_delete(row_col); // inverted logic } else if (col < window_offset + 8) { // track the 8 bits inside the window - if (col < window_offset) throw std::logic_error("col < window_offset"); + if (col < window_offset) { throw std::logic_error("col < window_offset"); } const uint32_t row = row_col >> 6; const uint8_t old_bits = sliding_window[row]; const uint8_t new_bits = old_bits | (1 << (col - window_offset)); @@ -244,7 +244,7 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { is_novel = true; } } else { // track the surprising 1's "after" the window - if (col < window_offset + 8) throw std::logic_error("col < window_offset + 8"); + if (col < window_offset + 8) { throw std::logic_error("col < window_offset + 8"); } is_novel = surprising_value_table.maybe_insert(row_col); // normal logic } @@ -254,9 +254,9 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { const uint64_t c8post = static_cast(num_coupons) << 3; if (c8post >= (27 + w8pre) * k) { move_window(); - if (window_offset < 1 || window_offset > 56) throw std::logic_error("wrong window offset"); + if (window_offset < 1 || window_offset > 56) { throw std::logic_error("wrong window offset"); } const uint64_t w8post = static_cast(window_offset) << 3; - if (c8post >= (27 + w8post) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset) + if (c8post >= (27 + w8post) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset) } } } @@ -276,7 +276,7 @@ template void cpc_sketch_alloc::promote_sparse_to_windowed() { const uint32_t k = 1 << lg_k; const uint64_t c32 = static_cast(num_coupons) << 5; - if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32"); + if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) { throw std::logic_error("wrong c32"); } sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it) @@ -285,7 +285,7 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { const uint32_t* old_slots = surprising_value_table.get_slots(); const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size(); - if (window_offset != 0) throw std::logic_error("window_offset != 0"); + if (window_offset != 0) { throw std::logic_error("window_offset != 0"); } for (uint32_t i = 0; i < old_num_slots; i++) { const uint32_t row_col = old_slots[i]; @@ -297,7 +297,7 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { } else { // cannot use u32_table::must_insert(), because it doesn't provide for growth const bool is_novel = new_table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } } @@ -308,17 +308,17 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { template void cpc_sketch_alloc::move_window() { const uint8_t new_offset = window_offset + 1; - if (new_offset > 56) throw std::logic_error("new_offset > 56"); - if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong"); + if (new_offset > 56) { throw std::logic_error("new_offset > 56"); } + if (new_offset != determine_correct_offset(lg_k, num_coupons)) { throw std::logic_error("new_offset is wrong"); } - if (sliding_window.size() == 0) throw std::logic_error("no sliding window"); + if (sliding_window.size() == 0) { throw std::logic_error("no sliding window"); } const uint32_t k = 1 << lg_k; // Construct the full-sized bit matrix that corresponds to the sketch vector_u64 bit_matrix = build_bit_matrix(); // refresh the KXP register on every 8th window shift. - if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data()); + if ((new_offset & 0x7) == 0) { refresh_kxp(bit_matrix.data()); } surprising_value_table.clear(); // the new number of surprises will be about the same @@ -339,14 +339,14 @@ void cpc_sketch_alloc::move_window() { pattern = pattern ^ (static_cast(1) << col); // erase the 1 const uint32_t row_col = (i << 6) | col; const bool is_novel = surprising_value_table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } window_offset = new_offset; first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored); - if (first_interesting_column > new_offset) first_interesting_column = new_offset; // corner case + if (first_interesting_column > new_offset) { first_interesting_column = new_offset; } // corner case } // The KXP register is a double with roughly 50 bits of precision, but @@ -438,7 +438,7 @@ void cpc_sketch_alloc::serialize(std::ostream& os) const { write(os, compressed.table_num_entries); // HIP values can be in two different places in the sequence of fields // this is the first HIP decision point - if (has_hip) write_hip(os); + if (has_hip) { write_hip(os); } } if (has_table) { write(os, compressed.table_data_words); @@ -447,7 +447,7 @@ void cpc_sketch_alloc::serialize(std::ostream& os) const { write(os, compressed.window_data_words); } // this is the second HIP decision point - if (has_hip && !(has_table && has_window)) write_hip(os); + if (has_hip && !(has_table && has_window)) { write_hip(os); } if (has_window) { write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t)); } @@ -494,7 +494,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.table_num_entries, ptr); // HIP values can be in two different places in the sequence of fields // this is the first HIP decision point - if (has_hip) ptr += copy_hip_to_mem(ptr); + if (has_hip) { ptr += copy_hip_to_mem(ptr); } } if (has_table) { ptr += copy_to_mem(compressed.table_data_words, ptr); @@ -503,7 +503,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.window_data_words, ptr); } // this is the second HIP decision point - if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr); + if (has_hip && !(has_table && has_window)) { ptr += copy_hip_to_mem(ptr); } if (has_window) { ptr += copy_to_mem(compressed.window_data.data(), ptr, compressed.window_data_words * sizeof(uint32_t)); } @@ -511,7 +511,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.table_data.data(), ptr, compressed.table_data_words * sizeof(uint32_t)); } } - if (ptr != bytes.data() + size) throw std::logic_error("serialized size mismatch"); + if (ptr != bytes.data() + size) { throw std::logic_error("serialized size mismatch"); } return bytes; } @@ -561,7 +561,7 @@ cpc_sketch_alloc cpc_sketch_alloc::deserialize(std::istream& is, uint64_t compressed.table_data.resize(compressed.table_data_words); read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t)); } - if (!has_window) compressed.table_num_entries = num_coupons; + if (!has_window) { compressed.table_num_entries = num_coupons; } } uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window); @@ -583,8 +583,9 @@ cpc_sketch_alloc cpc_sketch_alloc::deserialize(std::istream& is, uint64_t } uncompressed_state uncompressed(allocator); get_compressor().uncompress(compressed, uncompressed, lg_k, num_coupons); - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { + throw std::runtime_error("error reading from std::istream"); + } return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table), std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed); } diff --git a/cpc/include/cpc_union_impl.hpp b/cpc/include/cpc_union_impl.hpp index f277107f..673aa7a4 100644 --- a/cpc/include/cpc_union_impl.hpp +++ b/cpc/include/cpc_union_impl.hpp @@ -109,15 +109,15 @@ void cpc_union_alloc::internal_update(S&& sketch) { + std::to_string(seed_hash_sketch)); } const auto src_flavor = sketch.determine_flavor(); - if (cpc_sketch_alloc::flavor::EMPTY == src_flavor) return; + if (cpc_sketch_alloc::flavor::EMPTY == src_flavor) { return; } - if (sketch.get_lg_k() < lg_k) reduce_k(sketch.get_lg_k()); - if (sketch.get_lg_k() < lg_k) throw std::logic_error("sketch lg_k < union lg_k"); + if (sketch.get_lg_k() < lg_k) { reduce_k(sketch.get_lg_k()); } + if (sketch.get_lg_k() < lg_k) { throw std::logic_error("sketch lg_k < union lg_k"); } - if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit matrix are absent"); + if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit matrix are absent"); } if (cpc_sketch_alloc::flavor::SPARSE == src_flavor && accumulator != nullptr) { // Case A - if (bit_matrix.size() > 0) throw std::logic_error("union bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("union bit_matrix is not expected"); } const auto initial_dest_flavor = accumulator->determine_flavor(); if (cpc_sketch_alloc::flavor::EMPTY != initial_dest_flavor && cpc_sketch_alloc::flavor::SPARSE != initial_dest_flavor) throw std::logic_error("wrong flavor"); @@ -138,24 +138,24 @@ void cpc_union_alloc::internal_update(S&& sketch) { } if (cpc_sketch_alloc::flavor::SPARSE == src_flavor && bit_matrix.size() > 0) { // Case B - if (accumulator != nullptr) throw std::logic_error("union accumulator != null"); + if (accumulator != nullptr) { throw std::logic_error("union accumulator != null"); } or_table_into_matrix(sketch.surprising_value_table); return; } if (cpc_sketch_alloc::flavor::HYBRID != src_flavor && cpc_sketch_alloc::flavor::PINNED != src_flavor - && cpc_sketch_alloc::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); + && cpc_sketch_alloc::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); } // source is past SPARSE mode, so make sure that dest is a bit matrix if (accumulator != nullptr) { - if (bit_matrix.size() > 0) throw std::logic_error("union bit matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("union bit matrix is not expected"); } const auto dst_flavor = accumulator->determine_flavor(); if (cpc_sketch_alloc::flavor::EMPTY != dst_flavor && cpc_sketch_alloc::flavor::SPARSE != dst_flavor) { throw std::logic_error("wrong flavor"); } switch_to_bit_matrix(); } - if (bit_matrix.size() == 0) throw std::logic_error("union bit_matrix is expected"); + if (bit_matrix.size() == 0) { throw std::logic_error("union bit_matrix is expected"); } if (cpc_sketch_alloc::flavor::HYBRID == src_flavor || cpc_sketch_alloc::flavor::PINNED == src_flavor) { // Case C or_window_into_matrix(sketch.sliding_window, sketch.window_offset, sketch.get_lg_k()); @@ -165,7 +165,7 @@ void cpc_union_alloc::internal_update(S&& sketch) { // SLIDING mode involves inverted logic, so we can't just walk the source sketch. // Instead, we convert it to a bitMatrix that can be OR'ed into the destination. - if (cpc_sketch_alloc::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D + if (cpc_sketch_alloc::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); } // Case D vector_u64 src_matrix = sketch.build_bit_matrix(); or_matrix_into_matrix(src_matrix, sketch.get_lg_k()); } @@ -173,20 +173,20 @@ void cpc_union_alloc::internal_update(S&& sketch) { template cpc_sketch_alloc cpc_union_alloc::get_result() const { if (accumulator != nullptr) { - if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); } return get_result_from_accumulator(); } - if (bit_matrix.size() == 0) throw std::logic_error("bit_matrix is expected"); + if (bit_matrix.size() == 0) { throw std::logic_error("bit_matrix is expected"); } return get_result_from_bit_matrix(); } template cpc_sketch_alloc cpc_union_alloc::get_result_from_accumulator() const { - if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k"); + if (lg_k != accumulator->get_lg_k()) { throw std::logic_error("lg_k != accumulator->lg_k"); } if (accumulator->get_num_coupons() == 0) { return cpc_sketch_alloc(lg_k, seed, accumulator->get_allocator()); } - if (accumulator->determine_flavor() != cpc_sketch_alloc::flavor::SPARSE) throw std::logic_error("wrong flavor"); + if (accumulator->determine_flavor() != cpc_sketch_alloc::flavor::SPARSE) { throw std::logic_error("wrong flavor"); } cpc_sketch_alloc copy(*accumulator); copy.was_merged = true; return copy; @@ -199,7 +199,7 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { const auto flavor = cpc_sketch_alloc::determine_flavor(lg_k, num_coupons); if (flavor != cpc_sketch_alloc::flavor::HYBRID && flavor != cpc_sketch_alloc::flavor::PINNED - && flavor != cpc_sketch_alloc::flavor::SLIDING) throw std::logic_error("wrong flavor"); + && flavor != cpc_sketch_alloc::flavor::SLIDING) { throw std::logic_error("wrong flavor"); } const uint8_t offset = cpc_sketch_alloc::determine_correct_offset(lg_k, num_coupons); @@ -208,7 +208,7 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { // dynamically growing caused snowplow effect uint8_t table_lg_size = lg_k - 4; // K/16; in some cases this will end up being oversized - if (table_lg_size < 2) table_lg_size = 2; + if (table_lg_size < 2) { table_lg_size = 2; } u32_table table(table_lg_size, 6 + lg_k, bit_matrix.get_allocator()); // the following should work even when the offset is zero @@ -229,14 +229,14 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { pattern = pattern ^ (static_cast(1) << col); // erase the 1 const uint32_t row_col = (i << 6) | col; bool is_novel = table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } // at this point we could shrink an oversized hash table, but the relative waste isn't very big uint8_t first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored); - if (first_interesting_column > offset) first_interesting_column = offset; // corner case + if (first_interesting_column > offset) { first_interesting_column = offset; } // corner case // HIP-related fields will contain zeros, and that is okay return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(table), std::move(sliding_window), false, 0, 0, seed); @@ -260,9 +260,9 @@ void cpc_union_alloc::walk_table_updating_sketch(const u32_table& table) { // Using a golden ratio stride fixes the snowplow effect. const double golden = 0.6180339887498949025; uint32_t stride = static_cast(golden * static_cast(num_slots)); - if (stride < 2) throw std::logic_error("stride < 2"); - if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd - if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range"); + if (stride < 2) { throw std::logic_error("stride < 2"); } + if (stride == ((stride >> 1) << 1)) { stride += 1; } // force the stride to be odd + if (stride < 3 || stride >= num_slots) { throw std::out_of_range("stride out of range"); } for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) { j &= num_slots - 1; @@ -290,7 +290,7 @@ void cpc_union_alloc::or_table_into_matrix(const u32_table& table) { template void cpc_union_alloc::or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k) { - if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK"); + if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); } const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK const uint32_t src_k = 1 << src_lg_k; for (uint32_t src_row = 0; src_row < src_k; src_row++) { @@ -300,7 +300,7 @@ void cpc_union_alloc::or_window_into_matrix(const vector_bytes& sliding_windo template void cpc_union_alloc::or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k) { - if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK"); + if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); } const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK const uint32_t src_k = 1 << src_lg_k; for (uint32_t src_row = 0; src_row < src_k; src_row++) { @@ -310,11 +310,11 @@ void cpc_union_alloc::or_matrix_into_matrix(const vector_u64& src_matrix, uin template void cpc_union_alloc::reduce_k(uint8_t new_lg_k) { - if (new_lg_k >= lg_k) throw std::logic_error("new LgK >= union lgK"); - if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit_matrix are absent"); + if (new_lg_k >= lg_k) { throw std::logic_error("new LgK >= union lgK"); } + if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit_matrix are absent"); } if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix - if (accumulator != nullptr) throw std::logic_error("accumulator is not null"); + if (accumulator != nullptr) { throw std::logic_error("accumulator is not null"); } vector_u64 old_matrix = std::move(bit_matrix); const uint8_t old_lg_k = lg_k; const uint32_t new_k = 1 << new_lg_k; @@ -325,7 +325,7 @@ void cpc_union_alloc::reduce_k(uint8_t new_lg_k) { } if (accumulator != nullptr) { // downsample the unioner's sketch - if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); } if (!accumulator->is_empty()) { cpc_sketch_alloc old_accumulator(*accumulator); *accumulator = cpc_sketch_alloc(new_lg_k, seed, old_accumulator.get_allocator()); diff --git a/cpc/include/cpc_util.hpp b/cpc/include/cpc_util.hpp index e5664951..c9da8ab7 100644 --- a/cpc/include/cpc_util.hpp +++ b/cpc/include/cpc_util.hpp @@ -25,19 +25,19 @@ namespace datasketches { static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) { - if (y == 0) throw std::invalid_argument("divide_longs_rounding_up: bad argument"); + if (y == 0) { throw std::invalid_argument("divide_longs_rounding_up: bad argument"); } const uint64_t quotient = x / y; - if (quotient * y == x) return (quotient); - else return quotient + 1; + if (quotient * y == x) { return (quotient); } + else { return quotient + 1; } } static inline uint8_t floor_log2_of_long(uint64_t x) { - if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument"); + if (x < 1) { throw std::invalid_argument("floor_log2_of_long: bad argument"); } uint8_t p = 0; uint64_t y = 1; while (true) { - if (y == x) return p; - if (y > x) return p - 1; + if (y == x) { return p; } + if (y > x) { return p - 1; } p += 1; y <<= 1; } @@ -98,7 +98,7 @@ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, ui } static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) { - if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8"); + if ((length & 0x7) != 0) { throw std::invalid_argument("the length of the array must be a multiple of 8"); } uint32_t total = 0; uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights; fours = twos = ones = 0; diff --git a/cpc/include/icon_estimator.hpp b/cpc/include/icon_estimator.hpp index fb3c0c60..ade787e5 100644 --- a/cpc/include/icon_estimator.hpp +++ b/cpc/include/icon_estimator.hpp @@ -246,14 +246,14 @@ static inline double icon_exponential_approximation(double k, double c) { } static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) { - if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range"); - if (c < 2) return ((c == 0) ? 0.0 : 1.0); + if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) { throw std::out_of_range("lg_k out of range"); } + if (c < 2) { return ((c == 0) ? 0.0 : 1.0); } const uint32_t k = 1 << lg_k; const double double_k = static_cast(k); const double double_c = static_cast(c); // Differing thresholds ensure that the approximated estimator is monotonically increasing. const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6); - if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c); + if (double_c > (threshold_factor * double_k)) { return icon_exponential_approximation(double_k, double_c); } const double factor = evaluate_polynomial( ICON_POLYNOMIAL_COEFFICIENTS, ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K), @@ -265,8 +265,8 @@ static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) { // The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS const double term = 1.0 + (ratio * ratio * ratio / 66.774757); const double result = double_c * factor * term; - if (result >= double_c) return result; - else return double_c; + if (result >= double_c) { return result; } + else { return double_c; } } } /* namespace datasketches */ diff --git a/cpc/include/u32_table_impl.hpp b/cpc/include/u32_table_impl.hpp index 62cd7dac..85797bcf 100644 --- a/cpc/include/u32_table_impl.hpp +++ b/cpc/include/u32_table_impl.hpp @@ -43,8 +43,8 @@ num_valid_bits(num_valid_bits), num_items(0), slots(1ULL << lg_size, UINT32_MAX, allocator) { - if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2"); - if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32"); + if (lg_size < 2) { throw std::invalid_argument("lg_size must be >= 2"); } + if (num_valid_bits < 1 || num_valid_bits > 32) { throw std::invalid_argument("num_valid_bits must be between 1 and 32"); } } template @@ -71,8 +71,8 @@ void u32_table::clear() { template bool u32_table::maybe_insert(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == item) return false; - if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert"); + if (slots[index] == item) { return false; } + if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); } slots[index] = item; num_items++; if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) { @@ -84,9 +84,9 @@ bool u32_table::maybe_insert(uint32_t item) { template bool u32_table::maybe_delete(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == UINT32_MAX) return false; - if (slots[index] != item) throw std::logic_error("item does not exist"); - if (num_items == 0) throw std::logic_error("delete error"); + if (slots[index] == UINT32_MAX) { return false; } + if (slots[index] != item) { throw std::logic_error("item does not exist"); } + if (num_items == 0) { throw std::logic_error("delete error"); } // delete the item slots[index] = UINT32_MAX; num_items--; @@ -129,7 +129,7 @@ uint32_t u32_table::lookup(uint32_t item) const { const uint32_t mask = size - 1; const uint8_t shift = num_valid_bits - lg_size; uint32_t probe = item >> shift; - if (probe > mask) throw std::logic_error("probe out of range"); + if (probe > mask) { throw std::logic_error("probe out of range"); } while (slots[probe] != item && slots[probe] != UINT32_MAX) { probe = (probe + 1) & mask; } @@ -140,17 +140,17 @@ uint32_t u32_table::lookup(uint32_t item) const { template void u32_table::must_insert(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == item) throw std::logic_error("item exists"); - if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert"); + if (slots[index] == item) { throw std::logic_error("item exists"); } + if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); } slots[index] = item; } template void u32_table::rebuild(uint8_t new_lg_size) { - if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2"); + if (new_lg_size < 2) { throw std::logic_error("lg_size must be >= 2"); } const uint32_t old_size = 1 << lg_size; const uint32_t new_size = 1 << new_lg_size; - if (new_size <= num_items) throw std::logic_error("new_size <= num_items"); + if (new_size <= num_items) { throw std::logic_error("new_size <= num_items"); } vector_u32 old_slots = std::move(slots); slots = vector_u32(new_size, UINT32_MAX, old_slots.get_allocator()); lg_size = new_lg_size; @@ -169,7 +169,7 @@ void u32_table::rebuild(uint8_t new_lg_size) { // The result is nearly sorted, so make sure to use an efficient sort for that case template auto u32_table::unwrapping_get_items() const -> vector_u32 { - if (num_items == 0) return vector_u32(slots.get_allocator()); + if (num_items == 0) { return vector_u32(slots.get_allocator()); } const uint32_t table_size = 1 << lg_size; vector_u32 result(num_items, 0, slots.get_allocator()); size_t i = 0; @@ -187,9 +187,9 @@ auto u32_table::unwrapping_get_items() const -> vector_u32 { // the rest of the table is processed normally while (i < table_size) { const uint32_t item = slots[i++]; - if (item != UINT32_MAX) result[l++] = item; + if (item != UINT32_MAX) { result[l++] = item; } } - if (l != r + 1) throw std::logic_error("unwrapping error"); + if (l != r + 1) { throw std::logic_error("unwrapping error"); } return result; } @@ -213,7 +213,7 @@ void u32_table::merge( else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; } else { arr_c[c] = arr_b[b++]; } } - if (a != lim_a || b != lim_b) throw std::logic_error("merging error"); + if (a != lim_a || b != lim_b) { throw std::logic_error("merging error"); } } // In applications where the input array is already nearly sorted, From c9bf1e88a08f2d0fe58fddee5ed79a7ee703039b Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sun, 8 Mar 2026 22:44:06 +0900 Subject: [PATCH 59/75] refactor: rollback utf8 validation --- NOTICE | 3 - common/CMakeLists.txt | 11 - common/include/third_party/utf8cpp/LICENSE | 23 - common/include/third_party/utf8cpp/utf8.h | 46 -- .../third_party/utf8cpp/utf8/checked.h | 359 ------------- .../include/third_party/utf8cpp/utf8/core.h | 500 ------------------ .../include/third_party/utf8cpp/utf8/cpp11.h | 70 --- .../include/third_party/utf8cpp/utf8/cpp17.h | 96 ---- .../include/third_party/utf8cpp/utf8/cpp20.h | 124 ----- .../third_party/utf8cpp/utf8/unchecked.h | 286 ---------- tuple/include/array_of_strings_sketch.hpp | 27 +- .../include/array_of_strings_sketch_impl.hpp | 13 - tuple/test/array_of_strings_sketch_test.cpp | 12 - 13 files changed, 22 insertions(+), 1548 deletions(-) delete mode 100644 common/include/third_party/utf8cpp/LICENSE delete mode 100644 common/include/third_party/utf8cpp/utf8.h delete mode 100644 common/include/third_party/utf8cpp/utf8/checked.h delete mode 100644 common/include/third_party/utf8cpp/utf8/core.h delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp11.h delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp17.h delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp20.h delete mode 100644 common/include/third_party/utf8cpp/utf8/unchecked.h diff --git a/NOTICE b/NOTICE index 6a2376d9..11ba6f6c 100644 --- a/NOTICE +++ b/NOTICE @@ -10,6 +10,3 @@ The Apache Software Foundation (http://www.apache.org/). Prior to moving to ASF, the software for this project was developed at Yahoo Inc. (https://developer.yahoo.com). - -This product includes utf8cpp (https://github.com/nemtrif/utfcpp), -licensed under the Boost Software License, Version 1.0. diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 2d5c7330..8514433b 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,14 +51,3 @@ install(FILES include/serde.hpp include/xxhash64.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") - -install(FILES - include/third_party/utf8cpp/utf8.h - include/third_party/utf8cpp/utf8/checked.h - include/third_party/utf8cpp/utf8/core.h - include/third_party/utf8cpp/utf8/cpp11.h - include/third_party/utf8cpp/utf8/cpp17.h - include/third_party/utf8cpp/utf8/cpp20.h - include/third_party/utf8cpp/utf8/unchecked.h - include/third_party/utf8cpp/LICENSE - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp") diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE deleted file mode 100644 index 36b7cd93..00000000 --- a/common/include/third_party/utf8cpp/LICENSE +++ /dev/null @@ -1,23 +0,0 @@ -Boost Software License - Version 1.0 - August 17th, 2003 - -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h deleted file mode 100644 index b5135309..00000000 --- a/common/include/third_party/utf8cpp/utf8.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -/* -To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro -and set it to one of the values used by the __cplusplus predefined macro. - -For instance, - #define UTF_CPP_CPLUSPLUS 199711L -will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. -Some library features will be disabled. - -If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. -*/ - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h deleted file mode 100644 index 96ceb4d5..00000000 --- a/common/include/third_party/utf8cpp/utf8/checked.h +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2006-2016 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - utfchar32_t cp; - public: - invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } - utfchar32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - utfchar8_t u8; - public: - invalid_utf8 (utfchar8_t u) : u8(u) {} - invalid_utf8 (char c) : u8(static_cast(c)) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } - utfchar8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - utfchar16_t u16; - public: - invalid_utf16 (utfchar16_t u) : u16(u) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } - utfchar16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - octet_iterator append(utfchar32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - return internal::append(cp, result); - } - - inline void append(utfchar32_t cp, std::string& s) - { - append(cp, std::back_inserter(s)); - } - - template - word_iterator append16(utfchar32_t cp, word_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - return internal::append16(cp, result); - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::append (replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - template - utfchar32_t next(octet_iterator& it, octet_iterator end) - { - utfchar32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(static_cast(*it)); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - utfchar32_t next16(word_iterator& it, word_iterator end) - { - utfchar32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); - if (err_code == internal::NOT_ENOUGH_ROOM) - throw not_enough_room(); - return cp; - } - - template - utfchar32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template - utfchar32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::prior(it, end); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::next(it, end); - } - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - utfchar32_t cp = static_cast(utf8::internal::mask16(*start++)); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - const utfchar32_t trail_surrogate = static_cast(utf8::internal::mask16(*start++)); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - const utfchar32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template - class iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - typedef utfchar32_t value_type; - typedef utfchar32_t* pointer; - typedef utfchar32_t& reference; - typedef std::ptrdiff_t difference_type; - typedef std::bidirectional_iterator_tag iterator_category; - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& rangestart, - const octet_iterator& rangeend) : - it(octet_it), range_start(rangestart), range_end(rangeend) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - utfchar32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later -#include "cpp20.h" -#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later -#include "cpp17.h" -#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later -#include "cpp11.h" -#endif // C++ 11 or later - -#endif //header guard - diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h deleted file mode 100644 index 8e128c18..00000000 --- a/common/include/third_party/utf8cpp/utf8/core.h +++ /dev/null @@ -1,500 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include -#include -#include - -// Determine the C++ standard version. -// If the user defines UTF_CPP_CPLUSPLUS, use that. -// Otherwise, trust the unreliable predefined macro __cplusplus - -#if !defined UTF_CPP_CPLUSPLUS - #define UTF_CPP_CPLUSPLUS __cplusplus -#endif - -#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later - #define UTF_CPP_OVERRIDE override - #define UTF_CPP_NOEXCEPT noexcept - #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); -#else // C++ 98/03 - #define UTF_CPP_OVERRIDE - #define UTF_CPP_NOEXCEPT throw() - // Not worth simulating static_assert: - #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition); -#endif // C++ 11 or later - - -namespace utf8 -{ -// The typedefs for 8-bit, 16-bit and 32-bit code units -#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later - #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later - typedef char8_t utfchar8_t; - #else // C++ 11/14/17 - typedef unsigned char utfchar8_t; - #endif - typedef char16_t utfchar16_t; - typedef char32_t utfchar32_t; -#else // C++ 98/03 - typedef unsigned char utfchar8_t; - typedef unsigned short utfchar16_t; - typedef unsigned int utfchar32_t; -#endif // C++ 11 or later - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; - const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; - const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN - - // Maximum valid value for a Unicode code point - const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline utfchar8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - - template - inline utfchar16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - inline bool is_lead_surrogate(utfchar32_t cp) - { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); - } - - inline bool is_trail_surrogate(utfchar32_t cp) - { - return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); - } - - inline bool is_surrogate(utfchar32_t cp) - { - return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); - } - - inline bool is_code_point_valid(utfchar32_t cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - inline bool is_in_bmp(utfchar32_t cp) - { - return cp < utfchar32_t(0x10000); - } - - template - int sequence_length(octet_iterator lead_it) - { - const utfchar8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - inline bool is_overlong_sequence(utfchar32_t cp, int length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it, const octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = static_cast(utf8::internal::mask8(*it)); - - return UTF8_OK; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = static_cast(utf8::internal::mask8(*it)); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = static_cast(utf8::internal::mask8(*it)); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = static_cast(code_point + ((*it) & 0x3f)); - - return UTF8_OK; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = static_cast(utf8::internal::mask8(*it)); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = static_cast(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = static_cast(code_point + ((*it) & 0x3f)); - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - utfchar32_t cp = 0; - // Determine the sequence length based on the lead octet - const int length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - utfchar32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - - template - utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) - { - // Make sure the iterator dereferences a large enough type - typedef typename std::iterator_traits::value_type word_type; - UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); - // Check the edge case: - if (it == end) - return NOT_ENOUGH_ROOM; - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - word_iterator original_it = it; - - utf_error err = UTF8_OK; - - const utfchar16_t first_word = *it++; - if (!is_surrogate(first_word)) { - code_point = first_word; - return UTF8_OK; - } - else { - if (it == end) - err = NOT_ENOUGH_ROOM; - else if (is_lead_surrogate(first_word)) { - const utfchar16_t second_word = *it++; - if (is_trail_surrogate(static_cast(second_word))) { - code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; - return UTF8_OK; - } else - err = INCOMPLETE_SEQUENCE; - - } else { - err = INVALID_LEAD; - } - } - // error branch - it = original_it; - return err; - } - - // Internal implementation of both checked and unchecked append() function - // This function will be invoked by the overloads below, as they will know - // the octet_type. - template - octet_iterator append(utfchar32_t cp, octet_iterator result) { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - // One of the following overloads will be invoked from the API calls - - // A simple (but dangerous) case: the caller appends byte(s) to a char array - inline char* append(utfchar32_t cp, char* result) { - return append(cp, result); - } - - // Hopefully, most common case: the caller uses back_inserter - // i.e. append(cp, std::back_inserter(str)); - template - std::back_insert_iterator append - (utfchar32_t cp, std::back_insert_iterator result) { - return append, - typename container_type::value_type>(cp, result); - } - - // The caller uses some other kind of output operator - not covered above - // Note that in this case we are not able to determine octet_type - // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. - template - octet_iterator append(utfchar32_t cp, octet_iterator result) { - return append(cp, result); - } - - // Internal implementation of both checked and unchecked append16() function - // This function will be invoked by the overloads below, as they will know - // the word_type. - template - word_iterator append16(utfchar32_t cp, word_iterator result) { - UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); - if (is_in_bmp(cp)) - *(result++) = static_cast(cp); - else { - // Code points from the supplementary planes are encoded via surrogate pairs - *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); - *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); - } - return result; - } - - // Hopefully, most common case: the caller uses back_inserter - // i.e. append16(cp, std::back_inserter(str)); - template - std::back_insert_iterator append16 - (utfchar32_t cp, std::back_insert_iterator result) { - return append16, - typename container_type::value_type>(cp, result); - } - - // The caller uses some other kind of output operator - not covered above - // Note that in this case we are not able to determine word_type - // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. - template - word_iterator append16(utfchar32_t cp, word_iterator result) { - return append16(cp, result); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - inline const char* find_invalid(const char* str) - { - const char* end = str + std::strlen(str); - return find_invalid(str, end); - } - - inline std::size_t find_invalid(const std::string& s) - { - std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - inline bool is_valid(const char* str) - { - return (*(utf8::find_invalid(str)) == '\0'); - } - - inline bool is_valid(const std::string& s) - { - return is_valid(s.begin(), s.end()); - } - - - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } - - inline bool starts_with_bom(const std::string& s) - { - return starts_with_bom(s.begin(), s.end()); - } -} // namespace utf8 - -#endif // header guard - diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h deleted file mode 100644 index 691633c8..00000000 --- a/common/include/third_party/utf8cpp/utf8/cpp11.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2018 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 -#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 - -#include "checked.h" - -namespace utf8 -{ - inline void append16(utfchar32_t cp, std::u16string& s) - { - append16(cp, std::back_inserter(s)); - } - - inline std::string utf16to8(const std::u16string& s) - { - std::string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(const std::string& s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::string utf32to8(const std::u32string& s) - { - std::string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(const std::string& s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } -} // namespace utf8 - -#endif // header guard - diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h deleted file mode 100644 index 07587300..00000000 --- a/common/include/third_party/utf8cpp/utf8/cpp17.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2018 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 -#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 - -#include "cpp11.h" - -namespace utf8 -{ - inline std::string utf16to8(std::u16string_view s) - { - std::string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(std::string_view s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::string utf32to8(std::u32string_view s) - { - std::string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(std::string_view s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::size_t find_invalid(std::string_view s) - { - std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); - } - - inline bool is_valid(std::string_view s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(std::string_view s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(std::string_view s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(std::string_view s) - { - return starts_with_bom(s.begin(), s.end()); - } - -} // namespace utf8 - -#endif // header guard - diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h deleted file mode 100644 index 07b61d0f..00000000 --- a/common/include/third_party/utf8cpp/utf8/cpp20.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2022 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 -#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 - -#include "cpp17.h" - -namespace utf8 -{ - inline std::u8string utf16tou8(const std::u16string& s) - { - std::u8string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u8string utf16tou8(std::u16string_view s) - { - std::u8string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(const std::u8string& s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(const std::u8string_view& s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u8string utf32tou8(const std::u32string& s) - { - std::u8string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u8string utf32tou8(const std::u32string_view& s) - { - std::u8string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(const std::u8string& s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(const std::u8string_view& s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::size_t find_invalid(const std::u8string& s) - { - std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); - } - - inline bool is_valid(const std::u8string& s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) - { - std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::u8string replace_invalid(const std::u8string& s) - { - std::u8string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(const std::u8string& s) - { - return starts_with_bom(s.begin(), s.end()); - } - -} // namespace utf8 - -#endif // header guard - diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h deleted file mode 100644 index 173d0302..00000000 --- a/common/include/third_party/utf8cpp/utf8/unchecked.h +++ /dev/null @@ -1,286 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(utfchar32_t cp, octet_iterator result) - { - return internal::append(cp, result); - } - - template - word_iterator append16(utfchar32_t cp, word_iterator result) - { - return internal::append16(cp, result); - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::unchecked::append(replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::unchecked::append(replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::unchecked::append(replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); - return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); - } - - inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - template - utfchar32_t next(octet_iterator& it) - { - utfchar32_t cp = utf8::internal::mask8(*it); - switch (utf8::internal::sequence_length(it)) { - case 1: - break; - case 2: - ++it; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp = static_cast(cp + ((*it) & 0x3f)); - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp = static_cast(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); - ++it; - cp = static_cast(cp + ((*it) & 0x3f)); - break; - } - ++it; - return cp; - } - - template - utfchar32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template - utfchar32_t next16(word_iterator& it) - { - utfchar32_t cp = utf8::internal::mask16(*it++); - if (utf8::internal::is_lead_surrogate(cp)) - return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; - return cp; - } - - template - utfchar32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - template - void advance(octet_iterator& it, distance_type n) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::unchecked::prior(it); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::unchecked::next(it); - } - } - - template - typename std::iterator_traits::difference_type - distance(octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template - octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - utfchar32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start == end) - return result; - utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - utfchar32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template - class iterator { - octet_iterator it; - public: - typedef utfchar32_t value_type; - typedef utfchar32_t* pointer; - typedef utfchar32_t& reference; - typedef std::ptrdiff_t difference_type; - typedef std::bidirectional_iterator_tag iterator_category; - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - utfchar32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - -#endif // header guard - diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index ac49fd5b..296c0a87 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -42,8 +42,16 @@ class default_array_of_strings_update_policy { void update(array_of_strings& array, const array_of_strings* input) const; }; -// serializer/deserializer for an array of strings -// Requirements: all strings must be valid UTF-8 and array size must be <= 127. +/** + * Serializer/deserializer for an array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * This serde does not perform UTF-8 validation. Callers must ensure strings + * are valid UTF-8 before serialization to guarantee interoperability with + * Java, Go, and Rust implementations. + */ template> struct default_array_of_strings_serde { using summary_allocator = typename std::allocator_traits::template rebind_alloc; @@ -60,7 +68,6 @@ struct default_array_of_strings_serde { summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); - static void check_utf8(const std::string& value); }; /** @@ -69,8 +76,18 @@ struct default_array_of_strings_serde { uint64_t hash_array_of_strings_key(const array_of_strings& key); /** - * Extended class of compact_tuple_sketch for array of strings - * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + * Extended class of compact_tuple_sketch for array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * UTF-8 compatibility: + * Serialized sketches are intended to be language and platform independent. + * Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings. + * This C++ implementation does not validate UTF-8; it is the caller's + * responsibility to ensure all strings are valid UTF-8 before calling update(). + * Non-UTF-8 strings may serialize successfully but will fail to deserialize + * in other language implementations. */ template> class compact_array_of_strings_tuple_sketch: diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 81045472..26751d66 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -22,9 +22,7 @@ #include -#include "array_of_strings_sketch.hpp" #include "common_defs.hpp" -#include "third_party/utf8cpp/utf8.h" namespace datasketches { @@ -116,7 +114,6 @@ void default_array_of_strings_serde::serialize( write(os, num_nodes); const std::string* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { - check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); write(os, length); os.write(data[j].data(), length); @@ -143,7 +140,6 @@ void default_array_of_strings_serde::deserialize( is.read(&value[0], length); if (!is) throw std::runtime_error("array_of_strings stream read failed"); } - check_utf8(value); array[j] = std::move(value); } summary_allocator alloc(summary_allocator_); @@ -166,7 +162,6 @@ size_t default_array_of_strings_serde::serialize( bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); const std::string* data = items[i].data(); for (uint8_t j = 0; j < num_nodes; ++j) { - check_utf8(data[j]); const uint32_t length = static_cast(data[j].size()); bytes_written += copy_to_mem(length, ptr8 + bytes_written); @@ -200,7 +195,6 @@ size_t default_array_of_strings_serde::deserialize( if (length != 0) { bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } - check_utf8(value); array[j] = std::move(value); } summary_allocator alloc(summary_allocator_); @@ -233,13 +227,6 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar return static_cast(total); } -template -void default_array_of_strings_serde::check_utf8(const std::string& value) { - if (!utf8::is_valid(value.begin(), value.end())) { - throw std::runtime_error("array_of_strings contains invalid UTF-8"); - } -} - } /* namespace datasketches */ #endif diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index dc21aceb..5507c071 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -256,18 +256,6 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { TEST_CASE("aos serde validation", "[tuple_sketch]") { default_array_of_strings_serde<> serde; - SECTION("invalid utf8 rejected") { - array_of_strings array(1, "", std::allocator()); - const std::string invalid_utf8("\xC3\x28", 2); - array[0] = invalid_utf8; - std::stringstream ss; - ss.exceptions(std::ios::failbit | std::ios::badbit); - REQUIRE_THROWS_WITH( - serde.serialize(ss, &array, 1), - Catch::Matchers::Contains("invalid UTF-8") - ); - } - SECTION("too many nodes rejected") { array_of_strings array(128, "", std::allocator()); std::stringstream ss; From bc447d2307cd6501119dfdfd6889946e35d709c2 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sat, 14 Mar 2026 00:42:35 +0900 Subject: [PATCH 60/75] fix: destroy in the failure after partial success --- .../include/array_of_strings_sketch_impl.hpp | 93 ++++++++++++------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 26751d66..7884c5e5 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -125,25 +125,39 @@ template void default_array_of_strings_serde::deserialize( std::istream& is, array_of_strings* items, unsigned num ) const { - for (unsigned i = 0; i < num; ++i) { - read(is); // total_bytes - if (!is) throw std::runtime_error("array_of_strings stream read failed"); - const uint8_t num_nodes = read(is); - if (!is) throw std::runtime_error("array_of_strings stream read failed"); - check_num_nodes(num_nodes); - array_of_strings array(num_nodes, ""); - for (uint8_t j = 0; j < num_nodes; ++j) { - const uint32_t length = read(is); - if (!is) throw std::runtime_error("array_of_strings stream read failed"); - std::string value(length, '\0'); - if (length != 0) { - is.read(&value[0], length); - if (!is) throw std::runtime_error("array_of_strings stream read failed"); + unsigned i = 0; + bool failure = false; + try { + for (; i < num; ++i) { + read(is); // total_bytes + if (!is) { failure = true; break; } + const uint8_t num_nodes = read(is); + if (!is) { failure = true; break; } + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + if (!is) { failure = true; break; } + std::string value(length, '\0'); + if (length != 0) { + is.read(&value[0], length); + if (!is) { failure = true; break; } + } + array[j] = std::move(value); } - array[j] = std::move(value); + if (failure) break; + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } + } catch (std::istream::failure&) { + failure = true; + } + if (failure) { summary_allocator alloc(summary_allocator_); - std::allocator_traits::construct(alloc, &items[i], std::move(array)); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i)); } } @@ -177,28 +191,37 @@ size_t default_array_of_strings_serde::deserialize( ) const { const uint8_t* ptr8 = static_cast(ptr); size_t bytes_read = 0; - - for (unsigned i = 0; i < num; ++i) { - check_memory_size(bytes_read + sizeof(uint32_t), capacity); - const size_t item_start = bytes_read; - uint32_t total_bytes; - bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); - check_memory_size(item_start + total_bytes, capacity); - uint8_t num_nodes; - bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); - check_num_nodes(num_nodes); - array_of_strings array(num_nodes, ""); - for (uint8_t j = 0; j < num_nodes; ++j) { - uint32_t length; - bytes_read += copy_from_mem(ptr8 + bytes_read, length); - std::string value(length, '\0'); - if (length != 0) { - bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + unsigned i = 0; + + try { + for (; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + } + array[j] = std::move(value); } - array[j] = std::move(value); + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } + } catch (...) { summary_allocator alloc(summary_allocator_); - std::allocator_traits::construct(alloc, &items[i], std::move(array)); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + throw; } return bytes_read; } From 12a5116abe1065f57c1858b4b5a3362103ebe018 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sat, 14 Mar 2026 01:13:20 +0900 Subject: [PATCH 61/75] fix: more stricter check --- .../include/array_of_strings_sketch_impl.hpp | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 7884c5e5..f38dc1fb 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -107,17 +107,23 @@ template void default_array_of_strings_serde::serialize( std::ostream& os, const array_of_strings* items, unsigned num ) const { - for (unsigned i = 0; i < num; ++i) { - const uint32_t total_bytes = compute_total_bytes(items[i]); - const uint8_t num_nodes = static_cast(items[i].size()); - write(os, total_bytes); - write(os, num_nodes); - const std::string* data = items[i].data(); - for (uint8_t j = 0; j < num_nodes; ++j) { - const uint32_t length = static_cast(data[j].size()); - write(os, length); - os.write(data[j].data(), length); + unsigned i = 0; + try { + for (; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } } + } catch (std::runtime_error& e) { + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings stream write failed at item " + std::to_string(i)); } } @@ -149,7 +155,7 @@ void default_array_of_strings_serde::deserialize( summary_allocator alloc(summary_allocator_); std::allocator_traits::construct(alloc, &items[i], std::move(array)); } - } catch (std::istream::failure&) { + } catch (...) { failure = true; } if (failure) { @@ -191,6 +197,7 @@ size_t default_array_of_strings_serde::deserialize( ) const { const uint8_t* ptr8 = static_cast(ptr); size_t bytes_read = 0; + unsigned i = 0; try { @@ -200,15 +207,21 @@ size_t default_array_of_strings_serde::deserialize( uint32_t total_bytes; bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); check_memory_size(item_start + total_bytes, capacity); + + check_memory_size(bytes_read + sizeof(uint8_t), capacity); uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); for (uint8_t j = 0; j < num_nodes; ++j) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); if (length != 0) { + check_memory_size(bytes_read + length, capacity); bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } array[j] = std::move(value); @@ -216,12 +229,13 @@ size_t default_array_of_strings_serde::deserialize( summary_allocator alloc(summary_allocator_); std::allocator_traits::construct(alloc, &items[i], std::move(array)); } - } catch (...) { + } catch (std::exception& e) { summary_allocator alloc(summary_allocator_); for (unsigned j = 0; j < i; ++j) { std::allocator_traits::destroy(alloc, &items[j]); } - throw; + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings bytes read failed at item " + std::to_string(i)); } return bytes_read; } From 7617df45a1f2d6e8cad54b31aa5b77b007214874 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Sat, 14 Mar 2026 01:33:23 +0900 Subject: [PATCH 62/75] refactor: change code for consistency --- tuple/include/array_of_strings_sketch_impl.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index f38dc1fb..400df477 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -155,8 +155,13 @@ void default_array_of_strings_serde::deserialize( summary_allocator alloc(summary_allocator_); std::allocator_traits::construct(alloc, &items[i], std::move(array)); } - } catch (...) { - failure = true; + } catch (std::exception& e) { + summary_allocator alloc(summary_allocator_); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i)); } if (failure) { summary_allocator alloc(summary_allocator_); @@ -219,9 +224,9 @@ size_t default_array_of_strings_serde::deserialize( uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); + check_memory_size(bytes_read + length, capacity); std::string value(length, '\0'); if (length != 0) { - check_memory_size(bytes_read + length, capacity); bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } array[j] = std::move(value); From c65472084a7f314163c12fe2b4476f0b6dcb7a9a Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 16 Mar 2026 15:21:27 +0900 Subject: [PATCH 63/75] doc: update utf8 compatibility about serde --- common/include/serde.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/include/serde.hpp b/common/include/serde.hpp index ad20fe63..c4e46d7d 100644 --- a/common/include/serde.hpp +++ b/common/include/serde.hpp @@ -132,6 +132,11 @@ struct serde::value>::type> { /// ItemsSketch with ArrayOfStringsSerDe in Java. /// The length of each string is stored as a 32-bit integer (historically), /// which may be too wasteful. Treat this as an example. +/// +/// This implementation treats std::string as an arbitrary byte container. +/// It does not check whether string contents are valid UTF-8. +/// +/// Use a UTF-8-validating SerDe when cross-language portability is required. template<> struct serde { /// @copydoc serde::serialize From 04104c04dffa15f47f07f65d476f2ee78b531b42 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 16 Mar 2026 15:23:11 +0900 Subject: [PATCH 64/75] doc: add comments about utf8 compatibility for tuple sketch --- tuple/include/tuple_sketch.hpp | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index cbfd9f11..7b636a78 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -46,6 +46,11 @@ struct pair_extract_key { /** * Base class for Tuple sketch. * This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key. + * + * Summary that may retain string values. + * For Summary containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. */ template< typename Summary, @@ -253,6 +258,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given string. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key string to update the sketch with * @param value to update the sketch with */ @@ -261,6 +269,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key uint64_t to update the sketch with * @param value to update the sketch with */ @@ -269,6 +280,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key int64_t to update the sketch with * @param value to update the sketch with */ @@ -277,6 +291,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint32_t to update the sketch with * @param value to update the sketch with @@ -286,6 +303,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int32_t to update the sketch with * @param value to update the sketch with @@ -295,6 +315,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint16_t to update the sketch with * @param value to update the sketch with @@ -304,6 +327,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int16_t to update the sketch with * @param value to update the sketch with @@ -313,6 +339,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint8_t to update the sketch with * @param value to update the sketch with @@ -322,6 +351,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int8_t to update the sketch with * @param value to update the sketch with @@ -331,6 +363,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given double-precision floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key double to update the sketch with * @param value to update the sketch with @@ -340,6 +375,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key float to update the sketch with * @param value to update the sketch with @@ -357,6 +395,9 @@ class update_tuple_sketch: public tuple_sketch { * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key pointer to the data * @param length of the data in bytes * @param value to update the sketch with From 1cfe24520492e331a732d6f640cd7ab705d93583 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 16 Mar 2026 15:23:20 +0900 Subject: [PATCH 65/75] doc: add comments about utf8 compatibility for sampling sketches --- sampling/include/ebpps_sketch.hpp | 13 +++++++++++++ sampling/include/var_opt_sketch.hpp | 9 +++++++++ sampling/include/var_opt_union.hpp | 6 +++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sampling/include/ebpps_sketch.hpp b/sampling/include/ebpps_sketch.hpp index 038b5a30..615d37b8 100644 --- a/sampling/include/ebpps_sketch.hpp +++ b/sampling/include/ebpps_sketch.hpp @@ -50,6 +50,11 @@ namespace ebpps_constants { * The sample may be smaller than k and the resulting size of the sample potentially includes * a probabilistic component, meaning the resulting sample size is not always constant. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * @author Jon Malkin */ template< @@ -71,6 +76,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -79,6 +86,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -87,6 +96,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(const ebpps_sketch& sketch); @@ -94,6 +105,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(ebpps_sketch&& sketch); diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 1324883c..6b157caa 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -57,6 +57,11 @@ namespace var_opt_constants { * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for * subset sum estimation. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * author Kevin Lang * author Jon Malkin */ @@ -111,6 +116,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -119,6 +126,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp index 0e4f76d8..68d1ac4b 100644 --- a/sampling/include/var_opt_union.hpp +++ b/sampling/include/var_opt_union.hpp @@ -65,13 +65,17 @@ class var_opt_union { /** * Updates this union with the given sketch * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(const var_opt_sketch& sk); - + /** * Updates this union with the given sketch * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(var_opt_sketch&& sk); From 14c20a636404874b858b374311b470bc5d64dcff Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 16 Mar 2026 15:23:35 +0900 Subject: [PATCH 66/75] doc: add comments about utf8 compatibility for frequency sketch --- fi/include/frequent_items_sketch.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp index 0aa9514c..87ee174e 100644 --- a/fi/include/frequent_items_sketch.hpp +++ b/fi/include/frequent_items_sketch.hpp @@ -44,6 +44,11 @@ enum frequent_items_error_type { * Based on Java implementation here: * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java * @author Alexander Saydakov + * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. */ template< typename T, @@ -74,6 +79,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (lvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -82,6 +89,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (rvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -91,6 +100,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (lvalue) */ void merge(const frequent_items_sketch& other); @@ -98,6 +109,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (rvalue) */ void merge(frequent_items_sketch&& other); From a9b42755072b079fd90b29b9851adc121015c58e Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Mon, 16 Mar 2026 15:24:16 +0900 Subject: [PATCH 67/75] doc: add comments about utf8 compatibility for quantiels sketches --- kll/include/kll_sketch.hpp | 11 ++++++++++- quantiles/include/quantiles_sketch.hpp | 9 +++++++++ req/include/req_sketch.hpp | 9 +++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index 904587a1..d672c419 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -46,6 +46,11 @@ namespace kll_constants { * and nearly optimal accuracy per retained item. * See Optimal Quantile Approximation in Streams. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

This is a stochastic streaming sketch that enables near real-time analysis of the * approximate distribution of items from a very large stream in a single pass, requiring only * that the items are comparable. @@ -56,7 +61,7 @@ namespace kll_constants { *

As of May 2020, this implementation produces serialized sketches which are binary-compatible * with the equivalent Java implementation only when template parameter T = float * (32-bit single precision values). - * + * *

Given an input stream of N items, the natural rank of any specific * item is defined as its index (1 to N) in inclusive mode * or (0 to N-1) in exclusive mode @@ -225,6 +230,8 @@ class kll_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -232,6 +239,8 @@ class kll_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp index b1e2e3c1..e995e3e3 100644 --- a/quantiles/include/quantiles_sketch.hpp +++ b/quantiles/include/quantiles_sketch.hpp @@ -47,6 +47,11 @@ namespace quantiles_constants { * The analysis is obtained using get_rank() and get_quantile() functions, * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF(). * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Consider a large stream of one million values such as packet sizes coming into a network node. * The natural rank of any specific size value is its index in the hypothetical sorted * array of values. @@ -206,6 +211,8 @@ class quantiles_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -213,6 +220,8 @@ class quantiles_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp index 21ccac0c..52295bd2 100755 --- a/req/include/req_sketch.hpp +++ b/req/include/req_sketch.hpp @@ -35,6 +35,11 @@ namespace datasketches { * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty, * Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Reference: https://arxiv.org/abs/2004.01668

* *

This implementation differs from the algorithm described in the paper in the following:

@@ -179,6 +184,8 @@ class req_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -186,6 +193,8 @@ class req_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template From bda16fd2287cc523f6422d4781a51da2429e70c4 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 21 Mar 2026 23:06:03 -0700 Subject: [PATCH 68/75] Update GHA Code Coverage workflow (#493) --- .github/workflows/code_coverage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml index 060242fa..09a8dbc9 100644 --- a/.github/workflows/code_coverage.yml +++ b/.github/workflows/code_coverage.yml @@ -37,7 +37,7 @@ jobs: - name: Generate coverage .info run: cmake --build build --target coverage_report - name: Post to Coveralls - uses: coverallsapp/github-action@master + uses: coverallsapp/github-action@v2 with: github-token: ${{ secrets.GITHUB_TOKEN }} path-to-lcov: build/lcov.info From 5e20ad04e53b3bdcf7dc4ff6f964818731da09a5 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Tue, 24 Mar 2026 14:25:25 +0900 Subject: [PATCH 69/75] fix: allow positive weight only --- sampling/include/var_opt_sketch.hpp | 2 +- sampling/include/var_opt_sketch_impl.hpp | 7 +++---- sampling/test/var_opt_sketch_test.cpp | 14 ++++++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 6b157caa..df080c6e 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -272,7 +272,7 @@ class var_opt_sketch { typedef typename std::allocator_traits::template rebind_alloc AllocDouble; typedef typename std::allocator_traits::template rebind_alloc AllocBool; - static const uint32_t MIN_LG_ARR_ITEMS = 3; + static const uint32_t MIN_LG_ARR_ITEMS = 4; static const uint8_t PREAMBLE_LONGS_EMPTY = 1; static const uint8_t PREAMBLE_LONGS_WARMUP = 3; diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp index 36ee3fc8..30d526af 100644 --- a/sampling/include/var_opt_sketch_impl.hpp +++ b/sampling/include/var_opt_sketch_impl.hpp @@ -772,12 +772,11 @@ string var_opt_sketch::items_to_string(bool print_gap) const { template template void var_opt_sketch::update(O&& item, double weight, bool mark) { - if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) { - throw std::invalid_argument("Item weights must be nonnegative and finite. Found: " + if (weight <= 0.0 || std::isnan(weight) || std::isinf(weight)) { + throw std::invalid_argument("Item weights must be positive and finite. Found: " + std::to_string(weight)); - } else if (weight == 0.0) { - return; } + ++n_; if (r_ == 0) { diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp index 71d16e91..179d7016 100644 --- a/sampling/test/var_opt_sketch_test.cpp +++ b/sampling/test/var_opt_sketch_test.cpp @@ -178,11 +178,17 @@ TEST_CASE("varopt sketch: non-empty degenerate sketch", "[var_opt_sketch]") { TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") { var_opt_sketch sk(100, resize_factor::X2); - REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument); - // should not throw but sketch should still be empty - sk.update("zero weight", 0.0); - REQUIRE(sk.is_empty()); + // Negative + REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument); + // Zero + REQUIRE_THROWS_AS(sk.update("zero_weight", 0.0), std::invalid_argument); + // NaN + REQUIRE_THROWS_AS(sk.update("NaN_weight", std::numeric_limits::quiet_NaN()), std::invalid_argument); + // +Inf + REQUIRE_THROWS_AS(sk.update("positive_infinity", std::numeric_limits::infinity()), std::invalid_argument); + // -Inf + REQUIRE_THROWS_AS(sk.update("negative_infinity", -std::numeric_limits::infinity()), std::invalid_argument); } TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") { From de35ce73d52f7c4b13d87129892a50cf50a6f948 Mon Sep 17 00:00:00 2001 From: lani_karrot Date: Wed, 25 Mar 2026 09:55:12 +0900 Subject: [PATCH 70/75] ci: upload coverage report directly --- .github/workflows/code_coverage.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml index 09a8dbc9..69fa94ec 100644 --- a/.github/workflows/code_coverage.yml +++ b/.github/workflows/code_coverage.yml @@ -37,7 +37,8 @@ jobs: - name: Generate coverage .info run: cmake --build build --target coverage_report - name: Post to Coveralls - uses: coverallsapp/github-action@v2 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: build/lcov.info + run: | + curl -sL https://coveralls.io/coveralls-linux.tar.gz | tar -xz + ./coveralls report build/lcov.info + env: + COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 53588892848771e4a90238ca07a32ec990e605d3 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 27 Mar 2026 10:35:46 -0700 Subject: [PATCH 71/75] fix get_RSE() --- req/include/req_sketch_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp index 3c1c2fc1..7f0b4557 100755 --- a/req/include/req_sketch_impl.hpp +++ b/req/include/req_sketch_impl.hpp @@ -293,7 +293,7 @@ double req_sketch::get_rank_upper_bound(double rank, uint8_t num_std_de template double req_sketch::get_RSE(uint16_t k, double rank, bool hra, uint64_t n) { - return get_rank_lb(k, 2, rank, 1, n, hra); + return get_rank_ub(k, 2, rank, 1, n, hra) - rank; } template From 9130a0751ee48882680a4c80284372c20c901a3d Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 27 Mar 2026 16:19:16 -0700 Subject: [PATCH 72/75] add get_RSE() test --- req/test/req_sketch_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp index 2a338b8a..d9c9a16e 100755 --- a/req/test/req_sketch_test.cpp +++ b/req/test/req_sketch_test.cpp @@ -43,6 +43,7 @@ TEST_CASE("req sketch: empty", "[req_sketch]") { REQUIRE_FALSE(sketch.is_estimation_mode()); REQUIRE(sketch.get_n() == 0); REQUIRE(sketch.get_num_retained() == 0); + REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, true, 0) == 0); REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error); REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error); REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error); @@ -61,6 +62,7 @@ TEST_CASE("req sketch: single value, lra", "[req_sketch]") { REQUIRE_FALSE(sketch.is_estimation_mode()); REQUIRE(sketch.get_n() == 1); REQUIRE(sketch.get_num_retained() == 1); + REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, false, sketch.get_n()) == 0); REQUIRE(sketch.get_rank(1.0f, false) == 0); REQUIRE(sketch.get_rank(1.0f) == 1); REQUIRE(sketch.get_rank(1.1f, false) == 1); From 0a885718cad4f32e7cc240dba288d0a12b6b09b0 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 11 Apr 2026 17:44:14 +0900 Subject: [PATCH 73/75] fix: missing header when compile using gcc 15 --- common/include/serde.hpp | 1 + fi/include/reverse_purge_hash_map.hpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/common/include/serde.hpp b/common/include/serde.hpp index c4e46d7d..02c2fc16 100644 --- a/common/include/serde.hpp +++ b/common/include/serde.hpp @@ -20,6 +20,7 @@ #ifndef DATASKETCHES_SERDE_HPP_ #define DATASKETCHES_SERDE_HPP_ +#include #include #include #include diff --git a/fi/include/reverse_purge_hash_map.hpp b/fi/include/reverse_purge_hash_map.hpp index b75abc43..5d59c187 100644 --- a/fi/include/reverse_purge_hash_map.hpp +++ b/fi/include/reverse_purge_hash_map.hpp @@ -20,8 +20,9 @@ #ifndef REVERSE_PURGE_HASH_MAP_HPP_ #define REVERSE_PURGE_HASH_MAP_HPP_ -#include +#include #include +#include namespace datasketches { From 44e6fb33d23e86cbef437ba54c7d2bfa1b06e7f1 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 11 Apr 2026 17:44:32 +0900 Subject: [PATCH 74/75] ci: change build coverage --- .github/workflows/build_cmake.yml | 178 +++++++++++++++++++++++++++--- 1 file changed, 165 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index aee7ec3d..6687f8e7 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -6,7 +6,7 @@ env: BUILD_TYPE: Release jobs: - build: + build-native: name: ${{ matrix.config.name }} runs-on: ${{ matrix.config.os }} strategy: @@ -14,23 +14,16 @@ jobs: matrix: config: - { - name: "MacOS Latest, Clang", - os: macos-latest, + name: "macOS 15, Clang", + os: macos-15, test_target: test, cc: "clang", cxx: "clang++" } - { - name: "Ubuntu Latest, GCC", - os: ubuntu-latest, - test_target: test, - cc: "gcc", cxx: "g++" - } - - { - name: "Windows Latest, MSVC", - os: windows-latest, + name: "Windows 2022, MSVC", + os: windows-2022, test_target: RUN_TESTS, - cc: "cl", cxx: "cl", - environment_script: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/Auxiliary/Build/vcvars64.bat" + cc: "cl", cxx: "cl" } #- { # name: "Windows Latest, MinGW+gcc", @@ -52,3 +45,162 @@ jobs: run: cmake --build build --config Release --target ${{ matrix.config.test_target }} - name: Install headers run: cmake --build build -t install + + build-ubuntu-gcc: + name: Compiler / ${{ matrix.config.name }} + runs-on: ubuntu-24.04 + container: + image: ${{ matrix.config.image }} + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + config: + - { + name: "Ubuntu 24.04, GCC 9", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-9", cxx: "g++-9", + packages: "gcc-9 g++-9", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 10", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-10", cxx: "g++-10", + packages: "gcc-10 g++-10", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 11", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-11", cxx: "g++-11", + packages: "gcc-11 g++-11", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 12", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-12", cxx: "g++-12", + packages: "gcc-12 g++-12", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 13", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-13", cxx: "g++-13", + packages: "gcc-13 g++-13", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 14", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-14", cxx: "g++-14", + packages: "gcc-14 g++-14", + cxx_standard: "11" + } + - { + name: "Ubuntu 25.10, GCC 15", + image: "ubuntu:25.10", + test_target: test, + cc: "gcc-15", cxx: "g++-15", + packages: "gcc-15 g++-15", + cxx_standard: "11" + } + steps: + - name: Install build dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt-get update + apt-get install -y --no-install-recommends \ + ca-certificates \ + cmake \ + git \ + make \ + ${{ matrix.config.packages }} + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + - name: Configure + env: + CC: ${{ matrix.config.cc }} + CXX: ${{ matrix.config.cxx }} + run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test + - name: Build C++ unit tests + run: cmake --build build --config Release + - name: Run C++ tests + run: cmake --build build --config Release --target ${{ matrix.config.test_target }} + - name: Install headers + run: cmake --build build -t install + + build-ubuntu-std: + name: Standard / Ubuntu 25.10, GCC 15, C++${{ matrix.config.cxx_standard }} + runs-on: ubuntu-24.04 + container: + image: ubuntu:25.10 + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + config: + - { + cxx_standard: "11", + test_target: test + } + - { + cxx_standard: "14", + test_target: test + } + - { + cxx_standard: "17", + test_target: test + } + - { + cxx_standard: "20", + test_target: test + } + - { + cxx_standard: "23", + test_target: test, + } + steps: + - name: Install build dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt-get update + apt-get install -y --no-install-recommends \ + ca-certificates \ + cmake \ + gcc-15 \ + g++-15 \ + git \ + make + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + - name: Configure + env: + CC: gcc-15 + CXX: g++-15 + run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test + - name: Build C++ unit tests + run: cmake --build build --config Release + - name: Run C++ tests + run: cmake --build build --config Release --target ${{ matrix.config.test_target }} + - name: Install headers + run: cmake --build build -t install From bbd13d27958383c66df4fd3cefe18b2cdd66826f Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 11 Apr 2026 23:43:25 +0900 Subject: [PATCH 75/75] fix: pinning with windows 2025 --- .github/workflows/build_cmake.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index 6687f8e7..d8a53900 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -20,8 +20,8 @@ jobs: cc: "clang", cxx: "clang++" } - { - name: "Windows 2022, MSVC", - os: windows-2022, + name: "Windows 2025, MSVC", + os: windows-2025, test_target: RUN_TESTS, cc: "cl", cxx: "cl" }