From c38f798abcf5d35ccb08aebc7b8d2bc64e07d1f6 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:11:42 -0800
Subject: [PATCH 01/75] release process: setting development target to 5.3.0

---
 version.cfg.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/version.cfg.in b/version.cfg.in
index ad2e7b11..5ffad51a 100644
--- a/version.cfg.in
+++ b/version.cfg.in
@@ -1 +1 @@
-5.2.@DT@.@HHMM@
+5.3.@DT@.@HHMM@

From eb3200e7095c6fc073051dc09634e05fd1b64566 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Sun, 26 Jan 2025 20:50:44 -0800
Subject: [PATCH 02/75] code cleanup and alignment with Java

---
 theta/include/theta_sketch.hpp      |  4 +-
 theta/include/theta_sketch_impl.hpp | 58 ++++++++++++++---------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index ad6421e7..4aab4b92 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -609,9 +609,11 @@ class wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator {
   uint32_t index_;
   uint64_t previous_;
   bool is_block_mode_;
-  uint8_t buf_i_;
   uint8_t offset_;
   uint64_t buffer_[8];
+
+  inline void unpack1();
+  inline void unpack8();
 };
 
 } /* namespace datasketches */
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index b6a5d7ee..8f7b1e8d 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -817,23 +817,15 @@ num_entries_(num_entries),
 index_(index),
 previous_(0),
 is_block_mode_(num_entries_ >= 8),
-buf_i_(0),
 offset_(0)
 {
   if (entry_bits == 64) { // no compression
     ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
   } else if (index < num_entries) {
     if (is_block_mode_) {
-      unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
-      ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
-      for (int i = 0; i < 8; ++i) {
-        buffer_[i] += previous_;
-        previous_ = buffer_[i];
-      }
+      unpack8();
     } else {
-      offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
-      buffer_[0] += previous_;
-      previous_ = buffer_[0];
+      unpack1();
     }
   }
 }
@@ -844,35 +836,41 @@ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++()
     ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
     return *this;
   }
-  ++index_;
-  if (index_ < num_entries_) {
+  if (++index_ < num_entries_) {
     if (is_block_mode_) {
-      ++buf_i_;
-      if (buf_i_ == 8) {
-        buf_i_ = 0;
-        if (index_ + 8 < num_entries_) {
-          unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
-          ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
-          for (int i = 0; i < 8; ++i) {
-            buffer_[i] += previous_;
-            previous_ = buffer_[i];
-          }
+      if ((index_ & 7) == 0) {
+        if (num_entries_ - index_ >= 8) {
+          unpack8();
         } else {
           is_block_mode_ = false;
-          offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
-          buffer_[0] += previous_;
-          previous_ = buffer_[0];
+          unpack1();
         }
       }
     } else {
-      offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
-      buffer_[0] += previous_;
-      previous_ = buffer_[0];
+      unpack1();
     }
   }
   return *this;
 }
 
+template<typename Allocator>
+void wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::unpack1() {
+  const uint32_t i = index_ & 7;
+  offset_ = unpack_bits(buffer_[i], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
+  buffer_[i] += previous_;
+  previous_ = buffer_[i];
+}
+
+template<typename Allocator>
+void wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::unpack8() {
+  unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
+  ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
+  for (int i = 0; i < 8; ++i) {
+    buffer_[i] += previous_;
+    previous_ = buffer_[i];
+  }
+}
+
 template<typename Allocator>
 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
   const_iterator tmp(*this);
@@ -895,13 +893,13 @@ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(c
 template<typename Allocator>
 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const -> reference {
   if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
-  return buffer_[buf_i_];
+  return buffer_[index_ & 7];
 }
 
 template<typename Allocator>
 auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const -> pointer {
   if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
-  return buffer_ + buf_i_;
+  return buffer_ + (index_ & 7);
 }
 
 } /* namespace datasketches */

From f82217d472f0d122b848cd379b231e7ac8616cf2 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Sun, 26 Jan 2025 20:55:40 -0800
Subject: [PATCH 03/75] test equivalence of packing and unpacking single values
 and blocks

---
 theta/test/bit_packing_test.cpp | 50 +++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp
index b39f8996..0e0cf015 100644
--- a/theta/test/bit_packing_test.cpp
+++ b/theta/test/bit_packing_test.cpp
@@ -80,4 +80,54 @@ TEST_CASE("pack unpack blocks") {
   }
 }
 
+TEST_CASE("pack bits unpack blocks") {
+  uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
+  for (int m = 0; m < 10000; ++m) {
+    for (uint8_t bits = 1; bits <= 63; ++bits) {
+      const uint64_t mask = (1ULL << bits) - 1;
+      std::vector<uint64_t> input(8, 0);
+      for (int i = 0; i < 8; ++i) {
+        input[i] = value & mask;
+        value += IGOLDEN64;
+      }
+      std::vector<uint8_t> bytes(bits, 0);
+      uint8_t offset = 0;
+      uint8_t* ptr = bytes.data();
+      for (int i = 0; i < 8; ++i) {
+        offset = pack_bits(input[i], bits, ptr, offset);
+      }
+      std::vector<uint64_t> output(8, 0);
+      unpack_bits_block8(output.data(), bytes.data(), bits);
+      for (int i = 0; i < 8; ++i) {
+        REQUIRE(input[i] == output[i]);
+      }
+    }
+  }
+}
+
+TEST_CASE("pack blocks unpack bits") {
+  uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
+  for (int m = 0; m < 10000; ++m) {
+    for (uint8_t bits = 1; bits <= 63; ++bits) {
+      const uint64_t mask = (1ULL << bits) - 1;
+      std::vector<uint64_t> input(8, 0);
+      for (int i = 0; i < 8; ++i) {
+        input[i] = value & mask;
+        value += IGOLDEN64;
+      }
+      std::vector<uint8_t> bytes(bits, 0);
+      pack_bits_block8(input.data(), bytes.data(), bits);
+      std::vector<uint64_t> output(8, 0);
+      uint8_t offset = 0;
+      const uint8_t* cptr = bytes.data();
+      for (int i = 0; i < 8; ++i) {
+        offset = unpack_bits(output[i], bits, cptr, offset);
+      }
+      for (int i = 0; i < 8; ++i) {
+        REQUIRE(input[i] == output[i]);
+      }
+    }
+  }
+}
+
 } /* namespace datasketches */

From dea8d481cab8461f981e4edb5ab292936a87abff Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Sun, 26 Jan 2025 21:02:34 -0800
Subject: [PATCH 04/75] different starting points for pseudo-random sequences
 for more coverage

---
 theta/test/bit_packing_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp
index 0e0cf015..0094f9fd 100644
--- a/theta/test/bit_packing_test.cpp
+++ b/theta/test/bit_packing_test.cpp
@@ -81,7 +81,7 @@ TEST_CASE("pack unpack blocks") {
 }
 
 TEST_CASE("pack bits unpack blocks") {
-  uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
+  uint64_t value = 0; // arbitrary starting value
   for (int m = 0; m < 10000; ++m) {
     for (uint8_t bits = 1; bits <= 63; ++bits) {
       const uint64_t mask = (1ULL << bits) - 1;
@@ -106,7 +106,7 @@ TEST_CASE("pack bits unpack blocks") {
 }
 
 TEST_CASE("pack blocks unpack bits") {
-  uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
+  uint64_t value = 111; // arbitrary starting value
   for (int m = 0; m < 10000; ++m) {
     for (uint8_t bits = 1; bits <= 63; ++bits) {
       const uint64_t mask = (1ULL << bits) - 1;

From 27d988f0eda27b36c0afac31a51bf39073ea3e17 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Tue, 18 Feb 2025 18:40:10 -0800
Subject: [PATCH 05/75] enable branch protection

---
 .asf.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.asf.yaml b/.asf.yaml
index 45d974b1..f15ed263 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -2,3 +2,11 @@ github:
   homepage: https://datasketches.apache.org
   ghp_branch: gh-pages
   ghp_path: /docs
+
+  protected_branches:
+    master:
+      required_pull_request_reviews:
+        dismiss_stale_reviews: false
+        required_approving_review_count: 1
+      required_signatures: false
+      required_conversation_resolution: false

From 0a6218ce37b4d4f5c2328d9ba0ad1d2942c7fec9 Mon Sep 17 00:00:00 2001
From: geonove <andrea.novellini@outlook.it>
Date: Sun, 25 May 2025 14:41:30 +0200
Subject: [PATCH 06/75] Use REQUIRE_THROWS_WITH to check for error message

---
 count/test/count_min_test.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/count/test/count_min_test.cpp b/count/test/count_min_test.cpp
index 143be1b8..8b7ae0a7 100644
--- a/count/test/count_min_test.cpp
+++ b/count/test/count_min_test.cpp
@@ -55,7 +55,7 @@ TEST_CASE("CM init") {
 TEST_CASE("CM parameter suggestions", "[error parameters]") {
 
     // Bucket suggestions
-    REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
+    REQUIRE_THROWS_WITH(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Relative error must be at least 0.");
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14);
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28);
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55);
@@ -69,8 +69,8 @@ TEST_CASE("CM parameter suggestions", "[error parameters]") {
     REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01);
 
     // Hash suggestions
-    REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." );
-    REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
+    REQUIRE_THROWS_WITH(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." );
+    REQUIRE_THROWS_WITH(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2); // 1 STDDEV
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4); // 2 STDDEV
     REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6); // 3 STDDEV
@@ -161,9 +161,9 @@ TEST_CASE("CM merge - reject", "[reject cases]") {
     std::vector<count_min_sketch<uint64_t>> sketches = {s1, s2, s3};
 
     // Fail cases
-    REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." );
+    REQUIRE_THROWS_WITH(s.merge(s), "Cannot merge a sketch with itself." );
     for (count_min_sketch<uint64_t> sk : sketches) {
-      REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." );
+      REQUIRE_THROWS_WITH(s.merge(sk), "Incompatible sketch configuration." );
     }
 }
 

From 75edfbb3b59b047bf8cdf7fb5a5d46798ea8bf08 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Tue, 27 May 2025 23:42:03 -0700
Subject: [PATCH 07/75] ds-java main branch requires 21

---
 .github/workflows/serde_compat.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml
index 33c31801..084d1385 100644
--- a/.github/workflows/serde_compat.yml
+++ b/.github/workflows/serde_compat.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Setup Java
         uses: actions/setup-java@v4
         with:
-          java-version: '17'
+          java-version: '21'
           distribution: 'temurin'
       - name: Run Java
         run: cd java && mvn test -P generate-java-files

From 82630e554e35d702ec6358b1d1ec5e1f186e7447 Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Sat, 7 Jun 2025 11:28:16 +0530
Subject: [PATCH 08/75] Provide get_centroids implementation

---
 tdigest/include/tdigest.hpp      | 5 +++++
 tdigest/include/tdigest_impl.hpp | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index d33084ed..21cf47a2 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -143,6 +143,11 @@ class tdigest {
    */
   uint64_t get_total_weight() const;
 
+  /**
+   * @return centroids
+   */
+  vector_centroid get_centroids() const;
+
   /**
    * Returns an instance of the allocator for this t-Digest.
    * @return allocator
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 6e3ae1a0..73429f6d 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -85,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
   return centroids_weight_ + buffer_.size();
 }
 
+template<typename T, typename A>
+auto tdigest<T, A>::get_centroids() const -> vector_centroid{
+  return centroids_;
+}
+
 template<typename T, typename A>
 A tdigest<T, A>::get_allocator() const {
   return buffer_.get_allocator();

From 866f6d036a7fe91153d01d9648f9127755e5af77 Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Mon, 9 Jun 2025 19:54:59 +0530
Subject: [PATCH 09/75] Introduced const_iterator for tdigest

---
 tdigest/include/tdigest.hpp      | 42 ++++++++++++++++++---
 tdigest/include/tdigest_impl.hpp | 64 +++++++++++++++++++++++++++++---
 tdigest/test/tdigest_test.cpp    | 14 +++++++
 3 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 21cf47a2..e821e4c0 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -143,11 +143,6 @@ class tdigest {
    */
   uint64_t get_total_weight() const;
 
-  /**
-   * @return centroids
-   */
-  vector_centroid get_centroids() const;
-
   /**
    * Returns an instance of the allocator for this t-Digest.
    * @return allocator
@@ -262,6 +257,21 @@ class tdigest {
    */
   static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
 
+  class const_iterator;
+
+  /**
+   * Iterator pointing to the first centroid in the sketch.
+   * If the sketch is empty, the returned iterator must not be dereferenced or incremented.
+   * @return iterator pointing to the first centroid in the sketch
+   */
+  const_iterator begin() const;
+
+  /**
+   * Iterator pointing to the past-the-end centroid in the sketch.
+   * It does not point to any centroid, and must not be dereferenced or incremented.
+   * @return iterator pointing to the past-the-end centroid in the sketch
+   */
+  const_iterator end() const;
 private:
   bool reverse_merge_;
   uint16_t k_;
@@ -302,6 +312,28 @@ class tdigest {
   static inline void check_split_points(const T* values, uint32_t size);
 };
 
+template<typename T, typename A>
+class tdigest<T, A>::const_iterator {
+public:
+  using iterator_category = std::input_iterator_tag;
+  using value_type = std::pair<const T&, const W>;
+  using difference_type = void;
+  using pointer = const return_value_holder<value_type>;
+  using reference = const value_type;
+
+  const_iterator(const tdigest<T, A> &tdigest_, bool is_end);
+
+  const_iterator& operator++();
+  const_iterator& operator++(int);
+  bool operator==(const const_iterator& other) const;
+  bool operator!=(const const_iterator& other) const;
+  reference operator*() const;
+  pointer operator->() const;
+private:
+  friend class tdigest<T, A>;
+  uint32_t index_;
+  vector_centroid centroids_;
+};
 } /* namespace datasketches */
 
 #include "tdigest_impl.hpp"
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 73429f6d..49fd98a5 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -85,11 +85,6 @@ uint64_t tdigest<T, A>::get_total_weight() const {
   return centroids_weight_ + buffer_.size();
 }
 
-template<typename T, typename A>
-auto tdigest<T, A>::get_centroids() const -> vector_centroid{
-  return centroids_;
-}
-
 template<typename T, typename A>
 A tdigest<T, A>::get_allocator() const {
   return buffer_.get_allocator();
@@ -632,6 +627,65 @@ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
   }
 }
 
+template <typename T, typename A>
+typename tdigest<T, A>::const_iterator tdigest<T, A>::begin() const {
+  return tdigest<T, A>::const_iterator(*this, false);
+}
+
+template <typename T, typename A>
+  typename tdigest<T, A>::const_iterator tdigest<T, A>::end() const {
+  return tdigest::const_iterator(*this, true);
+}
+
+template<typename T, typename A>
+tdigest<T, A>::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end):
+  centroids_()
+{
+  // Create a copy of the tdigest to generate the centroids after processing the buffered values
+  tdigest tmp(tdigest_);
+  tmp.compress();
+  centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end());
+
+  if (is_end) {
+    index_ = centroids_.size();
+  } else {
+    index_ = 0;
+  }
+}
+
+template<typename T, typename A>
+typename tdigest<T, A>::const_iterator& tdigest<T, A>::const_iterator::operator++() {
+  ++index_;
+  return *this;
+}
+
+template<typename T, typename A>
+typename tdigest<T, A>::const_iterator& tdigest<T, A>::const_iterator::operator++(int) {
+  const_iterator tmp(*this);
+  operator++();
+  return tmp;
+}
+
+template<typename T, typename A>
+bool tdigest<T, A>::const_iterator::operator==(const const_iterator& other) const {
+  return index_ == other.index_;
+}
+
+template<typename T, typename A>
+bool tdigest<T, A>::const_iterator::operator!=(const const_iterator& other) const {
+  return !operator==(other);
+}
+
+template<typename T, typename A>
+auto tdigest<T, A>::const_iterator::operator*() const -> reference {
+  return value_type(centroids_[index_].get_mean(), centroids_[index_].get_weight());
+}
+
+template<typename T, typename A>
+auto tdigest<T, A>::const_iterator::operator->() const -> pointer {
+  return **this;
+}
+
 } /* namespace datasketches */
 
 #endif // _TDIGEST_IMPL_HPP_
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index fc3f5d1c..41b00943 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -453,4 +453,18 @@ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]")
   REQUIRE(td.get_rank(n) == 1);
 }
 
+TEST_CASE("iterate centroids", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 10; i++) {
+    td.update(i);
+  }
+
+  auto centroid_count = 0;
+  for (const auto &centroid: td) {
+    centroid_count++;
+  }
+  // Ensure that centroids are retrieved for a case where there is buffered values
+  REQUIRE(centroid_count == 10);
+}
+
 } /* namespace datasketches */

From 27cb7b8940659924cca7434136c537ff930716bc Mon Sep 17 00:00:00 2001
From: Andrew Kane <andrew@ankane.org>
Date: Mon, 9 Jun 2025 13:34:21 -0700
Subject: [PATCH 10/75] Fix typos

---
 CMakeLists.txt                              | 2 +-
 CODE_OF_CONDUCT.md                          | 2 +-
 filters/include/bloom_filter.hpp            | 2 +-
 filters/test/bloom_filter_test.cpp          | 2 +-
 hll/include/CubicInterpolation-internal.hpp | 2 +-
 hll/include/HllArray-internal.hpp           | 2 +-
 hll/test/HllSketchTest.cpp                  | 6 +++---
 hll/test/HllUnionTest.cpp                   | 2 +-
 quantiles/include/quantiles_sketch.hpp      | 4 ++--
 quantiles/include/quantiles_sketch_impl.hpp | 2 +-
 sampling/include/var_opt_sketch_impl.hpp    | 6 +++---
 sampling/include/var_opt_union_impl.hpp     | 2 +-
 sampling/test/ebpps_allocation_test.cpp     | 2 +-
 sampling/test/ebpps_sketch_test.cpp         | 4 ++--
 sampling/test/var_opt_allocation_test.cpp   | 2 +-
 15 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 056bb701..c469e456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT CMAKE_CXX_COMPILER_VERSION VERS
   add_compile_options(-Wimplicit-fallthrough=3)
 endif()
 
-# Code generation options, to ensure shaerd libraries work and are portable
+# Code generation options, to ensure shared libraries work and are portable
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_C_EXTENSIONS OFF)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index bdce6af9..0bdf0791 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,3 +1,3 @@
 # Code of Conduct
 
-We adhere to the Apache Softare Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct).
\ No newline at end of file
+We adhere to the Apache Software Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct).
diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp
index f3c6a031..fd5816a1 100644
--- a/filters/include/bloom_filter.hpp
+++ b/filters/include/bloom_filter.hpp
@@ -624,7 +624,7 @@ class bloom_filter_alloc {
   uint64_t capacity_bits_;
   uint64_t num_bits_set_;
   uint8_t* bit_array_;  // data backing bit_array_, regardless of ownership
-  uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr
+  uint8_t* memory_; // if wrapped, pointer to the start of the filter, otherwise nullptr
 };
 
 /**
diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp
index 41b63e64..d8bcec8e 100644
--- a/filters/test/bloom_filter_test.cpp
+++ b/filters/test/bloom_filter_test.cpp
@@ -399,7 +399,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
   REQUIRE(bf_writable.query(-1.0));
 
   // not good memory management to do this, but because we wrapped the same bytes as both
-  // read-only adn writable, that update should ahve changed the read-only version, too
+  // read-only and writable, that update should have changed the read-only version, too
   REQUIRE(bf_wrap.query(-1.0));
 }
 
diff --git a/hll/include/CubicInterpolation-internal.hpp b/hll/include/CubicInterpolation-internal.hpp
index c60ddab9..9677b99d 100644
--- a/hll/include/CubicInterpolation-internal.hpp
+++ b/hll/include/CubicInterpolation-internal.hpp
@@ -191,7 +191,7 @@ double CubicInterpolation<A>::usingXArrAndYStride(const double xArr[], const int
   const int xArrLenM1 = xArrLen - 1;
 
   if ((xArrLen < 4) || (x < xArr[0]) || (x > xArr[xArrLenM1])) {
-    throw std::logic_error("impossible values during interpolaiton");
+    throw std::logic_error("impossible values during interpolation");
   }
 
   if (x ==  xArr[xArrLenM1]) { /* corner case */
diff --git a/hll/include/HllArray-internal.hpp b/hll/include/HllArray-internal.hpp
index c3c6b3f8..8986f068 100644
--- a/hll/include/HllArray-internal.hpp
+++ b/hll/include/HllArray-internal.hpp
@@ -173,7 +173,7 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
 
   hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[hll_constants::MODE_BYTE]);
   if (mode != HLL) {
-    throw std::invalid_argument("Calling HLL construtor with non-HLL mode data");
+    throw std::invalid_argument("Calling HLL constructor with non-HLL mode data");
   }
 
   const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]);
diff --git a/hll/test/HllSketchTest.cpp b/hll/test/HllSketchTest.cpp
index 1ce21bbe..91197f13 100644
--- a/hll/test/HllSketchTest.cpp
+++ b/hll/test/HllSketchTest.cpp
@@ -298,7 +298,7 @@ TEST_CASE("hll sketch: deserialize list mode buffer overrun", "[hll_sketch]") {
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range);
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range);
 
-    // ckeck for leaks on stream exceptions
+    // check for leaks on stream exceptions
     {
       std::stringstream ss;
       ss.exceptions(std::ios::failbit | std::ios::badbit);
@@ -325,7 +325,7 @@ TEST_CASE("hll sketch: deserialize set mode buffer overrun", "[hll_sketch]") {
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range);
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range);
 
-    // ckeck for leaks on stream exceptions
+    // check for leaks on stream exceptions
     {
       std::stringstream ss;
       ss.exceptions(std::ios::failbit | std::ios::badbit);
@@ -355,7 +355,7 @@ TEST_CASE("hll sketch: deserialize HLL mode buffer overrun", "[hll_sketch]") {
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 16420, 0), std::out_of_range); // before aux table
     REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range);
 
-    // ckeck for leaks on stream exceptions
+    // check for leaks on stream exceptions
     {
       std::stringstream ss;
       ss.exceptions(std::ios::failbit | std::ios::badbit);
diff --git a/hll/test/HllUnionTest.cpp b/hll/test/HllUnionTest.cpp
index 41443786..ceaef12f 100644
--- a/hll/test/HllUnionTest.cpp
+++ b/hll/test/HllUnionTest.cpp
@@ -58,7 +58,7 @@ static void basicUnion(uint64_t n1, uint64_t n2,
 
   hll_sketch result = u.get_result(resultType);
 
-  // ensure we check a direct union estimate, without first caling get_result()
+  // ensure we check a direct union estimate, without first calling get_result()
   u.reset();
   u.update(std::move(h1));
   u.update(h2);
diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index ab493c99..b1e2e3c1 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -537,10 +537,10 @@ class quantiles_sketch {
   static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
 
   template<typename SerDe>
-  static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
+  static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator);
   
   template<typename SerDe>
-  static std::pair<Level, size_t> deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
+  static std::pair<Level, size_t> deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator);
 
   static void check_k(uint16_t k);
   static void check_serial_version(uint8_t serial_version);
diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index 558c13c5..50c82c18 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -581,7 +581,7 @@ auto quantiles_sketch<T, C, A>::deserialize_array(const void* bytes, size_t size
   // serde did not throw, enable destructors
   items.get_deleter().set_destroy(true);
   
-  // succesfully read, now put into a Level
+  // successfully read, now put into a Level
   Level level(allocator);
   level.reserve(capacity);
   level.insert(level.begin(),
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 7bf40958..36ee3fc8 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -1029,7 +1029,7 @@ void var_opt_sketch<T, A>::transition_from_warmup() {
   total_wt_r_ = weights_[k_]; // only one item, known location
   weights_[k_] = -1.0;
 
-  // The two lightest items are ncessarily downsample-able to one item,
+  // The two lightest items are necessarily downsample-able to one item,
   // and are therefore a valid initial candidate set
   grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2);
 }
@@ -1065,7 +1065,7 @@ void var_opt_sketch<T, A>::restore_towards_leaves(uint32_t slot_in) {
   while (child <= last_slot) {
     uint32_t child2 = child + 1; // might also be invalid
     if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) {
-      // siwtch to other child if it's both valid and smaller
+      // switch to other child if it's both valid and smaller
       child = child2;
     }
 
@@ -1221,7 +1221,7 @@ uint32_t var_opt_sketch<T, A>::choose_delete_slot(double wt_cands, uint32_t num_
     if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) {
       return pick_random_slot_in_r(); // keep item in M
     } else {
-      return h_; // indext of item in M
+      return h_; // index of item in M
     }
   } else {
     // general case
diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp
index 1d252245..d04be0cb 100644
--- a/sampling/include/var_opt_union_impl.hpp
+++ b/sampling/include/var_opt_union_impl.hpp
@@ -590,7 +590,7 @@ void var_opt_union<T, A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,
 
   // if non-full and pseudo-exact, change k so that gcopy is full
   if ((r_count == 0) && (h_count < k)) {
-    gcopy.k_ = h_count; // may leve extra space allocated but that's ok
+    gcopy.k_ = h_count; // may leave extra space allocated but that's ok
   }
 
   // Now k equals the number of samples, so reducing k will increase tau.
diff --git a/sampling/test/ebpps_allocation_test.cpp b/sampling/test/ebpps_allocation_test.cpp
index 67969c98..e249ac1f 100644
--- a/sampling/test/ebpps_allocation_test.cpp
+++ b/sampling/test/ebpps_allocation_test.cpp
@@ -63,7 +63,7 @@ TEST_CASE( "ebpps merge", "[ebpps_sketch][test_type]") {
     ebpps_test_sketch sk1(k, 0);
     ebpps_test_sketch sk2(k, 0);
 
-    // move udpates
+    // move updates
     for (int i = 0; i < (int) n; ++i) {
       sk1.update(i);
       sk2.update(-i);
diff --git a/sampling/test/ebpps_sketch_test.cpp b/sampling/test/ebpps_sketch_test.cpp
index 22eb4d5e..25c1a237 100644
--- a/sampling/test/ebpps_sketch_test.cpp
+++ b/sampling/test/ebpps_sketch_test.cpp
@@ -222,7 +222,7 @@ TEST_CASE("ebpps sketch: merge large into small", "[ebpps_sketch]") {
   sk1 = create_unweighted_sketch(k / 2, 0);
   sk1.update(-1, k / 4.0);
   sk1.update(-2, k / 8.0);
-  // sk2 should have been unchaged
+  // sk2 should have been unchanged
   REQUIRE(sk2.get_n() == k);
   REQUIRE(sk2.get_c() == Approx(k).margin(EPS));
 
@@ -250,7 +250,7 @@ TEST_CASE("ebpps sketch: merge small into large", "[ebpps_sketch]") {
   
   // rvalue merge
   sk1 = create_unweighted_sketch(k, 3 * k / 2);
-  // sk2 should have been unchaged
+  // sk2 should have been unchanged
   REQUIRE(sk2.get_n() == 1);
   REQUIRE(sk2.get_c() == 1.0);
   sk2.update(-2, k / 10.0);
diff --git a/sampling/test/var_opt_allocation_test.cpp b/sampling/test/var_opt_allocation_test.cpp
index 7df1b6f9..e821ae0f 100644
--- a/sampling/test/var_opt_allocation_test.cpp
+++ b/sampling/test/var_opt_allocation_test.cpp
@@ -67,7 +67,7 @@ TEST_CASE( "varopt union: move", "[var_opt_union][test_type]") {
     var_opt_test_sketch sk1(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
     var_opt_test_sketch sk2(k, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
 
-    // move udpates
+    // move updates
     for (int i = 0; i < (int) n; ++i) {
       sk1.update(i);
       sk2.update(-i);

From 315e50b290387ac0af2b6dc032bf09715d2a8deb Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Tue, 10 Jun 2025 00:28:48 +0530
Subject: [PATCH 11/75] Addressing review comments

---
 tdigest/include/tdigest.hpp      |  8 ++++++--
 tdigest/include/tdigest_impl.hpp | 24 +++++++++++++++++++++---
 tdigest/test/tdigest_test.cpp    |  3 +++
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index e821e4c0..2ad410f5 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -106,6 +106,12 @@ class tdigest {
    */
   explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
 
+  /**
+ * Copy constructor
+ * @param other sketch to be copied
+ */
+  tdigest(const tdigest& other);
+
   /**
    * Update this t-Digest with the given value
    * @param value to update the t-Digest with
@@ -275,13 +281,11 @@ class tdigest {
 private:
   bool reverse_merge_;
   uint16_t k_;
-  uint16_t internal_k_;
   T min_;
   T max_;
   size_t centroids_capacity_;
   vector_centroid centroids_;
   uint64_t centroids_weight_;
-  size_t buffer_capacity_;
   vector_t buffer_;
 
   static const size_t BUFFER_MULTIPLIER = 4;
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 49fd98a5..1dba9eb1 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -597,6 +597,24 @@ bool tdigest<T, A>::is_single_value() const {
   return get_total_weight() == 1;
 }
 
+template<typename T, typename A>
+tdigest<T, A>::tdigest(const tdigest<T, A>& other):
+  reverse_merge_(other.reverse_merge_),
+  k_(other.k_),
+  min_(other.min_),
+  max_(other.max_),
+  centroids_capacity_(other.centroids_capacity_),
+  centroids_(other.centroids_, other.get_allocator()),
+  centroids_weight_(other.centroids_weight_),
+  buffer_(other.buffer_, other.get_allocator())
+{
+  if (other.k_ < 10) throw std::invalid_argument("k must be at least 10");
+  const size_t fudge = other.k_ < 30 ? 30 : 10;
+  centroids_capacity_ = 2 * k_ + fudge;
+  centroids_.reserve(centroids_capacity_);
+  buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
+}
+
 template<typename T, typename A>
 tdigest<T, A>::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer):
 reverse_merge_(reverse_merge),
@@ -638,11 +656,11 @@ template <typename T, typename A>
 }
 
 template<typename T, typename A>
-tdigest<T, A>::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end):
-  centroids_()
+tdigest<T, A>::const_iterator::const_iterator(const tdigest<T, A>& tdigest_, const bool is_end):
+  centroids_(tdigest_.get_allocator())
 {
   // Create a copy of the tdigest to generate the centroids after processing the buffered values
-  tdigest tmp(tdigest_);
+  tdigest<T, A> tmp(tdigest_);
   tmp.compress();
   centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end());
 
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index 41b00943..9f92094d 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -460,11 +460,14 @@ TEST_CASE("iterate centroids", "[tdigest]") {
   }
 
   auto centroid_count = 0;
+  uint64_t total_weight = 0;
   for (const auto &centroid: td) {
     centroid_count++;
+    total_weight += centroid.second;
   }
   // Ensure that centroids are retrieved for a case where there is buffered values
   REQUIRE(centroid_count == 10);
+  REQUIRE(td.get_total_weight() == total_weight);
 }
 
 } /* namespace datasketches */

From faca5d0262173c96c846d0d36d900cc5bfa48b6d Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Wed, 11 Jun 2025 22:47:10 +0530
Subject: [PATCH 12/75] Retaining the default copy constructor

---
 tdigest/include/tdigest.hpp      |  6 ------
 tdigest/include/tdigest_impl.hpp | 18 ------------------
 2 files changed, 24 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 2ad410f5..99e8dfa3 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -106,12 +106,6 @@ class tdigest {
    */
   explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
 
-  /**
- * Copy constructor
- * @param other sketch to be copied
- */
-  tdigest(const tdigest& other);
-
   /**
    * Update this t-Digest with the given value
    * @param value to update the t-Digest with
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 1dba9eb1..ab4ce9e4 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -597,24 +597,6 @@ bool tdigest<T, A>::is_single_value() const {
   return get_total_weight() == 1;
 }
 
-template<typename T, typename A>
-tdigest<T, A>::tdigest(const tdigest<T, A>& other):
-  reverse_merge_(other.reverse_merge_),
-  k_(other.k_),
-  min_(other.min_),
-  max_(other.max_),
-  centroids_capacity_(other.centroids_capacity_),
-  centroids_(other.centroids_, other.get_allocator()),
-  centroids_weight_(other.centroids_weight_),
-  buffer_(other.buffer_, other.get_allocator())
-{
-  if (other.k_ < 10) throw std::invalid_argument("k must be at least 10");
-  const size_t fudge = other.k_ < 30 ? 30 : 10;
-  centroids_capacity_ = 2 * k_ + fudge;
-  centroids_.reserve(centroids_capacity_);
-  buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
-}
-
 template<typename T, typename A>
 tdigest<T, A>::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer):
 reverse_merge_(reverse_merge),

From ada87563432eebc989088d6fab3a1fd4d0aabc36 Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Thu, 12 Jun 2025 11:32:08 +0530
Subject: [PATCH 13/75] Removing the unnecessary parameters

---
 tdigest/include/tdigest_impl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index ab4ce9e4..0f53adc4 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -638,11 +638,11 @@ template <typename T, typename A>
 }
 
 template<typename T, typename A>
-tdigest<T, A>::const_iterator::const_iterator(const tdigest<T, A>& tdigest_, const bool is_end):
+tdigest<T, A>::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end):
   centroids_(tdigest_.get_allocator())
 {
   // Create a copy of the tdigest to generate the centroids after processing the buffered values
-  tdigest<T, A> tmp(tdigest_);
+  tdigest tmp(tdigest_);
   tmp.compress();
   centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end());
 

From 5be04f2561ca2d394f564b78400aa981d98b4c9e Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Fri, 13 Jun 2025 10:50:19 +0530
Subject: [PATCH 14/75] Review comments

---
 tdigest/include/tdigest.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 99e8dfa3..cc7898e3 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -319,8 +319,6 @@ class tdigest<T, A>::const_iterator {
   using pointer = const return_value_holder<value_type>;
   using reference = const value_type;
 
-  const_iterator(const tdigest<T, A> &tdigest_, bool is_end);
-
   const_iterator& operator++();
   const_iterator& operator++(int);
   bool operator==(const const_iterator& other) const;
@@ -328,9 +326,10 @@ class tdigest<T, A>::const_iterator {
   reference operator*() const;
   pointer operator->() const;
 private:
-  friend class tdigest<T, A>;
+  friend class tdigest;
   uint32_t index_;
   vector_centroid centroids_;
+  const_iterator(const tdigest& tdigest_, bool is_end);
 };
 } /* namespace datasketches */
 

From 2e92ea0474a502edb6b6760d962f7f2e47660177 Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Mon, 14 Jul 2025 21:58:57 -0700
Subject: [PATCH 15/75] porting bug fix from Java

---
 tdigest/include/tdigest_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 0f53adc4..b8fab38d 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -193,7 +193,7 @@ T tdigest<T, A>::get_quantile(double rank) const {
   }
   const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0;
   const double w2 = centroids_.back().get_weight() / 2.0 - w1;
-  return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
+  return weighted_average(centroids_.back().get_mean(), w1, max_, w2);
 }
 
 template<typename T, typename A>

From 0595e4d62422b113b3ee036d8552cf4dd01c4b1c Mon Sep 17 00:00:00 2001
From: AlexanderSaydakov <AlexanderSaydakov@users.noreply.github.com>
Date: Fri, 8 Aug 2025 13:16:03 -0700
Subject: [PATCH 16/75] ds-java main branch requires java 24

---
 .github/workflows/serde_compat.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml
index 084d1385..f3d7ed67 100644
--- a/.github/workflows/serde_compat.yml
+++ b/.github/workflows/serde_compat.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Setup Java
         uses: actions/setup-java@v4
         with:
-          java-version: '21'
+          java-version: '24'
           distribution: 'temurin'
       - name: Run Java
         run: cd java && mvn test -P generate-java-files

From 1db40c997541aea061391afe08a0af15dcbea1e0 Mon Sep 17 00:00:00 2001
From: devillove084 <786537003@qq.com>
Date: Mon, 1 Sep 2025 08:21:21 +0000
Subject: [PATCH 17/75] fix: unnecessary virtual specifier on destructor

---
 hll/include/hll.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp
index 9d5f78f1..5fc49629 100644
--- a/hll/include/hll.hpp
+++ b/hll/include/hll.hpp
@@ -160,7 +160,7 @@ class hll_sketch_alloc final {
     static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A());
 
     //! Class destructor
-    virtual ~hll_sketch_alloc();
+    ~hll_sketch_alloc();
 
     /**
      *  Copy assignment operator

From a3bc4e48551a72cfb25bfefd09b6cb22e84e0551 Mon Sep 17 00:00:00 2001
From: proost <jwalag87@gmail.com>
Date: Thu, 13 Nov 2025 01:16:51 +0900
Subject: [PATCH 18/75] refactor: clean up use get_preamble_longs

---
 theta/include/theta_sketch_impl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 8f7b1e8d..304ae64c 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -376,7 +376,7 @@ size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8
 
 template<typename A>
 void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
-  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
+  const uint8_t preamble_longs = get_preamble_longs(false);
   write(os, preamble_longs);
   write(os, UNCOMPRESSED_SERIAL_VERSION);
   write(os, SKETCH_TYPE);
@@ -459,7 +459,7 @@ uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
 
 template<typename A>
 void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
-  const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
+  const uint8_t preamble_longs = get_preamble_longs(true);
   const uint8_t entry_bits = compute_entry_bits();
   const uint8_t num_entries_bytes = get_num_entries_bytes();
 

From 9d1b524a50cfcd3a3b7c6f492fe0853807403e9a Mon Sep 17 00:00:00 2001
From: Lee Rhodes <leerho@gmail.com>
Date: Wed, 12 Nov 2025 14:29:35 -0800
Subject: [PATCH 19/75] ds-java main branch requires Java 25

---
 .github/workflows/serde_compat.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml
index f3d7ed67..81547ee7 100644
--- a/.github/workflows/serde_compat.yml
+++ b/.github/workflows/serde_compat.yml
@@ -12,16 +12,16 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
       - name: Checkout Java
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           repository: apache/datasketches-java
           path: java
       - name: Setup Java
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
-          java-version: '24'
+          java-version: '25'
           distribution: 'temurin'
       - name: Run Java
         run: cd java && mvn test -P generate-java-files

From a83254d9eb7933ca21489855c4bc89c0098a3537 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 5 Jan 2026 16:58:21 +0900
Subject: [PATCH 20/75] fix: division by 0

---
 common/include/binomial_bounds.hpp   |   4 +-
 common/test/CMakeLists.txt           |   1 +
 common/test/binomial_bounds_test.cpp | 279 +++++++++++++++++++++++++++
 3 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 common/test/binomial_bounds_test.cpp

diff --git a/common/include/binomial_bounds.hpp b/common/include/binomial_bounds.hpp
index 3b73535b..ff7cccc9 100644
--- a/common/include/binomial_bounds.hpp
+++ b/common/include/binomial_bounds.hpp
@@ -441,8 +441,8 @@ class binomial_bounds {
   }
 
   static void check_theta(double theta) {
-    if (theta < 0 || theta > 1) {
-      throw std::invalid_argument("theta must be in [0, 1]");
+    if (theta <= 0 || theta > 1) {
+      throw std::invalid_argument("theta must be in (0, 1]");
     }
   }
 
diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt
index c598c353..7593bd0b 100644
--- a/common/test/CMakeLists.txt
+++ b/common/test/CMakeLists.txt
@@ -69,6 +69,7 @@ target_sources(common_test
   PRIVATE
     quantiles_sorted_view_test.cpp
     optional_test.cpp
+    binomial_bounds_test.cpp
 )
 
 # now the integration test part
diff --git a/common/test/binomial_bounds_test.cpp b/common/test/binomial_bounds_test.cpp
new file mode 100644
index 00000000..6bde0910
--- /dev/null
+++ b/common/test/binomial_bounds_test.cpp
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+
+#include "binomial_bounds.hpp"
+
+namespace datasketches {
+
+TEST_CASE("binomial_bounds: get_lower_bound", "[common]") {
+
+  SECTION("num_samples == 0") {
+    double result = binomial_bounds::get_lower_bound(0, 0.5, 1);
+    REQUIRE(result == 0.0);
+  }
+
+  SECTION("theta == 1.0") {
+    double result = binomial_bounds::get_lower_bound(100, 1.0, 1);
+    REQUIRE(result == 100.0);
+  }
+
+  SECTION("num_samples == 1") {
+    double result = binomial_bounds::get_lower_bound(1, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples == 1, stddev=2") {
+    double result = binomial_bounds::get_lower_bound(1, 0.5, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples == 1, stddev=3") {
+    double result = binomial_bounds::get_lower_bound(1, 0.5, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples > 120") {
+    double result = binomial_bounds::get_lower_bound(121, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples > 120, stddev=2") {
+    double result = binomial_bounds::get_lower_bound(200, 0.5, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples > 120, stddev=3") {
+    double result = binomial_bounds::get_lower_bound(500, 0.5, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5)") {
+    double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 1);
+    REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") {
+    double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 2);
+    REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") {
+    double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 3);
+    REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta < num_samples/360") {
+    double result = binomial_bounds::get_lower_bound(100, 0.001, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") {
+    double result = binomial_bounds::get_lower_bound(100, 0.001, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") {
+    double result = binomial_bounds::get_lower_bound(100, 0.001, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND middle range theta (exact calculation)") {
+    double result = binomial_bounds::get_lower_bound(10, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=2") {
+    double result = binomial_bounds::get_lower_bound(10, 0.5, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=3") {
+    double result = binomial_bounds::get_lower_bound(10, 0.5, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("theta=0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(10, 0.0, 1), std::invalid_argument);
+  }
+
+  SECTION("theta very close to 0") {
+    double result = binomial_bounds::get_lower_bound(10, 1e-10, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples=2 boundary") {
+    double result = binomial_bounds::get_lower_bound(2, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples=120 boundary") {
+    double result = binomial_bounds::get_lower_bound(120, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("estimate clamping case") {
+    double result = binomial_bounds::get_lower_bound(10, 0.9, 1);
+    double estimate = 10.0 / 0.9;
+    REQUIRE(result <= estimate);
+  }
+
+  SECTION("invalid theta < 0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, -0.1, 1), std::invalid_argument);
+  }
+
+  SECTION("invalid theta > 1") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 1.1, 1), std::invalid_argument);
+  }
+
+  SECTION("invalid stddev = 0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 0), std::invalid_argument);
+  }
+
+  SECTION("invalid stddev = 4") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 4), std::invalid_argument);
+  }
+}
+
+TEST_CASE("binomial_bounds: get_upper_bound", "[common]") {
+
+  SECTION("theta == 1.0") {
+    double result = binomial_bounds::get_upper_bound(100, 1.0, 1);
+    REQUIRE(result == 100.0);
+  }
+
+  SECTION("num_samples == 0") {
+    double result = binomial_bounds::get_upper_bound(0, 0.5, 1);
+    REQUIRE(result > 0.0);
+  }
+
+  SECTION("num_samples == 0, stddev=2") {
+    double result = binomial_bounds::get_upper_bound(0, 0.5, 2);
+    REQUIRE(result > 0.0);
+  }
+
+  SECTION("num_samples == 0, stddev=3") {
+    double result = binomial_bounds::get_upper_bound(0, 0.5, 3);
+    REQUIRE(result > 0.0);
+  }
+
+  SECTION("num_samples > 120") {
+    double result = binomial_bounds::get_upper_bound(121, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples > 120, stddev=2") {
+    double result = binomial_bounds::get_upper_bound(200, 0.5, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples > 120, stddev=3") {
+    double result = binomial_bounds::get_upper_bound(500, 0.5, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5)") {
+    double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 1);
+    REQUIRE(result == 51.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") {
+    double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 2);
+    REQUIRE(result == 51.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") {
+    double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 3);
+    REQUIRE(result == 51.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta < num_samples/360") {
+    double result = binomial_bounds::get_upper_bound(100, 0.001, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") {
+    double result = binomial_bounds::get_upper_bound(100, 0.001, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") {
+    double result = binomial_bounds::get_upper_bound(100, 0.001, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND middle range theta (exact calculation)") {
+    double result = binomial_bounds::get_upper_bound(10, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=2") {
+    double result = binomial_bounds::get_upper_bound(10, 0.5, 2);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=3") {
+    double result = binomial_bounds::get_upper_bound(10, 0.5, 3);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("theta=0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(10, 0.0, 1), std::invalid_argument);
+  }
+
+  SECTION("theta very close to 0") {
+    double result = binomial_bounds::get_upper_bound(10, 1e-10, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples=1 boundary") {
+    double result = binomial_bounds::get_upper_bound(1, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("num_samples=120 boundary") {
+    double result = binomial_bounds::get_upper_bound(120, 0.5, 1);
+    REQUIRE(result >= 0.0);
+  }
+
+  SECTION("estimate clamping case") {
+    double result = binomial_bounds::get_upper_bound(10, 0.9, 1);
+    double estimate = 10.0 / 0.9;
+    REQUIRE(result >= estimate);
+  }
+
+  SECTION("invalid theta < 0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, -0.1, 1), std::invalid_argument);
+  }
+
+  SECTION("invalid theta > 1") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 1.1, 1), std::invalid_argument);
+  }
+
+  SECTION("invalid stddev = 0") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 0), std::invalid_argument);
+  }
+
+  SECTION("invalid stddev = 4") {
+    REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 4), std::invalid_argument);
+  }
+}
+
+} /* namespace datasketches */

From 59e5f366fe18c54f8d9e4f26742c02f6e7a9164a Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Wed, 7 Jan 2026 19:24:49 +0900
Subject: [PATCH 21/75] fix: rejecting inf as value

---
 tdigest/include/tdigest_impl.hpp |  5 +++
 tdigest/test/tdigest_test.cpp    | 53 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index b8fab38d..75f2d9ee 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -37,6 +37,7 @@ tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::i
 template<typename T, typename A>
 void tdigest<T, A>::update(T value) {
   if (std::isnan(value)) return;
+  if (std::isinf(value)) return;
   if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress();
   buffer_.push_back(value);
   min_ = std::min(min_, value);
@@ -94,6 +95,7 @@ template<typename T, typename A>
 double tdigest<T, A>::get_rank(T value) const {
   if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
   if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN");
+  if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity");
   if (value < min_) return 0;
   if (value > max_) return 1;
   // one centroid and value == min_ == max_
@@ -621,6 +623,9 @@ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
     if (std::isnan(values[i])) {
       throw std::invalid_argument("Values must not be NaN");
     }
+    if (std::isinf(values[i])) {
+      throw std::invalid_argument("Values must not be infinity");
+    }
     if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
       throw std::invalid_argument("Values must be unique and monotonically increasing");
     }
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index 9f92094d..45c10822 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -470,4 +470,57 @@ TEST_CASE("iterate centroids", "[tdigest]") {
   REQUIRE(td.get_total_weight() == total_weight);
 }
 
+TEST_CASE("update rejects positive infinity", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  td.update(std::numeric_limits<double>::infinity());
+  REQUIRE(td.get_total_weight() == 2);
+  REQUIRE(td.get_max_value() == 2.0);
+}
+
+TEST_CASE("update rejects negative infinity", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  td.update(-std::numeric_limits<double>::infinity());
+  REQUIRE(td.get_total_weight() == 2);
+  REQUIRE(td.get_min_value() == 1.0);
+}
+
+TEST_CASE("get_rank rejects positive infinity", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits<double>::infinity()), std::invalid_argument);
+}
+
+TEST_CASE("get_rank rejects negative infinity", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits<double>::infinity()), std::invalid_argument);
+}
+
+TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 100; ++i) td.update(i);
+  const double split_points[2] = {50.0, std::numeric_limits<double>::infinity()};
+  REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument);
+}
+
+TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 100; ++i) td.update(i);
+  const double split_points[2] = {-std::numeric_limits<double>::infinity(), 50.0};
+  REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument);
+}
+
+TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 100; ++i) td.update(i);
+  const double split_points[1] = {std::numeric_limits<double>::infinity()};
+  REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument);
+}
+
 } /* namespace datasketches */

From 588fd73c09b09740a0ebd493ef336a02fab2eb0f Mon Sep 17 00:00:00 2001
From: proost <jwalag87@gmail.com>
Date: Tue, 13 Jan 2026 00:40:31 +0900
Subject: [PATCH 22/75] fix: check invalid inputs on deserialization

---
 tdigest/include/tdigest.hpp      |  2 +
 tdigest/include/tdigest_impl.hpp | 91 +++++++++++++++++++++++++++---
 tdigest/test/tdigest_test.cpp    | 97 ++++++++++++++++++++++++++++----
 3 files changed, 171 insertions(+), 19 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index cc7898e3..7d060ec1 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -108,6 +108,7 @@ class tdigest {
 
   /**
    * Update this t-Digest with the given value
+   * NaN and infinity values are ignored
    * @param value to update the t-Digest with
    */
   void update(T value);
@@ -153,6 +154,7 @@ class tdigest {
    * Compute approximate normalized rank of the given value.
    *
    * <p>If the sketch is empty this throws std::runtime_error.
+   * <p>NaN and infinity values throw std::invalid_argument.
    *
    * @param value to be ranked
    * @return normalized rank (from 0 to 1 inclusive)
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 75f2d9ee..294dab88 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -29,6 +29,24 @@
 
 namespace datasketches {
 
+template<typename T>
+inline void check_not_nan(T value, const char* name) {
+  if (std::isnan(value)) {
+    std::ostringstream oss;
+    oss << name << " must not be NaN";
+    throw std::invalid_argument(oss.str());
+  }
+}
+
+template<typename T>
+inline void check_not_infinite(T value, const char* name) {
+  if (std::isinf(value)) {
+    std::ostringstream oss;
+    oss << name << " must not be infinite";
+    throw std::invalid_argument(oss.str());
+  }
+}
+
 template<typename T, typename A>
 tdigest<T, A>::tdigest(uint16_t k, const A& allocator):
 tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(), vector_centroid(allocator), 0, vector_t(allocator))
@@ -402,6 +420,8 @@ tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
   const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
   if (is_single_value) {
     const T value = read<T>(is);
+    check_not_nan(value, "single_value");
+    check_not_infinite(value, "single_value");
     return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
   }
 
@@ -410,12 +430,24 @@ tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
 
   const T min = read<T>(is);
   const T max = read<T>(is);
+  check_not_nan(min, "min");
+  check_not_infinite(min, "min");
+  check_not_nan(max, "max");
+  check_not_infinite(max, "max");
   vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
   if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid));
   vector_t buffer(num_buffered, 0, allocator);
   if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T));
   uint64_t weight = 0;
-  for (const auto& c: centroids) weight += c.get_weight();
+  for (const auto& c: centroids) {
+    check_not_nan(c.get_mean(), "centroid mean");
+    check_not_infinite(c.get_mean(), "centroid mean");
+    weight += c.get_weight();
+  }
+  for (const auto& value: buffer) {
+    check_not_nan(value, "buffered_value");
+    check_not_infinite(value, "buffered_value");
+  }
   return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
 }
 
@@ -453,6 +485,8 @@ tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A
     ensure_minimum_memory(end_ptr - ptr, sizeof(T));
     T value;
     ptr += copy_from_mem(ptr, value);
+    check_not_nan(value, "single_value");
+    check_not_infinite(value, "single_value");
     return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
   }
 
@@ -467,12 +501,24 @@ tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A
   ptr += copy_from_mem(ptr, min);
   T max;
   ptr += copy_from_mem(ptr, max);
+  check_not_nan(min, "min");
+  check_not_infinite(min, "min");
+  check_not_nan(max, "max");
+  check_not_infinite(max, "max");
   vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
   if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid));
   vector_t buffer(num_buffered, 0, allocator);
   if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T));
   uint64_t weight = 0;
-  for (const auto& c: centroids) weight += c.get_weight();
+  for (const auto& c: centroids) {
+    check_not_nan(c.get_mean(), "centroid mean");
+    check_not_infinite(c.get_mean(), "centroid mean");
+    weight += c.get_weight();
+  }
+  for (const auto& value: buffer) {
+    check_not_nan(value, "buffered_value");
+    check_not_infinite(value, "buffered_value");
+  }
   return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
 }
 
@@ -489,13 +535,22 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& alloc
   if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
     const auto min = read_big_endian<double>(is);
     const auto max = read_big_endian<double>(is);
+    check_not_nan(min, "min");
+    check_not_infinite(min, "min");
+    check_not_nan(max, "max");
+    check_not_infinite(max, "max");
     const auto k = static_cast<uint16_t>(read_big_endian<double>(is));
     const auto num_centroids = read_big_endian<uint32_t>(is);
     vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
     uint64_t total_weight = 0;
     for (auto& c: centroids) {
-      const W weight = static_cast<W>(read_big_endian<double>(is));
+      const auto weight_double = read_big_endian<double>(is);
+      check_not_nan(weight_double, "centroid weight");
+      check_not_infinite(weight_double, "centroid weight");
       const auto mean = read_big_endian<double>(is);
+      check_not_nan(mean, "centroid mean");
+      check_not_infinite(mean, "centroid mean");
+      const W weight = static_cast<W>(weight_double);
       c = centroid(mean, weight);
       total_weight += weight;
     }
@@ -504,6 +559,10 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& alloc
   // COMPAT_FLOAT: compatibility with asSmallBytes()
   const auto min = read_big_endian<double>(is); // reference implementation uses doubles for min and max
   const auto max = read_big_endian<double>(is);
+  check_not_nan(min, "min");
+  check_not_infinite(min, "min");
+  check_not_nan(max, "max");
+  check_not_infinite(max, "max");
   const auto k = static_cast<uint16_t>(read_big_endian<float>(is));
   // reference implementation stores capacities of the array of centroids and the buffer as shorts
   // they can be derived from k in the constructor
@@ -512,8 +571,13 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& alloc
   vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
   uint64_t total_weight = 0;
   for (auto& c: centroids) {
-    const W weight = static_cast<W>(read_big_endian<float>(is));
+    const auto weight_float = read_big_endian<float>(is);
+    check_not_nan(weight_float, "centroid weight");
+    check_not_infinite(weight_float, "centroid weight");
     const auto mean = read_big_endian<float>(is);
+    check_not_nan(mean, "centroid mean");
+    check_not_infinite(mean, "centroid mean");
+    const W weight = static_cast<W>(weight_float);
     c = centroid(mean, weight);
     total_weight += weight;
   }
@@ -540,6 +604,10 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size,
     double max;
     ptr += copy_from_mem(ptr, max);
     max = byteswap(max);
+    check_not_nan(min, "min");
+    check_not_infinite(min, "min");
+    check_not_nan(max, "max");
+    check_not_infinite(max, "max");
     double k_double;
     ptr += copy_from_mem(ptr, k_double);
     const uint16_t k = static_cast<uint16_t>(byteswap(k_double));
@@ -556,6 +624,10 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size,
       double mean;
       ptr += copy_from_mem(ptr, mean);
       mean = byteswap(mean);
+      check_not_nan(weight, "centroid weight");
+      check_not_infinite(weight, "centroid weight");
+      check_not_nan(mean, "centroid mean");
+      check_not_infinite(mean, "centroid mean");
       c = centroid(mean, static_cast<W>(weight));
       total_weight += static_cast<uint64_t>(weight);
     }
@@ -569,6 +641,10 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size,
   double max;
   ptr += copy_from_mem(ptr, max);
   max = byteswap(max);
+  check_not_nan(min, "min");
+  check_not_infinite(min, "min");
+  check_not_nan(max, "max");
+  check_not_infinite(max, "max");
   float k_float;
   ptr += copy_from_mem(ptr, k_float);
   const uint16_t k = static_cast<uint16_t>(byteswap(k_float));
@@ -588,6 +664,10 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size,
     float mean;
     ptr += copy_from_mem(ptr, mean);
     mean = byteswap(mean);
+    check_not_nan(weight, "centroid weight");
+    check_not_infinite(weight, "centroid weight");
+    check_not_nan(mean, "centroid mean");
+    check_not_infinite(mean, "centroid mean");
     c = centroid(mean, static_cast<W>(weight));
     total_weight += static_cast<uint64_t>(weight);
   }
@@ -623,9 +703,6 @@ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
     if (std::isnan(values[i])) {
       throw std::invalid_argument("Values must not be NaN");
     }
-    if (std::isinf(values[i])) {
-      throw std::invalid_argument("Values must not be infinity");
-    }
     if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
       throw std::invalid_argument("Values must be unique and monotonically increasing");
     }
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index 45c10822..0019b936 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -18,13 +18,35 @@
  */
 
 #include <catch2/catch.hpp>
+#include <cstring>
 #include <iostream>
 #include <fstream>
+#include <sstream>
 
 #include "tdigest.hpp"
 
 namespace datasketches {
 
+namespace {
+constexpr size_t kHeaderSize = 8;
+constexpr size_t kCountsSize = 8;
+constexpr size_t kMinOffset = kHeaderSize + kCountsSize;
+constexpr size_t kMaxOffset = kMinOffset + sizeof(double);
+constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2;
+constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset;
+constexpr size_t kSingleValueOffset = kHeaderSize;
+
+template <typename T>
+void write_bytes(std::vector<uint8_t>& bytes, size_t offset, T value) {
+  std::memcpy(bytes.data() + offset, &value, sizeof(T));
+}
+
+template <typename T>
+void write_bytes(std::string& data, size_t offset, T value) {
+  std::memcpy(&data[offset], &value, sizeof(T));
+}
+} // namespace
+
 TEST_CASE("empty", "[tdigest]") {
   tdigest_double td(10);
 //  std::cout << td.to_string();
@@ -502,25 +524,76 @@ TEST_CASE("get_rank rejects negative infinity", "[tdigest]") {
   REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits<double>::infinity()), std::invalid_argument);
 }
 
-TEST_CASE("get_CDF rejects positive infinity in split points", "[tdigest]") {
+TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  auto bytes = td.serialize();
+  write_bytes(bytes, kSingleValueOffset, std::numeric_limits<double>::quiet_NaN());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
+}
+
+TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  td.update(3.0);
+  auto bytes = td.serialize();
+  std::string data(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+  write_bytes(data, kMinOffset, std::numeric_limits<double>::infinity());
+  std::istringstream is(data, std::ios::binary);
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument);
+}
+
+TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 10; ++i) td.update(i);
+  auto bytes = td.serialize();
+  write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits<double>::quiet_NaN());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
+}
+
+TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") {
   tdigest_double td(100);
-  for (int i = 0; i < 100; ++i) td.update(i);
-  const double split_points[2] = {50.0, std::numeric_limits<double>::infinity()};
-  REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument);
+  td.update(1.0);
+  td.update(2.0);
+  auto bytes = td.serialize(0, true);
+  write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits<double>::quiet_NaN());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
+}
+
+TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  auto bytes = td.serialize();
+  write_bytes(bytes, kSingleValueOffset, std::numeric_limits<double>::infinity());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
+}
+
+TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(1.0);
+  td.update(2.0);
+  auto bytes = td.serialize();
+  write_bytes(bytes, kMaxOffset, std::numeric_limits<double>::quiet_NaN());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
-TEST_CASE("get_CDF rejects negative infinity in split points", "[tdigest]") {
+TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") {
   tdigest_double td(100);
-  for (int i = 0; i < 100; ++i) td.update(i);
-  const double split_points[2] = {-std::numeric_limits<double>::infinity(), 50.0};
-  REQUIRE_THROWS_AS(td.get_CDF(split_points, 2), std::invalid_argument);
+  td.update(1.0);
+  td.update(2.0);
+  auto bytes = td.serialize();
+  write_bytes(bytes, kMaxOffset, std::numeric_limits<double>::infinity());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
-TEST_CASE("get_PMF rejects infinity in split points", "[tdigest]") {
+TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") {
   tdigest_double td(100);
-  for (int i = 0; i < 100; ++i) td.update(i);
-  const double split_points[1] = {std::numeric_limits<double>::infinity()};
-  REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::invalid_argument);
+  td.update(1.0);
+  td.update(2.0);
+  auto bytes = td.serialize(0, true);
+  write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits<double>::infinity());
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
 } /* namespace datasketches */

From b8489fd7327721fa4c1a16ff2a93565e7b077e5e Mon Sep 17 00:00:00 2001
From: proost <jwalag87@gmail.com>
Date: Tue, 13 Jan 2026 01:04:40 +0900
Subject: [PATCH 23/75] perf: remove ostringstream

---
 tdigest/include/tdigest_impl.hpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 294dab88..0be1a486 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -22,7 +22,6 @@
 
 #include <algorithm>
 #include <cmath>
-#include <sstream>
 
 #include "common_defs.hpp"
 #include "memory_operations.hpp"
@@ -32,18 +31,14 @@ namespace datasketches {
 template<typename T>
 inline void check_not_nan(T value, const char* name) {
   if (std::isnan(value)) {
-    std::ostringstream oss;
-    oss << name << " must not be NaN";
-    throw std::invalid_argument(oss.str());
+    throw std::invalid_argument(std::string(name) + " must not be NaN");
   }
 }
 
 template<typename T>
 inline void check_not_infinite(T value, const char* name) {
   if (std::isinf(value)) {
-    std::ostringstream oss;
-    oss << name << " must not be infinite";
-    throw std::invalid_argument(oss.str());
+    throw std::invalid_argument(std::string(name) + " must not be infinite");
   }
 }
 

From c680a81c9fd690971de14cff4da3116fb04903cf Mon Sep 17 00:00:00 2001
From: proost <jwalag87@gmail.com>
Date: Tue, 13 Jan 2026 02:08:48 +0900
Subject: [PATCH 24/75] style: follow local convention

---
 tdigest/test/tdigest_test.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index 0019b936..fd0a71c1 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -28,13 +28,13 @@
 namespace datasketches {
 
 namespace {
-constexpr size_t kHeaderSize = 8;
-constexpr size_t kCountsSize = 8;
-constexpr size_t kMinOffset = kHeaderSize + kCountsSize;
-constexpr size_t kMaxOffset = kMinOffset + sizeof(double);
-constexpr size_t kFirstCentroidMeanOffset = kMinOffset + sizeof(double) * 2;
-constexpr size_t kFirstBufferedValueOffset = kFirstCentroidMeanOffset;
-constexpr size_t kSingleValueOffset = kHeaderSize;
+constexpr size_t header_size = 8;
+constexpr size_t counts_size = 8;
+constexpr size_t min_offset = header_size + counts_size;
+constexpr size_t max_offset = min_offset + sizeof(double);
+constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2;
+constexpr size_t first_buffered_value_offset = first_centroid_mean_offset;
+constexpr size_t single_value_offset = header_size;
 
 template <typename T>
 void write_bytes(std::vector<uint8_t>& bytes, size_t offset, T value) {
@@ -528,7 +528,7 @@ TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") {
   tdigest_double td(100);
   td.update(1.0);
   auto bytes = td.serialize();
-  write_bytes(bytes, kSingleValueOffset, std::numeric_limits<double>::quiet_NaN());
+  write_bytes(bytes, single_value_offset, std::numeric_limits<double>::quiet_NaN());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -539,7 +539,7 @@ TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") {
   td.update(3.0);
   auto bytes = td.serialize();
   std::string data(reinterpret_cast<const char*>(bytes.data()), bytes.size());
-  write_bytes(data, kMinOffset, std::numeric_limits<double>::infinity());
+  write_bytes(data, min_offset, std::numeric_limits<double>::infinity());
   std::istringstream is(data, std::ios::binary);
   REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument);
 }
@@ -548,7 +548,7 @@ TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") {
   tdigest_double td(100);
   for (int i = 0; i < 10; ++i) td.update(i);
   auto bytes = td.serialize();
-  write_bytes(bytes, kFirstCentroidMeanOffset, std::numeric_limits<double>::quiet_NaN());
+  write_bytes(bytes, first_centroid_mean_offset, std::numeric_limits<double>::quiet_NaN());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -557,7 +557,7 @@ TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") {
   td.update(1.0);
   td.update(2.0);
   auto bytes = td.serialize(0, true);
-  write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits<double>::quiet_NaN());
+  write_bytes(bytes, first_buffered_value_offset, std::numeric_limits<double>::quiet_NaN());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -565,7 +565,7 @@ TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") {
   tdigest_double td(100);
   td.update(1.0);
   auto bytes = td.serialize();
-  write_bytes(bytes, kSingleValueOffset, std::numeric_limits<double>::infinity());
+  write_bytes(bytes, single_value_offset, std::numeric_limits<double>::infinity());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -574,7 +574,7 @@ TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") {
   td.update(1.0);
   td.update(2.0);
   auto bytes = td.serialize();
-  write_bytes(bytes, kMaxOffset, std::numeric_limits<double>::quiet_NaN());
+  write_bytes(bytes, max_offset, std::numeric_limits<double>::quiet_NaN());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -583,7 +583,7 @@ TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") {
   td.update(1.0);
   td.update(2.0);
   auto bytes = td.serialize();
-  write_bytes(bytes, kMaxOffset, std::numeric_limits<double>::infinity());
+  write_bytes(bytes, max_offset, std::numeric_limits<double>::infinity());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
@@ -592,7 +592,7 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") {
   td.update(1.0);
   td.update(2.0);
   auto bytes = td.serialize(0, true);
-  write_bytes(bytes, kFirstBufferedValueOffset, std::numeric_limits<double>::infinity());
+  write_bytes(bytes, first_buffered_value_offset, std::numeric_limits<double>::infinity());
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 

From 99d06bfd2b3c668911720ee3e2598a6fce7cc917 Mon Sep 17 00:00:00 2001
From: proost <jwalag87@gmail.com>
Date: Tue, 13 Jan 2026 02:10:05 +0900
Subject: [PATCH 25/75] fix: add missing dependency

---
 tdigest/include/tdigest_impl.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 0be1a486..043c7bab 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -22,6 +22,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <sstream>
 
 #include "common_defs.hpp"
 #include "memory_operations.hpp"

From 662aef37c3912b4b2c6cbf3cc6ab0dbf5d40a0df Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Tue, 13 Jan 2026 15:20:44 +0900
Subject: [PATCH 26/75] fix: allow inf for get_rank

---
 tdigest/include/tdigest.hpp      |  1 -
 tdigest/include/tdigest_impl.hpp |  1 -
 tdigest/test/tdigest_test.cpp    | 14 --------------
 3 files changed, 16 deletions(-)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 7d060ec1..7ce87dd1 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -154,7 +154,6 @@ class tdigest {
    * Compute approximate normalized rank of the given value.
    *
    * <p>If the sketch is empty this throws std::runtime_error.
-   * <p>NaN and infinity values throw std::invalid_argument.
    *
    * @param value to be ranked
    * @return normalized rank (from 0 to 1 inclusive)
diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index 043c7bab..e6904f20 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -109,7 +109,6 @@ template<typename T, typename A>
 double tdigest<T, A>::get_rank(T value) const {
   if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
   if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN");
-  if (std::isinf(value)) throw std::invalid_argument("operation is undefined for infinity");
   if (value < min_) return 0;
   if (value > max_) return 1;
   // one centroid and value == min_ == max_
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index fd0a71c1..8dd62132 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -510,20 +510,6 @@ TEST_CASE("update rejects negative infinity", "[tdigest]") {
   REQUIRE(td.get_min_value() == 1.0);
 }
 
-TEST_CASE("get_rank rejects positive infinity", "[tdigest]") {
-  tdigest_double td(100);
-  td.update(1.0);
-  td.update(2.0);
-  REQUIRE_THROWS_AS(td.get_rank(std::numeric_limits<double>::infinity()), std::invalid_argument);
-}
-
-TEST_CASE("get_rank rejects negative infinity", "[tdigest]") {
-  tdigest_double td(100);
-  td.update(1.0);
-  td.update(2.0);
-  REQUIRE_THROWS_AS(td.get_rank(-std::numeric_limits<double>::infinity()), std::invalid_argument);
-}
-
 TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") {
   tdigest_double td(100);
   td.update(1.0);

From bded7aa1eb09c13daa742ce901443388e1a8994a Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Tue, 13 Jan 2026 16:15:30 +0900
Subject: [PATCH 27/75] fix: check weight is zero

---
 tdigest/include/tdigest_impl.hpp | 15 +++++++++++++++
 tdigest/test/tdigest_test.cpp    | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp
index e6904f20..065e3ef1 100644
--- a/tdigest/include/tdigest_impl.hpp
+++ b/tdigest/include/tdigest_impl.hpp
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <cmath>
 #include <sstream>
+#include <type_traits>
 
 #include "common_defs.hpp"
 #include "memory_operations.hpp"
@@ -43,6 +44,14 @@ inline void check_not_infinite(T value, const char* name) {
   }
 }
 
+template<typename T>
+inline void check_non_zero(T value, const char* name) {
+  static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+  if (value == 0) {
+    throw std::invalid_argument(std::string(name) + " must not be zero");
+  }
+}
+
 template<typename T, typename A>
 tdigest<T, A>::tdigest(uint16_t k, const A& allocator):
 tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(), vector_centroid(allocator), 0, vector_t(allocator))
@@ -437,6 +446,8 @@ tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
   for (const auto& c: centroids) {
     check_not_nan(c.get_mean(), "centroid mean");
     check_not_infinite(c.get_mean(), "centroid mean");
+    check_non_zero(c.get_weight(), "centroid weight");
+
     weight += c.get_weight();
   }
   for (const auto& value: buffer) {
@@ -508,6 +519,8 @@ tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A
   for (const auto& c: centroids) {
     check_not_nan(c.get_mean(), "centroid mean");
     check_not_infinite(c.get_mean(), "centroid mean");
+    check_non_zero(c.get_weight(), "centroid weight");
+
     weight += c.get_weight();
   }
   for (const auto& value: buffer) {
@@ -542,6 +555,8 @@ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& alloc
       const auto weight_double = read_big_endian<double>(is);
       check_not_nan(weight_double, "centroid weight");
       check_not_infinite(weight_double, "centroid weight");
+      check_non_zero(weight_double, "centroid weight");
+
       const auto mean = read_big_endian<double>(is);
       check_not_nan(mean, "centroid mean");
       check_not_infinite(mean, "centroid mean");
diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp
index 8dd62132..07d6185f 100644
--- a/tdigest/test/tdigest_test.cpp
+++ b/tdigest/test/tdigest_test.cpp
@@ -33,6 +33,7 @@ constexpr size_t counts_size = 8;
 constexpr size_t min_offset = header_size + counts_size;
 constexpr size_t max_offset = min_offset + sizeof(double);
 constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2;
+constexpr size_t first_centroid_weight_offset = first_centroid_mean_offset + sizeof(double);
 constexpr size_t first_buffered_value_offset = first_centroid_mean_offset;
 constexpr size_t single_value_offset = header_size;
 
@@ -582,4 +583,22 @@ TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") {
   REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
 }
 
+TEST_CASE("deserialize bytes rejects zero centroid weight", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 10; ++i) td.update(i);
+  auto bytes = td.serialize();
+  write_bytes(bytes, first_centroid_weight_offset, static_cast<uint64_t>(0));
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument);
+}
+
+TEST_CASE("deserialize stream rejects zero centroid weight", "[tdigest]") {
+  tdigest_double td(100);
+  for (int i = 0; i < 10; ++i) td.update(i);
+  auto bytes = td.serialize();
+  std::string data(reinterpret_cast<const char*>(bytes.data()), bytes.size());
+  write_bytes(data, first_centroid_weight_offset, static_cast<uint64_t>(0));
+  std::istringstream is(data, std::ios::binary);
+  REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument);
+}
+
 } /* namespace datasketches */

From 19798344aad67441f12f0e356ffc78b6fbd3078e Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Tue, 13 Jan 2026 16:22:12 +0900
Subject: [PATCH 28/75] doc: update throw NaN for get_rank

---
 tdigest/include/tdigest.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 7ce87dd1..2d3620b1 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -154,6 +154,7 @@ class tdigest {
    * Compute approximate normalized rank of the given value.
    *
    * <p>If the sketch is empty this throws std::runtime_error.
+   * <p>NaN value throw std::invalid_argument.
    *
    * @param value to be ranked
    * @return normalized rank (from 0 to 1 inclusive)

From 21362396a54fd142abf3481f93df8a8058b3e00f Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Tue, 13 Jan 2026 16:12:02 +0800
Subject: [PATCH 29/75] Refactor README for clarity and consistency

Updated README to improve clarity and formatting.
---
 README.md | 139 ++++++++++++++++++++++++++----------------------------
 1 file changed, 66 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index 4b216167..57125686 100644
--- a/README.md
+++ b/README.md
@@ -1,106 +1,99 @@
 # Apache DataSketches Core C++ Library Component
-This is the core C++ component of the Apache DataSketches library.  It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications. 
 
-This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL.
+This is the core C++ component of the Apache DataSketches library.  It contains all the key sketching algorithms from the Java implementation and can be accessed directly by user applications. 
 
-Note that we have a parallel core component for [Java]((https://github.com/apache/datasketches-java) and [Python]((https://github.com/apache/datasketches-python) implementations of the same sketch algorithms.
+This component is also a dependency of other library components that create adaptors for target systems, such as [PostgreSQL](https://github.com/apache/datasketches-postgresql).
+
+Note that we have parallel core library components for Java, Python, and GO implementations of many of the same sketch algorithms:
+
+- [datasketches-java](https://github.com/apache/datasketches-java)
+- [datasketches-python](https://github.com/apache/datasketches-python)
+- [datasketches-go](https://github.com/apache/datasketches-go)
 
 Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information. 
 
-If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
+If you are interested in making contributions to this site, please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us.
 
 ---
 
 This code requires C++11.
 
-This library is header-only. The build process provided is only for building unit tests.
+This library is header-only. The provided build process is only for unit tests.
 
-Building the unit tests requires cmake 3.12.0 or higher.
+Building the unit tests requires CMake 3.12.0 or higher.
 
-Installing the latest cmake on OSX: brew install cmake
+Installing the latest CMake on OSX: `brew install cmake`.
 
-Building and running unit tests using cmake for OSX and Linux:
+Building and running unit tests using CMake for OSX and Linux:
 
-```
-    $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release
-    $ cmake --build build/Release -t all test
+```shell
+cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release
+cmake --build build/Release -t all test
 ```
 
-Building and running unit tests using cmake for Windows from the command line:
+Building and running unit tests using CMake for Windows from the command line:
 
-```
-    $ cd build
-    $ cmake ..
-    $ cd ..
-    $ cmake --build build --config Release
-    $ cmake --build build --config Release --target RUN_TESTS
+```shell
+cd build
+cmake ..
+cd ..
+cmake --build build --config Release
+cmake --build build --config Release --target RUN_TESTS
 ```
 
-To install a local distribution (OSX and Linux), use the following command. The
-CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it 
-defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below,
-the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include,
-/tmp/install/DataSketches/lib, etc)
+To install a local distribution (OSX and Linux), use the following command. The `CMAKE_INSTALL_PREFIX` variable controls the destination. If not specified, it defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below, the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include, /tmp/install/DataSketches/lib, etc).
 
-```
-    $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches
-    $ cmake --build build/Release -t install
+```shell
+cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches
+cmake --build build/Release -t install
 ```
 
-To generate an installable package using cmake's built in cpack packaging tool,
-use the following command. The type of packaging is controlled by the CPACK_GENERATOR
-variable (semi-colon separated list). Cmake usually supports packaging types such as RPM,
-DEB, STGZ, TGZ, TZ, ZIP, etc.
+To generate an installable package using CMake's built-in cpack packaging tool, use the following command. The type of packaging is controlled by the `CPACK_GENERATOR` variable (semi-colon separated list). CMake usually supports packaging formats such as RPM, DEB, STGZ, TGZ, TZ, and ZIP.
 
-```
-    $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" 
-    $ cmake3 --build build/Release -t package
+```shell
+cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" 
+cmake --build build/Release -t package
 ```
 
 The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways.
-If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some 
-way, then CMake's `find_package` command can be used like this:
 
-```
-    find_package(DataSketches 3.2 REQUIRED)
-    target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB})
+If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some way, then CMake's `find_package` command can be used like this:
+
+```cmake
+find_package(DataSketches 3.2 REQUIRED)
+target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB})
 ```
 
 When used with find_package, DataSketches exports several variables, including
 
-   - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported.
-   - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files.
-   Because cmake automatically includes the interface directories for included target libraries when
-   using `target_link_library`, under normal circumstances there will be no need to include this directly.
-   - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling
-   in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies 
-   and include paths.
-
-If you don't have DataSketches installed locally, dependent projects can pull it directly
-from GitHub using CMake's `ExternalProject` module. The code would look something like this:
-
-```
-    cmake_policy(SET CMP0097 NEW)
-    include(ExternalProject)
-    ExternalProject_Add(datasketches
-        GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
-        GIT_TAG 3.2.0
-        GIT_SHALLOW true
-        GIT_SUBMODULES ""
-        INSTALL_DIR /tmp/datasketches-prefix
-        CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix
-
-        # Override the install command to add DESTDIR
-        # This is necessary to work around an oddity in the RPM (but not other) package
-        # generation, as CMake otherwise picks up the Datasketch files when building
-        # an RPM for a dependent package. (RPM scans the directory for files in addition to installing
-        # those files referenced in an "install" rule in the cmake file)
-        INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install
-    )
-    ExternalProject_Get_property(datasketches INSTALL_DIR)
-    set(datasketches_INSTALL_DIR ${INSTALL_DIR})
-    message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
-    target_include_directories(my_dependent_target 
-                                PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
-    add_dependencies(my_dependent_target datasketches)
+- `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported.
+- `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files. Because CMake automatically includes the interface directories for included target libraries when using `target_link_library`, under normal circumstances, there will be no need to include this directly
+- `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies and include paths.
+
+If you don't have DataSketches installed locally, dependent projects can pull it directly from GitHub using CMake's `ExternalProject` module. The code would look something like this:
+
+```cmake
+cmake_policy(SET CMP0097 NEW)
+include(ExternalProject)
+ExternalProject_Add(datasketches
+    GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
+    GIT_TAG 3.2.0
+    GIT_SHALLOW true
+    GIT_SUBMODULES ""
+    INSTALL_DIR /tmp/datasketches-prefix
+    CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix
+
+    # Override the install command to add DESTDIR
+    # This is necessary to work around an oddity in the RPM (but not other) package
+    # generation, as CMake otherwise picks up the Datasketch files when building
+    # an RPM for a dependent package. (RPM scans the directory for files in addition to installing
+    # those files referenced in an "install" rule in the cmake file)
+    INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install
+)
+ExternalProject_Get_property(datasketches INSTALL_DIR)
+set(datasketches_INSTALL_DIR ${INSTALL_DIR})
+message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
+target_include_directories(my_dependent_target 
+                            PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches)
+add_dependencies(my_dependent_target datasketches)
 ```

From da95fd28419a982d8c6d802ab153362f724c470f Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Thu, 22 Jan 2026 01:19:38 +0900
Subject: [PATCH 30/75] feat: add utf8cpp

---
 NOTICE                                        |   3 +
 common/CMakeLists.txt                         |  11 +
 common/include/third_party/utf8cpp/LICENSE    |  23 +
 common/include/third_party/utf8cpp/utf8.h     |  46 ++
 .../third_party/utf8cpp/utf8/checked.h        | 359 +++++++++++++
 .../include/third_party/utf8cpp/utf8/core.h   | 500 ++++++++++++++++++
 .../include/third_party/utf8cpp/utf8/cpp11.h  |  70 +++
 .../include/third_party/utf8cpp/utf8/cpp17.h  |  96 ++++
 .../include/third_party/utf8cpp/utf8/cpp20.h  | 124 +++++
 .../third_party/utf8cpp/utf8/unchecked.h      | 286 ++++++++++
 10 files changed, 1518 insertions(+)
 create mode 100644 common/include/third_party/utf8cpp/LICENSE
 create mode 100644 common/include/third_party/utf8cpp/utf8.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/checked.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/core.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/cpp11.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/cpp17.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/cpp20.h
 create mode 100644 common/include/third_party/utf8cpp/utf8/unchecked.h

diff --git a/NOTICE b/NOTICE
index 11ba6f6c..6a2376d9 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,3 +10,6 @@ The Apache Software Foundation (http://www.apache.org/).
 
 Prior to moving to ASF, the software for this project was developed at
 Yahoo Inc. (https://developer.yahoo.com).
+
+This product includes utf8cpp (https://github.com/nemtrif/utfcpp),
+licensed under the Boost Software License, Version 1.0.
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 8514433b..2d5c7330 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,3 +51,14 @@ install(FILES
       include/serde.hpp
       include/xxhash64.h
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
+
+install(FILES
+      include/third_party/utf8cpp/utf8.h
+      include/third_party/utf8cpp/utf8/checked.h
+      include/third_party/utf8cpp/utf8/core.h
+      include/third_party/utf8cpp/utf8/cpp11.h
+      include/third_party/utf8cpp/utf8/cpp17.h
+      include/third_party/utf8cpp/utf8/cpp20.h
+      include/third_party/utf8cpp/utf8/unchecked.h
+      include/third_party/utf8cpp/LICENSE
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp")
diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE
new file mode 100644
index 00000000..36b7cd93
--- /dev/null
+++ b/common/include/third_party/utf8cpp/LICENSE
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h
new file mode 100644
index 00000000..b5135309
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8.h
@@ -0,0 +1,46 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+/*
+To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro
+and set it to one of the values used by the __cplusplus predefined macro.
+
+For instance,
+    #define UTF_CPP_CPLUSPLUS 199711L
+will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard.
+Some library features will be disabled.
+
+If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus.
+*/
+
+#include "utf8/checked.h"
+#include "utf8/unchecked.h"
+
+#endif // header guard
diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h
new file mode 100644
index 00000000..96ceb4d5
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/checked.h
@@ -0,0 +1,359 @@
+// Copyright 2006-2016 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include <stdexcept>
+
+namespace utf8
+{
+    // Base for the exceptions that may be thrown from the library
+    class exception : public ::std::exception {
+    };
+
+    // Exceptions that may be thrown from the library functions.
+    class invalid_code_point : public exception {
+        utfchar32_t cp;
+    public:
+        invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
+        utfchar32_t code_point() const {return cp;}
+    };
+
+    class invalid_utf8 : public exception {
+        utfchar8_t u8;
+    public:
+        invalid_utf8 (utfchar8_t u) : u8(u) {}
+        invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
+        utfchar8_t utf8_octet() const {return u8;}
+    };
+
+    class invalid_utf16 : public exception {
+        utfchar16_t u16;
+    public:
+        invalid_utf16 (utfchar16_t u) : u16(u) {}
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
+        utfchar16_t utf16_word() const {return u16;}
+    };
+
+    class not_enough_room : public exception {
+    public:
+        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
+    };
+
+    /// The library API - functions intended to be called by the users
+
+    template <typename octet_iterator>
+    octet_iterator append(utfchar32_t cp, octet_iterator result)
+    {
+        if (!utf8::internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        return internal::append(cp, result);
+    }
+
+    inline void append(utfchar32_t cp, std::string& s)
+    {
+        append(cp, std::back_inserter(s));
+    }
+
+    template <typename word_iterator>
+    word_iterator append16(utfchar32_t cp, word_iterator result)
+    {
+        if (!utf8::internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        return internal::append16(cp, result);
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
+    {
+        while (start != end) {
+            octet_iterator sequence_start = start;
+            internal::utf_error err_code = utf8::internal::validate_next(start, end);
+            switch (err_code) {
+                case internal::UTF8_OK :
+                    for (octet_iterator it = sequence_start; it != start; ++it)
+                        *out++ = *it;
+                    break;
+                case internal::NOT_ENOUGH_ROOM:
+                    out = utf8::append (replacement, out);
+                    start = end;
+                    break;
+                case internal::INVALID_LEAD:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    break;
+                case internal::INCOMPLETE_SEQUENCE:
+                case internal::OVERLONG_SEQUENCE:
+                case internal::INVALID_CODE_POINT:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    // just one replacement mark for the sequence
+                    while (start != end && utf8::internal::is_trail(*start))
+                        ++start;
+                    break;
+            }
+        }
+        return out;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+    {
+        static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
+        return utf8::replace_invalid(start, end, out, replacement_marker);
+    }
+
+    inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    inline std::string replace_invalid(const std::string& s)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    template <typename octet_iterator>
+    utfchar32_t next(octet_iterator& it, octet_iterator end)
+    {
+        utfchar32_t cp = 0;
+        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
+        switch (err_code) {
+            case internal::UTF8_OK :
+                break;
+            case internal::NOT_ENOUGH_ROOM :
+                throw not_enough_room();
+            case internal::INVALID_LEAD :
+            case internal::INCOMPLETE_SEQUENCE :
+            case internal::OVERLONG_SEQUENCE :
+                throw invalid_utf8(static_cast<utfchar8_t>(*it));
+            case internal::INVALID_CODE_POINT :
+                throw invalid_code_point(cp);
+        }
+        return cp;
+    }
+
+    template <typename word_iterator>
+    utfchar32_t next16(word_iterator& it, word_iterator end)
+    {
+        utfchar32_t cp = 0;
+        internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp);
+        if (err_code == internal::NOT_ENOUGH_ROOM)
+            throw not_enough_room();
+        return cp;
+    }
+
+    template <typename octet_iterator>
+    utfchar32_t peek_next(octet_iterator it, octet_iterator end)
+    {
+        return utf8::next(it, end);
+    }
+
+    template <typename octet_iterator>
+    utfchar32_t prior(octet_iterator& it, octet_iterator start)
+    {
+        // can't do much if it == start
+        if (it == start)
+            throw not_enough_room();
+
+        octet_iterator end = it;
+        // Go back until we hit either a lead octet or start
+        while (utf8::internal::is_trail(*(--it)))
+            if (it == start)
+                throw invalid_utf8(*it); // error - no lead byte in the sequence
+        return utf8::peek_next(it, end);
+    }
+
+    template <typename octet_iterator, typename distance_type>
+    void advance (octet_iterator& it, distance_type n, octet_iterator end)
+    {
+        const distance_type zero(0);
+        if (n < zero) {
+            // backward
+            for (distance_type i = n; i < zero; ++i)
+                utf8::prior(it, end);
+        } else {
+            // forward
+            for (distance_type i = zero; i < n; ++i)
+                utf8::next(it, end);
+        }
+    }
+
+    template <typename octet_iterator>
+    typename std::iterator_traits<octet_iterator>::difference_type
+    distance (octet_iterator first, octet_iterator last)
+    {
+        typename std::iterator_traits<octet_iterator>::difference_type dist;
+        for (dist = 0; first < last; ++dist)
+            utf8::next(first, last);
+        return dist;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+    {
+        while (start != end) {
+            utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
+            // Take care of surrogate pairs first
+            if (utf8::internal::is_lead_surrogate(cp)) {
+                if (start != end) {
+                    const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
+                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
+                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                    else
+                        throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate));
+                }
+                else
+                    throw invalid_utf16(static_cast<utfchar16_t>(cp));
+
+            }
+            // Lone trail surrogate
+            else if (utf8::internal::is_trail_surrogate(cp))
+                throw invalid_utf16(static_cast<utfchar16_t>(cp));
+
+            result = utf8::append(cp, result);
+        }
+        return result;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+    {
+        while (start < end) {
+            const utfchar32_t cp = utf8::next(start, end);
+            if (cp > 0xffff) { //make a surrogate pair
+                *result++ = static_cast<utfchar16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+            }
+            else
+                *result++ = static_cast<utfchar16_t>(cp);
+        }
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+    {
+        while (start != end)
+            result = utf8::append(*(start++), result);
+
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+    {
+        while (start < end)
+            (*result++) = utf8::next(start, end);
+
+        return result;
+    }
+
+    // The iterator class
+    template <typename octet_iterator>
+    class iterator {
+      octet_iterator it;
+      octet_iterator range_start;
+      octet_iterator range_end;
+      public:
+      typedef utfchar32_t value_type;
+      typedef utfchar32_t* pointer;
+      typedef utfchar32_t& reference;
+      typedef std::ptrdiff_t difference_type;
+      typedef std::bidirectional_iterator_tag iterator_category;
+      iterator () {}
+      explicit iterator (const octet_iterator& octet_it,
+                         const octet_iterator& rangestart,
+                         const octet_iterator& rangeend) :
+               it(octet_it), range_start(rangestart), range_end(rangeend)
+      {
+          if (it < range_start || it > range_end)
+              throw std::out_of_range("Invalid utf-8 iterator position");
+      }
+      // the default "big three" are OK
+      octet_iterator base () const { return it; }
+      utfchar32_t operator * () const
+      {
+          octet_iterator temp = it;
+          return utf8::next(temp, range_end);
+      }
+      bool operator == (const iterator& rhs) const
+      {
+          if (range_start != rhs.range_start || range_end != rhs.range_end)
+              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+          return (it == rhs.it);
+      }
+      bool operator != (const iterator& rhs) const
+      {
+          return !(operator == (rhs));
+      }
+      iterator& operator ++ ()
+      {
+          utf8::next(it, range_end);
+          return *this;
+      }
+      iterator operator ++ (int)
+      {
+          iterator temp = *this;
+          utf8::next(it, range_end);
+          return temp;
+      }
+      iterator& operator -- ()
+      {
+          utf8::prior(it, range_start);
+          return *this;
+      }
+      iterator operator -- (int)
+      {
+          iterator temp = *this;
+          utf8::prior(it, range_start);
+          return temp;
+      }
+    }; // class iterator
+
+} // namespace utf8
+
+#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
+#include "cpp20.h"
+#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
+#include "cpp17.h"
+#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+#include "cpp11.h"
+#endif // C++ 11 or later
+
+#endif //header guard
+
diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h
new file mode 100644
index 00000000..8e128c18
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/core.h
@@ -0,0 +1,500 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include <iterator>
+#include <cstring>
+#include <string>
+
+// Determine the C++ standard version.
+// If the user defines UTF_CPP_CPLUSPLUS, use that.
+// Otherwise, trust the unreliable predefined macro __cplusplus
+
+#if !defined UTF_CPP_CPLUSPLUS
+    #define UTF_CPP_CPLUSPLUS __cplusplus
+#endif
+
+#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+    #define UTF_CPP_OVERRIDE override
+    #define UTF_CPP_NOEXCEPT noexcept
+    #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
+#else // C++ 98/03
+    #define UTF_CPP_OVERRIDE
+    #define UTF_CPP_NOEXCEPT throw()
+    // Not worth simulating static_assert:
+    #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition);
+#endif // C++ 11 or later
+
+
+namespace utf8
+{
+// The typedefs for 8-bit, 16-bit and 32-bit code units
+#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
+    #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
+        typedef char8_t         utfchar8_t;
+    #else // C++ 11/14/17
+        typedef unsigned char   utfchar8_t;
+    #endif
+    typedef char16_t        utfchar16_t;
+    typedef char32_t        utfchar32_t;
+#else // C++ 98/03
+    typedef unsigned char   utfchar8_t;
+    typedef unsigned short  utfchar16_t;
+    typedef unsigned int    utfchar32_t;
+#endif // C++ 11 or later
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+    // Unicode constants
+    // Leading (high) surrogates: 0xd800 - 0xdbff
+    // Trailing (low) surrogates: 0xdc00 - 0xdfff
+    const utfchar16_t LEAD_SURROGATE_MIN  = 0xd800u;
+    const utfchar16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+    const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+    const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+    const utfchar16_t LEAD_OFFSET         = 0xd7c0u;       // LEAD_SURROGATE_MIN - (0x10000 >> 10)
+    const utfchar32_t SURROGATE_OFFSET    = 0xfca02400u;   // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
+
+    // Maximum valid value for a Unicode code point
+    const utfchar32_t CODE_POINT_MAX      = 0x0010ffffu;
+
+    template<typename octet_type>
+    inline utfchar8_t mask8(octet_type oc)
+    {
+        return static_cast<utfchar8_t>(0xff & oc);
+    }
+
+    template<typename u16_type>
+    inline utfchar16_t mask16(u16_type oc)
+    {
+        return static_cast<utfchar16_t>(0xffff & oc);
+    }
+
+    template<typename octet_type>
+    inline bool is_trail(octet_type oc)
+    {
+        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
+    }
+
+    inline bool is_lead_surrogate(utfchar32_t cp)
+    {
+        return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX));
+    }
+
+    inline bool is_trail_surrogate(utfchar32_t cp)
+    {
+        return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
+    }
+
+    inline bool is_surrogate(utfchar32_t cp)
+    {
+        return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
+    }
+
+    inline bool is_code_point_valid(utfchar32_t cp)
+    {
+        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+    }
+
+    inline bool is_in_bmp(utfchar32_t cp)
+    {
+        return cp < utfchar32_t(0x10000);
+    }
+
+    template <typename octet_iterator>
+    int sequence_length(octet_iterator lead_it)
+    {
+        const utfchar8_t lead = utf8::internal::mask8(*lead_it);
+        if (lead < 0x80)
+            return 1;
+        else if ((lead >> 5) == 0x6)
+            return 2;
+        else if ((lead >> 4) == 0xe)
+            return 3;
+        else if ((lead >> 3) == 0x1e)
+            return 4;
+        else
+            return 0;
+    }
+
+    inline bool is_overlong_sequence(utfchar32_t cp, int length)
+    {
+        if (cp < 0x80) {
+            if (length != 1)
+                return true;
+        }
+        else if (cp < 0x800) {
+            if (length != 2)
+                return true;
+        }
+        else if (cp < 0x10000) {
+            if (length != 3)
+                return true;
+        }
+        return false;
+    }
+
+    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+    /// Helper for get_sequence_x
+    template <typename octet_iterator>
+    utf_error increase_safely(octet_iterator& it, const octet_iterator end)
+    {
+        if (++it == end)
+            return NOT_ENOUGH_ROOM;
+
+        if (!utf8::internal::is_trail(*it))
+            return INCOMPLETE_SEQUENCE;
+
+        return UTF8_OK;
+    }
+
+    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
+
+    /// get_sequence_x functions decode utf-8 sequences of the length x
+    template <typename octet_iterator>
+    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
+    {
+        if (it == end)
+           return NOT_ENOUGH_ROOM;
+
+        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
+
+        return UTF8_OK;
+    }
+
+    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
+
+    template <typename octet_iterator>
+    utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        // Save the original value of it so we can go back in case of failure
+        // Of course, it does not make much sense with i.e. stream iterators
+        octet_iterator original_it = it;
+
+        utfchar32_t cp = 0;
+        // Determine the sequence length based on the lead octet
+        const int length = utf8::internal::sequence_length(it);
+
+        // Get trail octets and calculate the code point
+        utf_error err = UTF8_OK;
+        switch (length) {
+            case 0:
+                return INVALID_LEAD;
+            case 1:
+                err = utf8::internal::get_sequence_1(it, end, cp);
+                break;
+            case 2:
+                err = utf8::internal::get_sequence_2(it, end, cp);
+            break;
+            case 3:
+                err = utf8::internal::get_sequence_3(it, end, cp);
+            break;
+            case 4:
+                err = utf8::internal::get_sequence_4(it, end, cp);
+            break;
+        }
+
+        if (err == UTF8_OK) {
+            // Decoding succeeded. Now, security checks...
+            if (utf8::internal::is_code_point_valid(cp)) {
+                if (!utf8::internal::is_overlong_sequence(cp, length)){
+                    // Passed! Return here.
+                    code_point = cp;
+                    ++it;
+                    return UTF8_OK;
+                }
+                else
+                    err = OVERLONG_SEQUENCE;
+            }
+            else
+                err = INVALID_CODE_POINT;
+        }
+
+        // Failure branch - restore the original value of the iterator
+        it = original_it;
+        return err;
+    }
+
+    template <typename octet_iterator>
+    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+        utfchar32_t ignored;
+        return utf8::internal::validate_next(it, end, ignored);
+    }
+
+    template <typename word_iterator>
+    utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
+    {
+        // Make sure the iterator dereferences a large enough type
+        typedef typename std::iterator_traits<word_iterator>::value_type word_type;
+        UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
+        // Check the edge case:
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+        // Save the original value of it so we can go back in case of failure
+        // Of course, it does not make much sense with i.e. stream iterators
+        word_iterator original_it = it;
+
+        utf_error err = UTF8_OK;
+
+        const utfchar16_t first_word = *it++;
+        if (!is_surrogate(first_word)) {
+            code_point = first_word;
+            return UTF8_OK;
+        }
+        else {
+            if (it == end)
+                err = NOT_ENOUGH_ROOM;
+            else if (is_lead_surrogate(first_word)) {
+                const utfchar16_t second_word = *it++;
+                if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) {
+                    code_point = static_cast<utfchar32_t>(first_word << 10) +  static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET;
+                    return UTF8_OK;
+                } else
+                    err = INCOMPLETE_SEQUENCE;
+
+            } else {
+                err = INVALID_LEAD;
+            }
+        }
+        // error branch
+        it = original_it;
+        return err;
+    }
+
+    // Internal implementation of both checked and unchecked append() function
+    // This function will be invoked by the overloads below, as they will know
+    // the octet_type.
+    template <typename octet_iterator, typename octet_type>
+    octet_iterator append(utfchar32_t cp, octet_iterator result) {
+        if (cp < 0x80)                        // one octet
+            *(result++) = static_cast<octet_type>(cp);
+        else if (cp < 0x800) {                // two octets
+            *(result++) = static_cast<octet_type>((cp >> 6)          | 0xc0);
+            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
+        }
+        else if (cp < 0x10000) {              // three octets
+            *(result++) = static_cast<octet_type>((cp >> 12)         | 0xe0);
+            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
+        }
+        else {                                // four octets
+            *(result++) = static_cast<octet_type>((cp >> 18)         | 0xf0);
+            *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
+            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
+            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
+        }
+        return result;
+    }
+
+    // One of the following overloads will be invoked from the API calls
+
+    // A simple (but dangerous) case: the caller appends byte(s) to a char array
+    inline char* append(utfchar32_t cp, char* result) {
+        return append<char*, char>(cp, result);
+    }
+
+    // Hopefully, most common case: the caller uses back_inserter
+    // i.e. append(cp, std::back_inserter(str));
+    template<typename container_type>
+    std::back_insert_iterator<container_type> append
+            (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
+        return append<std::back_insert_iterator<container_type>,
+            typename container_type::value_type>(cp, result);
+    }
+
+    // The caller uses some other kind of output operator - not covered above
+    // Note that in this case we are not able to determine octet_type
+    // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
+    template <typename octet_iterator>
+    octet_iterator append(utfchar32_t cp, octet_iterator result) {
+        return append<octet_iterator, utfchar8_t>(cp, result);
+    }
+
+    // Internal implementation of both checked and unchecked append16() function
+    // This function will be invoked by the overloads below, as they will know
+    // the word_type.
+    template <typename word_iterator, typename word_type>
+    word_iterator append16(utfchar32_t cp, word_iterator result) {
+        UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
+        if (is_in_bmp(cp))
+            *(result++) = static_cast<word_type>(cp);
+        else {
+            // Code points from the supplementary planes are encoded via surrogate pairs
+            *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
+            *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
+        }
+        return result;
+    }
+
+    // Hopefully, most common case: the caller uses back_inserter
+    // i.e. append16(cp, std::back_inserter(str));
+    template<typename container_type>
+    std::back_insert_iterator<container_type> append16
+            (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
+        return append16<std::back_insert_iterator<container_type>,
+            typename container_type::value_type>(cp, result);
+    }
+
+    // The caller uses some other kind of output operator - not covered above
+    // Note that in this case we are not able to determine word_type
+    // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
+    template <typename word_iterator>
+    word_iterator append16(utfchar32_t cp, word_iterator result) {
+        return append16<word_iterator, utfchar16_t>(cp, result);
+    }
+
+} // namespace internal
+
+    /// The library API - functions intended to be called by the users
+
+    // Byte order mark
+    const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
+
+    template <typename octet_iterator>
+    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+    {
+        octet_iterator result = start;
+        while (result != end) {
+            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
+            if (err_code != internal::UTF8_OK)
+                return result;
+        }
+        return result;
+    }
+
+    inline const char* find_invalid(const char* str)
+    {
+        const char* end = str + std::strlen(str);
+        return find_invalid(str, end);
+    }
+
+    inline std::size_t find_invalid(const std::string& s)
+    {
+        std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
+    }
+
+    template <typename octet_iterator>
+    inline bool is_valid(octet_iterator start, octet_iterator end)
+    {
+        return (utf8::find_invalid(start, end) == end);
+    }
+
+    inline bool is_valid(const char* str)
+    {
+        return (*(utf8::find_invalid(str)) == '\0');
+    }
+
+    inline bool is_valid(const std::string& s)
+    {
+        return is_valid(s.begin(), s.end());
+    }
+
+
+
+    template <typename octet_iterator>
+    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+    {
+        return (
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
+            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
+           );
+    }
+
+    inline bool starts_with_bom(const std::string& s)
+    {
+        return starts_with_bom(s.begin(), s.end());
+    }
+} // namespace utf8
+
+#endif // header guard
+
diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h
new file mode 100644
index 00000000..691633c8
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/cpp11.h
@@ -0,0 +1,70 @@
+// Copyright 2018 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
+
+#include "checked.h"
+
+namespace utf8
+{
+    inline void append16(utfchar32_t cp, std::u16string& s)
+    {
+        append16(cp, std::back_inserter(s));
+    }
+
+    inline std::string utf16to8(const std::u16string& s)
+    {
+        std::string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(const std::string& s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::string utf32to8(const std::u32string& s)
+    {
+        std::string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(const std::string& s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+} // namespace utf8
+
+#endif // header guard
+
diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h
new file mode 100644
index 00000000..07587300
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/cpp17.h
@@ -0,0 +1,96 @@
+// Copyright 2018 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
+#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
+
+#include "cpp11.h"
+
+namespace utf8
+{
+    inline std::string utf16to8(std::u16string_view s)
+    {
+        std::string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(std::string_view s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::string utf32to8(std::u32string_view s)
+    {
+        std::string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(std::string_view s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::size_t find_invalid(std::string_view s)
+    {
+        std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    }
+
+    inline bool is_valid(std::string_view s)
+    {
+        return is_valid(s.begin(), s.end());
+    }
+
+    inline std::string replace_invalid(std::string_view s, char32_t replacement)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    inline std::string replace_invalid(std::string_view s)
+    {
+        std::string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline bool starts_with_bom(std::string_view s)
+    {
+        return starts_with_bom(s.begin(), s.end());
+    }
+
+} // namespace utf8
+
+#endif // header guard
+
diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h
new file mode 100644
index 00000000..07b61d0f
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/cpp20.h
@@ -0,0 +1,124 @@
+// Copyright 2022 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
+#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
+
+#include "cpp17.h"
+
+namespace utf8
+{
+    inline std::u8string utf16tou8(const std::u16string& s)
+    {
+        std::u8string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u8string utf16tou8(std::u16string_view s)
+    {
+        std::u8string result;
+        utf16to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(const std::u8string& s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u16string utf8to16(const std::u8string_view& s)
+    {
+        std::u16string result;
+        utf8to16(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u8string utf32tou8(const std::u32string& s)
+    {
+        std::u8string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u8string utf32tou8(const std::u32string_view& s)
+    {
+        std::u8string result;
+        utf32to8(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(const std::u8string& s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::u32string utf8to32(const std::u8string_view& s)
+    {
+        std::u32string result;
+        utf8to32(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline std::size_t find_invalid(const std::u8string& s)
+    {
+        std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
+        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
+    }
+
+    inline bool is_valid(const std::u8string& s)
+    {
+        return is_valid(s.begin(), s.end());
+    }
+
+    inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
+    {
+        std::u8string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+        return result;
+    }
+
+    inline std::u8string replace_invalid(const std::u8string& s)
+    {
+        std::u8string result;
+        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+        return result;
+    }
+
+    inline bool starts_with_bom(const std::u8string& s)
+    {
+        return starts_with_bom(s.begin(), s.end());
+    }
+ 
+} // namespace utf8
+
+#endif // header guard
+
diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h
new file mode 100644
index 00000000..173d0302
--- /dev/null
+++ b/common/include/third_party/utf8cpp/utf8/unchecked.h
@@ -0,0 +1,286 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+
+namespace utf8
+{
+    namespace unchecked
+    {
+        template <typename octet_iterator>
+        octet_iterator append(utfchar32_t cp, octet_iterator result)
+        {
+            return internal::append(cp, result);
+        }
+
+        template <typename word_iterator>
+        word_iterator append16(utfchar32_t cp, word_iterator result)
+        {
+            return internal::append16(cp, result);
+        }
+
+        template <typename octet_iterator, typename output_iterator>
+        output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
+        {
+            while (start != end) {
+                octet_iterator sequence_start = start;
+                internal::utf_error err_code = utf8::internal::validate_next(start, end);
+                switch (err_code) {
+                    case internal::UTF8_OK :
+                        for (octet_iterator it = sequence_start; it != start; ++it)
+                            *out++ = *it;
+                        break;
+                    case internal::NOT_ENOUGH_ROOM:
+                        out = utf8::unchecked::append(replacement, out);
+                        start = end;
+                        break;
+                    case internal::INVALID_LEAD:
+                        out = utf8::unchecked::append(replacement, out);
+                        ++start;
+                        break;
+                    case internal::INCOMPLETE_SEQUENCE:
+                    case internal::OVERLONG_SEQUENCE:
+                    case internal::INVALID_CODE_POINT:
+                        out = utf8::unchecked::append(replacement, out);
+                        ++start;
+                        // just one replacement mark for the sequence
+                        while (start != end && utf8::internal::is_trail(*start))
+                            ++start;
+                        break;
+                }
+            }
+            return out;
+        }
+
+        template <typename octet_iterator, typename output_iterator>
+        inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+        {
+            static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
+            return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
+        }
+
+        inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
+        {
+            std::string result;
+            replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
+            return result;
+        }
+
+        inline std::string replace_invalid(const std::string& s)
+        {
+            std::string result;
+            replace_invalid(s.begin(), s.end(), std::back_inserter(result));
+            return result;
+        }
+
+        template <typename octet_iterator>
+        utfchar32_t next(octet_iterator& it)
+        {
+            utfchar32_t cp = utf8::internal::mask8(*it);
+            switch (utf8::internal::sequence_length(it)) {
+                case 1:
+                    break;
+                case 2:
+                    ++it;
+                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+                    break;
+                case 3:
+                    ++it;
+                    cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+                    ++it;
+                    cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
+                    break;
+                case 4:
+                    ++it;
+                    cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+                    ++it;
+                    cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff));
+                    ++it;
+                    cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
+                    break;
+            }
+            ++it;
+            return cp;
+        }
+
+        template <typename octet_iterator>
+        utfchar32_t peek_next(octet_iterator it)
+        {
+            return utf8::unchecked::next(it);
+        }
+
+        template <typename word_iterator>
+        utfchar32_t next16(word_iterator& it)
+        {
+            utfchar32_t cp = utf8::internal::mask16(*it++);
+            if (utf8::internal::is_lead_surrogate(cp))
+                return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET;
+            return cp;
+        }
+
+        template <typename octet_iterator>
+        utfchar32_t prior(octet_iterator& it)
+        {
+            while (utf8::internal::is_trail(*(--it))) ;
+            octet_iterator temp = it;
+            return utf8::unchecked::next(temp);
+        }
+
+        template <typename octet_iterator, typename distance_type>
+        void advance(octet_iterator& it, distance_type n)
+        {
+            const distance_type zero(0);
+            if (n < zero) {
+                // backward
+                for (distance_type i = n; i < zero; ++i)
+                    utf8::unchecked::prior(it);
+            } else {
+                // forward
+                for (distance_type i = zero; i < n; ++i)
+                    utf8::unchecked::next(it);
+            }
+        }
+
+        template <typename octet_iterator>
+        typename std::iterator_traits<octet_iterator>::difference_type
+        distance(octet_iterator first, octet_iterator last)
+        {
+            typename std::iterator_traits<octet_iterator>::difference_type dist;
+            for (dist = 0; first < last; ++dist)
+                utf8::unchecked::next(first);
+            return dist;
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+        {
+            while (start != end) {
+                utfchar32_t cp = utf8::internal::mask16(*start++);
+                // Take care of surrogate pairs first
+                if (utf8::internal::is_lead_surrogate(cp)) {
+                    if (start == end)
+                        return result;
+                    utfchar32_t trail_surrogate = utf8::internal::mask16(*start++);
+                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                }
+                result = utf8::unchecked::append(cp, result);
+            }
+            return result;
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
+        {
+            while (start < end) {
+                utfchar32_t cp = utf8::unchecked::next(start);
+                if (cp > 0xffff) { //make a surrogate pair
+                    *result++ = static_cast<utfchar16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                    *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+                }
+                else
+                    *result++ = static_cast<utfchar16_t>(cp);
+            }
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+        {
+            while (start != end)
+                result = utf8::unchecked::append(*(start++), result);
+
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
+        {
+            while (start < end)
+                (*result++) = utf8::unchecked::next(start);
+
+            return result;
+        }
+
+        // The iterator class
+        template <typename octet_iterator>
+          class iterator {
+            octet_iterator it;
+            public:
+            typedef utfchar32_t value_type;
+            typedef utfchar32_t* pointer;
+            typedef utfchar32_t& reference;
+            typedef std::ptrdiff_t difference_type;
+            typedef std::bidirectional_iterator_tag iterator_category;
+            iterator () {}
+            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
+            // the default "big three" are OK
+            octet_iterator base () const { return it; }
+            utfchar32_t operator * () const
+            {
+                octet_iterator temp = it;
+                return utf8::unchecked::next(temp);
+            }
+            bool operator == (const iterator& rhs) const
+            {
+                return (it == rhs.it);
+            }
+            bool operator != (const iterator& rhs) const
+            {
+                return !(operator == (rhs));
+            }
+            iterator& operator ++ ()
+            {
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return *this;
+            }
+            iterator operator ++ (int)
+            {
+                iterator temp = *this;
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return temp;
+            }
+            iterator& operator -- ()
+            {
+                utf8::unchecked::prior(it);
+                return *this;
+            }
+            iterator operator -- (int)
+            {
+                iterator temp = *this;
+                utf8::unchecked::prior(it);
+                return temp;
+            }
+          }; // class iterator
+
+    } // namespace utf8::unchecked
+} // namespace utf8
+
+#endif // header guard
+

From 2b48f475ed20ed576342f68f7a7e3afa401bf12c Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Thu, 22 Jan 2026 01:21:02 +0900
Subject: [PATCH 31/75] feat: add aos tuple sketch

---
 tuple/CMakeLists.txt                          |   2 +
 tuple/include/array_of_strings_sketch.hpp     | 150 +++++++++
 .../include/array_of_strings_sketch_impl.hpp  | 284 ++++++++++++++++++
 tuple/test/CMakeLists.txt                     |   5 +-
 .../aos_sketch_deserialize_from_java_test.cpp | 172 +++++++++++
 tuple/test/aos_sketch_serialize_for_java.cpp  | 155 ++++++++++
 tuple/test/array_of_strings_sketch_test.cpp   | 243 +++++++++++++++
 ...uple_sketch_deserialize_from_java_test.cpp |   2 +-
 8 files changed, 1011 insertions(+), 2 deletions(-)
 create mode 100644 tuple/include/array_of_strings_sketch.hpp
 create mode 100644 tuple/include/array_of_strings_sketch_impl.hpp
 create mode 100644 tuple/test/aos_sketch_deserialize_from_java_test.cpp
 create mode 100644 tuple/test/aos_sketch_serialize_for_java.cpp
 create mode 100644 tuple/test/array_of_strings_sketch_test.cpp

diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index 4b0a48c7..54df11ee 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -54,4 +54,6 @@ install(FILES
 		include/array_tuple_intersection_impl.hpp
 		include/array_tuple_a_not_b.hpp
 		include/array_tuple_a_not_b_impl.hpp
+		include/array_of_strings_sketch.hpp
+		include/array_of_strings_sketch_impl.hpp
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
new file mode 100644
index 00000000..a3f8ddd7
--- /dev/null
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_
+#define ARRAY_OF_STRINGS_SKETCH_HPP_
+
+#include <memory>
+#include <string>
+
+#include "array_tuple_sketch.hpp"
+#include "xxhash64.h"
+
+namespace datasketches {
+
+// default update policy for an array of strings
+template<typename Allocator = std::allocator<std::string>>
+class default_array_of_strings_update_policy {
+public:
+  using array_of_strings = array<std::string, Allocator>;
+
+  explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator());
+
+  array_of_strings create() const;
+
+  void update(array_of_strings& array, const array_of_strings& input) const;
+
+  void update(array_of_strings& array, const array_of_strings* input) const;
+
+private:
+  Allocator allocator_;
+};
+
+// serializer/deserializer for an array of strings
+// Requirements: all strings must be valid UTF-8 and array size must be <= 127.
+template<typename Allocator = std::allocator<std::string>>
+struct array_of_strings_serde {
+  using array_of_strings = array<std::string, Allocator>;
+
+  void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const;
+  void deserialize(std::istream& is, array_of_strings* items, unsigned num) const;
+  size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const;
+  size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const;
+  size_t size_of_item(const array_of_strings& item) const;
+
+private:
+  static void check_num_nodes(uint8_t num_nodes);
+  static uint32_t compute_total_bytes(const array_of_strings& item);
+  static void check_utf8(const std::string& value);
+};
+
+/**
+ * Extended class of compact_tuple_sketch for array of strings
+ * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
+ */
+template<typename Allocator = std::allocator<std::string>>
+class compact_array_of_strings_tuple_sketch:
+  public compact_tuple_sketch<
+    array<std::string, Allocator>,
+    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+  > {
+public:
+  using array_of_strings = array<std::string, Allocator>;
+  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
+  using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
+  using vector_bytes = typename Base::vector_bytes;
+
+  template<typename Sketch>
+  compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true);
+
+  void serialize(std::ostream& os) const;
+  vector_bytes serialize(unsigned header_size_bytes = 0) const;
+
+  static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
+      const Allocator& allocator = Allocator());
+  static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
+      const Allocator& allocator = Allocator());
+
+private:
+  explicit compact_array_of_strings_tuple_sketch(Base&& base);
+};
+
+/**
+ * Extended class of update_tuple_sketch for array of strings
+ * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
+ */
+template<typename Allocator = std::allocator<std::string>>
+class update_array_of_strings_tuple_sketch:
+  public update_tuple_sketch<
+    array<std::string, Allocator>,
+    array<std::string, Allocator>,
+    default_array_of_strings_update_policy<Allocator>,
+    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+  > {
+public:
+  using array_of_strings = array<std::string, Allocator>;
+  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
+  using policy_type = default_array_of_strings_update_policy<Allocator>;
+  using Base = update_tuple_sketch<
+    array_of_strings,
+    array_of_strings,
+    policy_type,
+    summary_allocator
+  >;
+  using resize_factor = typename Base::resize_factor;
+  class builder;
+  using Base::update;
+
+  void update(const array_of_strings& key, const array_of_strings& value);
+  compact_array_of_strings_tuple_sketch<Allocator> compact(bool ordered = true) const;
+
+private:
+  update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
+      uint64_t seed, const policy_type& policy, const summary_allocator& allocator);
+
+  // Matches Java Util.PRIME for ArrayOfStrings key hashing.
+  static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
+
+  static uint64_t hash_key(const array_of_strings& key);
+};
+
+template<typename Allocator>
+class update_array_of_strings_tuple_sketch<Allocator>::builder:
+  public tuple_base_builder<builder, policy_type, summary_allocator> {
+public:
+  builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator());
+
+  update_array_of_strings_tuple_sketch build() const;
+};
+
+} /* namespace datasketches */
+
+#include "array_of_strings_sketch_impl.hpp"
+
+#endif
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
new file mode 100644
index 00000000..264f79bf
--- /dev/null
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
+#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_
+
+#include <stdexcept>
+
+#include "common_defs.hpp"
+#include "third_party/utf8cpp/utf8.h"
+
+namespace datasketches {
+
+template<typename Allocator>
+default_array_of_strings_update_policy<Allocator>::default_array_of_strings_update_policy(const Allocator& allocator):
+  allocator_(allocator) {}
+
+template<typename Allocator>
+auto default_array_of_strings_update_policy<Allocator>::create() const -> array_of_strings {
+  return array_of_strings(0, "", allocator_);
+}
+
+template<typename Allocator>
+void default_array_of_strings_update_policy<Allocator>::update(
+  array_of_strings& array, const array_of_strings& input
+) const {
+  const auto length = input.size();
+  array = array_of_strings(length, "", allocator_);
+  for (uint8_t i = 0; i < length; ++i) array[i] = input[i];
+}
+
+template<typename Allocator>
+void default_array_of_strings_update_policy<Allocator>::update(
+  array_of_strings& array, const array_of_strings* input
+) const {
+  if (input == nullptr) {
+    array = array_of_strings(0, "", allocator_);
+    return;
+  }
+  const auto length = input->size();
+  array = array_of_strings(length, "", allocator_);
+  for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i];
+}
+
+template<typename Allocator>
+update_array_of_strings_tuple_sketch<Allocator>::update_array_of_strings_tuple_sketch(
+  uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
+  uint64_t seed, const policy_type& policy, const summary_allocator& allocator
+):
+Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {}
+
+template<typename Allocator>
+void update_array_of_strings_tuple_sketch<Allocator>::update(
+  const array_of_strings& key, const array_of_strings& value
+) {
+  const uint64_t hash = hash_key(key);
+  Base::update(hash, value);
+}
+
+template<typename Allocator>
+uint64_t update_array_of_strings_tuple_sketch<Allocator>::hash_key(const array_of_strings& key) {
+  XXHash64 hasher(STRING_ARR_HASH_SEED);
+  const auto size = static_cast<size_t>(key.size());
+  for (size_t i = 0; i < size; ++i) {
+    const auto& entry = key[static_cast<uint8_t>(i)];
+    hasher.add(entry.data(), entry.size());
+    if (i + 1 < size) hasher.add(",", 1);
+  }
+  return hasher.hash();
+}
+
+template<typename Allocator>
+compact_array_of_strings_tuple_sketch<Allocator> update_array_of_strings_tuple_sketch<Allocator>::compact(bool ordered) const {
+  return compact_array_of_strings_tuple_sketch<Allocator>(*this, ordered);
+}
+
+// builder
+
+template<typename Allocator>
+update_array_of_strings_tuple_sketch<Allocator>::builder::builder(
+  const policy_type& policy, const summary_allocator& allocator
+):
+tuple_base_builder<builder, policy_type, summary_allocator>(policy, allocator) {}
+
+template<typename Allocator>
+auto update_array_of_strings_tuple_sketch<Allocator>::builder::build() const -> update_array_of_strings_tuple_sketch {
+  return update_array_of_strings_tuple_sketch(
+    this->starting_lg_size(),
+    this->lg_k_,
+    this->rf_,
+    this->p_,
+    this->starting_theta(),
+    this->seed_,
+    this->policy_,
+    this->allocator_
+  );
+}
+
+template<typename Allocator>
+template<typename Sketch>
+compact_array_of_strings_tuple_sketch<Allocator>::compact_array_of_strings_tuple_sketch(
+  const Sketch& sketch, bool ordered
+): Base(sketch, ordered) {}
+
+template<typename Allocator>
+compact_array_of_strings_tuple_sketch<Allocator>::compact_array_of_strings_tuple_sketch(
+  Base&& base
+): Base(std::move(base)) {}
+
+template<typename Allocator>
+void compact_array_of_strings_tuple_sketch<Allocator>::serialize(std::ostream& os) const {
+  Base::serialize(os, array_of_strings_serde<Allocator>());
+}
+
+template<typename Allocator>
+auto compact_array_of_strings_tuple_sketch<Allocator>::serialize(unsigned header_size_bytes) const -> vector_bytes {
+  return Base::serialize(header_size_bytes, array_of_strings_serde<Allocator>());
+}
+
+template<typename Allocator>
+auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
+  std::istream& is, uint64_t seed, const Allocator& allocator
+) -> compact_array_of_strings_tuple_sketch {
+  summary_allocator alloc(allocator);
+  auto base = Base::deserialize(is, seed, array_of_strings_serde<Allocator>(), alloc);
+  return compact_array_of_strings_tuple_sketch(std::move(base));
+}
+
+template<typename Allocator>
+auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
+  const void* bytes, size_t size, uint64_t seed, const Allocator& allocator
+) -> compact_array_of_strings_tuple_sketch {
+  summary_allocator alloc(allocator);
+  auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde<Allocator>(), alloc);
+  return compact_array_of_strings_tuple_sketch(std::move(base));
+}
+
+template<typename Allocator>
+void array_of_strings_serde<Allocator>::serialize(
+  std::ostream& os, const array_of_strings* items, unsigned num
+) const {
+  for (unsigned i = 0; i < num; ++i) {
+    const uint32_t total_bytes = compute_total_bytes(items[i]);
+    const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
+    write(os, total_bytes);
+    write(os, num_nodes);
+    const std::string* data = items[i].data();
+    for (uint8_t j = 0; j < num_nodes; ++j) {
+      check_utf8(data[j]);
+      const uint32_t length = static_cast<uint32_t>(data[j].size());
+      write(os, length);
+      os.write(data[j].data(), length);
+    }
+  }
+}
+
+template<typename Allocator>
+void array_of_strings_serde<Allocator>::deserialize(
+  std::istream& is, array_of_strings* items, unsigned num
+) const {
+  for (unsigned i = 0; i < num; ++i) {
+    read<uint32_t>(is); // total_bytes
+    const uint8_t num_nodes = read<uint8_t>(is);
+    check_num_nodes(num_nodes);
+    array_of_strings array(num_nodes, "", Allocator());
+    for (uint8_t j = 0; j < num_nodes; ++j) {
+      const uint32_t length = read<uint32_t>(is);
+      std::string value(length, '\0');
+      is.read(&value[0], length);
+      check_utf8(value);
+      array[j] = std::move(value);
+    }
+    new (&items[i]) array_of_strings(std::move(array));
+  }
+}
+
+template<typename Allocator>
+size_t array_of_strings_serde<Allocator>::serialize(
+  void* ptr, size_t capacity, const array_of_strings* items, unsigned num
+) const {
+  uint8_t* ptr8 = static_cast<uint8_t*>(ptr);
+  size_t bytes_written = 0;
+
+  for (unsigned i = 0; i < num; ++i) {
+    const uint32_t total_bytes = compute_total_bytes(items[i]);
+    const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
+    check_memory_size(bytes_written + total_bytes, capacity);
+    bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written);
+    bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
+    const std::string* data = items[i].data();
+    for (uint8_t j = 0; j < num_nodes; ++j) {
+      check_utf8(data[j]);
+      const uint32_t length = static_cast<uint32_t>(data[j].size());
+
+      bytes_written += copy_to_mem(length, ptr8 + bytes_written);
+      bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length);
+    }
+  }
+  return bytes_written;
+}
+
+template<typename Allocator>
+size_t array_of_strings_serde<Allocator>::deserialize(
+  const void* ptr, size_t capacity, array_of_strings* items, unsigned num
+) const {
+  const uint8_t* ptr8 = static_cast<const uint8_t*>(ptr);
+  size_t bytes_read = 0;
+
+  for (unsigned i = 0; i < num; ++i) {
+    check_memory_size(bytes_read + sizeof(uint32_t), capacity);
+    const size_t item_start = bytes_read;
+    uint32_t total_bytes;
+    bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
+    check_memory_size(item_start + total_bytes, capacity);
+    uint8_t num_nodes;
+    bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
+    check_num_nodes(num_nodes);
+    array_of_strings array(num_nodes, "", Allocator());
+    for (uint8_t j = 0; j < num_nodes; ++j) {
+      uint32_t length;
+      bytes_read += copy_from_mem(ptr8 + bytes_read, length);
+      std::string value(length, '\0');
+      bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
+      check_utf8(value);
+      array[j] = std::move(value);
+    }
+    new (&items[i]) array_of_strings(std::move(array));
+  }
+  return bytes_read;
+}
+
+template<typename Allocator>
+size_t array_of_strings_serde<Allocator>::size_of_item(const array_of_strings& item) const {
+  return compute_total_bytes(item);
+}
+
+template<typename Allocator>
+void array_of_strings_serde<Allocator>::check_num_nodes(uint8_t num_nodes) {
+  if (num_nodes > 127) {
+    throw std::runtime_error("array_of_strings size exceeds 127");
+  }
+}
+
+template<typename Allocator>
+uint32_t array_of_strings_serde<Allocator>::compute_total_bytes(const array_of_strings& item) {
+  const auto count = item.size();
+  check_num_nodes(static_cast<uint8_t>(count));
+  size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t);
+  const std::string* data = item.data();
+  for (uint32_t j = 0; j < count; ++j) {
+    total += data[j].size();
+  }
+  if (total > std::numeric_limits<uint32_t>::max()) {
+    throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max");
+  }
+  return static_cast<uint32_t>(total);
+}
+
+template<typename Allocator>
+void array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
+  if (!utf8::is_valid(value.begin(), value.end())) {
+    throw std::runtime_error("array_of_strings contains invalid UTF-8");
+  }
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index 4ca6a503..8c561745 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES
   CXX_STANDARD_REQUIRED YES
 )
 
-file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH)
+file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH)
 string(APPEND THETA_TEST_BINARY_PATH "/")
 target_compile_definitions(tuple_test
   PRIVATE
@@ -44,6 +44,7 @@ target_sources(tuple_test
     tuple_a_not_b_test.cpp
     tuple_jaccard_similarity_test.cpp
     array_of_doubles_sketch_test.cpp
+    array_of_strings_sketch_test.cpp
     engagement_test.cpp
 )
 
@@ -52,6 +53,7 @@ target_sources(tuple_test
   PRIVATE
     aod_sketch_deserialize_from_java_test.cpp
     tuple_sketch_deserialize_from_java_test.cpp
+    aos_sketch_deserialize_from_java_test.cpp
 )
 endif()
 
@@ -60,5 +62,6 @@ target_sources(tuple_test
   PRIVATE
     aod_sketch_serialize_for_java.cpp
     tuple_sketch_serialize_for_java.cpp
+    aos_sketch_serialize_for_java.cpp
 )
 endif()
diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
new file mode 100644
index 00000000..15e9d6dd
--- /dev/null
+++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
@@ -0,0 +1,172 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+#include <fstream>
+#include <vector>
+
+#include "array_of_strings_sketch.hpp"
+
+namespace datasketches {
+  // assume the binary sketches for this test have been generated by datasketches-java code
+  // in the subdirectory called "java" in the root directory of this project
+  static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/";
+
+  TEST_CASE("aos sketch one value", "[serde_compat]") {
+    const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+    for (const unsigned n: n_arr) {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+      REQUIRE(sketch.is_empty() == (n == 0));
+      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+      for (const auto& entry: sketch) {
+        REQUIRE(entry.first < sketch.get_theta64());
+        REQUIRE(entry.second.size() == 1);
+      }
+    }
+  }
+
+  TEST_CASE("aos sketch three values", "[serde_compat]") {
+    const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+    for (const unsigned n: n_arr) {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+      REQUIRE(sketch.is_empty() == (n == 0));
+      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+      for (const auto& entry: sketch) {
+        REQUIRE(entry.first < sketch.get_theta64());
+        REQUIRE(entry.second.size() == 3);
+      }
+    }
+  }
+
+  TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary);
+    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+    REQUIRE_FALSE(sketch.is_empty());
+    REQUIRE(sketch.get_num_retained() == 0);
+  }
+
+  TEST_CASE("aos sketch multi keys strings", "[serde_compat]") {
+    const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+    for (const unsigned n: n_arr) {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+      REQUIRE(sketch.is_empty() == (n == 0));
+      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+      for (const auto& entry: sketch) {
+        REQUIRE(entry.first < sketch.get_theta64());
+        REQUIRE(entry.second.size() == 1);
+      }
+    }
+  }
+
+  TEST_CASE("aos sketch unicode strings", "[serde_compat]") {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary);
+    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+    REQUIRE_FALSE(sketch.is_empty());
+    REQUIRE_FALSE(sketch.is_estimation_mode());
+    REQUIRE(sketch.get_num_retained() == 3);
+
+    const std::vector<std::vector<std::string>> expected_values = {
+      {"밸류", "값"},
+      {"📦", "🎁"},
+      {"ценить1", "ценить2"}
+    };
+    std::vector<bool> matched(expected_values.size(), false);
+    for (const auto& entry: sketch) {
+      REQUIRE(entry.first < sketch.get_theta64());
+      REQUIRE(entry.second.size() == 2);
+
+      bool found = false;
+      for (size_t i = 0; i < expected_values.size(); ++i) {
+        if (matched[i]) continue;
+        const auto& expected = expected_values[i];
+        if (entry.second.size() != expected.size()) continue;
+        bool equal = true;
+        for (size_t j = 0; j < expected.size(); ++j) {
+          if (entry.second[j] != expected[j]) {
+            equal = false;
+            break;
+          }
+        }
+        if (equal) {
+          matched[i] = true;
+          found = true;
+          break;
+        }
+      }
+      REQUIRE(found);
+    }
+    for (bool found: matched) REQUIRE(found);
+  }
+
+  TEST_CASE("aos sketch empty strings", "[serde_compat]") {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary);
+    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
+    REQUIRE_FALSE(sketch.is_empty());
+    REQUIRE_FALSE(sketch.is_estimation_mode());
+    REQUIRE(sketch.get_num_retained() == 3);
+    const std::vector<std::vector<std::string>> expected_values = {
+      {"empty_key_value"},
+      {""},
+      {"", ""}
+    };
+    std::vector<bool> matched(expected_values.size(), false);
+    for (const auto& entry: sketch) {
+      REQUIRE(entry.first < sketch.get_theta64());
+
+      bool found = false;
+      for (size_t i = 0; i < expected_values.size(); ++i) {
+        if (matched[i]) continue;
+        const auto& expected = expected_values[i];
+        if (entry.second.size() != expected.size()) continue;
+        bool equal = true;
+        for (size_t j = 0; j < expected.size(); ++j) {
+          if (entry.second[j] != expected[j]) {
+            equal = false;
+            break;
+          }
+        }
+        if (equal) {
+          matched[i] = true;
+          found = true;
+          break;
+        }
+      }
+      REQUIRE(found);
+    }
+    for (bool found: matched) REQUIRE(found);
+  }
+}
diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp
new file mode 100644
index 00000000..3a154132
--- /dev/null
+++ b/tuple/test/aos_sketch_serialize_for_java.cpp
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+#include <fstream>
+#include <initializer_list>
+
+#include "array_of_strings_sketch.hpp"
+
+namespace datasketches {
+
+using aos_sketch = update_array_of_strings_tuple_sketch<>;
+using array_of_strings = aos_sketch::array_of_strings;
+
+static array_of_strings make_array(std::initializer_list<std::string> items) {
+  array_of_strings array(static_cast<uint8_t>(items.size()), "");
+  size_t i = 0;
+  for (const auto& item: items) {
+    array[static_cast<uint8_t>(i)] = item;
+    ++i;
+  }
+  return array;
+}
+
+TEST_CASE("aos sketch generate one value", "[serialize_for_java]") {
+  const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+  for (const unsigned n: n_arr) {
+    auto sketch = aos_sketch::builder().build();
+    for (unsigned i = 0; i < n; ++i) {
+      array_of_strings key(1, "");
+      key[0] = std::to_string(i);
+      array_of_strings value(1, "");
+      value[0] = "value" + std::to_string(i);
+      sketch.update(key, value);
+    }
+    REQUIRE(sketch.is_empty() == (n == 0));
+    REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+    std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
+    sketch.compact().serialize(os);
+  }
+}
+
+TEST_CASE("aos sketch generate three values", "[serialize_for_java]") {
+  const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+  for (const unsigned n: n_arr) {
+    auto sketch = aos_sketch::builder().build();
+    for (unsigned i = 0; i < n; ++i) {
+      array_of_strings key(1, "");
+      key[0] = std::to_string(i);
+      array_of_strings value(3, "");
+      value[0] = "a" + std::to_string(i);
+      value[1] = "b" + std::to_string(i);
+      value[2] = "c" + std::to_string(i);
+      sketch.update(key, value);
+    }
+    REQUIRE(sketch.is_empty() == (n == 0));
+    REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+    std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
+    sketch.compact().serialize(os);
+  }
+}
+
+TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") {
+  auto sketch = aos_sketch::builder()
+    .set_lg_k(12)
+    .set_resize_factor(resize_factor::X8)
+    .set_p(0.01f)
+    .build();
+  array_of_strings key(1, "");
+  key[0] = "key1";
+  array_of_strings value(1, "");
+  value[0] = "value1";
+  sketch.update(key, value);
+  REQUIRE_FALSE(sketch.is_empty());
+  REQUIRE(sketch.get_num_retained() == 0);
+  std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary);
+  sketch.compact().serialize(os);
+}
+
+TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
+  const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+  for (const unsigned n: n_arr) {
+    auto sketch = aos_sketch::builder().build();
+    for (unsigned i = 0; i < n; ++i) {
+      array_of_strings key(2, "");
+      key[0] = "key" + std::to_string(i);
+      key[1] = "subkey" + std::to_string(i % 10);
+      array_of_strings value(1, "");
+      value[0] = "value" + std::to_string(i);
+      sketch.update(key, value);
+    }
+    REQUIRE(sketch.is_empty() == (n == 0));
+    REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+    std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
+    sketch.compact().serialize(os);
+  }
+}
+
+TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") {
+  auto sketch = aos_sketch::builder().build();
+  sketch.update(
+    make_array({u8"키", u8"열쇠"}),
+    make_array({u8"밸류", u8"값"})
+  );
+  sketch.update(
+    make_array({u8"🔑", u8"🗝️"}),
+    make_array({u8"📦", u8"🎁"})
+  );
+  sketch.update(
+    make_array({u8"ключ1", u8"ключ2"}),
+    make_array({u8"ценить1", u8"ценить2"})
+  );
+  REQUIRE_FALSE(sketch.is_empty());
+  REQUIRE(sketch.get_num_retained() == 3);
+  std::ofstream os("aos_unicode_cpp.sk", std::ios::binary);
+  sketch.compact().serialize(os);
+}
+
+TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") {
+  auto sketch = aos_sketch::builder().build();
+  sketch.update(
+    make_array({""}),
+    make_array({"empty_key_value"})
+  );
+  sketch.update(
+    make_array({"empty_value_key"}),
+    make_array({""})
+  );
+  sketch.update(
+    make_array({"", ""}),
+    make_array({"", ""})
+  );
+  REQUIRE_FALSE(sketch.is_empty());
+  REQUIRE(sketch.get_num_retained() == 3);
+  std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary);
+  sketch.compact().serialize(os);
+}
+
+} /* namespace datasketches */
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
new file mode 100644
index 00000000..45e554bd
--- /dev/null
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+#include "array_of_strings_sketch.hpp"
+
+namespace datasketches {
+
+using array_of_strings = array<std::string>;
+
+TEST_CASE("aos update policy", "[tuple_sketch]") {
+  default_array_of_strings_update_policy<> policy;
+
+  SECTION("create empty") {
+    auto values = policy.create();
+    REQUIRE(values.size() == 0);
+  }
+
+  SECTION("replace array") {
+    auto values = policy.create();
+
+    array_of_strings input(2, "", std::allocator<std::string>());
+    input[0] = "alpha";
+    input[1] = "beta";
+    policy.update(values, input);
+    REQUIRE(values.size() == 2);
+    REQUIRE(values[0] == "alpha");
+    REQUIRE(values[1] == "beta");
+    input[0] = "changed";
+    REQUIRE(values[0] == "alpha");
+
+    array_of_strings input2(1, "", std::allocator<std::string>());
+    input2[0] = "gamma";
+    policy.update(values, input2);
+    REQUIRE(values.size() == 1);
+    REQUIRE(values[0] == "gamma");
+  }
+
+  SECTION("nullptr clears") {
+    array_of_strings values(2, "", std::allocator<std::string>());
+    values[0] = "one";
+    values[1] = "two";
+
+    policy.update(values, static_cast<const array_of_strings*>(nullptr));
+    REQUIRE(values.size() == 0);
+  }
+
+  SECTION("pointer input copies") {
+    auto values = policy.create();
+
+    array_of_strings input(2, "", std::allocator<std::string>());
+    input[0] = "first";
+    input[1] = "second";
+    policy.update(values, &input);
+    REQUIRE(values.size() == 2);
+    REQUIRE(values[1] == "second");
+    input[1] = "changed";
+    REQUIRE(values[1] == "second");
+  }
+}
+
+TEST_CASE("aos sketch update", "[tuple_sketch]") {
+  auto make_array = [](std::initializer_list<const char*> entries) {
+    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
+    uint8_t i = 0;
+    for (const auto* entry: entries) array[i++] = entry;
+    return array;
+  };
+
+  SECTION("same key replaces summary") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+
+    sketch.update(make_array({"alpha", "beta"}), make_array({"first"}));
+    sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"}));
+
+    REQUIRE(sketch.get_num_retained() == 1);
+
+    auto it = sketch.begin();
+    REQUIRE(it != sketch.end());
+    REQUIRE(it->second.size() == 2);
+    REQUIRE(it->second[0] == "second");
+    REQUIRE(it->second[1] == "third");
+  }
+
+  SECTION("distinct keys retain multiple entries") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+
+    sketch.update(make_array({"a", "bc"}), make_array({"one"}));
+    sketch.update(make_array({"ab", "c"}), make_array({"two"}));
+
+    REQUIRE(sketch.get_num_retained() == 2);
+
+    bool saw_one = false;
+    bool saw_two = false;
+    for (const auto& entry: sketch) {
+      REQUIRE(entry.second.size() == 1);
+      if (entry.second[0] == "one") saw_one = true;
+      if (entry.second[0] == "two") saw_two = true;
+    }
+    REQUIRE(saw_one);
+    REQUIRE(saw_two);
+  }
+
+  SECTION("empty key") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+
+    sketch.update(make_array({}), make_array({"value"}));
+    REQUIRE(sketch.get_num_retained() == 1);
+
+    auto it = sketch.begin();
+    REQUIRE(it != sketch.end());
+    REQUIRE(it->second.size() == 1);
+    REQUIRE(it->second[0] == "value");
+  }
+}
+
+TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
+  auto make_array = [](std::initializer_list<std::string> entries) {
+    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
+    uint8_t i = 0;
+    for (const auto& entry: entries) array[i++] = entry;
+    return array;
+  };
+
+  auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
+    typedef std::pair<uint64_t, array_of_strings> entry_type;
+    std::vector<entry_type> entries;
+    for (const auto& entry: sketch) entries.push_back(entry);
+    struct entry_less {
+      bool operator()(const entry_type& lhs, const entry_type& rhs) const {
+        return lhs.first < rhs.first;
+      }
+    };
+    std::sort(entries.begin(), entries.end(), entry_less());
+    return entries;
+  };
+
+  auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) {
+    std::stringstream ss;
+    ss.exceptions(std::ios::failbit | std::ios::badbit);
+    compact_sketch.serialize(ss);
+    auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss);
+
+    auto bytes = compact_sketch.serialize();
+    auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size());
+
+    const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = {
+      &deserialized_stream,
+      &deserialized_bytes
+    };
+    for (int list_index = 0; list_index < 2; ++list_index) {
+      const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index];
+      REQUIRE(compact_sketch.is_empty() == deserialized->is_empty());
+      REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode());
+      REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered());
+      REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained());
+      REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10));
+      REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10));
+      REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10));
+      REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10));
+
+      auto original_entries = collect_entries(compact_sketch);
+      auto round_trip_entries = collect_entries(*deserialized);
+      REQUIRE(original_entries.size() == round_trip_entries.size());
+      for (size_t i = 0; i < original_entries.size(); ++i) {
+        REQUIRE(original_entries[i].first == round_trip_entries[i].first);
+        REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size());
+        for (size_t j = 0; j < original_entries[i].second.size(); ++j) {
+          REQUIRE(original_entries[i].second[static_cast<uint8_t>(j)] ==
+            round_trip_entries[i].second[static_cast<uint8_t>(j)]);
+        }
+      }
+    }
+  };
+
+  auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) {
+    auto ordered = sketch.compact(true);
+    auto unordered = sketch.compact(false);
+    check_round_trip(ordered);
+    check_round_trip(unordered);
+  };
+
+  SECTION("empty sketch") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+    exercise_ordering(sketch);
+  }
+
+  SECTION("single entry sketch") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+    sketch.update(make_array({"key"}), make_array({"value"}));
+    exercise_ordering(sketch);
+  }
+
+  SECTION("multiple entries exact mode") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build();
+    for (int i = 0; i < 50; ++i) {
+      sketch.update(
+        make_array({std::string("key-") + std::to_string(i)}),
+        make_array({std::string("value-") + std::to_string(i), "extra"})
+      );
+    }
+    REQUIRE_FALSE(sketch.is_estimation_mode());
+    exercise_ordering(sketch);
+  }
+
+  SECTION("multiple entries estimation mode") {
+    auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
+    for (int i = 0; i < 10000; ++i) {
+      sketch.update(
+        make_array({std::string("key-") + std::to_string(i)}),
+        make_array({std::string("value-") + std::to_string(i)})
+      );
+    }
+    REQUIRE(sketch.is_estimation_mode());
+    exercise_ordering(sketch);
+  }
+}
+
+} /* namespace datasketches */
diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
index 408223f9..cf589cd0 100644
--- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
+++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
@@ -25,7 +25,7 @@ namespace datasketches {
 
 // assume the binary sketches for this test have been generated by datasketches-java code
 // in the subdirectory called "java" in the root directory of this project
-static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
+static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/";
 
 TEST_CASE("tuple sketch int", "[serde_compat]") {
   const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};

From 307fe02179bf4a1776897ffff7bbee1913125402 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Thu, 22 Jan 2026 01:22:16 +0900
Subject: [PATCH 32/75] test: rollback test file path

---
 tuple/test/tuple_sketch_deserialize_from_java_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
index cf589cd0..408223f9 100644
--- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
+++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp
@@ -25,7 +25,7 @@ namespace datasketches {
 
 // assume the binary sketches for this test have been generated by datasketches-java code
 // in the subdirectory called "java" in the root directory of this project
-static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/";
+static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
 
 TEST_CASE("tuple sketch int", "[serde_compat]") {
   const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};

From a1e24c80e6336bccacc02a75a256643b5958d3be Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Thu, 22 Jan 2026 01:23:42 +0900
Subject: [PATCH 33/75] chore: rollback test directory

---
 tuple/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index 8c561745..3d7ccca3 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES
   CXX_STANDARD_REQUIRED YES
 )
 
-file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH)
+file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH)
 string(APPEND THETA_TEST_BINARY_PATH "/")
 target_compile_definitions(tuple_test
   PRIVATE

From 4b87a2d569e94006e3c986a96ba17eee431ba292 Mon Sep 17 00:00:00 2001
From: Mahesh Pai <mahesh_pai@intuit.com>
Date: Sat, 24 Jan 2026 17:43:17 +0530
Subject: [PATCH 34/75] Bugfix: tdigest const_iterator returns dangling
 reference causing incorrect values

---
 tdigest/include/tdigest.hpp            |   2 +-
 tdigest/test/CMakeLists.txt            |   1 +
 tdigest/test/tdigest_iterator_test.cpp | 274 +++++++++++++++++++++++++
 3 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 tdigest/test/tdigest_iterator_test.cpp

diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp
index 2d3620b1..095752e9 100644
--- a/tdigest/include/tdigest.hpp
+++ b/tdigest/include/tdigest.hpp
@@ -316,7 +316,7 @@ template<typename T, typename A>
 class tdigest<T, A>::const_iterator {
 public:
   using iterator_category = std::input_iterator_tag;
-  using value_type = std::pair<const T&, const W>;
+  using value_type = std::pair<T, W>;
   using difference_type = void;
   using pointer = const return_value_holder<value_type>;
   using reference = const value_type;
diff --git a/tdigest/test/CMakeLists.txt b/tdigest/test/CMakeLists.txt
index 18bf3599..8dcfb4f0 100644
--- a/tdigest/test/CMakeLists.txt
+++ b/tdigest/test/CMakeLists.txt
@@ -39,6 +39,7 @@ target_sources(tdigest_test
   PRIVATE
     tdigest_test.cpp
     tdigest_custom_allocator_test.cpp
+    tdigest_iterator_test.cpp
 )
 
 if (SERDE_COMPAT)
diff --git a/tdigest/test/tdigest_iterator_test.cpp b/tdigest/test/tdigest_iterator_test.cpp
new file mode 100644
index 00000000..e7c03205
--- /dev/null
+++ b/tdigest/test/tdigest_iterator_test.cpp
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+#include <memory>
+#include <map>
+#include <vector>
+#include <set>
+
+#include "tdigest.hpp"
+
+namespace datasketches {
+
+TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") {
+  tdigest_double td(100);
+  
+  // Insert 10 distinct values
+  for (int i = 0; i < 10; i++) {
+    td.update(static_cast<double>(i));
+  }
+  
+  // Collect all centroids via iteration
+  std::map<double, uint64_t> centroids;
+  for (const auto&& centroid : td) {
+    centroids[centroid.first] = centroid.second;
+  }
+  
+  // Should have collected all 10 distinct values
+  REQUIRE(centroids.size() == 10);
+  
+  // Verify each value was captured correctly
+  for (int i = 0; i < 10; i++) {
+    REQUIRE(centroids.count(static_cast<double>(i)) == 1);
+    REQUIRE(centroids[static_cast<double>(i)] == 1);
+  }
+}
+
+TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") {
+  // This test reproduces the bug scenario found in ClickHouse
+  std::unique_ptr<tdigest_double> td(new tdigest_double(100));
+  
+  // Insert distinct values
+  for (int i = 0; i < 10; i++) {
+    td->update(static_cast<double>(i));
+  }
+  
+  // Use explicit begin/end iterators
+  auto it = td->begin();
+  auto end_it = td->end();
+  
+  std::vector<double> means;
+  std::vector<uint64_t> weights;
+  
+  while (it != end_it) {
+    // Before the fix, accessing it->first would return garbage or same value repeatedly
+    double mean = it->first;
+    uint64_t weight = it->second;
+    means.push_back(mean);
+    weights.push_back(weight);
+    ++it;
+  }
+  
+  // Should have collected 10 centroids
+  REQUIRE(means.size() == 10);
+  REQUIRE(weights.size() == 10);
+  
+  // All means should be distinct (not all zeros or garbage)
+  std::set<double> unique_means(means.begin(), means.end());
+  REQUIRE(unique_means.size() == 10);
+  
+  // Verify all expected values are present
+  for (int i = 0; i < 10; i++) {
+    REQUIRE(unique_means.count(static_cast<double>(i)) == 1);
+  }
+}
+
+TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") {
+  tdigest_double td(100);
+  
+  for (int i = 0; i < 5; i++) {
+    td.update(static_cast<double>(i * 10));
+  }
+  
+  std::vector<std::pair<double, uint64_t>> collected;
+  
+  // Test structured bindings
+  for (auto it = td.begin(); it != td.end(); ++it) {
+    const auto& centroid = *it;
+    collected.emplace_back(centroid.first, centroid.second);
+  }
+  
+  REQUIRE(collected.size() == 5);
+  
+  // Verify distinct values were collected
+  std::set<double> means;
+  for (const auto& pair : collected) {
+    means.insert(pair.first);
+    REQUIRE(pair.second == 1);  // Each value inserted once
+  }
+  
+  REQUIRE(means.size() == 5);
+  for (int i = 0; i < 5; i++) {
+    REQUIRE(means.count(static_cast<double>(i * 10)) == 1);
+  }
+}
+
+TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") {
+  tdigest_double td(100);
+  
+  // Insert values
+  for (int i = 1; i <= 10; i++) {
+    td.update(static_cast<double>(i * i));  // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100
+  }
+  
+  // Access via operator->
+  std::map<double, uint64_t> centroids;
+  auto end_it = td.end();
+  for (auto it = td.begin(); it != end_it; ++it) {
+    // operator-> should return valid values
+    centroids[it->first] = it->second;
+  }
+  
+  REQUIRE(centroids.size() == 10);
+  
+  // Verify the squared values
+  for (int i = 1; i <= 10; i++) {
+    double expected = static_cast<double>(i * i);
+    REQUIRE(centroids.count(expected) == 1);
+  }
+}
+
+TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") {
+  tdigest_double td(100);
+  
+  // Insert values
+  for (double d = 0.0; d < 10.0; d += 1.0) {
+    td.update(d);
+  }
+  
+  size_t count = 0;
+  std::set<double> seen_means;
+  
+  // This pattern was working in simple tests but failing in optimized builds
+  for (const auto&& centroid : td) {
+    seen_means.insert(centroid.first);
+    count++;
+  }
+  
+  REQUIRE(count == 10);
+  REQUIRE(seen_means.size() == 10);
+  
+  // Verify all values from 0 to 9 are present
+  for (int i = 0; i < 10; i++) {
+    REQUIRE(seen_means.count(static_cast<double>(i)) == 1);
+  }
+}
+
+TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") {
+  tdigest_double td(100);
+  
+  td.update(1.0);
+  td.update(2.0);
+  td.update(3.0);
+  
+  auto it = td.begin();
+  
+  // Store the pair
+  auto pair1 = *it;
+  double mean1 = pair1.first;
+  
+  ++it;
+  
+  // Store another pair
+  auto pair2 = *it;
+  double mean2 = pair2.first;
+  
+  ++it;
+  
+  auto pair3 = *it;
+  double mean3 = pair3.first;
+  
+  // All three means should be distinct
+  REQUIRE(mean1 != mean2);
+  REQUIRE(mean2 != mean3);
+  REQUIRE(mean1 != mean3);
+  
+  // And they should match our input values
+  std::set<double> means = {mean1, mean2, mean3};
+  REQUIRE(means.count(1.0) == 1);
+  REQUIRE(means.count(2.0) == 1);
+  REQUIRE(means.count(3.0) == 1);
+}
+
+TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") {
+  tdigest_double td(100);
+  
+  // Empty sketch should have begin() == end()
+  REQUIRE(td.begin() == td.end());
+  
+  // Range-based for should not execute
+  size_t count = 0;
+  for (const auto&& centroid : td) {
+    (void)centroid;  // Silence unused warning
+    count++;
+  }
+  REQUIRE(count == 0);
+}
+
+TEST_CASE("tdigest iterator: single value", "[tdigest]") {
+  tdigest_double td(100);
+  td.update(42.0);
+  
+  size_t count = 0;
+  double captured_mean = 0.0;
+  uint64_t captured_weight = 0;
+  
+  for (const auto&& centroid : td) {
+    captured_mean = centroid.first;
+    captured_weight = centroid.second;
+    count++;
+  }
+  
+  REQUIRE(count == 1);
+  REQUIRE(captured_mean == 42.0);
+  REQUIRE(captured_weight == 1);
+}
+
+TEST_CASE("tdigest iterator: large dataset", "[tdigest]") {
+  tdigest_double td(100);
+  
+  // Insert 1000 distinct values
+  for (int i = 0; i < 1000; i++) {
+    td.update(static_cast<double>(i));
+  }
+  
+  // Iterator should provide compressed centroids (not all 1000)
+  size_t centroid_count = 0;
+  std::set<double> unique_means;
+  uint64_t total_weight = 0;
+  
+  for (const auto&& centroid : td) {
+    unique_means.insert(centroid.first);
+    total_weight += centroid.second;
+    centroid_count++;
+  }
+  
+  // Should have fewer centroids than input values due to compression
+  REQUIRE(centroid_count < 1000);
+  REQUIRE(centroid_count > 0);
+  
+  // Total weight should equal number of input values
+  REQUIRE(total_weight == 1000);
+  
+  // All means should be unique (no duplicates)
+  REQUIRE(unique_means.size() == centroid_count);
+}
+
+} // namespace datasketches

From 9381dcd227a5a06f8e18803dd21aa6c877b46b25 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Sun, 25 Jan 2026 22:14:47 +0900
Subject: [PATCH 35/75] fix: empty string handling

---
 tuple/include/array_of_strings_sketch.hpp     |  66 +++-
 .../include/array_of_strings_sketch_impl.hpp  | 101 +++---
 .../aos_sketch_deserialize_from_java_test.cpp | 311 ++++++++++++------
 tuple/test/aos_sketch_serialize_for_java.cpp  |  12 +-
 tuple/test/array_of_strings_sketch_test.cpp   |  16 +-
 5 files changed, 334 insertions(+), 172 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index a3f8ddd7..4442fd64 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -49,8 +49,11 @@ class default_array_of_strings_update_policy {
 // serializer/deserializer for an array of strings
 // Requirements: all strings must be valid UTF-8 and array size must be <= 127.
 template<typename Allocator = std::allocator<std::string>>
-struct array_of_strings_serde {
+struct default_array_of_strings_serde {
   using array_of_strings = array<std::string, Allocator>;
+  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
+
+  explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());
 
   void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const;
   void deserialize(std::istream& is, array_of_strings* items, unsigned num) const;
@@ -59,6 +62,8 @@ struct array_of_strings_serde {
   size_t size_of_item(const array_of_strings& item) const;
 
 private:
+  Allocator allocator_;
+  summary_allocator summary_allocator_;
   static void check_num_nodes(uint8_t num_nodes);
   static uint32_t compute_total_bytes(const array_of_strings& item);
   static void check_utf8(const std::string& value);
@@ -79,17 +84,41 @@ class compact_array_of_strings_tuple_sketch:
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
   using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
   using vector_bytes = typename Base::vector_bytes;
-
+  using Base::serialize;
+
+  /**
+   * Copy constructor.
+   * Constructs a compact sketch from another sketch (update or compact)
+   * @param other sketch to be constructed from
+   * @param ordered if true make the resulting sketch ordered
+   */
   template<typename Sketch>
   compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true);
 
-  void serialize(std::ostream& os) const;
-  vector_bytes serialize(unsigned header_size_bytes = 0) const;
-
+  /**
+   * This method deserializes a sketch from a given stream.
+   * @param is input stream
+   * @param seed the seed for the hash function that was used to create the sketch
+   * @param sd instance of a SerDe
+   * @param allocator instance of an Allocator
+   * @return an instance of the sketch
+   */
+  template<typename SerDe = default_array_of_strings_serde<Allocator>>
   static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
-      const Allocator& allocator = Allocator());
+      const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
+
+  /**
+   * This method deserializes a sketch from a given array of bytes.
+   * @param bytes pointer to the array of bytes
+   * @param size the size of the array
+   * @param seed the seed for the hash function that was used to create the sketch
+   * @param sd instance of a SerDe
+   * @param allocator instance of an Allocator
+   * @return an instance of the sketch
+   */
+  template<typename SerDe = default_array_of_strings_serde<Allocator>>
   static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
-      const Allocator& allocator = Allocator());
+      const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
 
 private:
   explicit compact_array_of_strings_tuple_sketch(Base&& base);
@@ -97,20 +126,20 @@ class compact_array_of_strings_tuple_sketch:
 
 /**
  * Extended class of update_tuple_sketch for array of strings
- * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
  */
-template<typename Allocator = std::allocator<std::string>>
+template<template<typename> class Policy = default_array_of_strings_update_policy,
+         typename Allocator = std::allocator<std::string>>
 class update_array_of_strings_tuple_sketch:
   public update_tuple_sketch<
     array<std::string, Allocator>,
     array<std::string, Allocator>,
-    default_array_of_strings_update_policy<Allocator>,
+    Policy<Allocator>,
     typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
   > {
 public:
   using array_of_strings = array<std::string, Allocator>;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
-  using policy_type = default_array_of_strings_update_policy<Allocator>;
+  using policy_type = Policy<Allocator>;
   using Base = update_tuple_sketch<
     array_of_strings,
     array_of_strings,
@@ -121,7 +150,18 @@ class update_array_of_strings_tuple_sketch:
   class builder;
   using Base::update;
 
+  /**
+   * Updates the sketch with string array for both key and value.
+   * @param key the given string array key
+   * @param value the given string array value
+   */
   void update(const array_of_strings& key, const array_of_strings& value);
+
+  /**
+   * Converts this sketch to a compact sketch (ordered or unordered).
+   * @param ordered optional flag to specify if an ordered sketch should be produced
+   * @return compact array of strings sketch
+   */
   compact_array_of_strings_tuple_sketch<Allocator> compact(bool ordered = true) const;
 
 private:
@@ -134,8 +174,8 @@ class update_array_of_strings_tuple_sketch:
   static uint64_t hash_key(const array_of_strings& key);
 };
 
-template<typename Allocator>
-class update_array_of_strings_tuple_sketch<Allocator>::builder:
+template<template<typename> class Policy, typename Allocator>
+class update_array_of_strings_tuple_sketch<Policy, Allocator>::builder:
   public tuple_base_builder<builder, policy_type, summary_allocator> {
 public:
   builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator());
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 264f79bf..b95987a0 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -40,9 +40,9 @@ template<typename Allocator>
 void default_array_of_strings_update_policy<Allocator>::update(
   array_of_strings& array, const array_of_strings& input
 ) const {
-  const auto length = input.size();
-  array = array_of_strings(length, "", allocator_);
-  for (uint8_t i = 0; i < length; ++i) array[i] = input[i];
+  const auto length = static_cast<size_t>(input.size());
+  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  for (size_t i = 0; i < length; ++i) array[i] = input[i];
 }
 
 template<typename Allocator>
@@ -53,53 +53,53 @@ void default_array_of_strings_update_policy<Allocator>::update(
     array = array_of_strings(0, "", allocator_);
     return;
   }
-  const auto length = input->size();
-  array = array_of_strings(length, "", allocator_);
-  for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i];
+  const auto length = static_cast<size_t>(input->size());
+  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
 }
 
-template<typename Allocator>
-update_array_of_strings_tuple_sketch<Allocator>::update_array_of_strings_tuple_sketch(
+template<template<typename> class Policy, typename Allocator>
+update_array_of_strings_tuple_sketch<Policy, Allocator>::update_array_of_strings_tuple_sketch(
   uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
   uint64_t seed, const policy_type& policy, const summary_allocator& allocator
 ):
 Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {}
 
-template<typename Allocator>
-void update_array_of_strings_tuple_sketch<Allocator>::update(
+template<template<typename> class Policy, typename Allocator>
+void update_array_of_strings_tuple_sketch<Policy, Allocator>::update(
   const array_of_strings& key, const array_of_strings& value
 ) {
   const uint64_t hash = hash_key(key);
   Base::update(hash, value);
 }
 
-template<typename Allocator>
-uint64_t update_array_of_strings_tuple_sketch<Allocator>::hash_key(const array_of_strings& key) {
+template<template<typename> class Policy, typename Allocator>
+uint64_t update_array_of_strings_tuple_sketch<Policy, Allocator>::hash_key(const array_of_strings& key) {
   XXHash64 hasher(STRING_ARR_HASH_SEED);
   const auto size = static_cast<size_t>(key.size());
   for (size_t i = 0; i < size; ++i) {
-    const auto& entry = key[static_cast<uint8_t>(i)];
+    const auto& entry = key[i];
     hasher.add(entry.data(), entry.size());
     if (i + 1 < size) hasher.add(",", 1);
   }
   return hasher.hash();
 }
 
-template<typename Allocator>
-compact_array_of_strings_tuple_sketch<Allocator> update_array_of_strings_tuple_sketch<Allocator>::compact(bool ordered) const {
+template<template<typename> class Policy, typename Allocator>
+compact_array_of_strings_tuple_sketch<Allocator> update_array_of_strings_tuple_sketch<Policy, Allocator>::compact(bool ordered) const {
   return compact_array_of_strings_tuple_sketch<Allocator>(*this, ordered);
 }
 
 // builder
 
-template<typename Allocator>
-update_array_of_strings_tuple_sketch<Allocator>::builder::builder(
+template<template<typename> class Policy, typename Allocator>
+update_array_of_strings_tuple_sketch<Policy, Allocator>::builder::builder(
   const policy_type& policy, const summary_allocator& allocator
 ):
 tuple_base_builder<builder, policy_type, summary_allocator>(policy, allocator) {}
 
-template<typename Allocator>
-auto update_array_of_strings_tuple_sketch<Allocator>::builder::build() const -> update_array_of_strings_tuple_sketch {
+template<template<typename> class Policy, typename Allocator>
+auto update_array_of_strings_tuple_sketch<Policy, Allocator>::builder::build() const -> update_array_of_strings_tuple_sketch {
   return update_array_of_strings_tuple_sketch(
     this->starting_lg_size(),
     this->lg_k_,
@@ -124,35 +124,32 @@ compact_array_of_strings_tuple_sketch<Allocator>::compact_array_of_strings_tuple
 ): Base(std::move(base)) {}
 
 template<typename Allocator>
-void compact_array_of_strings_tuple_sketch<Allocator>::serialize(std::ostream& os) const {
-  Base::serialize(os, array_of_strings_serde<Allocator>());
-}
-
-template<typename Allocator>
-auto compact_array_of_strings_tuple_sketch<Allocator>::serialize(unsigned header_size_bytes) const -> vector_bytes {
-  return Base::serialize(header_size_bytes, array_of_strings_serde<Allocator>());
-}
-
-template<typename Allocator>
+template<typename SerDe>
 auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
-  std::istream& is, uint64_t seed, const Allocator& allocator
+  std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator
 ) -> compact_array_of_strings_tuple_sketch {
   summary_allocator alloc(allocator);
-  auto base = Base::deserialize(is, seed, array_of_strings_serde<Allocator>(), alloc);
+  auto base = Base::deserialize(is, seed, sd, alloc);
   return compact_array_of_strings_tuple_sketch(std::move(base));
 }
 
 template<typename Allocator>
+template<typename SerDe>
 auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
-  const void* bytes, size_t size, uint64_t seed, const Allocator& allocator
+  const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator
 ) -> compact_array_of_strings_tuple_sketch {
   summary_allocator alloc(allocator);
-  auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde<Allocator>(), alloc);
+  auto base = Base::deserialize(bytes, size, seed, sd, alloc);
   return compact_array_of_strings_tuple_sketch(std::move(base));
 }
 
 template<typename Allocator>
-void array_of_strings_serde<Allocator>::serialize(
+default_array_of_strings_serde<Allocator>::default_array_of_strings_serde(const Allocator& allocator):
+  allocator_(allocator),
+  summary_allocator_(allocator) {}
+
+template<typename Allocator>
+void default_array_of_strings_serde<Allocator>::serialize(
   std::ostream& os, const array_of_strings* items, unsigned num
 ) const {
   for (unsigned i = 0; i < num; ++i) {
@@ -171,27 +168,34 @@ void array_of_strings_serde<Allocator>::serialize(
 }
 
 template<typename Allocator>
-void array_of_strings_serde<Allocator>::deserialize(
+void default_array_of_strings_serde<Allocator>::deserialize(
   std::istream& is, array_of_strings* items, unsigned num
 ) const {
   for (unsigned i = 0; i < num; ++i) {
     read<uint32_t>(is); // total_bytes
+    if (!is) throw std::runtime_error("array_of_strings stream read failed");
     const uint8_t num_nodes = read<uint8_t>(is);
+    if (!is) throw std::runtime_error("array_of_strings stream read failed");
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", Allocator());
+    array_of_strings array(num_nodes, "", allocator_);
     for (uint8_t j = 0; j < num_nodes; ++j) {
       const uint32_t length = read<uint32_t>(is);
+      if (!is) throw std::runtime_error("array_of_strings stream read failed");
       std::string value(length, '\0');
-      is.read(&value[0], length);
+      if (length != 0) {
+        is.read(value.data(), length);
+        if (!is) throw std::runtime_error("array_of_strings stream read failed");
+      }
       check_utf8(value);
       array[j] = std::move(value);
     }
-    new (&items[i]) array_of_strings(std::move(array));
+    summary_allocator alloc(summary_allocator_);
+    std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
   }
 }
 
 template<typename Allocator>
-size_t array_of_strings_serde<Allocator>::serialize(
+size_t default_array_of_strings_serde<Allocator>::serialize(
   void* ptr, size_t capacity, const array_of_strings* items, unsigned num
 ) const {
   uint8_t* ptr8 = static_cast<uint8_t*>(ptr);
@@ -216,7 +220,7 @@ size_t array_of_strings_serde<Allocator>::serialize(
 }
 
 template<typename Allocator>
-size_t array_of_strings_serde<Allocator>::deserialize(
+size_t default_array_of_strings_serde<Allocator>::deserialize(
   const void* ptr, size_t capacity, array_of_strings* items, unsigned num
 ) const {
   const uint8_t* ptr8 = static_cast<const uint8_t*>(ptr);
@@ -231,34 +235,37 @@ size_t array_of_strings_serde<Allocator>::deserialize(
     uint8_t num_nodes;
     bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", Allocator());
+    array_of_strings array(num_nodes, "", allocator_);
     for (uint8_t j = 0; j < num_nodes; ++j) {
       uint32_t length;
       bytes_read += copy_from_mem(ptr8 + bytes_read, length);
       std::string value(length, '\0');
-      bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
+      if (length != 0) {
+        bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length);
+      }
       check_utf8(value);
       array[j] = std::move(value);
     }
-    new (&items[i]) array_of_strings(std::move(array));
+    summary_allocator alloc(summary_allocator_);
+    std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
   }
   return bytes_read;
 }
 
 template<typename Allocator>
-size_t array_of_strings_serde<Allocator>::size_of_item(const array_of_strings& item) const {
+size_t default_array_of_strings_serde<Allocator>::size_of_item(const array_of_strings& item) const {
   return compute_total_bytes(item);
 }
 
 template<typename Allocator>
-void array_of_strings_serde<Allocator>::check_num_nodes(uint8_t num_nodes) {
+void default_array_of_strings_serde<Allocator>::check_num_nodes(uint8_t num_nodes) {
   if (num_nodes > 127) {
     throw std::runtime_error("array_of_strings size exceeds 127");
   }
 }
 
 template<typename Allocator>
-uint32_t array_of_strings_serde<Allocator>::compute_total_bytes(const array_of_strings& item) {
+uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const array_of_strings& item) {
   const auto count = item.size();
   check_num_nodes(static_cast<uint8_t>(count));
   size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t);
@@ -273,7 +280,7 @@ uint32_t array_of_strings_serde<Allocator>::compute_total_bytes(const array_of_s
 }
 
 template<typename Allocator>
-void array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
+void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
   if (!utf8::is_valid(value.begin(), value.end())) {
     throw std::runtime_error("array_of_strings contains invalid UTF-8");
   }
diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
index 15e9d6dd..af37d6c2 100644
--- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp
+++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
@@ -26,21 +26,53 @@
 namespace datasketches {
   // assume the binary sketches for this test have been generated by datasketches-java code
   // in the subdirectory called "java" in the root directory of this project
-  static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/";
+  static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
+
+  static std::vector<uint8_t> read_binary_file(const std::string& path) {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(path, std::ios::binary);
+    is.seekg(0, std::ios::end);
+    const auto size = static_cast<size_t>(is.tellg());
+    is.seekg(0, std::ios::beg);
+    std::vector<uint8_t> bytes(size);
+    if (size != 0) {
+      is.read(reinterpret_cast<char*>(bytes.data()), size);
+    }
+    return bytes;
+  }
 
   TEST_CASE("aos sketch one value", "[serde_compat]") {
     const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
     for (const unsigned n: n_arr) {
-      std::ifstream is;
-      is.exceptions(std::ios::failbit | std::ios::badbit);
-      is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary);
-      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-      REQUIRE(sketch.is_empty() == (n == 0));
-      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
-      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
-      for (const auto& entry: sketch) {
-        REQUIRE(entry.first < sketch.get_theta64());
-        REQUIRE(entry.second.size() == 1);
+      const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk";
+      SECTION("stream") {
+        std::ifstream is;
+        is.exceptions(std::ios::failbit | std::ios::badbit);
+        is.open(path, std::ios::binary);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          is, DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 1);
+        }
+      }
+      SECTION("bytes") {
+        const auto bytes = read_binary_file(path);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 1);
+        }
       }
     }
   }
@@ -48,125 +80,204 @@ namespace datasketches {
   TEST_CASE("aos sketch three values", "[serde_compat]") {
     const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
     for (const unsigned n: n_arr) {
-      std::ifstream is;
-      is.exceptions(std::ios::failbit | std::ios::badbit);
-      is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary);
-      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-      REQUIRE(sketch.is_empty() == (n == 0));
-      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
-      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
-      for (const auto& entry: sketch) {
-        REQUIRE(entry.first < sketch.get_theta64());
-        REQUIRE(entry.second.size() == 3);
+      const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk";
+      SECTION("stream") {
+        std::ifstream is;
+        is.exceptions(std::ios::failbit | std::ios::badbit);
+        is.open(path, std::ios::binary);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          is, DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 3);
+        }
+      }
+      SECTION("bytes") {
+        const auto bytes = read_binary_file(path);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 3);
+        }
       }
     }
   }
 
   TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") {
-    std::ifstream is;
-    is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary);
-    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-    REQUIRE_FALSE(sketch.is_empty());
-    REQUIRE(sketch.get_num_retained() == 0);
+    const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk";
+    SECTION("stream") {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(path, std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        is, DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      REQUIRE_FALSE(sketch.is_empty());
+      REQUIRE(sketch.get_num_retained() == 0);
+    }
+    SECTION("bytes") {
+      const auto bytes = read_binary_file(path);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      REQUIRE_FALSE(sketch.is_empty());
+      REQUIRE(sketch.get_num_retained() == 0);
+    }
   }
 
   TEST_CASE("aos sketch multi keys strings", "[serde_compat]") {
     const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
     for (const unsigned n: n_arr) {
-      std::ifstream is;
-      is.exceptions(std::ios::failbit | std::ios::badbit);
-      is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary);
-      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-      REQUIRE(sketch.is_empty() == (n == 0));
-      REQUIRE(sketch.is_estimation_mode() == (n > 1000));
-      REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
-      for (const auto& entry: sketch) {
-        REQUIRE(entry.first < sketch.get_theta64());
-        REQUIRE(entry.second.size() == 1);
+      const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk";
+      SECTION("stream") {
+        std::ifstream is;
+        is.exceptions(std::ios::failbit | std::ios::badbit);
+        is.open(path, std::ios::binary);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          is, DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 1);
+        }
+      }
+      SECTION("bytes") {
+        const auto bytes = read_binary_file(path);
+        const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+          bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+        );
+        REQUIRE(sketch.is_empty() == (n == 0));
+        REQUIRE(sketch.is_estimation_mode() == (n > 1000));
+        REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
+        for (const auto& entry: sketch) {
+          REQUIRE(entry.first < sketch.get_theta64());
+          REQUIRE(entry.second.size() == 1);
+        }
       }
     }
   }
 
   TEST_CASE("aos sketch unicode strings", "[serde_compat]") {
-    std::ifstream is;
-    is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary);
-    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-    REQUIRE_FALSE(sketch.is_empty());
-    REQUIRE_FALSE(sketch.is_estimation_mode());
-    REQUIRE(sketch.get_num_retained() == 3);
+    const auto path = testBinaryInputPath + "aos_unicode_java.sk";
+    auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
+      REQUIRE_FALSE(sketch.is_empty());
+      REQUIRE_FALSE(sketch.is_estimation_mode());
+      REQUIRE(sketch.get_num_retained() == 3);
 
-    const std::vector<std::vector<std::string>> expected_values = {
-      {"밸류", "값"},
-      {"📦", "🎁"},
-      {"ценить1", "ценить2"}
-    };
-    std::vector<bool> matched(expected_values.size(), false);
-    for (const auto& entry: sketch) {
-      REQUIRE(entry.first < sketch.get_theta64());
-      REQUIRE(entry.second.size() == 2);
+      const std::vector<std::vector<std::string>> expected_values = {
+        {"밸류", "값"},
+        {"📦", "🎁"},
+        {"ценить1", "ценить2"}
+      };
+      std::vector<bool> matched(expected_values.size(), false);
+      for (const auto& entry: sketch) {
+        REQUIRE(entry.first < sketch.get_theta64());
+        REQUIRE(entry.second.size() == 2);
 
-      bool found = false;
-      for (size_t i = 0; i < expected_values.size(); ++i) {
-        if (matched[i]) continue;
-        const auto& expected = expected_values[i];
-        if (entry.second.size() != expected.size()) continue;
-        bool equal = true;
-        for (size_t j = 0; j < expected.size(); ++j) {
-          if (entry.second[j] != expected[j]) {
-            equal = false;
+        bool found = false;
+        for (size_t i = 0; i < expected_values.size(); ++i) {
+          if (matched[i]) continue;
+          const auto& expected = expected_values[i];
+          if (entry.second.size() != expected.size()) continue;
+          bool equal = true;
+          for (size_t j = 0; j < expected.size(); ++j) {
+            if (entry.second[j] != expected[j]) {
+              equal = false;
+              break;
+            }
+          }
+          if (equal) {
+            matched[i] = true;
+            found = true;
             break;
           }
         }
-        if (equal) {
-          matched[i] = true;
-          found = true;
-          break;
-        }
+        REQUIRE(found);
       }
-      REQUIRE(found);
+      for (bool found: matched) REQUIRE(found);
+    };
+    SECTION("stream") {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(path, std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        is, DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      check(sketch);
+    }
+    SECTION("bytes") {
+      const auto bytes = read_binary_file(path);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      check(sketch);
     }
-    for (bool found: matched) REQUIRE(found);
   }
 
   TEST_CASE("aos sketch empty strings", "[serde_compat]") {
-    std::ifstream is;
-    is.exceptions(std::ios::failbit | std::ios::badbit);
-    is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary);
-    const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is);
-    REQUIRE_FALSE(sketch.is_empty());
-    REQUIRE_FALSE(sketch.is_estimation_mode());
-    REQUIRE(sketch.get_num_retained() == 3);
-    const std::vector<std::vector<std::string>> expected_values = {
-      {"empty_key_value"},
-      {""},
-      {"", ""}
-    };
-    std::vector<bool> matched(expected_values.size(), false);
-    for (const auto& entry: sketch) {
-      REQUIRE(entry.first < sketch.get_theta64());
+    const auto path = testBinaryInputPath + "aos_empty_strings_java.sk";
+    auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
+      REQUIRE_FALSE(sketch.is_empty());
+      REQUIRE_FALSE(sketch.is_estimation_mode());
+      REQUIRE(sketch.get_num_retained() == 3);
+      const std::vector<std::vector<std::string>> expected_values = {
+        {"empty_key_value"},
+        {""},
+        {"", ""}
+      };
+      std::vector<bool> matched(expected_values.size(), false);
+      for (const auto& entry: sketch) {
+        REQUIRE(entry.first < sketch.get_theta64());
 
-      bool found = false;
-      for (size_t i = 0; i < expected_values.size(); ++i) {
-        if (matched[i]) continue;
-        const auto& expected = expected_values[i];
-        if (entry.second.size() != expected.size()) continue;
-        bool equal = true;
-        for (size_t j = 0; j < expected.size(); ++j) {
-          if (entry.second[j] != expected[j]) {
-            equal = false;
+        bool found = false;
+        for (size_t i = 0; i < expected_values.size(); ++i) {
+          if (matched[i]) continue;
+          const auto& expected = expected_values[i];
+          if (entry.second.size() != expected.size()) continue;
+          bool equal = true;
+          for (size_t j = 0; j < expected.size(); ++j) {
+            if (entry.second[j] != expected[j]) {
+              equal = false;
+              break;
+            }
+          }
+          if (equal) {
+            matched[i] = true;
+            found = true;
             break;
           }
         }
-        if (equal) {
-          matched[i] = true;
-          found = true;
-          break;
-        }
+        REQUIRE(found);
       }
-      REQUIRE(found);
+      for (bool found: matched) REQUIRE(found);
+    };
+    SECTION("stream") {
+      std::ifstream is;
+      is.exceptions(std::ios::failbit | std::ios::badbit);
+      is.open(path, std::ios::binary);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        is, DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      check(sketch);
+    }
+    SECTION("bytes") {
+      const auto bytes = read_binary_file(path);
+      const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
+        bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+      );
+      check(sketch);
     }
-    for (bool found: matched) REQUIRE(found);
   }
 }
diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp
index 3a154132..db506825 100644
--- a/tuple/test/aos_sketch_serialize_for_java.cpp
+++ b/tuple/test/aos_sketch_serialize_for_java.cpp
@@ -52,7 +52,7 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") {
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os);
+    sketch.compact().serialize(os, default_array_of_strings_serde<>());
   }
 }
 
@@ -72,7 +72,7 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") {
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os);
+    sketch.compact().serialize(os, default_array_of_strings_serde<>());
   }
 }
 
@@ -90,7 +90,7 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") {
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 0);
   std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os);
+  sketch.compact().serialize(os, default_array_of_strings_serde<>());
 }
 
 TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
@@ -108,7 +108,7 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os);
+    sketch.compact().serialize(os, default_array_of_strings_serde<>());
   }
 }
 
@@ -129,7 +129,7 @@ TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") {
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 3);
   std::ofstream os("aos_unicode_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os);
+  sketch.compact().serialize(os, default_array_of_strings_serde<>());
 }
 
 TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") {
@@ -149,7 +149,7 @@ TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") {
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 3);
   std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os);
+  sketch.compact().serialize(os, default_array_of_strings_serde<>());
 }
 
 } /* namespace datasketches */
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 45e554bd..3e3673aa 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -65,7 +65,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
     values[0] = "one";
     values[1] = "two";
 
-    policy.update(values, static_cast<const array_of_strings*>(nullptr));
+    policy.update(values, nullptr);
     REQUIRE(values.size() == 0);
   }
 
@@ -162,11 +162,15 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
   auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) {
     std::stringstream ss;
     ss.exceptions(std::ios::failbit | std::ios::badbit);
-    compact_sketch.serialize(ss);
-    auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss);
-
-    auto bytes = compact_sketch.serialize();
-    auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size());
+    compact_sketch.serialize(ss, default_array_of_strings_serde<>());
+    auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(
+      ss, DEFAULT_SEED, default_array_of_strings_serde<>()
+    );
+
+    auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>());
+    auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(
+      bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
+    );
 
     const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = {
       &deserialized_stream,

From 46c945d5c9f59a0bfd1e7afe6c8256fdd3e8d4f4 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Tue, 27 Jan 2026 01:11:18 +0900
Subject: [PATCH 36/75] refactor: remove update sketch

---
 tuple/include/array_of_strings_sketch.hpp     | 79 ++++++-------------
 .../include/array_of_strings_sketch_impl.hpp  | 55 +++----------
 tuple/test/aos_sketch_serialize_for_java.cpp  | 43 ++++------
 tuple/test/array_of_strings_sketch_test.cpp   | 43 ++++++----
 4 files changed, 79 insertions(+), 141 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index 4442fd64..db147723 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -69,6 +69,12 @@ struct default_array_of_strings_serde {
   static void check_utf8(const std::string& value);
 };
 
+/**
+ * Hashes an array of strings using ArrayOfStrings-compatible hashing.
+ */
+template<typename Allocator = std::allocator<std::string>>
+uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key);
+
 /**
  * Extended class of compact_tuple_sketch for array of strings
  * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
@@ -125,63 +131,26 @@ class compact_array_of_strings_tuple_sketch:
 };
 
 /**
- * Extended class of update_tuple_sketch for array of strings
+ * Convenience alias for update_tuple_sketch for array of strings
  */
-template<template<typename> class Policy = default_array_of_strings_update_policy,
-         typename Allocator = std::allocator<std::string>>
-class update_array_of_strings_tuple_sketch:
-  public update_tuple_sketch<
-    array<std::string, Allocator>,
-    array<std::string, Allocator>,
-    Policy<Allocator>,
-    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
-  > {
-public:
-  using array_of_strings = array<std::string, Allocator>;
-  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
-  using policy_type = Policy<Allocator>;
-  using Base = update_tuple_sketch<
-    array_of_strings,
-    array_of_strings,
-    policy_type,
-    summary_allocator
-  >;
-  using resize_factor = typename Base::resize_factor;
-  class builder;
-  using Base::update;
-
-  /**
-   * Updates the sketch with string array for both key and value.
-   * @param key the given string array key
-   * @param value the given string array value
-   */
-  void update(const array_of_strings& key, const array_of_strings& value);
+template<typename Allocator = std::allocator<std::string>,
+         typename Policy = default_array_of_strings_update_policy<Allocator>>
+using update_array_of_strings_tuple_sketch = update_tuple_sketch<
+  array<std::string, Allocator>,
+  array<std::string, Allocator>,
+  Policy,
+  typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+>;
 
-  /**
-   * Converts this sketch to a compact sketch (ordered or unordered).
-   * @param ordered optional flag to specify if an ordered sketch should be produced
-   * @return compact array of strings sketch
-   */
-  compact_array_of_strings_tuple_sketch<Allocator> compact(bool ordered = true) const;
-
-private:
-  update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
-      uint64_t seed, const policy_type& policy, const summary_allocator& allocator);
-
-  // Matches Java Util.PRIME for ArrayOfStrings key hashing.
-  static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
-
-  static uint64_t hash_key(const array_of_strings& key);
-};
-
-template<template<typename> class Policy, typename Allocator>
-class update_array_of_strings_tuple_sketch<Policy, Allocator>::builder:
-  public tuple_base_builder<builder, policy_type, summary_allocator> {
-public:
-  builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator());
-
-  update_array_of_strings_tuple_sketch build() const;
-};
+/**
+ * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered).
+ * @param sketch input sketch
+ * @param ordered optional flag to specify if an ordered sketch should be produced
+ * @return compact array of strings sketch
+ */
+template<typename Allocator = std::allocator<std::string>, typename Policy = default_array_of_strings_update_policy<Allocator>>
+compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
+  const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);
 
 } /* namespace datasketches */
 
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index b95987a0..01a3daba 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -58,23 +58,10 @@ void default_array_of_strings_update_policy<Allocator>::update(
   for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
 }
 
-template<template<typename> class Policy, typename Allocator>
-update_array_of_strings_tuple_sketch<Policy, Allocator>::update_array_of_strings_tuple_sketch(
-  uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
-  uint64_t seed, const policy_type& policy, const summary_allocator& allocator
-):
-Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {}
-
-template<template<typename> class Policy, typename Allocator>
-void update_array_of_strings_tuple_sketch<Policy, Allocator>::update(
-  const array_of_strings& key, const array_of_strings& value
-) {
-  const uint64_t hash = hash_key(key);
-  Base::update(hash, value);
-}
-
-template<template<typename> class Policy, typename Allocator>
-uint64_t update_array_of_strings_tuple_sketch<Policy, Allocator>::hash_key(const array_of_strings& key) {
+template<typename Allocator>
+uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key) {
+  // Matches Java Util.PRIME for ArrayOfStrings key hashing.
+  static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
   XXHash64 hasher(STRING_ARR_HASH_SEED);
   const auto size = static_cast<size_t>(key.size());
   for (size_t i = 0; i < size; ++i) {
@@ -85,31 +72,11 @@ uint64_t update_array_of_strings_tuple_sketch<Policy, Allocator>::hash_key(const
   return hasher.hash();
 }
 
-template<template<typename> class Policy, typename Allocator>
-compact_array_of_strings_tuple_sketch<Allocator> update_array_of_strings_tuple_sketch<Policy, Allocator>::compact(bool ordered) const {
-  return compact_array_of_strings_tuple_sketch<Allocator>(*this, ordered);
-}
-
-// builder
-
-template<template<typename> class Policy, typename Allocator>
-update_array_of_strings_tuple_sketch<Policy, Allocator>::builder::builder(
-  const policy_type& policy, const summary_allocator& allocator
-):
-tuple_base_builder<builder, policy_type, summary_allocator>(policy, allocator) {}
-
-template<template<typename> class Policy, typename Allocator>
-auto update_array_of_strings_tuple_sketch<Policy, Allocator>::builder::build() const -> update_array_of_strings_tuple_sketch {
-  return update_array_of_strings_tuple_sketch(
-    this->starting_lg_size(),
-    this->lg_k_,
-    this->rf_,
-    this->p_,
-    this->starting_theta(),
-    this->seed_,
-    this->policy_,
-    this->allocator_
-  );
+template<typename Allocator, typename Policy>
+compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
+  const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered
+) {
+  return compact_array_of_strings_tuple_sketch<Allocator>(sketch, ordered);
 }
 
 template<typename Allocator>
@@ -183,7 +150,7 @@ void default_array_of_strings_serde<Allocator>::deserialize(
       if (!is) throw std::runtime_error("array_of_strings stream read failed");
       std::string value(length, '\0');
       if (length != 0) {
-        is.read(value.data(), length);
+        is.read(&value[0], length);
         if (!is) throw std::runtime_error("array_of_strings stream read failed");
       }
       check_utf8(value);
@@ -241,7 +208,7 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
       bytes_read += copy_from_mem(ptr8 + bytes_read, length);
       std::string value(length, '\0');
       if (length != 0) {
-        bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length);
+        bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
       }
       check_utf8(value);
       array[j] = std::move(value);
diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp
index db506825..c6eb0dfc 100644
--- a/tuple/test/aos_sketch_serialize_for_java.cpp
+++ b/tuple/test/aos_sketch_serialize_for_java.cpp
@@ -26,7 +26,7 @@
 namespace datasketches {
 
 using aos_sketch = update_array_of_strings_tuple_sketch<>;
-using array_of_strings = aos_sketch::array_of_strings;
+using array_of_strings = array<std::string>;
 
 static array_of_strings make_array(std::initializer_list<std::string> items) {
   array_of_strings array(static_cast<uint8_t>(items.size()), "");
@@ -47,12 +47,12 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") {
       key[0] = std::to_string(i);
       array_of_strings value(1, "");
       value[0] = "value" + std::to_string(i);
-      sketch.update(key, value);
+      sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os, default_array_of_strings_serde<>());
+    compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
   }
 }
 
@@ -67,12 +67,12 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") {
       value[0] = "a" + std::to_string(i);
       value[1] = "b" + std::to_string(i);
       value[2] = "c" + std::to_string(i);
-      sketch.update(key, value);
+      sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os, default_array_of_strings_serde<>());
+    compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
   }
 }
 
@@ -86,11 +86,11 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") {
   key[0] = "key1";
   array_of_strings value(1, "");
   value[0] = "value1";
-  sketch.update(key, value);
+  sketch.update(hash_array_of_strings_key(key), value);
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 0);
   std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os, default_array_of_strings_serde<>());
+  compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
 }
 
 TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
@@ -103,53 +103,44 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
       key[1] = "subkey" + std::to_string(i % 10);
       array_of_strings value(1, "");
       value[0] = "value" + std::to_string(i);
-      sketch.update(key, value);
+      sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
     REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
     std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
-    sketch.compact().serialize(os, default_array_of_strings_serde<>());
+    compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
   }
 }
 
 TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") {
   auto sketch = aos_sketch::builder().build();
   sketch.update(
-    make_array({u8"키", u8"열쇠"}),
+    hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})),
     make_array({u8"밸류", u8"값"})
   );
   sketch.update(
-    make_array({u8"🔑", u8"🗝️"}),
+    hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})),
     make_array({u8"📦", u8"🎁"})
   );
   sketch.update(
-    make_array({u8"ключ1", u8"ключ2"}),
+    hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})),
     make_array({u8"ценить1", u8"ценить2"})
   );
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 3);
   std::ofstream os("aos_unicode_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os, default_array_of_strings_serde<>());
+  compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
 }
 
 TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") {
   auto sketch = aos_sketch::builder().build();
-  sketch.update(
-    make_array({""}),
-    make_array({"empty_key_value"})
-  );
-  sketch.update(
-    make_array({"empty_value_key"}),
-    make_array({""})
-  );
-  sketch.update(
-    make_array({"", ""}),
-    make_array({"", ""})
-  );
+  sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"}));
+  sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""}));
+  sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""}));
   REQUIRE_FALSE(sketch.is_empty());
   REQUIRE(sketch.get_num_retained() == 3);
   std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary);
-  sketch.compact().serialize(os, default_array_of_strings_serde<>());
+  compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>());
 }
 
 } /* namespace datasketches */
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 3e3673aa..59cc04ca 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -18,7 +18,6 @@
  */
 
 #include <algorithm>
-#include <iostream>
 #include <fstream>
 #include <sstream>
 #include <string>
@@ -94,8 +93,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") {
   SECTION("same key replaces summary") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
 
-    sketch.update(make_array({"alpha", "beta"}), make_array({"first"}));
-    sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"}));
+    sketch.update(
+      hash_array_of_strings_key(make_array({"alpha", "beta"})),
+      make_array({"first"})
+    );
+    sketch.update(
+      hash_array_of_strings_key(make_array({"alpha", "beta"})),
+      make_array({"second", "third"})
+    );
 
     REQUIRE(sketch.get_num_retained() == 1);
 
@@ -109,8 +114,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") {
   SECTION("distinct keys retain multiple entries") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
 
-    sketch.update(make_array({"a", "bc"}), make_array({"one"}));
-    sketch.update(make_array({"ab", "c"}), make_array({"two"}));
+    sketch.update(
+      hash_array_of_strings_key(make_array({"a", "bc"})),
+      make_array({"one"})
+    );
+    sketch.update(
+      hash_array_of_strings_key(make_array({"ab", "c"})),
+      make_array({"two"})
+    );
 
     REQUIRE(sketch.get_num_retained() == 2);
 
@@ -128,7 +139,7 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") {
   SECTION("empty key") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
 
-    sketch.update(make_array({}), make_array({"value"}));
+    sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"}));
     REQUIRE(sketch.get_num_retained() == 1);
 
     auto it = sketch.begin();
@@ -201,46 +212,46 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
     }
   };
 
-  auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) {
-    auto ordered = sketch.compact(true);
-    auto unordered = sketch.compact(false);
+  auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) {
+    auto ordered = compact_array_of_strings_sketch(sketch, true);
+    auto unordered = compact_array_of_strings_sketch(sketch, false);
     check_round_trip(ordered);
     check_round_trip(unordered);
   };
 
   SECTION("empty sketch") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
-    exercise_ordering(sketch);
+    run_tests(sketch);
   }
 
   SECTION("single entry sketch") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
-    sketch.update(make_array({"key"}), make_array({"value"}));
-    exercise_ordering(sketch);
+    sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"}));
+    run_tests(sketch);
   }
 
   SECTION("multiple entries exact mode") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build();
     for (int i = 0; i < 50; ++i) {
       sketch.update(
-        make_array({std::string("key-") + std::to_string(i)}),
+        hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})),
         make_array({std::string("value-") + std::to_string(i), "extra"})
       );
     }
     REQUIRE_FALSE(sketch.is_estimation_mode());
-    exercise_ordering(sketch);
+    run_tests(sketch);
   }
 
   SECTION("multiple entries estimation mode") {
     auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
     for (int i = 0; i < 10000; ++i) {
       sketch.update(
-        make_array({std::string("key-") + std::to_string(i)}),
+        hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})),
         make_array({std::string("value-") + std::to_string(i)})
       );
     }
     REQUIRE(sketch.is_estimation_mode());
-    exercise_ordering(sketch);
+    run_tests(sketch);
   }
 }
 

From 1b91666377a34e49548097caf9f482a45a5b4e93 Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Tue, 27 Jan 2026 13:23:09 +0530
Subject: [PATCH 37/75] BugFix: SIGABRT in quantiles_sketch::deserialize():
 dereferencing empty std::optional (libc++ verbose_abort)

---
 kll/include/kll_sketch_impl.hpp             | 37 +++++++++++++--------
 quantiles/include/quantiles_sketch_impl.hpp | 37 +++++++++++++--------
 req/include/req_sketch_impl.hpp             | 37 +++++++++++++--------
 sampling/include/ebpps_sample_impl.hpp      | 25 +++++++++-----
 4 files changed, 86 insertions(+), 50 deletions(-)

diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index fde0a314..44fe6a15 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -24,6 +24,7 @@
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
+#include <type_traits>
 
 #include "conditional_forward.hpp"
 #include "count_zeros.hpp"
@@ -481,18 +482,22 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(std::istream& is, const Ser
     read(is, levels.data(), sizeof(levels[0]) * num_levels);
   }
   levels[num_levels] = capacity;
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
   if (!is_single_item) {
-    sd.deserialize(is, &*tmp, 1);
+    // Space to deserialize min and max.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    sd.deserialize(is, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    min_item.emplace(*tmp);
-    (*tmp).~T();
-    sd.deserialize(is, &*tmp, 1);
+    min_item.emplace(std::move(*tmp));
+    tmp->~T();
+    sd.deserialize(is, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    max_item.emplace(*tmp);
-    (*tmp).~T();
+    max_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
   A alloc(allocator);
   auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
@@ -565,18 +570,22 @@ kll_sketch<T, C, A> kll_sketch<T, C, A>::deserialize(const void* bytes, size_t s
     ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
   }
   levels[num_levels] = capacity;
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
   if (!is_single_item) {
-    ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+    // Space to deserialize min and max.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    min_item.emplace(*tmp);
-    (*tmp).~T();
-    ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+    min_item.emplace(std::move(*tmp));
+    tmp->~T();
+    ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    max_item.emplace(*tmp);
-    (*tmp).~T();
+    max_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
   A alloc(allocator);
   auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp
index 50c82c18..2dacf21e 100644
--- a/quantiles/include/quantiles_sketch_impl.hpp
+++ b/quantiles/include/quantiles_sketch_impl.hpp
@@ -25,6 +25,7 @@
 #include <stdexcept>
 #include <iomanip>
 #include <sstream>
+#include <type_traits>
 
 #include "count_zeros.hpp"
 #include "conditional_forward.hpp"
@@ -393,18 +394,22 @@ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde
   const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
   const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
 
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
 
-  serde.deserialize(is, &*tmp, 1);
+  // Space to deserialize min and max.
+  // serde::deserialize expects allocated but not initialized storage.
+  typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+  T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+  serde.deserialize(is, tmp, 1);
   // serde call did not throw, repackage and cleanup
-  min_item.emplace(*tmp);
-  (*tmp).~T();
-  serde.deserialize(is, &*tmp, 1);
+  min_item.emplace(std::move(*tmp));
+  tmp->~T();
+  serde.deserialize(is, tmp, 1);
   // serde call did not throw, repackage and cleanup
-  max_item.emplace(*tmp);
-  (*tmp).~T();
+  max_item.emplace(std::move(*tmp));
+  tmp->~T();
 
   if (serial_version == 1) {
     read<uint64_t>(is); // no longer used
@@ -507,18 +512,22 @@ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, cons
   const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
   const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
 
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
 
-  ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+  // Space to deserialize min and max.
+  // serde::deserialize expects allocated but not initialized storage.
+  typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+  T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+  ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1);
   // serde call did not throw, repackage and cleanup
-  min_item.emplace(*tmp);
-  (*tmp).~T();
-  ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+  min_item.emplace(std::move(*tmp));
+  tmp->~T();
+  ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1);
   // serde call did not throw, repackage and cleanup
-  max_item.emplace(*tmp);
-  (*tmp).~T();
+  max_item.emplace(std::move(*tmp));
+  tmp->~T();
 
   if (serial_version == 1) {
     uint64_t unused_long;
diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp
index a28e74e2..3c1c2fc1 100755
--- a/req/include/req_sketch_impl.hpp
+++ b/req/include/req_sketch_impl.hpp
@@ -22,6 +22,7 @@
 
 #include <sstream>
 #include <stdexcept>
+#include <type_traits>
 
 namespace datasketches {
 
@@ -461,7 +462,6 @@ req_sketch<T, C, A> req_sketch<T, C, A>::deserialize(std::istream& is, const Ser
   const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK);
   if (is_empty) return req_sketch(k, hra, comparator, allocator);
 
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
 
@@ -472,14 +472,19 @@ req_sketch<T, C, A> req_sketch<T, C, A>::deserialize(std::istream& is, const Ser
   uint64_t n = 1;
   if (num_levels > 1) {
     n = read<uint64_t>(is);
-    sd.deserialize(is, &*tmp, 1);
+    // Space to deserialize min and max.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    sd.deserialize(is, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    min_item.emplace(*tmp);
-    (*tmp).~T();
-    sd.deserialize(is, &*tmp, 1);
+    min_item.emplace(std::move(*tmp));
+    tmp->~T();
+    sd.deserialize(is, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    max_item.emplace(*tmp);
-    (*tmp).~T();
+    max_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
 
   if (raw_items) {
@@ -537,7 +542,6 @@ req_sketch<T, C, A> req_sketch<T, C, A>::deserialize(const void* bytes, size_t s
   const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK);
   if (is_empty) return req_sketch(k, hra, comparator, allocator);
 
-  optional<T> tmp; // space to deserialize min and max
   optional<T> min_item;
   optional<T> max_item;
 
@@ -549,14 +553,19 @@ req_sketch<T, C, A> req_sketch<T, C, A>::deserialize(const void* bytes, size_t s
   if (num_levels > 1) {
     ensure_minimum_memory(end_ptr - ptr, sizeof(n));
     ptr += copy_from_mem(ptr, n);
-    ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+    // Space to deserialize min and max.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    min_item.emplace(*tmp);
-    (*tmp).~T();
-    ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+    min_item.emplace(std::move(*tmp));
+    tmp->~T();
+    ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1);
     // serde call did not throw, repackage and cleanup
-    max_item.emplace(*tmp);
-    (*tmp).~T();
+    max_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
 
   if (raw_items) {
diff --git a/sampling/include/ebpps_sample_impl.hpp b/sampling/include/ebpps_sample_impl.hpp
index 88a86ae0..c48b32aa 100644
--- a/sampling/include/ebpps_sample_impl.hpp
+++ b/sampling/include/ebpps_sample_impl.hpp
@@ -28,6 +28,7 @@
 #include <cmath>
 #include <string>
 #include <sstream>
+#include <type_traits>
 
 namespace datasketches {
 
@@ -365,11 +366,15 @@ std::pair<ebpps_sample<T, A>, size_t> ebpps_sample<T, A>::deserialize(const uint
 
   optional<T> partial_item;
   if (has_partial) {
-    optional<T> tmp; // space to deserialize
-    ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1);
+    // Space to deserialize.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1);
     // serde did not throw so place item and clean up
-    partial_item.emplace(*tmp);
-    (*tmp).~T();
+    partial_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
 
   return std::pair<ebpps_sample<T,A>, size_t>(
@@ -400,11 +405,15 @@ ebpps_sample<T, A> ebpps_sample<T, A>::deserialize(std::istream& is, const SerDe
 
   optional<T> partial_item;
   if (has_partial) {
-    optional<T> tmp; // space to deserialize
-    sd.deserialize(is, &*tmp, 1);
+    // Space to deserialize.
+    // serde::deserialize expects allocated but not initialized storage.
+    typename std::aligned_storage<sizeof(T), alignof(T)>::type tmp_storage;
+    T* tmp = reinterpret_cast<T*>(&tmp_storage);
+
+    sd.deserialize(is, tmp, 1);
     // serde did not throw so place item and clean up
-    partial_item.emplace(*tmp);
-    (*tmp).~T();
+    partial_item.emplace(std::move(*tmp));
+    tmp->~T();
   }
 
   if (!is.good()) throw std::runtime_error("error reading from std::istream");

From 342248f294020a362a35c34d9b40ea7224d27438 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Wed, 28 Jan 2026 00:37:41 +0900
Subject: [PATCH 38/75] fix: control array and element life cycle

---
 tuple/include/array_tuple_sketch.hpp | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp
index 547b240c..54a000e8 100644
--- a/tuple/include/array_tuple_sketch.hpp
+++ b/tuple/include/array_tuple_sketch.hpp
@@ -34,17 +34,22 @@ class array {
 public:
   using value_type = T;
   using allocator_type = Allocator;
+  using alloc_traits = std::allocator_traits<Allocator>;
 
-  explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()):
-  allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) {
-    std::fill(array_, array_ + size_, value);
+  explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()):
+  allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) {
+    for (uint8_t i = 0; i < size_; ++i) {
+      alloc_traits::construct(allocator_, array_ + i, value);
+    }
   }
   array(const array& other):
     allocator_(other.allocator_),
     size_(other.size_),
-    array_(allocator_.allocate(size_))
+    array_(size_ == 0 ? nullptr : allocator_.allocate(size_))
   {
-    std::copy(other.array_, other.array_ + size_, array_);
+    for (uint8_t i = 0; i < size_; ++i) {
+      alloc_traits::construct(allocator_, array_ + i, other.array_[i]);
+    }
   }
   array(array&& other) noexcept:
     allocator_(std::move(other.allocator_)),
@@ -52,9 +57,15 @@ class array {
     array_(other.array_)
   {
     other.array_ = nullptr;
+    other.size_ = 0;
   }
   ~array() {
-    if (array_ != nullptr) allocator_.deallocate(array_, size_);
+    if (array_ != nullptr) {
+      for (uint8_t i = 0; i < size_; ++i) {
+        alloc_traits::destroy(allocator_, array_ + i);
+      }
+      allocator_.deallocate(array_, size_);
+    }
   }
   array& operator=(const array& other) {
     array copy(other);

From 2c712e99ff17d8c6eff3c2c1dc53db5b10e0613d Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Wed, 28 Jan 2026 00:48:12 +0900
Subject: [PATCH 39/75] fix: null ptr to empty array

---
 tuple/include/array_tuple_sketch.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp
index 54a000e8..416816e0 100644
--- a/tuple/include/array_tuple_sketch.hpp
+++ b/tuple/include/array_tuple_sketch.hpp
@@ -37,7 +37,7 @@ class array {
   using alloc_traits = std::allocator_traits<Allocator>;
 
   explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()):
-  allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) {
+  allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) {
     for (uint8_t i = 0; i < size_; ++i) {
       alloc_traits::construct(allocator_, array_ + i, value);
     }
@@ -45,7 +45,7 @@ class array {
   array(const array& other):
     allocator_(other.allocator_),
     size_(other.size_),
-    array_(size_ == 0 ? nullptr : allocator_.allocate(size_))
+    array_(allocator_.allocate(size_))
   {
     for (uint8_t i = 0; i < size_; ++i) {
       alloc_traits::construct(allocator_, array_ + i, other.array_[i]);

From d463bfb159cd338fd88094ba600a50cfcc0e34d2 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Wed, 28 Jan 2026 00:54:32 +0900
Subject: [PATCH 40/75] test: serde validation cases

---
 .../include/array_of_strings_sketch_impl.hpp  |  3 ---
 tuple/test/array_of_strings_sketch_test.cpp   | 26 +++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 01a3daba..f5fa0652 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -240,9 +240,6 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
   for (uint32_t j = 0; j < count; ++j) {
     total += data[j].size();
   }
-  if (total > std::numeric_limits<uint32_t>::max()) {
-    throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max");
-  }
   return static_cast<uint32_t>(total);
 }
 
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 59cc04ca..74b225b0 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -255,4 +255,30 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
   }
 }
 
+TEST_CASE("aos serde validation", "[tuple_sketch]") {
+  default_array_of_strings_serde<> serde;
+
+  SECTION("invalid utf8 rejected") {
+    array_of_strings array(1, "", std::allocator<std::string>());
+    const std::string invalid_utf8("\xC3\x28", 2);
+    array[0] = invalid_utf8;
+    std::stringstream ss;
+    ss.exceptions(std::ios::failbit | std::ios::badbit);
+    REQUIRE_THROWS_WITH(
+      serde.serialize(ss, &array, 1),
+      Catch::Matchers::Contains("invalid UTF-8")
+    );
+  }
+
+  SECTION("too many nodes rejected") {
+    array_of_strings array(128, "", std::allocator<std::string>());
+    std::stringstream ss;
+    ss.exceptions(std::ios::failbit | std::ios::badbit);
+    REQUIRE_THROWS_WITH(
+      serde.serialize(ss, &array, 1),
+      Catch::Matchers::Contains("size exceeds 127")
+    );
+  }
+}
+
 } /* namespace datasketches */

From 3b3a13de52445e663472f630bd41080810977649 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Wed, 28 Jan 2026 03:32:39 +0900
Subject: [PATCH 41/75] perf: avoid allocation if data type is primitive

---
 tuple/include/array_tuple_sketch.hpp | 38 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp
index 416816e0..9baa2010 100644
--- a/tuple/include/array_tuple_sketch.hpp
+++ b/tuple/include/array_tuple_sketch.hpp
@@ -22,6 +22,9 @@
 
 #include <vector>
 #include <memory>
+#include <type_traits>
+#include <cstring>
+#include <algorithm>
 
 #include "serde.hpp"
 #include "tuple_sketch.hpp"
@@ -38,18 +41,14 @@ class array {
 
   explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()):
   allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) {
-    for (uint8_t i = 0; i < size_; ++i) {
-      alloc_traits::construct(allocator_, array_ + i, value);
-    }
+    init_values(value, std::is_trivially_copyable<T>());
   }
   array(const array& other):
     allocator_(other.allocator_),
     size_(other.size_),
     array_(allocator_.allocate(size_))
   {
-    for (uint8_t i = 0; i < size_; ++i) {
-      alloc_traits::construct(allocator_, array_ + i, other.array_[i]);
-    }
+    copy_from(other, std::is_trivially_copyable<T>());
   }
   array(array&& other) noexcept:
     allocator_(std::move(other.allocator_)),
@@ -61,9 +60,7 @@ class array {
   }
   ~array() {
     if (array_ != nullptr) {
-      for (uint8_t i = 0; i < size_; ++i) {
-        alloc_traits::destroy(allocator_, array_ + i);
-      }
+      destroy_values(std::is_trivially_destructible<T>());
       allocator_.deallocate(array_, size_);
     }
   }
@@ -90,6 +87,29 @@ class array {
     return true;
   }
 private:
+  void init_values(const T& value, std::true_type) {
+    std::fill(array_, array_ + size_, value);
+  }
+  void init_values(const T& value, std::false_type) {
+    for (uint8_t i = 0; i < size_; ++i) {
+      alloc_traits::construct(allocator_, array_ + i, value);
+    }
+  }
+  void copy_from(const array& other, std::true_type) {
+    std::copy(other.array_, other.array_ + size_, array_);
+  }
+  void copy_from(const array& other, std::false_type) {
+    for (uint8_t i = 0; i < size_; ++i) {
+      alloc_traits::construct(allocator_, array_ + i, other.array_[i]);
+    }
+  }
+  void destroy_values(std::true_type) {}
+  void destroy_values(std::false_type) {
+    for (uint8_t i = 0; i < size_; ++i) {
+      alloc_traits::destroy(allocator_, array_ + i);
+    }
+  }
+
   Allocator allocator_;
   uint8_t size_;
   T* array_;

From 189d22de11b53ff48bb279d066586abbd5af71b9 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Wed, 28 Jan 2026 03:33:57 +0900
Subject: [PATCH 42/75] chore: remove unused header

---
 tuple/include/array_tuple_sketch.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp
index 9baa2010..03761ff4 100644
--- a/tuple/include/array_tuple_sketch.hpp
+++ b/tuple/include/array_tuple_sketch.hpp
@@ -23,7 +23,6 @@
 #include <vector>
 #include <memory>
 #include <type_traits>
-#include <cstring>
 #include <algorithm>
 
 #include "serde.hpp"

From ba2aa6909aa5269af7fc3aa67ad9ced4f75a2938 Mon Sep 17 00:00:00 2001
From: Mahesh G Pai <mahesh.pai.r@gmail.com>
Date: Tue, 27 Jan 2026 18:54:09 +0530
Subject: [PATCH 43/75] Added testcases

---
 .github/workflows/hardening.yml            |  59 +++++++
 common/test/CMakeLists.txt                 |  69 +++++++-
 common/test/deserialize_hardening_test.cpp | 188 +++++++++++++++++++++
 3 files changed, 310 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/hardening.yml
 create mode 100644 common/test/deserialize_hardening_test.cpp

diff --git a/.github/workflows/hardening.yml b/.github/workflows/hardening.yml
new file mode 100644
index 00000000..e264ebd9
--- /dev/null
+++ b/.github/workflows/hardening.yml
@@ -0,0 +1,59 @@
+name: libc++ Hardening Tests
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  workflow_dispatch:
+
+env:
+  BUILD_TYPE: Debug
+
+jobs:
+  hardening-test:
+    name: C++17 with libc++ Hardening Mode
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          persist-credentials: false
+      
+      - name: Install LLVM and libc++
+        run: |
+          # Install LLVM/Clang with libc++
+          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+          sudo add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
+          sudo apt-get update
+          sudo apt-get install -y clang-18 libc++-18-dev libc++abi-18-dev
+          
+      - name: Configure with C++17 and libc++ hardening
+        env:
+          CC: clang-18
+          CXX: clang++-18
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \
+            -DCMAKE_CXX_STANDARD=17 \
+            -DBUILD_TESTS=ON \
+            -DCMAKE_CXX_FLAGS="-stdlib=libc++ -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG" \
+            -DCMAKE_EXE_LINKER_FLAGS="-stdlib=libc++ -lc++abi"
+      
+      - name: Build hardening tests
+        run: cmake --build build --target hardening_test --config ${{ env.BUILD_TYPE }}
+      
+      - name: Run hardening tests
+        run: |
+          cd build
+          ./common/test/hardening_test "[deserialize_hardening]"
+      
+      - name: Report results
+        if: always()
+        run: |
+          echo "✅ Tests passed with libc++ hardening enabled!"
+          echo "This verifies the fix for issue #477 prevents SIGABRT."
diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt
index 7593bd0b..d190b628 100644
--- a/common/test/CMakeLists.txt
+++ b/common/test/CMakeLists.txt
@@ -75,12 +75,28 @@ target_sources(common_test
 # now the integration test part
 add_executable(integration_test)
 
-target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple common_test_lib)
-
-set_target_properties(integration_test PROPERTIES
-  CXX_STANDARD 11
-  CXX_STANDARD_REQUIRED YES
-)
+target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib)
+
+# Use C++17 if CMAKE_CXX_STANDARD is set to 17+, otherwise C++11
+# This allows hardening tests to use std::optional with libc++ hardening
+if(DEFINED CMAKE_CXX_STANDARD)
+  if(CMAKE_CXX_STANDARD MATCHES "17|20|23")
+    set_target_properties(integration_test PROPERTIES
+      CXX_STANDARD ${CMAKE_CXX_STANDARD}
+      CXX_STANDARD_REQUIRED YES
+    )
+  else()
+    set_target_properties(integration_test PROPERTIES
+      CXX_STANDARD 11
+      CXX_STANDARD_REQUIRED YES
+    )
+  endif()
+else()
+  set_target_properties(integration_test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+  )
+endif()
 
 add_test(
   NAME integration_test
@@ -91,3 +107,44 @@ target_sources(integration_test
   PRIVATE
     integration_test.cpp
 )
+
+# Separate hardening test executable (header-only, no pre-compiled libs)
+# This ensures the sketch code is compiled with C++17 + hardening
+# Always build this target - it will use CMAKE_CXX_STANDARD if set, otherwise C++17
+message(STATUS "CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}")
+
+add_executable(hardening_test)
+target_link_libraries(hardening_test common common_test_lib)
+
+# Include directories for header-only sketch implementations
+target_include_directories(hardening_test PRIVATE
+  ${CMAKE_SOURCE_DIR}/quantiles/include
+  ${CMAKE_SOURCE_DIR}/kll/include
+  ${CMAKE_SOURCE_DIR}/req/include
+  ${CMAKE_SOURCE_DIR}/common/include
+)
+
+# Use C++17 minimum for hardening tests
+if(CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17)
+  set_target_properties(hardening_test PROPERTIES
+    CXX_STANDARD ${CMAKE_CXX_STANDARD}
+    CXX_STANDARD_REQUIRED YES
+  )
+  message(STATUS "hardening_test will use C++${CMAKE_CXX_STANDARD}")
+else()
+  set_target_properties(hardening_test PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED YES
+  )
+  message(STATUS "hardening_test will use C++17 (default)")
+endif()
+
+add_test(
+  NAME hardening_test
+  COMMAND hardening_test "[deserialize_hardening]"
+)
+
+target_sources(hardening_test
+  PRIVATE
+    deserialize_hardening_test.cpp
+)
diff --git a/common/test/deserialize_hardening_test.cpp b/common/test/deserialize_hardening_test.cpp
new file mode 100644
index 00000000..64e654b4
--- /dev/null
+++ b/common/test/deserialize_hardening_test.cpp
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+#include <sstream>
+#include <vector>
+
+// Include all affected sketch types
+#include <quantiles_sketch.hpp>
+#include <kll_sketch.hpp>
+#include <req_sketch.hpp>
+
+namespace datasketches {
+
+/**
+ * Test for fix of issue #477:
+ * BUG: SIGABRT in deserialize(): dereferencing empty std::optional (libc++ verbose_abort)
+ * 
+ * These tests exercise the actual deserialization code path that contained the bug.
+ * With buggy code (&*tmp on empty optional) and hardening enabled, these will SIGABRT.
+ * With fixed code (aligned_storage), these pass normally.
+ * 
+ * IMPORTANT: These tests actually call deserialize() on multi-item sketches, which
+ * exercises the buggy code path where min/max are deserialized.
+ */
+
+TEST_CASE("quantiles_sketch: deserialize multi-item sketch", "[deserialize_hardening]") {
+  // Create sketch with multiple items (so min/max are stored in serialization)
+  quantiles_sketch<double> sketch(128);
+  for (int i = 0; i < 1000; i++) {
+    sketch.update(static_cast<double>(i));
+  }
+  
+  // Serialize
+  auto bytes = sketch.serialize();
+  
+  // Deserialize - WITH BUGGY CODE AND HARDENING, THIS WILL SIGABRT HERE
+  // The bug is: sd.deserialize(is, &*tmp, 1) where tmp is empty optional
+  auto sketch2 = quantiles_sketch<double>::deserialize(bytes.data(), bytes.size());
+  
+  // Verify deserialization worked correctly
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
+}
+
+TEST_CASE("quantiles_sketch: deserialize from stream", "[deserialize_hardening]") {
+  quantiles_sketch<float> sketch(256);
+  for (int i = 0; i < 2000; i++) {
+    sketch.update(static_cast<float>(i) * 0.5f);
+  }
+  
+  // Serialize to stream
+  std::stringstream ss;
+  sketch.serialize(ss);
+  
+  // Deserialize from stream - exercises the buggy code path
+  auto sketch2 = quantiles_sketch<float>::deserialize(ss);
+  
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+}
+
+TEST_CASE("kll_sketch: deserialize multi-item sketch", "[deserialize_hardening]") {
+  kll_sketch<float> sketch(200);
+  for (int i = 0; i < 1500; i++) {
+    sketch.update(static_cast<float>(i));
+  }
+  
+  auto bytes = sketch.serialize();
+  
+  // Deserialize - exercises buggy &*tmp code path
+  auto sketch2 = kll_sketch<float>::deserialize(bytes.data(), bytes.size());
+  
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+}
+
+TEST_CASE("kll_sketch: deserialize from stream", "[deserialize_hardening]") {
+  kll_sketch<int> sketch(400);
+  for (int i = 0; i < 3000; i++) {
+    sketch.update(i);
+  }
+  
+  std::stringstream ss;
+  sketch.serialize(ss);
+  
+  // Deserialize from stream
+  auto sketch2 = kll_sketch<int>::deserialize(ss);
+  
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+}
+
+TEST_CASE("req_sketch: deserialize multi-level sketch", "[deserialize_hardening]") {
+  // REQ sketch only has the bug when num_levels > 1
+  // We need to add enough items to trigger multiple levels
+  req_sketch<float> sketch(12);
+  for (int i = 0; i < 10000; i++) {
+    sketch.update(static_cast<float>(i));
+  }
+  
+  auto bytes = sketch.serialize();
+  
+  // Deserialize - exercises buggy code path when num_levels > 1
+  auto sketch2 = req_sketch<float>::deserialize(bytes.data(), bytes.size());
+  
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+}
+
+TEST_CASE("req_sketch: deserialize from stream", "[deserialize_hardening]") {
+  req_sketch<double> sketch(20);
+  for (int i = 0; i < 15000; i++) {
+    sketch.update(static_cast<double>(i) * 0.1);
+  }
+  
+  std::stringstream ss;
+  sketch.serialize(ss);
+  
+  // Deserialize from stream
+  auto sketch2 = req_sketch<double>::deserialize(ss);
+  
+  REQUIRE(sketch2.get_n() == sketch.get_n());
+  REQUIRE(sketch2.get_min_item() == sketch.get_min_item());
+  REQUIRE(sketch2.get_max_item() == sketch.get_max_item());
+}
+
+TEST_CASE("multiple sketch types: stress test", "[deserialize_hardening]") {
+  SECTION("quantiles with various sizes") {
+    for (int k : {64, 128, 256}) {
+      quantiles_sketch<int> sketch(k);
+      for (int i = 0; i < 5000; i++) {
+        sketch.update(i);
+      }
+      auto bytes = sketch.serialize();
+      auto sketch2 = quantiles_sketch<int>::deserialize(bytes.data(), bytes.size());
+      REQUIRE(sketch2.get_n() == sketch.get_n());
+    }
+  }
+  
+  SECTION("kll with various sizes") {
+    for (int k : {100, 200, 400}) {
+      kll_sketch<double> sketch(k);
+      for (int i = 0; i < 4000; i++) {
+        sketch.update(static_cast<double>(i) / 10.0);
+      }
+      auto bytes = sketch.serialize();
+      auto sketch2 = kll_sketch<double>::deserialize(bytes.data(), bytes.size());
+      REQUIRE(sketch2.get_n() == sketch.get_n());
+    }
+  }
+  
+  SECTION("req with various sizes") {
+    for (int k : {12, 20}) {
+      req_sketch<float> sketch(k);
+      for (int i = 0; i < 8000; i++) {
+        sketch.update(static_cast<float>(i));
+      }
+      auto bytes = sketch.serialize();
+      auto sketch2 = req_sketch<float>::deserialize(bytes.data(), bytes.size());
+      REQUIRE(sketch2.get_n() == sketch.get_n());
+    }
+  }
+}
+
+} // namespace datasketches

From 4894e5e7b156f1ce6a909ef84321ab081078c16c Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Sun, 1 Feb 2026 20:08:24 +0900
Subject: [PATCH 44/75] fix: allocation handling for string in deserialize

---
 tuple/include/array_of_strings_sketch.hpp     | 50 +++++++++++-----
 .../include/array_of_strings_sketch_impl.hpp  | 32 +++++-----
 .../aos_sketch_deserialize_from_java_test.cpp | 11 +++-
 tuple/test/aos_sketch_serialize_for_java.cpp  | 58 ++++++++++++-------
 tuple/test/array_of_strings_sketch_test.cpp   | 35 +++++++----
 5 files changed, 124 insertions(+), 62 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index db147723..4ee3bc9c 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -28,11 +28,22 @@
 
 namespace datasketches {
 
+template<typename Allocator>
+struct array_of_strings_types {
+  using string_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<char>;
+  using string_type = std::basic_string<char, std::char_traits<char>, string_allocator>;
+  using array_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<string_type>;
+  using array_of_strings = array<string_type, array_allocator>;
+};
+
 // default update policy for an array of strings
-template<typename Allocator = std::allocator<std::string>>
+template<typename Allocator = std::allocator<char>>
 class default_array_of_strings_update_policy {
 public:
-  using array_of_strings = array<std::string, Allocator>;
+  using string_allocator = typename array_of_strings_types<Allocator>::string_allocator;
+  using string_type = typename array_of_strings_types<Allocator>::string_type;
+  using array_allocator = typename array_of_strings_types<Allocator>::array_allocator;
+  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
 
   explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator());
 
@@ -48,9 +59,12 @@ class default_array_of_strings_update_policy {
 
 // serializer/deserializer for an array of strings
 // Requirements: all strings must be valid UTF-8 and array size must be <= 127.
-template<typename Allocator = std::allocator<std::string>>
+template<typename Allocator = std::allocator<char>>
 struct default_array_of_strings_serde {
-  using array_of_strings = array<std::string, Allocator>;
+  using string_allocator = typename array_of_strings_types<Allocator>::string_allocator;
+  using string_type = typename array_of_strings_types<Allocator>::string_type;
+  using array_allocator = typename array_of_strings_types<Allocator>::array_allocator;
+  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
 
   explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());
@@ -66,27 +80,29 @@ struct default_array_of_strings_serde {
   summary_allocator summary_allocator_;
   static void check_num_nodes(uint8_t num_nodes);
   static uint32_t compute_total_bytes(const array_of_strings& item);
-  static void check_utf8(const std::string& value);
+  static void check_utf8(const string_type& value);
 };
 
 /**
  * Hashes an array of strings using ArrayOfStrings-compatible hashing.
  */
-template<typename Allocator = std::allocator<std::string>>
-uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key);
+template<typename Allocator = std::allocator<char>>
+uint64_t hash_array_of_strings_key(const typename array_of_strings_types<Allocator>::array_of_strings& key);
 
 /**
  * Extended class of compact_tuple_sketch for array of strings
  * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
  */
-template<typename Allocator = std::allocator<std::string>>
+template<typename Allocator = std::allocator<char>>
 class compact_array_of_strings_tuple_sketch:
   public compact_tuple_sketch<
-    array<std::string, Allocator>,
-    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+    typename array_of_strings_types<Allocator>::array_of_strings,
+    typename std::allocator_traits<Allocator>::template rebind_alloc<
+      typename array_of_strings_types<Allocator>::array_of_strings
+    >
   > {
 public:
-  using array_of_strings = array<std::string, Allocator>;
+  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
   using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
   using vector_bytes = typename Base::vector_bytes;
@@ -133,13 +149,15 @@ class compact_array_of_strings_tuple_sketch:
 /**
  * Convenience alias for update_tuple_sketch for array of strings
  */
-template<typename Allocator = std::allocator<std::string>,
+template<typename Allocator = std::allocator<char>,
          typename Policy = default_array_of_strings_update_policy<Allocator>>
 using update_array_of_strings_tuple_sketch = update_tuple_sketch<
-  array<std::string, Allocator>,
-  array<std::string, Allocator>,
+  typename array_of_strings_types<Allocator>::array_of_strings,
+  typename array_of_strings_types<Allocator>::array_of_strings,
   Policy,
-  typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+  typename std::allocator_traits<Allocator>::template rebind_alloc<
+    typename array_of_strings_types<Allocator>::array_of_strings
+  >
 >;
 
 /**
@@ -148,7 +166,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch<
  * @param ordered optional flag to specify if an ordered sketch should be produced
  * @return compact array of strings sketch
  */
-template<typename Allocator = std::allocator<std::string>, typename Policy = default_array_of_strings_update_policy<Allocator>>
+template<typename Allocator = std::allocator<char>, typename Policy = default_array_of_strings_update_policy<Allocator>>
 compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
   const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);
 
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index f5fa0652..e8725c55 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -33,7 +33,8 @@ default_array_of_strings_update_policy<Allocator>::default_array_of_strings_upda
 
 template<typename Allocator>
 auto default_array_of_strings_update_policy<Allocator>::create() const -> array_of_strings {
-  return array_of_strings(0, "", allocator_);
+  const string_type empty{string_allocator(allocator_)};
+  return array_of_strings(0, empty, array_allocator(allocator_));
 }
 
 template<typename Allocator>
@@ -41,7 +42,8 @@ void default_array_of_strings_update_policy<Allocator>::update(
   array_of_strings& array, const array_of_strings& input
 ) const {
   const auto length = static_cast<size_t>(input.size());
-  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  const string_type empty{string_allocator(allocator_)};
+  array = array_of_strings(static_cast<uint8_t>(length), empty, array_allocator(allocator_));
   for (size_t i = 0; i < length; ++i) array[i] = input[i];
 }
 
@@ -50,16 +52,18 @@ void default_array_of_strings_update_policy<Allocator>::update(
   array_of_strings& array, const array_of_strings* input
 ) const {
   if (input == nullptr) {
-    array = array_of_strings(0, "", allocator_);
+    const string_type empty{string_allocator(allocator_)};
+    array = array_of_strings(0, empty, array_allocator(allocator_));
     return;
   }
   const auto length = static_cast<size_t>(input->size());
-  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  const string_type empty{string_allocator(allocator_)};
+  array = array_of_strings(static_cast<uint8_t>(length), empty, array_allocator(allocator_));
   for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
 }
 
 template<typename Allocator>
-uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key) {
+uint64_t hash_array_of_strings_key(const typename array_of_strings_types<Allocator>::array_of_strings& key) {
   // Matches Java Util.PRIME for ArrayOfStrings key hashing.
   static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
   XXHash64 hasher(STRING_ARR_HASH_SEED);
@@ -124,7 +128,7 @@ void default_array_of_strings_serde<Allocator>::serialize(
     const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
     write(os, total_bytes);
     write(os, num_nodes);
-    const std::string* data = items[i].data();
+    const string_type* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
       check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
@@ -144,11 +148,12 @@ void default_array_of_strings_serde<Allocator>::deserialize(
     const uint8_t num_nodes = read<uint8_t>(is);
     if (!is) throw std::runtime_error("array_of_strings stream read failed");
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", allocator_);
+    const string_type empty{string_allocator(allocator_)};
+    array_of_strings array(num_nodes, empty, array_allocator(allocator_));
     for (uint8_t j = 0; j < num_nodes; ++j) {
       const uint32_t length = read<uint32_t>(is);
       if (!is) throw std::runtime_error("array_of_strings stream read failed");
-      std::string value(length, '\0');
+      string_type value(length, '\0', string_allocator(allocator_));
       if (length != 0) {
         is.read(&value[0], length);
         if (!is) throw std::runtime_error("array_of_strings stream read failed");
@@ -174,7 +179,7 @@ size_t default_array_of_strings_serde<Allocator>::serialize(
     check_memory_size(bytes_written + total_bytes, capacity);
     bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written);
     bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
-    const std::string* data = items[i].data();
+    const string_type* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
       check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
@@ -202,11 +207,12 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
     uint8_t num_nodes;
     bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", allocator_);
+    const string_type empty{string_allocator(allocator_)};
+    array_of_strings array(num_nodes, empty, array_allocator(allocator_));
     for (uint8_t j = 0; j < num_nodes; ++j) {
       uint32_t length;
       bytes_read += copy_from_mem(ptr8 + bytes_read, length);
-      std::string value(length, '\0');
+      string_type value(length, '\0', string_allocator(allocator_));
       if (length != 0) {
         bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
       }
@@ -236,7 +242,7 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
   const auto count = item.size();
   check_num_nodes(static_cast<uint8_t>(count));
   size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t);
-  const std::string* data = item.data();
+  const string_type* data = item.data();
   for (uint32_t j = 0; j < count; ++j) {
     total += data[j].size();
   }
@@ -244,7 +250,7 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
 }
 
 template<typename Allocator>
-void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
+void default_array_of_strings_serde<Allocator>::check_utf8(const string_type& value) {
   if (!utf8::is_valid(value.begin(), value.end())) {
     throw std::runtime_error("array_of_strings contains invalid UTF-8");
   }
diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
index af37d6c2..a623f618 100644
--- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp
+++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <algorithm>
 #include <catch2/catch.hpp>
 #include <fstream>
 #include <vector>
@@ -24,6 +25,12 @@
 #include "array_of_strings_sketch.hpp"
 
 namespace datasketches {
+  using types = array_of_strings_types<std::allocator<char>>;
+  using string_type = types::string_type;
+
+  static bool equals_string(const string_type& lhs, const std::string& rhs) {
+    return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+  }
   // assume the binary sketches for this test have been generated by datasketches-java code
   // in the subdirectory called "java" in the root directory of this project
   static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
@@ -193,7 +200,7 @@ namespace datasketches {
           if (entry.second.size() != expected.size()) continue;
           bool equal = true;
           for (size_t j = 0; j < expected.size(); ++j) {
-            if (entry.second[j] != expected[j]) {
+            if (!equals_string(entry.second[j], expected[j])) {
               equal = false;
               break;
             }
@@ -248,7 +255,7 @@ namespace datasketches {
           if (entry.second.size() != expected.size()) continue;
           bool equal = true;
           for (size_t j = 0; j < expected.size(); ++j) {
-            if (entry.second[j] != expected[j]) {
+            if (!equals_string(entry.second[j], expected[j])) {
               equal = false;
               break;
             }
diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp
index c6eb0dfc..ab5fd53b 100644
--- a/tuple/test/aos_sketch_serialize_for_java.cpp
+++ b/tuple/test/aos_sketch_serialize_for_java.cpp
@@ -26,13 +26,18 @@
 namespace datasketches {
 
 using aos_sketch = update_array_of_strings_tuple_sketch<>;
-using array_of_strings = array<std::string>;
+using types = array_of_strings_types<std::allocator<char>>;
+using array_of_strings = types::array_of_strings;
+using string_allocator = types::string_allocator;
+using string_type = types::string_type;
+using array_allocator = types::array_allocator;
 
 static array_of_strings make_array(std::initializer_list<std::string> items) {
-  array_of_strings array(static_cast<uint8_t>(items.size()), "");
+  const string_type empty{string_allocator()};
+  array_of_strings array(static_cast<uint8_t>(items.size()), empty, array_allocator());
   size_t i = 0;
   for (const auto& item: items) {
-    array[static_cast<uint8_t>(i)] = item;
+    array[static_cast<uint8_t>(i)] = string_type(item.data(), item.size(), string_allocator());
     ++i;
   }
   return array;
@@ -43,10 +48,13 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      array_of_strings key(1, "");
-      key[0] = std::to_string(i);
-      array_of_strings value(1, "");
-      value[0] = "value" + std::to_string(i);
+      const string_type empty{string_allocator()};
+      array_of_strings key(1, empty, array_allocator());
+      const std::string key_value = std::to_string(i);
+      key[0] = string_type(key_value.data(), key_value.size(), string_allocator());
+      array_of_strings value(1, empty, array_allocator());
+      const std::string value_str = "value" + std::to_string(i);
+      value[0] = string_type(value_str.data(), value_str.size(), string_allocator());
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
@@ -61,12 +69,17 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      array_of_strings key(1, "");
-      key[0] = std::to_string(i);
-      array_of_strings value(3, "");
-      value[0] = "a" + std::to_string(i);
-      value[1] = "b" + std::to_string(i);
-      value[2] = "c" + std::to_string(i);
+      const string_type empty{string_allocator()};
+      array_of_strings key(1, empty, array_allocator());
+      const std::string key_value = std::to_string(i);
+      key[0] = string_type(key_value.data(), key_value.size(), string_allocator());
+      array_of_strings value(3, empty, array_allocator());
+      const std::string value_a = "a" + std::to_string(i);
+      const std::string value_b = "b" + std::to_string(i);
+      const std::string value_c = "c" + std::to_string(i);
+      value[0] = string_type(value_a.data(), value_a.size(), string_allocator());
+      value[1] = string_type(value_b.data(), value_b.size(), string_allocator());
+      value[2] = string_type(value_c.data(), value_c.size(), string_allocator());
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
@@ -82,9 +95,10 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") {
     .set_resize_factor(resize_factor::X8)
     .set_p(0.01f)
     .build();
-  array_of_strings key(1, "");
+  const string_type empty{string_allocator()};
+  array_of_strings key(1, empty, array_allocator());
   key[0] = "key1";
-  array_of_strings value(1, "");
+  array_of_strings value(1, empty, array_allocator());
   value[0] = "value1";
   sketch.update(hash_array_of_strings_key(key), value);
   REQUIRE_FALSE(sketch.is_empty());
@@ -98,11 +112,15 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      array_of_strings key(2, "");
-      key[0] = "key" + std::to_string(i);
-      key[1] = "subkey" + std::to_string(i % 10);
-      array_of_strings value(1, "");
-      value[0] = "value" + std::to_string(i);
+      const string_type empty{string_allocator()};
+      array_of_strings key(2, empty, array_allocator());
+      const std::string key0 = "key" + std::to_string(i);
+      const std::string key1 = "subkey" + std::to_string(i % 10);
+      key[0] = string_type(key0.data(), key0.size(), string_allocator());
+      key[1] = string_type(key1.data(), key1.size(), string_allocator());
+      array_of_strings value(1, empty, array_allocator());
+      const std::string value_str = "value" + std::to_string(i);
+      value[0] = string_type(value_str.data(), value_str.size(), string_allocator());
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 74b225b0..8e1f1582 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -29,7 +29,11 @@
 
 namespace datasketches {
 
-using array_of_strings = array<std::string>;
+using types = array_of_strings_types<std::allocator<char>>;
+using array_of_strings = types::array_of_strings;
+using string_allocator = types::string_allocator;
+using string_type = types::string_type;
+using array_allocator = types::array_allocator;
 
 TEST_CASE("aos update policy", "[tuple_sketch]") {
   default_array_of_strings_update_policy<> policy;
@@ -42,7 +46,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   SECTION("replace array") {
     auto values = policy.create();
 
-    array_of_strings input(2, "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings input(2, empty, array_allocator());
     input[0] = "alpha";
     input[1] = "beta";
     policy.update(values, input);
@@ -52,7 +57,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
     input[0] = "changed";
     REQUIRE(values[0] == "alpha");
 
-    array_of_strings input2(1, "", std::allocator<std::string>());
+    array_of_strings input2(1, empty, array_allocator());
     input2[0] = "gamma";
     policy.update(values, input2);
     REQUIRE(values.size() == 1);
@@ -60,7 +65,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   }
 
   SECTION("nullptr clears") {
-    array_of_strings values(2, "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings values(2, empty, array_allocator());
     values[0] = "one";
     values[1] = "two";
 
@@ -71,7 +77,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   SECTION("pointer input copies") {
     auto values = policy.create();
 
-    array_of_strings input(2, "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings input(2, empty, array_allocator());
     input[0] = "first";
     input[1] = "second";
     policy.update(values, &input);
@@ -84,7 +91,8 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
 
 TEST_CASE("aos sketch update", "[tuple_sketch]") {
   auto make_array = [](std::initializer_list<const char*> entries) {
-    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings array(static_cast<uint8_t>(entries.size()), empty, array_allocator());
     uint8_t i = 0;
     for (const auto* entry: entries) array[i++] = entry;
     return array;
@@ -151,9 +159,12 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") {
 
 TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
   auto make_array = [](std::initializer_list<std::string> entries) {
-    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings array(static_cast<uint8_t>(entries.size()), empty, array_allocator());
     uint8_t i = 0;
-    for (const auto& entry: entries) array[i++] = entry;
+    for (const auto& entry: entries) {
+      array[i++] = string_type(entry.data(), entry.size(), string_allocator());
+    }
     return array;
   };
 
@@ -259,8 +270,9 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") {
   default_array_of_strings_serde<> serde;
 
   SECTION("invalid utf8 rejected") {
-    array_of_strings array(1, "", std::allocator<std::string>());
-    const std::string invalid_utf8("\xC3\x28", 2);
+    const string_type empty{string_allocator()};
+    array_of_strings array(1, empty, array_allocator());
+    const string_type invalid_utf8("\xC3\x28", 2, string_allocator());
     array[0] = invalid_utf8;
     std::stringstream ss;
     ss.exceptions(std::ios::failbit | std::ios::badbit);
@@ -271,7 +283,8 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") {
   }
 
   SECTION("too many nodes rejected") {
-    array_of_strings array(128, "", std::allocator<std::string>());
+    const string_type empty{string_allocator()};
+    array_of_strings array(128, empty, array_allocator());
     std::stringstream ss;
     ss.exceptions(std::ios::failbit | std::ios::badbit);
     REQUIRE_THROWS_WITH(

From 852b26bfd180fe48aafab307a5391232b011012b Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Tue, 3 Feb 2026 01:29:47 +0900
Subject: [PATCH 45/75] test: add missing kll long sketch compatibility cases

---
 .../kll_sketch_deserialize_from_java_test.cpp | 24 +++++++++++++++++++
 kll/test/kll_sketch_serialize_for_java.cpp    | 10 ++++++++
 2 files changed, 34 insertions(+)

diff --git a/kll/test/kll_sketch_deserialize_from_java_test.cpp b/kll/test/kll_sketch_deserialize_from_java_test.cpp
index 795486ae..65efc3e5 100644
--- a/kll/test/kll_sketch_deserialize_from_java_test.cpp
+++ b/kll/test/kll_sketch_deserialize_from_java_test.cpp
@@ -100,4 +100,28 @@ TEST_CASE("kll string", "[serde_compat]") {
   }
 }
 
+TEST_CASE("kll long", "[serde_compat]") {
+  const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+  for (const unsigned n: n_arr) {
+    std::ifstream is;
+    is.exceptions(std::ios::failbit | std::ios::badbit);
+    is.open(testBinaryInputPath + "kll_long_n" + std::to_string(n) + "_java.sk", std::ios::binary);
+    const auto sketch = kll_sketch<long>::deserialize(is);
+    REQUIRE(sketch.is_empty() == (n == 0));
+    REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
+    REQUIRE(sketch.get_n() == n);
+    if (n > 0) {
+      REQUIRE(sketch.get_min_item() == 1);
+      REQUIRE(sketch.get_max_item() == static_cast<long>(n));
+      uint64_t weight = 0;
+      for (const auto pair: sketch) {
+        REQUIRE(pair.first >= sketch.get_min_item());
+        REQUIRE(pair.first <= sketch.get_max_item());
+        weight += pair.second;
+      }
+      REQUIRE(weight == sketch.get_n());
+    }
+  }
+}
+
 } /* namespace datasketches */
diff --git a/kll/test/kll_sketch_serialize_for_java.cpp b/kll/test/kll_sketch_serialize_for_java.cpp
index 00b8913d..22b75774 100644
--- a/kll/test/kll_sketch_serialize_for_java.cpp
+++ b/kll/test/kll_sketch_serialize_for_java.cpp
@@ -43,6 +43,16 @@ TEST_CASE("kll sketch double generate", "[serialize_for_java]") {
   }
 }
 
+TEST_CASE("kll sketch long generate", "[serialize_for_java]") {
+  const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
+  for (const unsigned n: n_arr) {
+    kll_sketch<long> sketch;
+    for (unsigned i = 1; i <= n; ++i) sketch.update(i);
+    std::ofstream os("kll_long_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
+    sketch.serialize(os);
+  }
+}
+
 struct compare_as_number {
   bool operator()(const std::string& a, const std::string& b) const {
     return std::stoi(a) < std::stoi(b);

From f5fb9d9d9142aed46295fb98888ee6b9b414e73f Mon Sep 17 00:00:00 2001
From: Mahesh Pai <mahesh_pai@intuit.com>
Date: Wed, 4 Feb 2026 14:55:58 +0530
Subject: [PATCH 46/75] Review comments

---
 common/test/CMakeLists.txt | 48 +++++++++++++-------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt
index d190b628..c3e937a2 100644
--- a/common/test/CMakeLists.txt
+++ b/common/test/CMakeLists.txt
@@ -77,26 +77,15 @@ add_executable(integration_test)
 
 target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib)
 
-# Use C++17 if CMAKE_CXX_STANDARD is set to 17+, otherwise C++11
-# This allows hardening tests to use std::optional with libc++ hardening
+# Use CMAKE_CXX_STANDARD if defined, otherwise C++11
+set(_integration_cxx_standard 11)
 if(DEFINED CMAKE_CXX_STANDARD)
-  if(CMAKE_CXX_STANDARD MATCHES "17|20|23")
-    set_target_properties(integration_test PROPERTIES
-      CXX_STANDARD ${CMAKE_CXX_STANDARD}
-      CXX_STANDARD_REQUIRED YES
-    )
-  else()
-    set_target_properties(integration_test PROPERTIES
-      CXX_STANDARD 11
-      CXX_STANDARD_REQUIRED YES
-    )
-  endif()
-else()
-  set_target_properties(integration_test PROPERTIES
-    CXX_STANDARD 11
-    CXX_STANDARD_REQUIRED YES
-  )
+  set(_integration_cxx_standard ${CMAKE_CXX_STANDARD})
 endif()
+set_target_properties(integration_test PROPERTIES
+  CXX_STANDARD ${_integration_cxx_standard}
+  CXX_STANDARD_REQUIRED YES
+)
 
 add_test(
   NAME integration_test
@@ -110,8 +99,7 @@ target_sources(integration_test
 
 # Separate hardening test executable (header-only, no pre-compiled libs)
 # This ensures the sketch code is compiled with C++17 + hardening
-# Always build this target - it will use CMAKE_CXX_STANDARD if set, otherwise C++17
-message(STATUS "CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}")
+# Always build this target - it will use CMAKE_CXX_STANDARD if set (and >= 17), otherwise C++17
 
 add_executable(hardening_test)
 target_link_libraries(hardening_test common common_test_lib)
@@ -125,19 +113,15 @@ target_include_directories(hardening_test PRIVATE
 )
 
 # Use C++17 minimum for hardening tests
-if(CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17)
-  set_target_properties(hardening_test PROPERTIES
-    CXX_STANDARD ${CMAKE_CXX_STANDARD}
-    CXX_STANDARD_REQUIRED YES
-  )
-  message(STATUS "hardening_test will use C++${CMAKE_CXX_STANDARD}")
-else()
-  set_target_properties(hardening_test PROPERTIES
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED YES
-  )
-  message(STATUS "hardening_test will use C++17 (default)")
+set(_hardening_cxx_standard 17)
+if(DEFINED CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17)
+  set(_hardening_cxx_standard ${CMAKE_CXX_STANDARD})
 endif()
+set_target_properties(hardening_test PROPERTIES
+  CXX_STANDARD ${_hardening_cxx_standard}
+  CXX_STANDARD_REQUIRED YES
+)
+message(STATUS "hardening_test will use C++${_hardening_cxx_standard}")
 
 add_test(
   NAME hardening_test

From fccb2385f3a66416fc34c0d7bd0513696721ecd9 Mon Sep 17 00:00:00 2001
From: tison <wander4096@gmail.com>
Date: Thu, 5 Feb 2026 21:44:05 +0800
Subject: [PATCH 47/75] Fix error message for empty window data in compressor
 (#482)

---
 cpc/include/cpc_compressor_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpc/include/cpc_compressor_impl.hpp b/cpc/include/cpc_compressor_impl.hpp
index 062e2e0e..0cc24b19 100644
--- a/cpc/include/cpc_compressor_impl.hpp
+++ b/cpc/include/cpc_compressor_impl.hpp
@@ -157,7 +157,7 @@ void cpc_compressor<A>::compress(const cpc_sketch_alloc<A>& source, compressed_s
       break;
     case cpc_sketch_alloc<A>::flavor::PINNED:
       compress_pinned_flavor(source, result);
-      if (result.window_data.size() == 0) throw std::logic_error("window is not expected");
+      if (result.window_data.size() == 0) throw std::logic_error("window is expected");
       break;
     case cpc_sketch_alloc<A>::flavor::SLIDING:
       compress_sliding_flavor(source, result);

From 79cb75cc0e873922ab80cc6f32f84220710afe10 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Fri, 6 Feb 2026 12:22:43 +0900
Subject: [PATCH 48/75] fix: check length for equal

---
 tuple/include/array_tuple_sketch.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp
index 03761ff4..d331f8b1 100644
--- a/tuple/include/array_tuple_sketch.hpp
+++ b/tuple/include/array_tuple_sketch.hpp
@@ -82,6 +82,7 @@ class array {
   T* data() { return array_; }
   const T* data() const { return array_; }
   bool operator==(const array& other) const {
+    if (size_ != other.size_) return false;
     for (uint8_t i = 0; i < size_; ++i) if (array_[i] != other.array_[i]) return false;
     return true;
   }

From 2a59f114871027c9b068bbfca39350c9cf3f2da8 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Fri, 6 Feb 2026 23:32:55 +0900
Subject: [PATCH 49/75] Revert "fix: allocation handling for string in
 deserialize"

This reverts commit 4894e5e7b156f1ce6a909ef84321ab081078c16c.
---
 tuple/include/array_of_strings_sketch.hpp     | 50 +++++-----------
 .../include/array_of_strings_sketch_impl.hpp  | 32 +++++-----
 .../aos_sketch_deserialize_from_java_test.cpp | 11 +---
 tuple/test/aos_sketch_serialize_for_java.cpp  | 58 +++++++------------
 tuple/test/array_of_strings_sketch_test.cpp   | 35 ++++-------
 5 files changed, 62 insertions(+), 124 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index 4ee3bc9c..db147723 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -28,22 +28,11 @@
 
 namespace datasketches {
 
-template<typename Allocator>
-struct array_of_strings_types {
-  using string_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<char>;
-  using string_type = std::basic_string<char, std::char_traits<char>, string_allocator>;
-  using array_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<string_type>;
-  using array_of_strings = array<string_type, array_allocator>;
-};
-
 // default update policy for an array of strings
-template<typename Allocator = std::allocator<char>>
+template<typename Allocator = std::allocator<std::string>>
 class default_array_of_strings_update_policy {
 public:
-  using string_allocator = typename array_of_strings_types<Allocator>::string_allocator;
-  using string_type = typename array_of_strings_types<Allocator>::string_type;
-  using array_allocator = typename array_of_strings_types<Allocator>::array_allocator;
-  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
+  using array_of_strings = array<std::string, Allocator>;
 
   explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator());
 
@@ -59,12 +48,9 @@ class default_array_of_strings_update_policy {
 
 // serializer/deserializer for an array of strings
 // Requirements: all strings must be valid UTF-8 and array size must be <= 127.
-template<typename Allocator = std::allocator<char>>
+template<typename Allocator = std::allocator<std::string>>
 struct default_array_of_strings_serde {
-  using string_allocator = typename array_of_strings_types<Allocator>::string_allocator;
-  using string_type = typename array_of_strings_types<Allocator>::string_type;
-  using array_allocator = typename array_of_strings_types<Allocator>::array_allocator;
-  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
+  using array_of_strings = array<std::string, Allocator>;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
 
   explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());
@@ -80,29 +66,27 @@ struct default_array_of_strings_serde {
   summary_allocator summary_allocator_;
   static void check_num_nodes(uint8_t num_nodes);
   static uint32_t compute_total_bytes(const array_of_strings& item);
-  static void check_utf8(const string_type& value);
+  static void check_utf8(const std::string& value);
 };
 
 /**
  * Hashes an array of strings using ArrayOfStrings-compatible hashing.
  */
-template<typename Allocator = std::allocator<char>>
-uint64_t hash_array_of_strings_key(const typename array_of_strings_types<Allocator>::array_of_strings& key);
+template<typename Allocator = std::allocator<std::string>>
+uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key);
 
 /**
  * Extended class of compact_tuple_sketch for array of strings
  * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
  */
-template<typename Allocator = std::allocator<char>>
+template<typename Allocator = std::allocator<std::string>>
 class compact_array_of_strings_tuple_sketch:
   public compact_tuple_sketch<
-    typename array_of_strings_types<Allocator>::array_of_strings,
-    typename std::allocator_traits<Allocator>::template rebind_alloc<
-      typename array_of_strings_types<Allocator>::array_of_strings
-    >
+    array<std::string, Allocator>,
+    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
   > {
 public:
-  using array_of_strings = typename array_of_strings_types<Allocator>::array_of_strings;
+  using array_of_strings = array<std::string, Allocator>;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
   using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
   using vector_bytes = typename Base::vector_bytes;
@@ -149,15 +133,13 @@ class compact_array_of_strings_tuple_sketch:
 /**
  * Convenience alias for update_tuple_sketch for array of strings
  */
-template<typename Allocator = std::allocator<char>,
+template<typename Allocator = std::allocator<std::string>,
          typename Policy = default_array_of_strings_update_policy<Allocator>>
 using update_array_of_strings_tuple_sketch = update_tuple_sketch<
-  typename array_of_strings_types<Allocator>::array_of_strings,
-  typename array_of_strings_types<Allocator>::array_of_strings,
+  array<std::string, Allocator>,
+  array<std::string, Allocator>,
   Policy,
-  typename std::allocator_traits<Allocator>::template rebind_alloc<
-    typename array_of_strings_types<Allocator>::array_of_strings
-  >
+  typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
 >;
 
 /**
@@ -166,7 +148,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch<
  * @param ordered optional flag to specify if an ordered sketch should be produced
  * @return compact array of strings sketch
  */
-template<typename Allocator = std::allocator<char>, typename Policy = default_array_of_strings_update_policy<Allocator>>
+template<typename Allocator = std::allocator<std::string>, typename Policy = default_array_of_strings_update_policy<Allocator>>
 compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
   const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);
 
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index e8725c55..f5fa0652 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -33,8 +33,7 @@ default_array_of_strings_update_policy<Allocator>::default_array_of_strings_upda
 
 template<typename Allocator>
 auto default_array_of_strings_update_policy<Allocator>::create() const -> array_of_strings {
-  const string_type empty{string_allocator(allocator_)};
-  return array_of_strings(0, empty, array_allocator(allocator_));
+  return array_of_strings(0, "", allocator_);
 }
 
 template<typename Allocator>
@@ -42,8 +41,7 @@ void default_array_of_strings_update_policy<Allocator>::update(
   array_of_strings& array, const array_of_strings& input
 ) const {
   const auto length = static_cast<size_t>(input.size());
-  const string_type empty{string_allocator(allocator_)};
-  array = array_of_strings(static_cast<uint8_t>(length), empty, array_allocator(allocator_));
+  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
   for (size_t i = 0; i < length; ++i) array[i] = input[i];
 }
 
@@ -52,18 +50,16 @@ void default_array_of_strings_update_policy<Allocator>::update(
   array_of_strings& array, const array_of_strings* input
 ) const {
   if (input == nullptr) {
-    const string_type empty{string_allocator(allocator_)};
-    array = array_of_strings(0, empty, array_allocator(allocator_));
+    array = array_of_strings(0, "", allocator_);
     return;
   }
   const auto length = static_cast<size_t>(input->size());
-  const string_type empty{string_allocator(allocator_)};
-  array = array_of_strings(static_cast<uint8_t>(length), empty, array_allocator(allocator_));
+  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
   for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
 }
 
 template<typename Allocator>
-uint64_t hash_array_of_strings_key(const typename array_of_strings_types<Allocator>::array_of_strings& key) {
+uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key) {
   // Matches Java Util.PRIME for ArrayOfStrings key hashing.
   static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
   XXHash64 hasher(STRING_ARR_HASH_SEED);
@@ -128,7 +124,7 @@ void default_array_of_strings_serde<Allocator>::serialize(
     const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
     write(os, total_bytes);
     write(os, num_nodes);
-    const string_type* data = items[i].data();
+    const std::string* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
       check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
@@ -148,12 +144,11 @@ void default_array_of_strings_serde<Allocator>::deserialize(
     const uint8_t num_nodes = read<uint8_t>(is);
     if (!is) throw std::runtime_error("array_of_strings stream read failed");
     check_num_nodes(num_nodes);
-    const string_type empty{string_allocator(allocator_)};
-    array_of_strings array(num_nodes, empty, array_allocator(allocator_));
+    array_of_strings array(num_nodes, "", allocator_);
     for (uint8_t j = 0; j < num_nodes; ++j) {
       const uint32_t length = read<uint32_t>(is);
       if (!is) throw std::runtime_error("array_of_strings stream read failed");
-      string_type value(length, '\0', string_allocator(allocator_));
+      std::string value(length, '\0');
       if (length != 0) {
         is.read(&value[0], length);
         if (!is) throw std::runtime_error("array_of_strings stream read failed");
@@ -179,7 +174,7 @@ size_t default_array_of_strings_serde<Allocator>::serialize(
     check_memory_size(bytes_written + total_bytes, capacity);
     bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written);
     bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
-    const string_type* data = items[i].data();
+    const std::string* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
       check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
@@ -207,12 +202,11 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
     uint8_t num_nodes;
     bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
     check_num_nodes(num_nodes);
-    const string_type empty{string_allocator(allocator_)};
-    array_of_strings array(num_nodes, empty, array_allocator(allocator_));
+    array_of_strings array(num_nodes, "", allocator_);
     for (uint8_t j = 0; j < num_nodes; ++j) {
       uint32_t length;
       bytes_read += copy_from_mem(ptr8 + bytes_read, length);
-      string_type value(length, '\0', string_allocator(allocator_));
+      std::string value(length, '\0');
       if (length != 0) {
         bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
       }
@@ -242,7 +236,7 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
   const auto count = item.size();
   check_num_nodes(static_cast<uint8_t>(count));
   size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t);
-  const string_type* data = item.data();
+  const std::string* data = item.data();
   for (uint32_t j = 0; j < count; ++j) {
     total += data[j].size();
   }
@@ -250,7 +244,7 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
 }
 
 template<typename Allocator>
-void default_array_of_strings_serde<Allocator>::check_utf8(const string_type& value) {
+void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
   if (!utf8::is_valid(value.begin(), value.end())) {
     throw std::runtime_error("array_of_strings contains invalid UTF-8");
   }
diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
index a623f618..af37d6c2 100644
--- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp
+++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp
@@ -17,7 +17,6 @@
  * under the License.
  */
 
-#include <algorithm>
 #include <catch2/catch.hpp>
 #include <fstream>
 #include <vector>
@@ -25,12 +24,6 @@
 #include "array_of_strings_sketch.hpp"
 
 namespace datasketches {
-  using types = array_of_strings_types<std::allocator<char>>;
-  using string_type = types::string_type;
-
-  static bool equals_string(const string_type& lhs, const std::string& rhs) {
-    return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
-  }
   // assume the binary sketches for this test have been generated by datasketches-java code
   // in the subdirectory called "java" in the root directory of this project
   static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
@@ -200,7 +193,7 @@ namespace datasketches {
           if (entry.second.size() != expected.size()) continue;
           bool equal = true;
           for (size_t j = 0; j < expected.size(); ++j) {
-            if (!equals_string(entry.second[j], expected[j])) {
+            if (entry.second[j] != expected[j]) {
               equal = false;
               break;
             }
@@ -255,7 +248,7 @@ namespace datasketches {
           if (entry.second.size() != expected.size()) continue;
           bool equal = true;
           for (size_t j = 0; j < expected.size(); ++j) {
-            if (!equals_string(entry.second[j], expected[j])) {
+            if (entry.second[j] != expected[j]) {
               equal = false;
               break;
             }
diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp
index ab5fd53b..c6eb0dfc 100644
--- a/tuple/test/aos_sketch_serialize_for_java.cpp
+++ b/tuple/test/aos_sketch_serialize_for_java.cpp
@@ -26,18 +26,13 @@
 namespace datasketches {
 
 using aos_sketch = update_array_of_strings_tuple_sketch<>;
-using types = array_of_strings_types<std::allocator<char>>;
-using array_of_strings = types::array_of_strings;
-using string_allocator = types::string_allocator;
-using string_type = types::string_type;
-using array_allocator = types::array_allocator;
+using array_of_strings = array<std::string>;
 
 static array_of_strings make_array(std::initializer_list<std::string> items) {
-  const string_type empty{string_allocator()};
-  array_of_strings array(static_cast<uint8_t>(items.size()), empty, array_allocator());
+  array_of_strings array(static_cast<uint8_t>(items.size()), "");
   size_t i = 0;
   for (const auto& item: items) {
-    array[static_cast<uint8_t>(i)] = string_type(item.data(), item.size(), string_allocator());
+    array[static_cast<uint8_t>(i)] = item;
     ++i;
   }
   return array;
@@ -48,13 +43,10 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      const string_type empty{string_allocator()};
-      array_of_strings key(1, empty, array_allocator());
-      const std::string key_value = std::to_string(i);
-      key[0] = string_type(key_value.data(), key_value.size(), string_allocator());
-      array_of_strings value(1, empty, array_allocator());
-      const std::string value_str = "value" + std::to_string(i);
-      value[0] = string_type(value_str.data(), value_str.size(), string_allocator());
+      array_of_strings key(1, "");
+      key[0] = std::to_string(i);
+      array_of_strings value(1, "");
+      value[0] = "value" + std::to_string(i);
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
@@ -69,17 +61,12 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      const string_type empty{string_allocator()};
-      array_of_strings key(1, empty, array_allocator());
-      const std::string key_value = std::to_string(i);
-      key[0] = string_type(key_value.data(), key_value.size(), string_allocator());
-      array_of_strings value(3, empty, array_allocator());
-      const std::string value_a = "a" + std::to_string(i);
-      const std::string value_b = "b" + std::to_string(i);
-      const std::string value_c = "c" + std::to_string(i);
-      value[0] = string_type(value_a.data(), value_a.size(), string_allocator());
-      value[1] = string_type(value_b.data(), value_b.size(), string_allocator());
-      value[2] = string_type(value_c.data(), value_c.size(), string_allocator());
+      array_of_strings key(1, "");
+      key[0] = std::to_string(i);
+      array_of_strings value(3, "");
+      value[0] = "a" + std::to_string(i);
+      value[1] = "b" + std::to_string(i);
+      value[2] = "c" + std::to_string(i);
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
@@ -95,10 +82,9 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") {
     .set_resize_factor(resize_factor::X8)
     .set_p(0.01f)
     .build();
-  const string_type empty{string_allocator()};
-  array_of_strings key(1, empty, array_allocator());
+  array_of_strings key(1, "");
   key[0] = "key1";
-  array_of_strings value(1, empty, array_allocator());
+  array_of_strings value(1, "");
   value[0] = "value1";
   sketch.update(hash_array_of_strings_key(key), value);
   REQUIRE_FALSE(sketch.is_empty());
@@ -112,15 +98,11 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") {
   for (const unsigned n: n_arr) {
     auto sketch = aos_sketch::builder().build();
     for (unsigned i = 0; i < n; ++i) {
-      const string_type empty{string_allocator()};
-      array_of_strings key(2, empty, array_allocator());
-      const std::string key0 = "key" + std::to_string(i);
-      const std::string key1 = "subkey" + std::to_string(i % 10);
-      key[0] = string_type(key0.data(), key0.size(), string_allocator());
-      key[1] = string_type(key1.data(), key1.size(), string_allocator());
-      array_of_strings value(1, empty, array_allocator());
-      const std::string value_str = "value" + std::to_string(i);
-      value[0] = string_type(value_str.data(), value_str.size(), string_allocator());
+      array_of_strings key(2, "");
+      key[0] = "key" + std::to_string(i);
+      key[1] = "subkey" + std::to_string(i % 10);
+      array_of_strings value(1, "");
+      value[0] = "value" + std::to_string(i);
       sketch.update(hash_array_of_strings_key(key), value);
     }
     REQUIRE(sketch.is_empty() == (n == 0));
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 8e1f1582..74b225b0 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -29,11 +29,7 @@
 
 namespace datasketches {
 
-using types = array_of_strings_types<std::allocator<char>>;
-using array_of_strings = types::array_of_strings;
-using string_allocator = types::string_allocator;
-using string_type = types::string_type;
-using array_allocator = types::array_allocator;
+using array_of_strings = array<std::string>;
 
 TEST_CASE("aos update policy", "[tuple_sketch]") {
   default_array_of_strings_update_policy<> policy;
@@ -46,8 +42,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   SECTION("replace array") {
     auto values = policy.create();
 
-    const string_type empty{string_allocator()};
-    array_of_strings input(2, empty, array_allocator());
+    array_of_strings input(2, "", std::allocator<std::string>());
     input[0] = "alpha";
     input[1] = "beta";
     policy.update(values, input);
@@ -57,7 +52,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
     input[0] = "changed";
     REQUIRE(values[0] == "alpha");
 
-    array_of_strings input2(1, empty, array_allocator());
+    array_of_strings input2(1, "", std::allocator<std::string>());
     input2[0] = "gamma";
     policy.update(values, input2);
     REQUIRE(values.size() == 1);
@@ -65,8 +60,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   }
 
   SECTION("nullptr clears") {
-    const string_type empty{string_allocator()};
-    array_of_strings values(2, empty, array_allocator());
+    array_of_strings values(2, "", std::allocator<std::string>());
     values[0] = "one";
     values[1] = "two";
 
@@ -77,8 +71,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
   SECTION("pointer input copies") {
     auto values = policy.create();
 
-    const string_type empty{string_allocator()};
-    array_of_strings input(2, empty, array_allocator());
+    array_of_strings input(2, "", std::allocator<std::string>());
     input[0] = "first";
     input[1] = "second";
     policy.update(values, &input);
@@ -91,8 +84,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") {
 
 TEST_CASE("aos sketch update", "[tuple_sketch]") {
   auto make_array = [](std::initializer_list<const char*> entries) {
-    const string_type empty{string_allocator()};
-    array_of_strings array(static_cast<uint8_t>(entries.size()), empty, array_allocator());
+    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
     uint8_t i = 0;
     for (const auto* entry: entries) array[i++] = entry;
     return array;
@@ -159,12 +151,9 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") {
 
 TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
   auto make_array = [](std::initializer_list<std::string> entries) {
-    const string_type empty{string_allocator()};
-    array_of_strings array(static_cast<uint8_t>(entries.size()), empty, array_allocator());
+    array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
     uint8_t i = 0;
-    for (const auto& entry: entries) {
-      array[i++] = string_type(entry.data(), entry.size(), string_allocator());
-    }
+    for (const auto& entry: entries) array[i++] = entry;
     return array;
   };
 
@@ -270,9 +259,8 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") {
   default_array_of_strings_serde<> serde;
 
   SECTION("invalid utf8 rejected") {
-    const string_type empty{string_allocator()};
-    array_of_strings array(1, empty, array_allocator());
-    const string_type invalid_utf8("\xC3\x28", 2, string_allocator());
+    array_of_strings array(1, "", std::allocator<std::string>());
+    const std::string invalid_utf8("\xC3\x28", 2);
     array[0] = invalid_utf8;
     std::stringstream ss;
     ss.exceptions(std::ios::failbit | std::ios::badbit);
@@ -283,8 +271,7 @@ TEST_CASE("aos serde validation", "[tuple_sketch]") {
   }
 
   SECTION("too many nodes rejected") {
-    const string_type empty{string_allocator()};
-    array_of_strings array(128, empty, array_allocator());
+    array_of_strings array(128, "", std::allocator<std::string>());
     std::stringstream ss;
     ss.exceptions(std::ios::failbit | std::ios::badbit);
     REQUIRE_THROWS_WITH(

From 25ce65cfd2c20995e99b2e8dd0cea99b308925f3 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sat, 7 Feb 2026 00:57:47 +0900
Subject: [PATCH 50/75] refactor: change allocator only for array_of_strings

---
 tuple/include/array_of_strings_sketch.hpp     | 42 +++++++------------
 .../include/array_of_strings_sketch_impl.hpp  | 38 +++++++----------
 tuple/test/array_of_strings_sketch_test.cpp   |  4 +-
 3 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index db147723..60c43ca9 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -28,29 +28,24 @@
 
 namespace datasketches {
 
+using array_of_strings = array<std::string>;
+
 // default update policy for an array of strings
-template<typename Allocator = std::allocator<std::string>>
 class default_array_of_strings_update_policy {
 public:
-  using array_of_strings = array<std::string, Allocator>;
-
-  explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator());
+  default_array_of_strings_update_policy() = default;
 
   array_of_strings create() const;
 
   void update(array_of_strings& array, const array_of_strings& input) const;
 
   void update(array_of_strings& array, const array_of_strings* input) const;
-
-private:
-  Allocator allocator_;
 };
 
 // serializer/deserializer for an array of strings
 // Requirements: all strings must be valid UTF-8 and array size must be <= 127.
-template<typename Allocator = std::allocator<std::string>>
+template<typename Allocator = std::allocator<array_of_strings>>
 struct default_array_of_strings_serde {
-  using array_of_strings = array<std::string, Allocator>;
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
 
   explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());
@@ -62,7 +57,6 @@ struct default_array_of_strings_serde {
   size_t size_of_item(const array_of_strings& item) const;
 
 private:
-  Allocator allocator_;
   summary_allocator summary_allocator_;
   static void check_num_nodes(uint8_t num_nodes);
   static uint32_t compute_total_bytes(const array_of_strings& item);
@@ -72,23 +66,17 @@ struct default_array_of_strings_serde {
 /**
  * Hashes an array of strings using ArrayOfStrings-compatible hashing.
  */
-template<typename Allocator = std::allocator<std::string>>
-uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key);
+uint64_t hash_array_of_strings_key(const array_of_strings& key);
 
 /**
  * Extended class of compact_tuple_sketch for array of strings
  * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
  */
-template<typename Allocator = std::allocator<std::string>>
+template<typename Allocator = std::allocator<array_of_strings>>
 class compact_array_of_strings_tuple_sketch:
-  public compact_tuple_sketch<
-    array<std::string, Allocator>,
-    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
-  > {
+  public compact_tuple_sketch<array_of_strings, Allocator> {
 public:
-  using array_of_strings = array<std::string, Allocator>;
-  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
-  using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
+  using Base = compact_tuple_sketch<array_of_strings, Allocator>;
   using vector_bytes = typename Base::vector_bytes;
   using Base::serialize;
 
@@ -133,13 +121,13 @@ class compact_array_of_strings_tuple_sketch:
 /**
  * Convenience alias for update_tuple_sketch for array of strings
  */
-template<typename Allocator = std::allocator<std::string>,
-         typename Policy = default_array_of_strings_update_policy<Allocator>>
+template<typename Allocator = std::allocator<array_of_strings>,
+         typename Policy = default_array_of_strings_update_policy>
 using update_array_of_strings_tuple_sketch = update_tuple_sketch<
-  array<std::string, Allocator>,
-  array<std::string, Allocator>,
+  array_of_strings,
+  array_of_strings,
   Policy,
-  typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
+  Allocator
 >;
 
 /**
@@ -148,7 +136,7 @@ using update_array_of_strings_tuple_sketch = update_tuple_sketch<
  * @param ordered optional flag to specify if an ordered sketch should be produced
  * @return compact array of strings sketch
  */
-template<typename Allocator = std::allocator<std::string>, typename Policy = default_array_of_strings_update_policy<Allocator>>
+template<typename Allocator = std::allocator<array_of_strings>, typename Policy = default_array_of_strings_update_policy>
 compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
   const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);
 
@@ -156,4 +144,4 @@ compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch
 
 #include "array_of_strings_sketch_impl.hpp"
 
-#endif
+#endif
\ No newline at end of file
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index f5fa0652..78b683d7 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -22,44 +22,37 @@
 
 #include <stdexcept>
 
+#include "array_of_strings_sketch.hpp"
 #include "common_defs.hpp"
 #include "third_party/utf8cpp/utf8.h"
 
 namespace datasketches {
 
-template<typename Allocator>
-default_array_of_strings_update_policy<Allocator>::default_array_of_strings_update_policy(const Allocator& allocator):
-  allocator_(allocator) {}
-
-template<typename Allocator>
-auto default_array_of_strings_update_policy<Allocator>::create() const -> array_of_strings {
-  return array_of_strings(0, "", allocator_);
+inline array_of_strings default_array_of_strings_update_policy::create() const {
+  return array_of_strings(0, "");
 }
 
-template<typename Allocator>
-void default_array_of_strings_update_policy<Allocator>::update(
+inline void default_array_of_strings_update_policy::update(
   array_of_strings& array, const array_of_strings& input
 ) const {
   const auto length = static_cast<size_t>(input.size());
-  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  array = array_of_strings(static_cast<uint8_t>(length), "");
   for (size_t i = 0; i < length; ++i) array[i] = input[i];
 }
 
-template<typename Allocator>
-void default_array_of_strings_update_policy<Allocator>::update(
+inline void default_array_of_strings_update_policy::update(
   array_of_strings& array, const array_of_strings* input
 ) const {
   if (input == nullptr) {
-    array = array_of_strings(0, "", allocator_);
+    array = array_of_strings(0, "");
     return;
   }
   const auto length = static_cast<size_t>(input->size());
-  array = array_of_strings(static_cast<uint8_t>(length), "", allocator_);
+  array = array_of_strings(static_cast<uint8_t>(length), "");
   for (size_t i = 0; i < length; ++i) array[i] = (*input)[i];
 }
 
-template<typename Allocator>
-uint64_t hash_array_of_strings_key(const array<std::string, Allocator>& key) {
+inline uint64_t hash_array_of_strings_key(const array_of_strings& key) {
   // Matches Java Util.PRIME for ArrayOfStrings key hashing.
   static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;
   XXHash64 hasher(STRING_ARR_HASH_SEED);
@@ -95,8 +88,7 @@ template<typename SerDe>
 auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
   std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator
 ) -> compact_array_of_strings_tuple_sketch {
-  summary_allocator alloc(allocator);
-  auto base = Base::deserialize(is, seed, sd, alloc);
+  auto base = Base::deserialize(is, seed, sd, allocator);
   return compact_array_of_strings_tuple_sketch(std::move(base));
 }
 
@@ -105,14 +97,12 @@ template<typename SerDe>
 auto compact_array_of_strings_tuple_sketch<Allocator>::deserialize(
   const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator
 ) -> compact_array_of_strings_tuple_sketch {
-  summary_allocator alloc(allocator);
-  auto base = Base::deserialize(bytes, size, seed, sd, alloc);
+  auto base = Base::deserialize(bytes, size, seed, sd, allocator);
   return compact_array_of_strings_tuple_sketch(std::move(base));
 }
 
 template<typename Allocator>
 default_array_of_strings_serde<Allocator>::default_array_of_strings_serde(const Allocator& allocator):
-  allocator_(allocator),
   summary_allocator_(allocator) {}
 
 template<typename Allocator>
@@ -144,7 +134,7 @@ void default_array_of_strings_serde<Allocator>::deserialize(
     const uint8_t num_nodes = read<uint8_t>(is);
     if (!is) throw std::runtime_error("array_of_strings stream read failed");
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", allocator_);
+    array_of_strings array(num_nodes, "");
     for (uint8_t j = 0; j < num_nodes; ++j) {
       const uint32_t length = read<uint32_t>(is);
       if (!is) throw std::runtime_error("array_of_strings stream read failed");
@@ -202,7 +192,7 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
     uint8_t num_nodes;
     bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
     check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "", allocator_);
+    array_of_strings array(num_nodes, "");
     for (uint8_t j = 0; j < num_nodes; ++j) {
       uint32_t length;
       bytes_read += copy_from_mem(ptr8 + bytes_read, length);
@@ -252,4 +242,4 @@ void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& va
 
 } /* namespace datasketches */
 
-#endif
+#endif
\ No newline at end of file
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index 74b225b0..dc21aceb 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -29,10 +29,8 @@
 
 namespace datasketches {
 
-using array_of_strings = array<std::string>;
-
 TEST_CASE("aos update policy", "[tuple_sketch]") {
-  default_array_of_strings_update_policy<> policy;
+  default_array_of_strings_update_policy policy;
 
   SECTION("create empty") {
     auto values = policy.create();

From 7f05c0305b62fd9988aee70dffaf631c79565846 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sat, 7 Feb 2026 00:59:18 +0900
Subject: [PATCH 51/75] style: add new line end of files

---
 tuple/include/array_of_strings_sketch.hpp      | 2 +-
 tuple/include/array_of_strings_sketch_impl.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index 60c43ca9..ac49fd5b 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -144,4 +144,4 @@ compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch
 
 #include "array_of_strings_sketch_impl.hpp"
 
-#endif
\ No newline at end of file
+#endif
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 78b683d7..81045472 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -242,4 +242,4 @@ void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& va
 
 } /* namespace datasketches */
 
-#endif
\ No newline at end of file
+#endif

From 2956f150933438368d539ab42b19bf8e4c8665e1 Mon Sep 17 00:00:00 2001
From: yaojun <940334249@qq.com>
Date: Tue, 10 Feb 2026 15:19:52 +0800
Subject: [PATCH 52/75] Add clang-tidy and check script and fix the warnings
 under count directory

---
 .clang-tidy                      | 36 ++++++++++++++++++++++++++++++++
 .pre-commit-config.yaml          | 29 +++++++++++++++++++++++++
 count/include/count_min_impl.hpp |  5 +++--
 3 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 .clang-tidy
 create mode 100644 .pre-commit-config.yaml

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..93e3edeb
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+---
+Checks: |
+  clang-diagnostic-*,
+  clang-analyzer-*,
+  -clang-analyzer-alpha*,
+  google-*,
+  modernize-*,
+  -modernize-avoid-c-arrays,
+  -modernize-use-trailing-return-type,
+  -modernize-use-nodiscard,
+
+CheckOptions:
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..262fd02e
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# To use this, install the python package `pre-commit` and
+# run once `pre-commit install`. This will setup a git pre-commit-hook
+# that is executed on each commit and will report the linting problems.
+# To run all hooks on all files use `pre-commit run -a`
+
+repos:
+  - repo: https://github.com/pocc/pre-commit-hooks
+    rev: v1.3.5
+    hooks:
+      - id: clang-tidy
+        args: ['--quiet', '-p=build/compile_commands.json', '--config-file=.clang-tidy']
+        types_or: [c++, c]
\ No newline at end of file
diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp
index 45376e7b..99b0a41e 100644
--- a/count/include/count_min_impl.hpp
+++ b/count/include/count_min_impl.hpp
@@ -74,7 +74,7 @@ uint64_t count_min_sketch<W,A>::get_seed() const {
 
 template<typename W, typename A>
 double count_min_sketch<W,A>::get_relative_error() const {
-  return exp(1.0) / double(_num_buckets);
+  return exp(1.0) / static_cast<double>(_num_buckets);
 }
 
 template<typename W, typename A>
@@ -449,8 +449,9 @@ string<A> count_min_sketch<W,A>::to_string() const {
   // count the number of used entries in the sketch
   uint64_t num_nonzero = 0;
   for (const auto entry: _sketch_array) {
-    if (entry != static_cast<W>(0.0))
+    if (entry != static_cast<W>(0.0)){
       ++num_nonzero;
+    }
   }
 
   // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.

From 9bf45794d3030e21c9d9f6910a2469a163feced7 Mon Sep 17 00:00:00 2001
From: yaojun <940334249@qq.com>
Date: Sat, 31 Jan 2026 13:48:52 +0800
Subject: [PATCH 53/75] fix: Add the missing brackets and support one line
 statement

---
 .clang-tidy                      |  2 +-
 count/include/count_min_impl.hpp | 32 ++++++++++++++------------------
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 93e3edeb..d0cdc6e9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -27,7 +27,7 @@ Checks: |
 
 CheckOptions:
   - key:             google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
+    value:           '0'
   - key:             google-readability-function-size.StatementThreshold
     value:           '800'
   - key:             google-readability-namespace-comments.ShortNamespaceLines
diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp
index 99b0a41e..2f2629fc 100644
--- a/count/include/count_min_impl.hpp
+++ b/count/include/count_min_impl.hpp
@@ -39,7 +39,9 @@ _num_buckets(num_buckets),
 _sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
 _seed(seed),
 _total_weight(0) {
-  if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
+  if (num_buckets < 3) {
+    throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
+  }
 
   // This check is to ensure later compatibility with a Java implementation whose maximum size can only
   // be 2^31-1.  We check only against 2^30 for simplicity.
@@ -147,7 +149,7 @@ W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&
 
 template<typename W, typename A>
 W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
-  if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
+  if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch.
   return get_estimate(item.c_str(), item.length());
 }
 
@@ -176,7 +178,7 @@ void count_min_sketch<W,A>::update(int64_t item, W weight) {
 
 template<typename W, typename A>
 void count_min_sketch<W,A>::update(const std::string& item, W weight) {
-  if (item.empty()) return;
+  if (item.empty()) { return; }
   update(item.c_str(), item.length(), weight);
 }
 
@@ -201,7 +203,7 @@ W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_b
 
 template<typename W, typename A>
 W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
-  if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
+  if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch.
   return get_upper_bound(item.c_str(), item.length());
 }
 
@@ -218,7 +220,7 @@ W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_b
 
 template<typename W, typename A>
 W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
-  if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
+  if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch.
   return get_lower_bound(item.c_str(), item.length());
 }
 
@@ -232,17 +234,13 @@ void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch) {
   /*
   * Merges this sketch into other_sketch sketch by elementwise summing of buckets
   */
-  if (this == &other_sketch) {
-    throw std::invalid_argument( "Cannot merge a sketch with itself." );
-  }
+  if (this == &other_sketch) { throw std::invalid_argument( "Cannot merge a sketch with itself." ); }
 
   bool acceptable_config =
     (get_num_hashes() == other_sketch.get_num_hashes())   &&
     (get_num_buckets() == other_sketch.get_num_buckets()) &&
     (get_seed() == other_sketch.get_seed());
-  if (!acceptable_config) {
-    throw std::invalid_argument( "Incompatible sketch configuration." );
-  }
+  if (!acceptable_config) { throw std::invalid_argument( "Incompatible sketch configuration." ); }
 
   // Merge step - iterate over the other vector and add the weights to this sketch
   auto it = _sketch_array.begin(); // This is a std::vector iterator.
@@ -290,7 +288,7 @@ void count_min_sketch<W,A>::serialize(std::ostream& os) const {
   write(os, nhashes);
   write(os, seed_hash);
   write(os, unused8);
-  if (is_empty()) return; // sketch is empty, no need to write further bytes.
+  if (is_empty()) { return; } // sketch is empty, no need to write further bytes.
 
   // Long 2
   write(os, _total_weight);
@@ -327,7 +325,7 @@ auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A
   }
   count_min_sketch c(nhashes, nbuckets, seed, allocator);
   const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
-  if (is_empty == 1) return c; // sketch is empty, no need to read further.
+  if (is_empty == 1) { return c; } // sketch is empty, no need to read further.
 
   // Set the sketch weight and read in the sketch values
   const auto weight = read<W>(is);
@@ -373,7 +371,7 @@ auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vecto
   ptr += copy_to_mem(nhashes, ptr);
   ptr += copy_to_mem(seed_hash, ptr);
   ptr += copy_to_mem(null_characters_8, ptr);
-  if (is_empty()) return bytes; // sketch is empty, no need to write further bytes.
+  if (is_empty()) { return bytes; } // sketch is empty, no need to write further bytes.
 
   // Long 2
   const W t_weight = _total_weight;
@@ -423,7 +421,7 @@ auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t
   }
   count_min_sketch c(nhashes, nbuckets, seed, allocator);
   const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
-  if (is_empty) return c; // sketch is empty, no need to read further.
+  if (is_empty) { return c; } // sketch is empty, no need to read further.
 
   ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));
 
@@ -449,9 +447,7 @@ string<A> count_min_sketch<W,A>::to_string() const {
   // count the number of used entries in the sketch
   uint64_t num_nonzero = 0;
   for (const auto entry: _sketch_array) {
-    if (entry != static_cast<W>(0.0)){
-      ++num_nonzero;
-    }
+    if (entry != static_cast<W>(0.0)) { ++num_nonzero; }
   }
 
   // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.

From 13bb3a922457f15e6d15e389b61291a9651f6b06 Mon Sep 17 00:00:00 2001
From: syaojun <libevent@yeah.net>
Date: Mon, 23 Feb 2026 14:07:27 +0800
Subject: [PATCH 54/75] perf: Replace push_back with emplace_back to optimize
 object construction

---
 theta/include/theta_set_difference_base_impl.hpp | 2 +-
 tuple/include/array_tuple_sketch_impl.hpp        | 4 ++--
 tuple/include/tuple_sketch_impl.hpp              | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/theta/include/theta_set_difference_base_impl.hpp b/theta/include/theta_set_difference_base_impl.hpp
index 02317816..40f94a2f 100644
--- a/theta/include/theta_set_difference_base_impl.hpp
+++ b/theta/include/theta_set_difference_base_impl.hpp
@@ -69,7 +69,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
         const uint64_t hash = EK()(entry);
         if (hash < theta) {
           auto result = table.find(hash);
-          if (!result.second) entries.push_back(conditional_forward<FwdSketch>(entry));
+          if (!result.second) entries.emplace_back(conditional_forward<FwdSketch>(entry));
         } else if (a.is_ordered()) {
           break; // early stop
         }
diff --git a/tuple/include/array_tuple_sketch_impl.hpp b/tuple/include/array_tuple_sketch_impl.hpp
index 42b39216..ad0c999c 100644
--- a/tuple/include/array_tuple_sketch_impl.hpp
+++ b/tuple/include/array_tuple_sketch_impl.hpp
@@ -166,7 +166,7 @@ compact_array_tuple_sketch<Array, Allocator> compact_array_tuple_sketch<Array, A
     for (size_t i = 0; i < num_entries; ++i) {
       Array summary(num_values, 0, allocator);
       read(is, summary.data(), num_values * sizeof(typename Array::value_type));
-      entries.push_back(Entry(keys[i], std::move(summary)));
+      entries.emplace_back(keys[i], std::move(summary));
     }
   }
   if (!is.good()) throw std::runtime_error("error reading from std::istream");
@@ -213,7 +213,7 @@ compact_array_tuple_sketch<Array, Allocator> compact_array_tuple_sketch<Array, A
     for (size_t i = 0; i < num_entries; ++i) {
       Array summary(num_values, 0, allocator);
       ptr += copy_from_mem(ptr, summary.data(), num_values * sizeof(typename Array::value_type));
-      entries.push_back(Entry(keys[i], std::move(summary)));
+      entries.emplace_back(keys[i], std::move(summary));
     }
   }
   const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index e5bf8340..b3f2d0bd 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -315,7 +315,7 @@ entries_(other.get_allocator())
 {
   entries_.reserve(other.get_num_retained());
   for (uint64_t hash: other) {
-    entries_.push_back(Entry(hash, summary));
+    entries_.emplace_back(hash, summary);
   }
   if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator());
 }
@@ -518,7 +518,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(std::istream&
     for (size_t i = 0; i < num_entries; ++i) {
       const auto key = read<uint64_t>(is);
       sd.deserialize(is, summary.get(), 1);
-      entries.push_back(Entry(key, std::move(*summary)));
+      entries.emplace_back(key, std::move(*summary));
       (*summary).~S();
     }
   }
@@ -585,7 +585,7 @@ compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::deserialize(const void* b
       uint64_t key;
       ptr += copy_from_mem(ptr, key);
       ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1);
-      entries.push_back(Entry(key, std::move(*summary)));
+      entries.emplace_back(key, std::move(*summary));
       (*summary).~S();
     }
   }

From c764d901ed9ce53ad41a28f60331411ce3445707 Mon Sep 17 00:00:00 2001
From: syaojun <libevent@yeah.net>
Date: Mon, 23 Feb 2026 14:31:11 +0800
Subject: [PATCH 55/75] style(kll): Add braces to single-line if statements for
 consistency

---
 kll/include/kll_helper_impl.hpp | 20 ++++++++++----------
 kll/include/kll_sketch_impl.hpp | 20 ++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/kll/include/kll_helper_impl.hpp b/kll/include/kll_helper_impl.hpp
index bb92bdc7..31534d9a 100644
--- a/kll/include/kll_helper_impl.hpp
+++ b/kll/include/kll_helper_impl.hpp
@@ -36,17 +36,17 @@ bool kll_helper::is_odd(uint32_t value) {
 }
 
 uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) {
-  if (denom > numer) return 0;
+  if (denom > numer) { return 0; }
   uint8_t count = 0;
   while (true) {
     denom <<= 1;
-    if (denom > numer) return count;
+    if (denom > numer) { return count; }
     count++;
   }
 }
 
 uint8_t kll_helper::ub_on_num_levels(uint64_t n) {
-  if (n == 0) return 1;
+  if (n == 0) { return 1; }
   return 1 + floor_of_log2_of_fraction(n, 1);
 }
 
@@ -65,8 +65,8 @@ uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t heigh
 }
 
 uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
-  if (depth > 60) throw std::invalid_argument("depth > 60");
-  if (depth <= 30) return int_cap_aux_aux(k, depth);
+  if (depth > 60) { throw std::invalid_argument("depth > 60"); }
+  if (depth <= 30) { return int_cap_aux_aux(k, depth); }
   const uint8_t half = depth / 2;
   const uint8_t rest = depth - half;
   const uint16_t tmp = int_cap_aux_aux(k, half);
@@ -74,11 +74,11 @@ uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
 }
 
 uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
-  if (depth > 30) throw std::invalid_argument("depth > 30");
+  if (depth > 30) { throw std::invalid_argument("depth > 30"); }
   const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
   const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
   const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
-  if (result > k) throw std::logic_error("result > k");
+  if (result > k) { throw std::logic_error("result > k"); }
   return static_cast<uint16_t>(result);
 }
 
@@ -94,7 +94,7 @@ uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t*
 
 template <typename T>
 void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
-  if (!is_even(length)) throw std::invalid_argument("length must be even");
+  if (!is_even(length)) { throw std::invalid_argument("length must be even"); }
   const uint32_t half_length = length / 2;
 #ifdef KLL_VALIDATION
   const uint32_t offset = deterministic_offset();
@@ -110,7 +110,7 @@ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
 
 template <typename T>
 void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
-  if (!is_even(length)) throw std::invalid_argument("length must be even");
+  if (!is_even(length)) { throw std::invalid_argument("length must be even"); }
   const uint32_t half_length = length / 2;
 #ifdef KLL_VALIDATION
   const uint32_t offset = deterministic_offset();
@@ -206,7 +206,7 @@ template <typename T, typename C>
 kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
         uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted)
 {
-  if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed
+  if (num_levels_in == 0) { throw std::invalid_argument("num_levels_in == 0"); } // things are too weird if zero levels are allowed
   const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0];
   uint8_t current_num_levels = num_levels_in;
   uint32_t current_item_count = starting_item_count; // decreases with each compaction
diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp
index 44fe6a15..b12a39c8 100644
--- a/kll/include/kll_sketch_impl.hpp
+++ b/kll/include/kll_sketch_impl.hpp
@@ -199,7 +199,7 @@ void kll_sketch<T, C, A>::update_min_max(const T& item) {
 
 template<typename T, typename C, typename A>
 uint32_t kll_sketch<T, C, A>::internal_update() {
-  if (levels_[0] == 0) compress_while_updating();
+  if (levels_[0] == 0) { compress_while_updating(); }
   n_++;
   is_level_zero_sorted_ = false;
   return --levels_[0];
@@ -208,7 +208,7 @@ uint32_t kll_sketch<T, C, A>::internal_update() {
 template<typename T, typename C, typename A>
 template<typename FwdSk>
 void kll_sketch<T, C, A>::merge(FwdSk&& other) {
-  if (other.is_empty()) return;
+  if (other.is_empty()) { return; }
   if (m_ != other.m_) {
     throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
   }
@@ -224,9 +224,9 @@ void kll_sketch<T, C, A>::merge(FwdSk&& other) {
     const uint32_t index = internal_update();
     new (&items_[index]) T(conditional_forward<FwdSk>(other.items_[i]));
   }
-  if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
+  if (other.num_levels_ >= 2) { merge_higher_levels(other, final_n); }
   n_ = final_n;
-  if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
+  if (other.is_estimation_mode()) { min_k_ = std::min(min_k_, other.min_k_); }
   assert_correct_total_weight();
   reset_sorted_view();
 }
@@ -258,13 +258,13 @@ bool kll_sketch<T, C, A>::is_estimation_mode() const {
 
 template<typename T, typename C, typename A>
 T kll_sketch<T, C, A>::get_min_item() const {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   return *min_item_;
 }
 
 template<typename T, typename C, typename A>
 T kll_sketch<T, C, A>::get_max_item() const {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   return *max_item_;
 }
 
@@ -280,28 +280,28 @@ A kll_sketch<T, C, A>::get_allocator() const {
 
 template<typename T, typename C, typename A>
 double kll_sketch<T, C, A>::get_rank(const T& item, bool inclusive) const {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   setup_sorted_view();
   return sorted_view_->get_rank(item, inclusive);
 }
 
 template<typename T, typename C, typename A>
 auto kll_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   setup_sorted_view();
   return sorted_view_->get_PMF(split_points, size, inclusive);
 }
 
 template<typename T, typename C, typename A>
 auto kll_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   setup_sorted_view();
   return sorted_view_->get_CDF(split_points, size, inclusive);
 }
 
 template<typename T, typename C, typename A>
 auto kll_sketch<T, C, A>::get_quantile(double rank, bool inclusive) const -> quantile_return_type {
-  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
+  if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); }
   if ((rank < 0.0) || (rank > 1.0)) {
     throw std::invalid_argument("normalized rank cannot be less than zero or greater than 1.0");
   }

From a46fc2f00e83fc152b7aeb6b7a27c7cb15a9494d Mon Sep 17 00:00:00 2001
From: syaojun <libevent@yeah.net>
Date: Mon, 23 Feb 2026 14:55:18 +0800
Subject: [PATCH 56/75] style(hll): Add braces to single-line if statements for
 consistency

---
 hll/include/CouponHashSet-internal.hpp      |  3 +--
 hll/include/CouponList-internal.hpp         |  3 +--
 hll/include/CubicInterpolation-internal.hpp |  6 +++---
 hll/include/Hll4Array-internal.hpp          |  2 +-
 hll/include/HllArray-internal.hpp           | 16 ++++++++--------
 hll/include/HllUnion-internal.hpp           | 16 ++++++++++------
 hll/include/coupon_iterator-internal.hpp    |  4 ++--
 7 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/hll/include/CouponHashSet-internal.hpp b/hll/include/CouponHashSet-internal.hpp
index 7474cf2c..2ec4d6a8 100644
--- a/hll/include/CouponHashSet-internal.hpp
+++ b/hll/include/CouponHashSet-internal.hpp
@@ -176,8 +176,7 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator)
     read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t));
   } 
 
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream"); 
+  if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } 
 
   return ptr.release();
 }
diff --git a/hll/include/CouponList-internal.hpp b/hll/include/CouponList-internal.hpp
index a240a000..c92820e2 100644
--- a/hll/include/CouponList-internal.hpp
+++ b/hll/include/CouponList-internal.hpp
@@ -162,8 +162,7 @@ CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
     read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t));
   }
 
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream"); 
+  if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } 
 
   return ptr.release();
 }
diff --git a/hll/include/CubicInterpolation-internal.hpp b/hll/include/CubicInterpolation-internal.hpp
index 9677b99d..fb74c402 100644
--- a/hll/include/CubicInterpolation-internal.hpp
+++ b/hll/include/CubicInterpolation-internal.hpp
@@ -165,10 +165,10 @@ static int recursiveFindStraddle(const double xArr[], const int l, const int r,
     throw std::logic_error("target value invariant violated in search");
   }
 
-  if (l+1 == r) return (l);
+  if (l+1 == r) { return (l); }
   m = l + ((r-l)/2);
-  if (xArr[m] <= x) return (recursiveFindStraddle<A>(xArr, m, r, x));
-  else              return (recursiveFindStraddle<A>(xArr, l, m, x));
+  if (xArr[m] <= x) { return (recursiveFindStraddle<A>(xArr, m, r, x)); }
+  else              { return (recursiveFindStraddle<A>(xArr, l, m, x)); }
 }
 
 
diff --git a/hll/include/Hll4Array-internal.hpp b/hll/include/Hll4Array-internal.hpp
index 9d22006b..082f168f 100644
--- a/hll/include/Hll4Array-internal.hpp
+++ b/hll/include/Hll4Array-internal.hpp
@@ -131,7 +131,7 @@ uint8_t Hll4Array<A>::getSlot(uint32_t slotNo) const {
 
 template<typename A>
 uint8_t Hll4Array<A>::adjustRawValue(uint32_t slot, uint8_t value) const {
-  if (value != hll_constants::AUX_TOKEN) return value + this->curMin_;
+  if (value != hll_constants::AUX_TOKEN) { return value + this->curMin_; }
   return auxHashMap_->mustFindValueFor(slot);
 }
 
diff --git a/hll/include/HllArray-internal.hpp b/hll/include/HllArray-internal.hpp
index 8986f068..62ea7f78 100644
--- a/hll/include/HllArray-internal.hpp
+++ b/hll/include/HllArray-internal.hpp
@@ -142,15 +142,16 @@ HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len, const A& allocat
   HllArray<A>* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
   sketch->putCurMin(curMin);
   sketch->putOutOfOrderFlag(oooFlag);
-  if (!oooFlag) sketch->putHipAccum(hip);
+  if (!oooFlag) { sketch->putHipAccum(hip); }
   sketch->putKxQ0(kxq0);
   sketch->putKxQ1(kxq1);
   sketch->putNumAtCurMin(numAtCurMin);
 
   std::memcpy(sketch->hllByteArr_.data(), data + hll_constants::HLL_BYTE_ARR_START, arrayBytes);
 
-  if (auxHashMap != nullptr)
+  if (auxHashMap != nullptr) {
     ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
+  }
 
   aux_ptr.release();
   return sketch;
@@ -193,7 +194,7 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
   const auto hip = read<double>(is);
   const auto kxq0 = read<double>(is);
   const auto kxq1 = read<double>(is);
-  if (!oooFlag) sketch->putHipAccum(hip);
+  if (!oooFlag) { sketch->putHipAccum(hip); }
   sketch->putKxQ0(kxq0);
   sketch->putKxQ1(kxq1);
 
@@ -209,8 +210,7 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
     ((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
   }
 
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream"); 
+  if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } 
 
   return sketch_ptr.release();
 }
@@ -545,7 +545,7 @@ template<typename A>
 void HllArray<A>::hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue) {
   const uint32_t configK = 1 << this->getLgConfigK();
   // update hip BEFORE updating kxq
-  if (!oooFlag_) hipAccum_ += configK / (kxq0_ + kxq1_);
+  if (!oooFlag_) { hipAccum_ += configK / (kxq0_ + kxq1_); }
   // update kxq0 and kxq1; subtract first, then add
   if (oldValue < 32) { kxq0_ -= INVERSE_POWERS_OF_2[oldValue]; }
   else               { kxq1_ -= INVERSE_POWERS_OF_2[oldValue]; }
@@ -648,7 +648,7 @@ array_(array), array_size_(array_size), index_(index), hll_type_(hll_type), exce
 {
   while (index_ < array_size_) {
     value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
-    if (all_ || value_ != hll_constants::EMPTY) break;
+    if (all_ || value_ != hll_constants::EMPTY) { break; }
     ++index_;
   }
 }
@@ -657,7 +657,7 @@ template<typename A>
 typename HllArray<A>::const_iterator& HllArray<A>::const_iterator::operator++() {
   while (++index_ < array_size_) {
     value_ = get_value(array_, index_, hll_type_, exceptions_, offset_);
-    if (all_ || value_ != hll_constants::EMPTY) break;
+    if (all_ || value_ != hll_constants::EMPTY) { break; }
   }
   return *this;
 }
diff --git a/hll/include/HllUnion-internal.hpp b/hll/include/HllUnion-internal.hpp
index 3a5a926c..27adab74 100644
--- a/hll/include/HllUnion-internal.hpp
+++ b/hll/include/HllUnion-internal.hpp
@@ -44,13 +44,13 @@ hll_sketch_alloc<A> hll_union_alloc<A>::get_result(target_hll_type target_type)
 
 template<typename A>
 void hll_union_alloc<A>::update(const hll_sketch_alloc<A>& sketch) {
-  if (sketch.is_empty()) return;
+  if (sketch.is_empty()) { return; }
   union_impl(sketch, lg_max_k_);
 }
 
 template<typename A>
 void hll_union_alloc<A>::update(hll_sketch_alloc<A>&& sketch) {
-  if (sketch.is_empty()) return;
+  if (sketch.is_empty()) { return; }
   if (gadget_.is_empty() && sketch.get_target_type() == HLL_8 && sketch.get_lg_config_k() <= lg_max_k_) {
     if (sketch.get_current_mode() == HLL || sketch.get_lg_config_k() == lg_max_k_) {
       gadget_ = std::move(sketch);
@@ -131,29 +131,33 @@ void hll_union_alloc<A>::coupon_update(uint32_t coupon) {
 
 template<typename A>
 double hll_union_alloc<A>::get_estimate() const {
-  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL)
+  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) {
     static_cast<HllArray<A>*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min();
+  }
   return gadget_.get_estimate();
 }
 
 template<typename A>
 double hll_union_alloc<A>::get_composite_estimate() const {
-  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL)
+  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) {
     static_cast<HllArray<A>*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min();
+  }
   return gadget_.get_composite_estimate();
 }
 
 template<typename A>
 double hll_union_alloc<A>::get_lower_bound(uint8_t num_std_dev) const {
-  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL)
+  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) {
     static_cast<HllArray<A>*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min();
+  }
   return gadget_.get_lower_bound(num_std_dev);
 }
 
 template<typename A>
 double hll_union_alloc<A>::get_upper_bound(uint8_t num_std_dev) const {
-  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL)
+  if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) {
     static_cast<HllArray<A>*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min();
+  }
   return gadget_.get_upper_bound(num_std_dev);
 }
 
diff --git a/hll/include/coupon_iterator-internal.hpp b/hll/include/coupon_iterator-internal.hpp
index 84133ffb..356517ec 100644
--- a/hll/include/coupon_iterator-internal.hpp
+++ b/hll/include/coupon_iterator-internal.hpp
@@ -28,7 +28,7 @@ template<typename A>
 coupon_iterator<A>::coupon_iterator(const uint32_t* array, size_t array_size, size_t index, bool all):
 array_(array), array_size_(array_size), index_(index), all_(all) {
   while (index_ < array_size_) {
-    if (all_ || array_[index_] != hll_constants::EMPTY) break;
+    if (all_ || array_[index_] != hll_constants::EMPTY) { break; }
     ++index_;
   }
 }
@@ -36,7 +36,7 @@ array_(array), array_size_(array_size), index_(index), all_(all) {
 template<typename A>
 coupon_iterator<A>& coupon_iterator<A>::operator++() {
   while (++index_ < array_size_) {
-    if (all_ || array_[index_] != hll_constants::EMPTY) break;
+    if (all_ || array_[index_] != hll_constants::EMPTY) { break; }
   }
   return *this;
 }

From b444a2ad2db9d70c4782e081cb4d92eb9b6cb8b8 Mon Sep 17 00:00:00 2001
From: syaojun <libevent@yeah.net>
Date: Mon, 23 Feb 2026 15:00:40 +0800
Subject: [PATCH 57/75] style(fi): Add braces to single-line if statements for
 consistency

---
 fi/include/frequent_items_sketch_impl.hpp  | 17 ++++++++---------
 fi/include/reverse_purge_hash_map_impl.hpp | 10 +++++-----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fi/include/frequent_items_sketch_impl.hpp b/fi/include/frequent_items_sketch_impl.hpp
index acbd2ee1..3eba188b 100644
--- a/fi/include/frequent_items_sketch_impl.hpp
+++ b/fi/include/frequent_items_sketch_impl.hpp
@@ -45,13 +45,13 @@ map(
   allocator
 )
 {
-  if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
+  if (lg_start_map_size > lg_max_map_size) { throw std::invalid_argument("starting size must not be greater than maximum size"); }
 }
 
 template<typename T, typename W, typename H, typename E, typename A>
 void frequent_items_sketch<T, W, H, E, A>::update(const T& item, W weight) {
   check_weight(weight);
-  if (weight == 0) return;
+  if (weight == 0) { return; }
   total_weight += weight;
   offset += map.adjust_or_insert(item, weight);
 }
@@ -59,14 +59,14 @@ void frequent_items_sketch<T, W, H, E, A>::update(const T& item, W weight) {
 template<typename T, typename W, typename H, typename E, typename A>
 void frequent_items_sketch<T, W, H, E, A>::update(T&& item, W weight) {
   check_weight(weight);
-  if (weight == 0) return;
+  if (weight == 0) { return; }
   total_weight += weight;
   offset += map.adjust_or_insert(std::move(item), weight);
 }
 
 template<typename T, typename W, typename H, typename E, typename A>
 void frequent_items_sketch<T, W, H, E, A>::merge(const frequent_items_sketch& other) {
-  if (other.is_empty()) return;
+  if (other.is_empty()) { return; }
   const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
   for (auto it: other.map) {
     update(it.first, it.second);
@@ -77,7 +77,7 @@ void frequent_items_sketch<T, W, H, E, A>::merge(const frequent_items_sketch& ot
 
 template<typename T, typename W, typename H, typename E, typename A>
 void frequent_items_sketch<T, W, H, E, A>::merge(frequent_items_sketch&& other) {
-  if (other.is_empty()) return;
+  if (other.is_empty()) { return; }
   const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
   for (auto it: other.map) {
     update(std::move(it.first), it.second);
@@ -105,7 +105,7 @@ template<typename T, typename W, typename H, typename E, typename A>
 W frequent_items_sketch<T, W, H, E, A>::get_estimate(const T& item) const {
   // if item is tracked estimate = weight + offset, otherwise 0
   const W weight = map.get(item);
-  if (weight > 0) return weight + offset;
+  if (weight > 0) { return weight + offset; }
   return 0;
 }
 
@@ -210,7 +210,7 @@ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const Ser
 template<typename T, typename W, typename H, typename E, typename A>
 template<typename SerDe>
 size_t frequent_items_sketch<T, W, H, E, A>::get_serialized_size_bytes(const SerDe& sd) const {
-  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
+  if (is_empty()) { return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t); }
   size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
   for (auto it: map) size += sd.size_of_item(it.first);
   return size;
@@ -328,8 +328,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
     sketch.total_weight = total_weight;
     sketch.offset = offset;
   }
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream"); 
+  if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } 
   return sketch;
 }
 
diff --git a/fi/include/reverse_purge_hash_map_impl.hpp b/fi/include/reverse_purge_hash_map_impl.hpp
index fa2ad824..63909cf3 100644
--- a/fi/include/reverse_purge_hash_map_impl.hpp
+++ b/fi/include/reverse_purge_hash_map_impl.hpp
@@ -74,7 +74,7 @@ states_(nullptr)
       if (other.states_[i] > 0) {
         new (&keys_[i]) K(other.keys_[i]);
         values_[i] = other.values_[i];
-         if (--num == 0) break;
+         if (--num == 0) { break; }
       }
     }
   }
@@ -105,7 +105,7 @@ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
     for (uint32_t i = 0; i < size; i++) {
       if (is_active(i)) {
         keys_[i].~K();
-        if (--num_active_ == 0) break;
+        if (--num_active_ == 0) { break; }
       }
     }
   }
@@ -166,7 +166,7 @@ V reverse_purge_hash_map<K, V, H, E, A>::get(const K& key) const {
   const uint32_t mask = (1 << lg_cur_size_) - 1;
   uint32_t probe = fmix64(H()(key)) & mask;
   while (is_active(probe)) {
-    if (E()(keys_[probe], key)) return values_[probe];
+    if (E()(keys_[probe], key)) { return values_[probe]; }
     probe = (probe + 1) & mask;
   }
   return 0;
@@ -271,7 +271,7 @@ void reverse_purge_hash_map<K, V, H, E, A>::hash_delete(uint32_t delete_index) {
     probe = (probe + 1) & mask;
     drift++;
     // only used for theoretical analysis
-    if (drift >= DRIFT_LIMIT) throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT");
+    if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT"); }
   }
 }
 
@@ -289,7 +289,7 @@ uint32_t reverse_purge_hash_map<K, V, H, E, A>::internal_adjust_or_insert(const
     index = (index + 1) & mask;
     drift++;
     // only used for theoretical analysis
-    if (drift >= DRIFT_LIMIT) throw std::logic_error("drift limit reached");
+    if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift limit reached"); }
   }
   // adding the key and value to the table
   if (num_active_ > get_capacity()) {

From 4e92e0bd71db0d00af93e3be6fc8a3f9152173e1 Mon Sep 17 00:00:00 2001
From: syaojun <libevent@yeah.net>
Date: Mon, 23 Feb 2026 15:13:51 +0800
Subject: [PATCH 58/75] style(cpc): Fix missing braces in if statements in
 cpc/include

---
 cpc/include/cpc_sketch_impl.hpp | 69 +++++++++++++++++----------------
 cpc/include/cpc_union_impl.hpp  | 54 +++++++++++++-------------
 cpc/include/cpc_util.hpp        | 14 +++----
 cpc/include/icon_estimator.hpp  | 10 ++---
 cpc/include/u32_table_impl.hpp  | 32 +++++++--------
 5 files changed, 90 insertions(+), 89 deletions(-)

diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp
index 84709cdc..80f111f1 100644
--- a/cpc/include/cpc_sketch_impl.hpp
+++ b/cpc/include/cpc_sketch_impl.hpp
@@ -73,7 +73,7 @@ bool cpc_sketch_alloc<A>::is_empty() const {
 
 template<typename A>
 double cpc_sketch_alloc<A>::get_estimate() const {
-  if (!was_merged) return get_hip_estimate();
+  if (!was_merged) { return get_hip_estimate(); }
   return get_icon_estimate();
 }
 
@@ -92,7 +92,7 @@ double cpc_sketch_alloc<A>::get_lower_bound(unsigned kappa) const {
   if (kappa < 1 || kappa > 3) {
     throw std::invalid_argument("kappa must be 1, 2 or 3");
   }
-  if (!was_merged) return get_hip_confidence_lb<A>(*this, kappa);
+  if (!was_merged) { return get_hip_confidence_lb<A>(*this, kappa); }
   return get_icon_confidence_lb<A>(*this, kappa);
 }
 
@@ -101,13 +101,13 @@ double cpc_sketch_alloc<A>::get_upper_bound(unsigned kappa) const {
   if (kappa < 1 || kappa > 3) {
     throw std::invalid_argument("kappa must be 1, 2 or 3");
   }
-  if (!was_merged) return get_hip_confidence_ub<A>(*this, kappa);
+  if (!was_merged) { return get_hip_confidence_ub<A>(*this, kappa); }
   return get_icon_confidence_ub<A>(*this, kappa);
 }
 
 template<typename A>
 void cpc_sketch_alloc<A>::update(const std::string& value) {
-  if (value.empty()) return;
+  if (value.empty()) { return; }
   update(value.c_str(), value.length());
 }
 
@@ -173,15 +173,15 @@ void cpc_sketch_alloc<A>::update(float value) {
 }
 
 static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
-  if (lg_k > 26) throw std::logic_error("lg_k > 26");
+  if (lg_k > 26) { throw std::logic_error("lg_k > 26"); }
   const uint32_t k = 1 << lg_k;
   uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
-  if (col > 63) col = 63; // clip so that 0 <= col <= 63
+  if (col > 63) { col = 63; } // clip so that 0 <= col <= 63
   const uint32_t row = hash0 & (k - 1);
   uint32_t row_col = (row << 6) | col;
   // To avoid the hash table's "empty" value, we change the row of the following pair.
   // This case is extremely unlikely, but we might as well handle it.
-  if (row_col == UINT32_MAX) row_col ^= 1 << 6;
+  if (row_col == UINT32_MAX) { row_col ^= 1 << 6; }
   return row_col;
 }
 
@@ -195,7 +195,7 @@ void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
 template<typename A>
 void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
   const uint8_t col = row_col & 63;
-  if (col < first_interesting_column) return; // important speed optimization
+  if (col < first_interesting_column) { return; } // important speed optimization
   // window size is 0 until sketch is promoted from sparse to windowed
   if (sliding_window.size() == 0) {
     update_sparse(row_col);
@@ -208,26 +208,26 @@ template<typename A>
 void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
   const uint32_t k = 1 << lg_k;
   const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
-  if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
+  if (c32pre >= 3 * k) { throw std::logic_error("c32pre >= 3 * k"); } // C < 3K/32, in other words flavor == SPARSE
   bool is_novel = surprising_value_table.maybe_insert(row_col);
   if (is_novel) {
     num_coupons++;
     update_hip(row_col);
     const uint64_t c32post = static_cast<uint64_t>(num_coupons) << 5;
-    if (c32post >= 3 * k) promote_sparse_to_windowed(); // C >= 3K/32
+    if (c32post >= 3 * k) { promote_sparse_to_windowed(); } // C >= 3K/32
   }
 }
 
 // the flavor is HYBRID, PINNED, or SLIDING
 template<typename A>
 void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
-  if (window_offset > 56) throw std::logic_error("wrong window offset");
+  if (window_offset > 56) { throw std::logic_error("wrong window offset"); }
   const uint32_t k = 1 << lg_k;
   const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
-  if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
+  if (c32pre < 3 * k) { throw std::logic_error("c32pre < 3 * k"); } // C < 3K/32, in other words flavor >= HYBRID
   const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
   const uint64_t w8pre = static_cast<uint64_t>(window_offset) << 3;
-  if (c8pre >= (27 + w8pre) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
+  if (c8pre >= (27 + w8pre) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset)
 
   bool is_novel = false;
   const uint8_t col = row_col & 63;
@@ -235,7 +235,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
   if (col < window_offset) { // track the surprising 0's "before" the window
     is_novel = surprising_value_table.maybe_delete(row_col); // inverted logic
   } else if (col < window_offset + 8) { // track the 8 bits inside the window
-    if (col < window_offset) throw std::logic_error("col < window_offset");
+    if (col < window_offset) { throw std::logic_error("col < window_offset"); }
     const uint32_t row = row_col >> 6;
     const uint8_t old_bits = sliding_window[row];
     const uint8_t new_bits = old_bits | (1 << (col - window_offset));
@@ -244,7 +244,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
       is_novel = true;
     }
   } else { // track the surprising 1's "after" the window
-    if (col < window_offset + 8) throw std::logic_error("col < window_offset + 8");
+    if (col < window_offset + 8) { throw std::logic_error("col < window_offset + 8"); }
     is_novel = surprising_value_table.maybe_insert(row_col); // normal logic
   }
 
@@ -254,9 +254,9 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
     const uint64_t c8post = static_cast<uint64_t>(num_coupons) << 3;
     if (c8post >= (27 + w8pre) * k) {
       move_window();
-      if (window_offset < 1 || window_offset > 56) throw std::logic_error("wrong window offset");
+      if (window_offset < 1 || window_offset > 56) { throw std::logic_error("wrong window offset"); }
       const uint64_t w8post = static_cast<uint64_t>(window_offset) << 3;
-      if (c8post >= (27 + w8post) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
+      if (c8post >= (27 + w8post) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset)
     }
   }
 }
@@ -276,7 +276,7 @@ template<typename A>
 void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
   const uint32_t k = 1 << lg_k;
   const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
-  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
+  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) { throw std::logic_error("wrong c32"); }
 
   sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it)
 
@@ -285,7 +285,7 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
   const uint32_t* old_slots = surprising_value_table.get_slots();
   const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
 
-  if (window_offset != 0) throw std::logic_error("window_offset != 0");
+  if (window_offset != 0) { throw std::logic_error("window_offset != 0"); }
 
   for (uint32_t i = 0; i < old_num_slots; i++) {
     const uint32_t row_col = old_slots[i];
@@ -297,7 +297,7 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
       } else {
         // cannot use u32_table::must_insert(), because it doesn't provide for growth
         const bool is_novel = new_table.maybe_insert(row_col);
-        if (!is_novel) throw std::logic_error("is_novel != true");
+        if (!is_novel) { throw std::logic_error("is_novel != true"); }
       }
     }
   }
@@ -308,17 +308,17 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
 template<typename A>
 void cpc_sketch_alloc<A>::move_window() {
   const uint8_t new_offset = window_offset + 1;
-  if (new_offset > 56) throw std::logic_error("new_offset > 56");
-  if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
+  if (new_offset > 56) { throw std::logic_error("new_offset > 56"); }
+  if (new_offset != determine_correct_offset(lg_k, num_coupons)) { throw std::logic_error("new_offset is wrong"); }
 
-  if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
+  if (sliding_window.size() == 0) { throw std::logic_error("no sliding window"); }
   const uint32_t k = 1 << lg_k;
 
   // Construct the full-sized bit matrix that corresponds to the sketch
   vector_u64 bit_matrix = build_bit_matrix();
 
   // refresh the KXP register on every 8th window shift.
-  if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
+  if ((new_offset & 0x7) == 0) { refresh_kxp(bit_matrix.data()); }
 
   surprising_value_table.clear(); // the new number of surprises will be about the same
 
@@ -339,14 +339,14 @@ void cpc_sketch_alloc<A>::move_window() {
       pattern = pattern ^ (static_cast<uint64_t>(1) << col); // erase the 1
       const uint32_t row_col = (i << 6) | col;
       const bool is_novel = surprising_value_table.maybe_insert(row_col);
-      if (!is_novel) throw std::logic_error("is_novel != true");
+      if (!is_novel) { throw std::logic_error("is_novel != true"); }
     }
   }
 
   window_offset = new_offset;
 
   first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored);
-  if (first_interesting_column > new_offset) first_interesting_column = new_offset; // corner case
+  if (first_interesting_column > new_offset) { first_interesting_column = new_offset; } // corner case
 }
 
 // The KXP register is a double with roughly 50 bits of precision, but
@@ -438,7 +438,7 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
       write(os, compressed.table_num_entries);
       // HIP values can be in two different places in the sequence of fields
       // this is the first HIP decision point
-      if (has_hip) write_hip(os);
+      if (has_hip) { write_hip(os); }
     }
     if (has_table) {
       write(os, compressed.table_data_words);
@@ -447,7 +447,7 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
       write(os, compressed.window_data_words);
     }
     // this is the second HIP decision point
-    if (has_hip && !(has_table && has_window)) write_hip(os);
+    if (has_hip && !(has_table && has_window)) { write_hip(os); }
     if (has_window) {
       write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
     }
@@ -494,7 +494,7 @@ auto cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_
       ptr += copy_to_mem(compressed.table_num_entries, ptr);
       // HIP values can be in two different places in the sequence of fields
       // this is the first HIP decision point
-      if (has_hip) ptr += copy_hip_to_mem(ptr);
+      if (has_hip) { ptr += copy_hip_to_mem(ptr); }
     }
     if (has_table) {
       ptr += copy_to_mem(compressed.table_data_words, ptr);
@@ -503,7 +503,7 @@ auto cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_
       ptr += copy_to_mem(compressed.window_data_words, ptr);
     }
     // this is the second HIP decision point
-    if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
+    if (has_hip && !(has_table && has_window)) { ptr += copy_hip_to_mem(ptr); }
     if (has_window) {
       ptr += copy_to_mem(compressed.window_data.data(), ptr, compressed.window_data_words * sizeof(uint32_t));
     }
@@ -511,7 +511,7 @@ auto cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_
       ptr += copy_to_mem(compressed.table_data.data(), ptr, compressed.table_data_words * sizeof(uint32_t));
     }
   }
-  if (ptr != bytes.data() + size) throw std::logic_error("serialized size mismatch");
+  if (ptr != bytes.data() + size) { throw std::logic_error("serialized size mismatch"); }
   return bytes;
 }
 
@@ -561,7 +561,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
       compressed.table_data.resize(compressed.table_data_words);
       read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
     }
-    if (!has_window) compressed.table_num_entries = num_coupons;
+    if (!has_window) { compressed.table_num_entries = num_coupons; }
   }
 
   uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
@@ -583,8 +583,9 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
   }
   uncompressed_state<A> uncompressed(allocator);
   get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
-  if (!is.good())
-    throw std::runtime_error("error reading from std::istream"); 
+  if (!is.good()) {
+    throw std::runtime_error("error reading from std::istream");
+  } 
   return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
       std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
 }
diff --git a/cpc/include/cpc_union_impl.hpp b/cpc/include/cpc_union_impl.hpp
index f277107f..673aa7a4 100644
--- a/cpc/include/cpc_union_impl.hpp
+++ b/cpc/include/cpc_union_impl.hpp
@@ -109,15 +109,15 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
         + std::to_string(seed_hash_sketch));
   }
   const auto src_flavor = sketch.determine_flavor();
-  if (cpc_sketch_alloc<A>::flavor::EMPTY == src_flavor) return;
+  if (cpc_sketch_alloc<A>::flavor::EMPTY == src_flavor) { return; }
 
-  if (sketch.get_lg_k() < lg_k) reduce_k(sketch.get_lg_k());
-  if (sketch.get_lg_k() < lg_k) throw std::logic_error("sketch lg_k < union lg_k");
+  if (sketch.get_lg_k() < lg_k) { reduce_k(sketch.get_lg_k()); }
+  if (sketch.get_lg_k() < lg_k) { throw std::logic_error("sketch lg_k < union lg_k"); }
 
-  if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit matrix are absent");
+  if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit matrix are absent"); }
 
   if (cpc_sketch_alloc<A>::flavor::SPARSE == src_flavor && accumulator != nullptr)  { // Case A
-    if (bit_matrix.size() > 0) throw std::logic_error("union bit_matrix is not expected");
+    if (bit_matrix.size() > 0) { throw std::logic_error("union bit_matrix is not expected"); }
     const auto initial_dest_flavor = accumulator->determine_flavor();
     if (cpc_sketch_alloc<A>::flavor::EMPTY != initial_dest_flavor &&
         cpc_sketch_alloc<A>::flavor::SPARSE != initial_dest_flavor) throw std::logic_error("wrong flavor");
@@ -138,24 +138,24 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
   }
 
   if (cpc_sketch_alloc<A>::flavor::SPARSE == src_flavor && bit_matrix.size() > 0)  { // Case B
-    if (accumulator != nullptr) throw std::logic_error("union accumulator != null");
+    if (accumulator != nullptr) { throw std::logic_error("union accumulator != null"); }
     or_table_into_matrix(sketch.surprising_value_table);
     return;
   }
 
   if (cpc_sketch_alloc<A>::flavor::HYBRID != src_flavor && cpc_sketch_alloc<A>::flavor::PINNED != src_flavor
-      && cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor");
+      && cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); }
 
   // source is past SPARSE mode, so make sure that dest is a bit matrix
   if (accumulator != nullptr) {
-    if (bit_matrix.size() > 0) throw std::logic_error("union bit matrix is not expected");
+    if (bit_matrix.size() > 0) { throw std::logic_error("union bit matrix is not expected"); }
     const auto dst_flavor = accumulator->determine_flavor();
     if (cpc_sketch_alloc<A>::flavor::EMPTY != dst_flavor && cpc_sketch_alloc<A>::flavor::SPARSE != dst_flavor) {
       throw std::logic_error("wrong flavor");
     }
     switch_to_bit_matrix();
   }
-  if (bit_matrix.size() == 0) throw std::logic_error("union bit_matrix is expected");
+  if (bit_matrix.size() == 0) { throw std::logic_error("union bit_matrix is expected"); }
 
   if (cpc_sketch_alloc<A>::flavor::HYBRID == src_flavor || cpc_sketch_alloc<A>::flavor::PINNED == src_flavor) { // Case C
     or_window_into_matrix(sketch.sliding_window, sketch.window_offset, sketch.get_lg_k());
@@ -165,7 +165,7 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
 
   // SLIDING mode involves inverted logic, so we can't just walk the source sketch.
   // Instead, we convert it to a bitMatrix that can be OR'ed into the destination.
-  if (cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D
+  if (cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); } // Case D
   vector_u64 src_matrix = sketch.build_bit_matrix();
   or_matrix_into_matrix(src_matrix, sketch.get_lg_k());
 }
@@ -173,20 +173,20 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
 template<typename A>
 cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result() const {
   if (accumulator != nullptr) {
-    if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
+    if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); }
     return get_result_from_accumulator();
   }
-  if (bit_matrix.size() == 0) throw std::logic_error("bit_matrix is expected");
+  if (bit_matrix.size() == 0) { throw std::logic_error("bit_matrix is expected"); }
   return get_result_from_bit_matrix();
 }
 
 template<typename A>
 cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
-  if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
+  if (lg_k != accumulator->get_lg_k()) { throw std::logic_error("lg_k != accumulator->lg_k"); }
   if (accumulator->get_num_coupons() == 0) {
     return cpc_sketch_alloc<A>(lg_k, seed, accumulator->get_allocator());
   }
-  if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
+  if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) { throw std::logic_error("wrong flavor"); }
   cpc_sketch_alloc<A> copy(*accumulator);
   copy.was_merged = true;
   return copy;
@@ -199,7 +199,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
 
   const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
   if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
-      && flavor != cpc_sketch_alloc<A>::flavor::SLIDING) throw std::logic_error("wrong flavor");
+      && flavor != cpc_sketch_alloc<A>::flavor::SLIDING) { throw std::logic_error("wrong flavor"); }
 
   const uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
 
@@ -208,7 +208,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
 
   // dynamically growing caused snowplow effect
   uint8_t table_lg_size = lg_k - 4; // K/16; in some cases this will end up being oversized
-  if (table_lg_size < 2) table_lg_size = 2;
+  if (table_lg_size < 2) { table_lg_size = 2; }
   u32_table<A> table(table_lg_size, 6 + lg_k, bit_matrix.get_allocator());
 
   // the following should work even when the offset is zero
@@ -229,14 +229,14 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
       pattern = pattern ^ (static_cast<uint64_t>(1) << col); // erase the 1
       const uint32_t row_col = (i << 6) | col;
       bool is_novel = table.maybe_insert(row_col);
-      if (!is_novel) throw std::logic_error("is_novel != true");
+      if (!is_novel) { throw std::logic_error("is_novel != true"); }
     }
   }
 
   // at this point we could shrink an oversized hash table, but the relative waste isn't very big
 
   uint8_t first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored);
-  if (first_interesting_column > offset) first_interesting_column = offset; // corner case
+  if (first_interesting_column > offset) { first_interesting_column = offset; } // corner case
 
   // HIP-related fields will contain zeros, and that is okay
   return cpc_sketch_alloc<A>(lg_k, num_coupons, first_interesting_column, std::move(table), std::move(sliding_window), false, 0, 0, seed);
@@ -260,9 +260,9 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
   // Using a golden ratio stride fixes the snowplow effect.
   const double golden = 0.6180339887498949025;
   uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
-  if (stride < 2) throw std::logic_error("stride < 2");
-  if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
-  if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
+  if (stride < 2) { throw std::logic_error("stride < 2"); }
+  if (stride == ((stride >> 1) << 1)) { stride += 1; } // force the stride to be odd
+  if (stride < 3 || stride >= num_slots) { throw std::out_of_range("stride out of range"); }
 
   for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
     j &= num_slots - 1;
@@ -290,7 +290,7 @@ void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
 
 template<typename A>
 void cpc_union_alloc<A>::or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k) {
-  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
+  if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); }
   const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
   const uint32_t src_k = 1 << src_lg_k;
   for (uint32_t src_row = 0; src_row < src_k; src_row++) {
@@ -300,7 +300,7 @@ void cpc_union_alloc<A>::or_window_into_matrix(const vector_bytes& sliding_windo
 
 template<typename A>
 void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k) {
-  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
+  if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); }
   const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
   const uint32_t src_k = 1 << src_lg_k;
   for (uint32_t src_row = 0; src_row < src_k; src_row++) {
@@ -310,11 +310,11 @@ void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64& src_matrix, uin
 
 template<typename A>
 void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
-  if (new_lg_k >= lg_k) throw std::logic_error("new LgK >= union lgK");
-  if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit_matrix are absent");
+  if (new_lg_k >= lg_k) { throw std::logic_error("new LgK >= union lgK"); }
+  if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit_matrix are absent"); }
 
   if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix
-    if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
+    if (accumulator != nullptr) { throw std::logic_error("accumulator is not null"); }
     vector_u64 old_matrix = std::move(bit_matrix);
     const uint8_t old_lg_k = lg_k;
     const uint32_t new_k = 1 << new_lg_k;
@@ -325,7 +325,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
   }
 
   if (accumulator != nullptr) { // downsample the unioner's sketch
-    if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
+    if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); }
     if (!accumulator->is_empty()) {
       cpc_sketch_alloc<A> old_accumulator(*accumulator);
       *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed, old_accumulator.get_allocator());
diff --git a/cpc/include/cpc_util.hpp b/cpc/include/cpc_util.hpp
index e5664951..c9da8ab7 100644
--- a/cpc/include/cpc_util.hpp
+++ b/cpc/include/cpc_util.hpp
@@ -25,19 +25,19 @@
 namespace datasketches {
 
 static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
-  if (y == 0) throw std::invalid_argument("divide_longs_rounding_up: bad argument");
+  if (y == 0) { throw std::invalid_argument("divide_longs_rounding_up: bad argument"); }
   const uint64_t quotient = x / y;
-  if (quotient * y == x) return (quotient);
-  else return quotient + 1;
+  if (quotient * y == x) { return (quotient); }
+  else { return quotient + 1; }
 }
 
 static inline uint8_t floor_log2_of_long(uint64_t x) {
-  if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
+  if (x < 1) { throw std::invalid_argument("floor_log2_of_long: bad argument"); }
   uint8_t p = 0;
   uint64_t y = 1;
   while (true) {
-    if (y == x) return p;
-    if (y > x) return p - 1;
+    if (y == x) { return p; }
+    if (y > x) { return p - 1; }
     p += 1;
     y <<= 1;
   }
@@ -98,7 +98,7 @@ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, ui
   }
 
 static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
-  if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
+  if ((length & 0x7) != 0) { throw std::invalid_argument("the length of the array must be a multiple of 8"); }
   uint32_t total = 0;
   uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
   fours = twos = ones = 0;
diff --git a/cpc/include/icon_estimator.hpp b/cpc/include/icon_estimator.hpp
index fb3c0c60..ade787e5 100644
--- a/cpc/include/icon_estimator.hpp
+++ b/cpc/include/icon_estimator.hpp
@@ -246,14 +246,14 @@ static inline double icon_exponential_approximation(double k, double c) {
 }
 
 static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
-  if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
-  if (c < 2) return ((c == 0) ? 0.0 : 1.0);
+  if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) { throw std::out_of_range("lg_k out of range"); }
+  if (c < 2) { return ((c == 0) ? 0.0 : 1.0); }
   const uint32_t k = 1 << lg_k;
   const double double_k = static_cast<double>(k);
   const double double_c = static_cast<double>(c);
   // Differing thresholds ensure that the approximated estimator is monotonically increasing.
   const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
-  if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
+  if (double_c > (threshold_factor * double_k)) { return icon_exponential_approximation(double_k, double_c); }
   const double factor = evaluate_polynomial(
       ICON_POLYNOMIAL_COEFFICIENTS,
       ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
@@ -265,8 +265,8 @@ static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
   // The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
   const double term = 1.0 + (ratio * ratio * ratio / 66.774757);
   const double result = double_c * factor * term;
-  if (result >= double_c) return result;
-  else return double_c;
+  if (result >= double_c) { return result; }
+  else { return double_c; }
 }
 
 } /* namespace datasketches */
diff --git a/cpc/include/u32_table_impl.hpp b/cpc/include/u32_table_impl.hpp
index 62cd7dac..85797bcf 100644
--- a/cpc/include/u32_table_impl.hpp
+++ b/cpc/include/u32_table_impl.hpp
@@ -43,8 +43,8 @@ num_valid_bits(num_valid_bits),
 num_items(0),
 slots(1ULL << lg_size, UINT32_MAX, allocator)
 {
-  if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
-  if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
+  if (lg_size < 2) { throw std::invalid_argument("lg_size must be >= 2"); }
+  if (num_valid_bits < 1 || num_valid_bits > 32) { throw std::invalid_argument("num_valid_bits must be between 1 and 32"); }
 }
 
 template<typename A>
@@ -71,8 +71,8 @@ void u32_table<A>::clear() {
 template<typename A>
 bool u32_table<A>::maybe_insert(uint32_t item) {
   const uint32_t index = lookup(item);
-  if (slots[index] == item) return false;
-  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
+  if (slots[index] == item) { return false; }
+  if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); }
   slots[index] = item;
   num_items++;
   if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) {
@@ -84,9 +84,9 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
 template<typename A>
 bool u32_table<A>::maybe_delete(uint32_t item) {
   const uint32_t index = lookup(item);
-  if (slots[index] == UINT32_MAX) return false;
-  if (slots[index] != item) throw std::logic_error("item does not exist");
-  if (num_items == 0) throw std::logic_error("delete error");
+  if (slots[index] == UINT32_MAX) { return false; }
+  if (slots[index] != item) { throw std::logic_error("item does not exist"); }
+  if (num_items == 0) { throw std::logic_error("delete error"); }
   // delete the item
   slots[index] = UINT32_MAX;
   num_items--;
@@ -129,7 +129,7 @@ uint32_t u32_table<A>::lookup(uint32_t item) const {
   const uint32_t mask = size - 1;
   const uint8_t shift = num_valid_bits - lg_size;
   uint32_t probe = item >> shift;
-  if (probe > mask) throw std::logic_error("probe out of range");
+  if (probe > mask) { throw std::logic_error("probe out of range"); }
   while (slots[probe] != item && slots[probe] != UINT32_MAX) {
     probe = (probe + 1) & mask;
   }
@@ -140,17 +140,17 @@ uint32_t u32_table<A>::lookup(uint32_t item) const {
 template<typename A>
 void u32_table<A>::must_insert(uint32_t item) {
   const uint32_t index = lookup(item);
-  if (slots[index] == item) throw std::logic_error("item exists");
-  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
+  if (slots[index] == item) { throw std::logic_error("item exists"); }
+  if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); }
   slots[index] = item;
 }
 
 template<typename A>
 void u32_table<A>::rebuild(uint8_t new_lg_size) {
-  if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
+  if (new_lg_size < 2) { throw std::logic_error("lg_size must be >= 2"); }
   const uint32_t old_size = 1 << lg_size;
   const uint32_t new_size = 1 << new_lg_size;
-  if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
+  if (new_size <= num_items) { throw std::logic_error("new_size <= num_items"); }
   vector_u32 old_slots = std::move(slots);
   slots = vector_u32(new_size, UINT32_MAX, old_slots.get_allocator());
   lg_size = new_lg_size;
@@ -169,7 +169,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
 // The result is nearly sorted, so make sure to use an efficient sort for that case
 template<typename A>
 auto u32_table<A>::unwrapping_get_items() const -> vector_u32 {
-  if (num_items == 0) return vector_u32(slots.get_allocator());
+  if (num_items == 0) { return vector_u32(slots.get_allocator()); }
   const uint32_t table_size = 1 << lg_size;
   vector_u32 result(num_items, 0, slots.get_allocator());
   size_t i = 0;
@@ -187,9 +187,9 @@ auto u32_table<A>::unwrapping_get_items() const -> vector_u32 {
   // the rest of the table is processed normally
   while (i < table_size) {
     const uint32_t item = slots[i++];
-    if (item != UINT32_MAX) result[l++] = item;
+    if (item != UINT32_MAX) { result[l++] = item; }
   }
-  if (l != r + 1) throw std::logic_error("unwrapping error");
+  if (l != r + 1) { throw std::logic_error("unwrapping error"); }
   return result;
 }
 
@@ -213,7 +213,7 @@ void u32_table<A>::merge(
     else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; }
     else                          { arr_c[c] = arr_b[b++]; }
   }
-  if (a != lim_a || b != lim_b) throw std::logic_error("merging error");
+  if (a != lim_a || b != lim_b) { throw std::logic_error("merging error"); }
 }
 
 // In applications where the input array is already nearly sorted,

From c9bf1e88a08f2d0fe58fddee5ed79a7ee703039b Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sun, 8 Mar 2026 22:44:06 +0900
Subject: [PATCH 59/75] refactor: rollback utf8 validation

---
 NOTICE                                        |   3 -
 common/CMakeLists.txt                         |  11 -
 common/include/third_party/utf8cpp/LICENSE    |  23 -
 common/include/third_party/utf8cpp/utf8.h     |  46 --
 .../third_party/utf8cpp/utf8/checked.h        | 359 -------------
 .../include/third_party/utf8cpp/utf8/core.h   | 500 ------------------
 .../include/third_party/utf8cpp/utf8/cpp11.h  |  70 ---
 .../include/third_party/utf8cpp/utf8/cpp17.h  |  96 ----
 .../include/third_party/utf8cpp/utf8/cpp20.h  | 124 -----
 .../third_party/utf8cpp/utf8/unchecked.h      | 286 ----------
 tuple/include/array_of_strings_sketch.hpp     |  27 +-
 .../include/array_of_strings_sketch_impl.hpp  |  13 -
 tuple/test/array_of_strings_sketch_test.cpp   |  12 -
 13 files changed, 22 insertions(+), 1548 deletions(-)
 delete mode 100644 common/include/third_party/utf8cpp/LICENSE
 delete mode 100644 common/include/third_party/utf8cpp/utf8.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/checked.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/core.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp11.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp17.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/cpp20.h
 delete mode 100644 common/include/third_party/utf8cpp/utf8/unchecked.h

diff --git a/NOTICE b/NOTICE
index 6a2376d9..11ba6f6c 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,6 +10,3 @@ The Apache Software Foundation (http://www.apache.org/).
 
 Prior to moving to ASF, the software for this project was developed at
 Yahoo Inc. (https://developer.yahoo.com).
-
-This product includes utf8cpp (https://github.com/nemtrif/utfcpp),
-licensed under the Boost Software License, Version 1.0.
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 2d5c7330..8514433b 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,14 +51,3 @@ install(FILES
       include/serde.hpp
       include/xxhash64.h
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
-
-install(FILES
-      include/third_party/utf8cpp/utf8.h
-      include/third_party/utf8cpp/utf8/checked.h
-      include/third_party/utf8cpp/utf8/core.h
-      include/third_party/utf8cpp/utf8/cpp11.h
-      include/third_party/utf8cpp/utf8/cpp17.h
-      include/third_party/utf8cpp/utf8/cpp20.h
-      include/third_party/utf8cpp/utf8/unchecked.h
-      include/third_party/utf8cpp/LICENSE
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp")
diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE
deleted file mode 100644
index 36b7cd93..00000000
--- a/common/include/third_party/utf8cpp/LICENSE
+++ /dev/null
@@ -1,23 +0,0 @@
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h
deleted file mode 100644
index b5135309..00000000
--- a/common/include/third_party/utf8cpp/utf8.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-/*
-To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro
-and set it to one of the values used by the __cplusplus predefined macro.
-
-For instance,
-    #define UTF_CPP_CPLUSPLUS 199711L
-will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard.
-Some library features will be disabled.
-
-If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus.
-*/
-
-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
-
-#endif // header guard
diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h
deleted file mode 100644
index 96ceb4d5..00000000
--- a/common/include/third_party/utf8cpp/utf8/checked.h
+++ /dev/null
@@ -1,359 +0,0 @@
-// Copyright 2006-2016 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-#include <stdexcept>
-
-namespace utf8
-{
-    // Base for the exceptions that may be thrown from the library
-    class exception : public ::std::exception {
-    };
-
-    // Exceptions that may be thrown from the library functions.
-    class invalid_code_point : public exception {
-        utfchar32_t cp;
-    public:
-        invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
-        utfchar32_t code_point() const {return cp;}
-    };
-
-    class invalid_utf8 : public exception {
-        utfchar8_t u8;
-    public:
-        invalid_utf8 (utfchar8_t u) : u8(u) {}
-        invalid_utf8 (char c) : u8(static_cast<utfchar8_t>(c)) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
-        utfchar8_t utf8_octet() const {return u8;}
-    };
-
-    class invalid_utf16 : public exception {
-        utfchar16_t u16;
-    public:
-        invalid_utf16 (utfchar16_t u) : u16(u) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
-        utfchar16_t utf16_word() const {return u16;}
-    };
-
-    class not_enough_room : public exception {
-    public:
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
-    };
-
-    /// The library API - functions intended to be called by the users
-
-    template <typename octet_iterator>
-    octet_iterator append(utfchar32_t cp, octet_iterator result)
-    {
-        if (!utf8::internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
-
-        return internal::append(cp, result);
-    }
-
-    inline void append(utfchar32_t cp, std::string& s)
-    {
-        append(cp, std::back_inserter(s));
-    }
-
-    template <typename word_iterator>
-    word_iterator append16(utfchar32_t cp, word_iterator result)
-    {
-        if (!utf8::internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
-
-        return internal::append16(cp, result);
-    }
-
-    template <typename octet_iterator, typename output_iterator>
-    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
-    {
-        while (start != end) {
-            octet_iterator sequence_start = start;
-            internal::utf_error err_code = utf8::internal::validate_next(start, end);
-            switch (err_code) {
-                case internal::UTF8_OK :
-                    for (octet_iterator it = sequence_start; it != start; ++it)
-                        *out++ = *it;
-                    break;
-                case internal::NOT_ENOUGH_ROOM:
-                    out = utf8::append (replacement, out);
-                    start = end;
-                    break;
-                case internal::INVALID_LEAD:
-                    out = utf8::append (replacement, out);
-                    ++start;
-                    break;
-                case internal::INCOMPLETE_SEQUENCE:
-                case internal::OVERLONG_SEQUENCE:
-                case internal::INVALID_CODE_POINT:
-                    out = utf8::append (replacement, out);
-                    ++start;
-                    // just one replacement mark for the sequence
-                    while (start != end && utf8::internal::is_trail(*start))
-                        ++start;
-                    break;
-            }
-        }
-        return out;
-    }
-
-    template <typename octet_iterator, typename output_iterator>
-    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
-    {
-        static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
-        return utf8::replace_invalid(start, end, out, replacement_marker);
-    }
-
-    inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
-
-    inline std::string replace_invalid(const std::string& s)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    template <typename octet_iterator>
-    utfchar32_t next(octet_iterator& it, octet_iterator end)
-    {
-        utfchar32_t cp = 0;
-        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
-        switch (err_code) {
-            case internal::UTF8_OK :
-                break;
-            case internal::NOT_ENOUGH_ROOM :
-                throw not_enough_room();
-            case internal::INVALID_LEAD :
-            case internal::INCOMPLETE_SEQUENCE :
-            case internal::OVERLONG_SEQUENCE :
-                throw invalid_utf8(static_cast<utfchar8_t>(*it));
-            case internal::INVALID_CODE_POINT :
-                throw invalid_code_point(cp);
-        }
-        return cp;
-    }
-
-    template <typename word_iterator>
-    utfchar32_t next16(word_iterator& it, word_iterator end)
-    {
-        utfchar32_t cp = 0;
-        internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp);
-        if (err_code == internal::NOT_ENOUGH_ROOM)
-            throw not_enough_room();
-        return cp;
-    }
-
-    template <typename octet_iterator>
-    utfchar32_t peek_next(octet_iterator it, octet_iterator end)
-    {
-        return utf8::next(it, end);
-    }
-
-    template <typename octet_iterator>
-    utfchar32_t prior(octet_iterator& it, octet_iterator start)
-    {
-        // can't do much if it == start
-        if (it == start)
-            throw not_enough_room();
-
-        octet_iterator end = it;
-        // Go back until we hit either a lead octet or start
-        while (utf8::internal::is_trail(*(--it)))
-            if (it == start)
-                throw invalid_utf8(*it); // error - no lead byte in the sequence
-        return utf8::peek_next(it, end);
-    }
-
-    template <typename octet_iterator, typename distance_type>
-    void advance (octet_iterator& it, distance_type n, octet_iterator end)
-    {
-        const distance_type zero(0);
-        if (n < zero) {
-            // backward
-            for (distance_type i = n; i < zero; ++i)
-                utf8::prior(it, end);
-        } else {
-            // forward
-            for (distance_type i = zero; i < n; ++i)
-                utf8::next(it, end);
-        }
-    }
-
-    template <typename octet_iterator>
-    typename std::iterator_traits<octet_iterator>::difference_type
-    distance (octet_iterator first, octet_iterator last)
-    {
-        typename std::iterator_traits<octet_iterator>::difference_type dist;
-        for (dist = 0; first < last; ++dist)
-            utf8::next(first, last);
-        return dist;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-    {
-        while (start != end) {
-            utfchar32_t cp = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
-            // Take care of surrogate pairs first
-            if (utf8::internal::is_lead_surrogate(cp)) {
-                if (start != end) {
-                    const utfchar32_t trail_surrogate = static_cast<utfchar32_t>(utf8::internal::mask16(*start++));
-                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
-                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                    else
-                        throw invalid_utf16(static_cast<utfchar16_t>(trail_surrogate));
-                }
-                else
-                    throw invalid_utf16(static_cast<utfchar16_t>(cp));
-
-            }
-            // Lone trail surrogate
-            else if (utf8::internal::is_trail_surrogate(cp))
-                throw invalid_utf16(static_cast<utfchar16_t>(cp));
-
-            result = utf8::append(cp, result);
-        }
-        return result;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
-    {
-        while (start < end) {
-            const utfchar32_t cp = utf8::next(start, end);
-            if (cp > 0xffff) { //make a surrogate pair
-                *result++ = static_cast<utfchar16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-            }
-            else
-                *result++ = static_cast<utfchar16_t>(cp);
-        }
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-    {
-        while (start != end)
-            result = utf8::append(*(start++), result);
-
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
-    {
-        while (start < end)
-            (*result++) = utf8::next(start, end);
-
-        return result;
-    }
-
-    // The iterator class
-    template <typename octet_iterator>
-    class iterator {
-      octet_iterator it;
-      octet_iterator range_start;
-      octet_iterator range_end;
-      public:
-      typedef utfchar32_t value_type;
-      typedef utfchar32_t* pointer;
-      typedef utfchar32_t& reference;
-      typedef std::ptrdiff_t difference_type;
-      typedef std::bidirectional_iterator_tag iterator_category;
-      iterator () {}
-      explicit iterator (const octet_iterator& octet_it,
-                         const octet_iterator& rangestart,
-                         const octet_iterator& rangeend) :
-               it(octet_it), range_start(rangestart), range_end(rangeend)
-      {
-          if (it < range_start || it > range_end)
-              throw std::out_of_range("Invalid utf-8 iterator position");
-      }
-      // the default "big three" are OK
-      octet_iterator base () const { return it; }
-      utfchar32_t operator * () const
-      {
-          octet_iterator temp = it;
-          return utf8::next(temp, range_end);
-      }
-      bool operator == (const iterator& rhs) const
-      {
-          if (range_start != rhs.range_start || range_end != rhs.range_end)
-              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
-          return (it == rhs.it);
-      }
-      bool operator != (const iterator& rhs) const
-      {
-          return !(operator == (rhs));
-      }
-      iterator& operator ++ ()
-      {
-          utf8::next(it, range_end);
-          return *this;
-      }
-      iterator operator ++ (int)
-      {
-          iterator temp = *this;
-          utf8::next(it, range_end);
-          return temp;
-      }
-      iterator& operator -- ()
-      {
-          utf8::prior(it, range_start);
-          return *this;
-      }
-      iterator operator -- (int)
-      {
-          iterator temp = *this;
-          utf8::prior(it, range_start);
-          return temp;
-      }
-    }; // class iterator
-
-} // namespace utf8
-
-#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
-#include "cpp20.h"
-#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
-#include "cpp17.h"
-#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
-#include "cpp11.h"
-#endif // C++ 11 or later
-
-#endif //header guard
-
diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h
deleted file mode 100644
index 8e128c18..00000000
--- a/common/include/third_party/utf8cpp/utf8/core.h
+++ /dev/null
@@ -1,500 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include <iterator>
-#include <cstring>
-#include <string>
-
-// Determine the C++ standard version.
-// If the user defines UTF_CPP_CPLUSPLUS, use that.
-// Otherwise, trust the unreliable predefined macro __cplusplus
-
-#if !defined UTF_CPP_CPLUSPLUS
-    #define UTF_CPP_CPLUSPLUS __cplusplus
-#endif
-
-#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
-    #define UTF_CPP_OVERRIDE override
-    #define UTF_CPP_NOEXCEPT noexcept
-    #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert");
-#else // C++ 98/03
-    #define UTF_CPP_OVERRIDE
-    #define UTF_CPP_NOEXCEPT throw()
-    // Not worth simulating static_assert:
-    #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition);
-#endif // C++ 11 or later
-
-
-namespace utf8
-{
-// The typedefs for 8-bit, 16-bit and 32-bit code units
-#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
-    #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later
-        typedef char8_t         utfchar8_t;
-    #else // C++ 11/14/17
-        typedef unsigned char   utfchar8_t;
-    #endif
-    typedef char16_t        utfchar16_t;
-    typedef char32_t        utfchar32_t;
-#else // C++ 98/03
-    typedef unsigned char   utfchar8_t;
-    typedef unsigned short  utfchar16_t;
-    typedef unsigned int    utfchar32_t;
-#endif // C++ 11 or later
-
-// Helper code - not intended to be directly called by the library users. May be changed at any time
-namespace internal
-{
-    // Unicode constants
-    // Leading (high) surrogates: 0xd800 - 0xdbff
-    // Trailing (low) surrogates: 0xdc00 - 0xdfff
-    const utfchar16_t LEAD_SURROGATE_MIN  = 0xd800u;
-    const utfchar16_t LEAD_SURROGATE_MAX  = 0xdbffu;
-    const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u;
-    const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu;
-    const utfchar16_t LEAD_OFFSET         = 0xd7c0u;       // LEAD_SURROGATE_MIN - (0x10000 >> 10)
-    const utfchar32_t SURROGATE_OFFSET    = 0xfca02400u;   // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
-
-    // Maximum valid value for a Unicode code point
-    const utfchar32_t CODE_POINT_MAX      = 0x0010ffffu;
-
-    template<typename octet_type>
-    inline utfchar8_t mask8(octet_type oc)
-    {
-        return static_cast<utfchar8_t>(0xff & oc);
-    }
-
-    template<typename u16_type>
-    inline utfchar16_t mask16(u16_type oc)
-    {
-        return static_cast<utfchar16_t>(0xffff & oc);
-    }
-
-    template<typename octet_type>
-    inline bool is_trail(octet_type oc)
-    {
-        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
-    }
-
-    inline bool is_lead_surrogate(utfchar32_t cp)
-    {
-        return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(LEAD_SURROGATE_MAX));
-    }
-
-    inline bool is_trail_surrogate(utfchar32_t cp)
-    {
-        return (cp >= static_cast<utfchar32_t>(TRAIL_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
-    }
-
-    inline bool is_surrogate(utfchar32_t cp)
-    {
-        return (cp >= static_cast<utfchar32_t>(LEAD_SURROGATE_MIN) && cp <= static_cast<utfchar32_t>(TRAIL_SURROGATE_MAX));
-    }
-
-    inline bool is_code_point_valid(utfchar32_t cp)
-    {
-        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
-    }
-
-    inline bool is_in_bmp(utfchar32_t cp)
-    {
-        return cp < utfchar32_t(0x10000);
-    }
-
-    template <typename octet_iterator>
-    int sequence_length(octet_iterator lead_it)
-    {
-        const utfchar8_t lead = utf8::internal::mask8(*lead_it);
-        if (lead < 0x80)
-            return 1;
-        else if ((lead >> 5) == 0x6)
-            return 2;
-        else if ((lead >> 4) == 0xe)
-            return 3;
-        else if ((lead >> 3) == 0x1e)
-            return 4;
-        else
-            return 0;
-    }
-
-    inline bool is_overlong_sequence(utfchar32_t cp, int length)
-    {
-        if (cp < 0x80) {
-            if (length != 1)
-                return true;
-        }
-        else if (cp < 0x800) {
-            if (length != 2)
-                return true;
-        }
-        else if (cp < 0x10000) {
-            if (length != 3)
-                return true;
-        }
-        return false;
-    }
-
-    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
-
-    /// Helper for get_sequence_x
-    template <typename octet_iterator>
-    utf_error increase_safely(octet_iterator& it, const octet_iterator end)
-    {
-        if (++it == end)
-            return NOT_ENOUGH_ROOM;
-
-        if (!utf8::internal::is_trail(*it))
-            return INCOMPLETE_SEQUENCE;
-
-        return UTF8_OK;
-    }
-
-    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
-
-    /// get_sequence_x functions decode utf-8 sequences of the length x
-    template <typename octet_iterator>
-    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
-    {
-        if (it == end)
-           return NOT_ENOUGH_ROOM;
-
-        code_point = static_cast<utfchar32_t>(utf8::internal::mask8(*it));
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = static_cast<utfchar32_t>(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff));
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = static_cast<utfchar32_t>(code_point + ((*it) & 0x3f));
-
-        return UTF8_OK;
-    }
-
-    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
-
-    template <typename octet_iterator>
-    utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        // Save the original value of it so we can go back in case of failure
-        // Of course, it does not make much sense with i.e. stream iterators
-        octet_iterator original_it = it;
-
-        utfchar32_t cp = 0;
-        // Determine the sequence length based on the lead octet
-        const int length = utf8::internal::sequence_length(it);
-
-        // Get trail octets and calculate the code point
-        utf_error err = UTF8_OK;
-        switch (length) {
-            case 0:
-                return INVALID_LEAD;
-            case 1:
-                err = utf8::internal::get_sequence_1(it, end, cp);
-                break;
-            case 2:
-                err = utf8::internal::get_sequence_2(it, end, cp);
-            break;
-            case 3:
-                err = utf8::internal::get_sequence_3(it, end, cp);
-            break;
-            case 4:
-                err = utf8::internal::get_sequence_4(it, end, cp);
-            break;
-        }
-
-        if (err == UTF8_OK) {
-            // Decoding succeeded. Now, security checks...
-            if (utf8::internal::is_code_point_valid(cp)) {
-                if (!utf8::internal::is_overlong_sequence(cp, length)){
-                    // Passed! Return here.
-                    code_point = cp;
-                    ++it;
-                    return UTF8_OK;
-                }
-                else
-                    err = OVERLONG_SEQUENCE;
-            }
-            else
-                err = INVALID_CODE_POINT;
-        }
-
-        // Failure branch - restore the original value of the iterator
-        it = original_it;
-        return err;
-    }
-
-    template <typename octet_iterator>
-    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
-        utfchar32_t ignored;
-        return utf8::internal::validate_next(it, end, ignored);
-    }
-
-    template <typename word_iterator>
-    utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point)
-    {
-        // Make sure the iterator dereferences a large enough type
-        typedef typename std::iterator_traits<word_iterator>::value_type word_type;
-        UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
-        // Check the edge case:
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-        // Save the original value of it so we can go back in case of failure
-        // Of course, it does not make much sense with i.e. stream iterators
-        word_iterator original_it = it;
-
-        utf_error err = UTF8_OK;
-
-        const utfchar16_t first_word = *it++;
-        if (!is_surrogate(first_word)) {
-            code_point = first_word;
-            return UTF8_OK;
-        }
-        else {
-            if (it == end)
-                err = NOT_ENOUGH_ROOM;
-            else if (is_lead_surrogate(first_word)) {
-                const utfchar16_t second_word = *it++;
-                if (is_trail_surrogate(static_cast<utfchar32_t>(second_word))) {
-                    code_point = static_cast<utfchar32_t>(first_word << 10) +  static_cast<utfchar32_t>(second_word) + SURROGATE_OFFSET;
-                    return UTF8_OK;
-                } else
-                    err = INCOMPLETE_SEQUENCE;
-
-            } else {
-                err = INVALID_LEAD;
-            }
-        }
-        // error branch
-        it = original_it;
-        return err;
-    }
-
-    // Internal implementation of both checked and unchecked append() function
-    // This function will be invoked by the overloads below, as they will know
-    // the octet_type.
-    template <typename octet_iterator, typename octet_type>
-    octet_iterator append(utfchar32_t cp, octet_iterator result) {
-        if (cp < 0x80)                        // one octet
-            *(result++) = static_cast<octet_type>(cp);
-        else if (cp < 0x800) {                // two octets
-            *(result++) = static_cast<octet_type>((cp >> 6)          | 0xc0);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        else if (cp < 0x10000) {              // three octets
-            *(result++) = static_cast<octet_type>((cp >> 12)         | 0xe0);
-            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        else {                                // four octets
-            *(result++) = static_cast<octet_type>((cp >> 18)         | 0xf0);
-            *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
-            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        return result;
-    }
-
-    // One of the following overloads will be invoked from the API calls
-
-    // A simple (but dangerous) case: the caller appends byte(s) to a char array
-    inline char* append(utfchar32_t cp, char* result) {
-        return append<char*, char>(cp, result);
-    }
-
-    // Hopefully, most common case: the caller uses back_inserter
-    // i.e. append(cp, std::back_inserter(str));
-    template<typename container_type>
-    std::back_insert_iterator<container_type> append
-            (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
-        return append<std::back_insert_iterator<container_type>,
-            typename container_type::value_type>(cp, result);
-    }
-
-    // The caller uses some other kind of output operator - not covered above
-    // Note that in this case we are not able to determine octet_type
-    // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong.
-    template <typename octet_iterator>
-    octet_iterator append(utfchar32_t cp, octet_iterator result) {
-        return append<octet_iterator, utfchar8_t>(cp, result);
-    }
-
-    // Internal implementation of both checked and unchecked append16() function
-    // This function will be invoked by the overloads below, as they will know
-    // the word_type.
-    template <typename word_iterator, typename word_type>
-    word_iterator append16(utfchar32_t cp, word_iterator result) {
-        UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t));
-        if (is_in_bmp(cp))
-            *(result++) = static_cast<word_type>(cp);
-        else {
-            // Code points from the supplementary planes are encoded via surrogate pairs
-            *(result++) = static_cast<word_type>(LEAD_OFFSET + (cp >> 10));
-            *(result++) = static_cast<word_type>(TRAIL_SURROGATE_MIN + (cp & 0x3FF));
-        }
-        return result;
-    }
-
-    // Hopefully, most common case: the caller uses back_inserter
-    // i.e. append16(cp, std::back_inserter(str));
-    template<typename container_type>
-    std::back_insert_iterator<container_type> append16
-            (utfchar32_t cp, std::back_insert_iterator<container_type> result) {
-        return append16<std::back_insert_iterator<container_type>,
-            typename container_type::value_type>(cp, result);
-    }
-
-    // The caller uses some other kind of output operator - not covered above
-    // Note that in this case we are not able to determine word_type
-    // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong.
-    template <typename word_iterator>
-    word_iterator append16(utfchar32_t cp, word_iterator result) {
-        return append16<word_iterator, utfchar16_t>(cp, result);
-    }
-
-} // namespace internal
-
-    /// The library API - functions intended to be called by the users
-
-    // Byte order mark
-    const utfchar8_t bom[] = {0xef, 0xbb, 0xbf};
-
-    template <typename octet_iterator>
-    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
-    {
-        octet_iterator result = start;
-        while (result != end) {
-            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
-            if (err_code != internal::UTF8_OK)
-                return result;
-        }
-        return result;
-    }
-
-    inline const char* find_invalid(const char* str)
-    {
-        const char* end = str + std::strlen(str);
-        return find_invalid(str, end);
-    }
-
-    inline std::size_t find_invalid(const std::string& s)
-    {
-        std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
-
-    template <typename octet_iterator>
-    inline bool is_valid(octet_iterator start, octet_iterator end)
-    {
-        return (utf8::find_invalid(start, end) == end);
-    }
-
-    inline bool is_valid(const char* str)
-    {
-        return (*(utf8::find_invalid(str)) == '\0');
-    }
-
-    inline bool is_valid(const std::string& s)
-    {
-        return is_valid(s.begin(), s.end());
-    }
-
-
-
-    template <typename octet_iterator>
-    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
-    {
-        return (
-            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
-            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
-            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
-           );
-    }
-
-    inline bool starts_with_bom(const std::string& s)
-    {
-        return starts_with_bom(s.begin(), s.end());
-    }
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h
deleted file mode 100644
index 691633c8..00000000
--- a/common/include/third_party/utf8cpp/utf8/cpp11.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2018 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
-#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
-
-#include "checked.h"
-
-namespace utf8
-{
-    inline void append16(utfchar32_t cp, std::u16string& s)
-    {
-        append16(cp, std::back_inserter(s));
-    }
-
-    inline std::string utf16to8(const std::u16string& s)
-    {
-        std::string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(const std::string& s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::string utf32to8(const std::u32string& s)
-    {
-        std::string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(const std::string& s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h
deleted file mode 100644
index 07587300..00000000
--- a/common/include/third_party/utf8cpp/utf8/cpp17.h
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2018 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
-#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
-
-#include "cpp11.h"
-
-namespace utf8
-{
-    inline std::string utf16to8(std::u16string_view s)
-    {
-        std::string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(std::string_view s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::string utf32to8(std::u32string_view s)
-    {
-        std::string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(std::string_view s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::size_t find_invalid(std::string_view s)
-    {
-        std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
-
-    inline bool is_valid(std::string_view s)
-    {
-        return is_valid(s.begin(), s.end());
-    }
-
-    inline std::string replace_invalid(std::string_view s, char32_t replacement)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
-
-    inline std::string replace_invalid(std::string_view s)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline bool starts_with_bom(std::string_view s)
-    {
-        return starts_with_bom(s.begin(), s.end());
-    }
-
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h
deleted file mode 100644
index 07b61d0f..00000000
--- a/common/include/third_party/utf8cpp/utf8/cpp20.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2022 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
-#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9
-
-#include "cpp17.h"
-
-namespace utf8
-{
-    inline std::u8string utf16tou8(const std::u16string& s)
-    {
-        std::u8string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u8string utf16tou8(std::u16string_view s)
-    {
-        std::u8string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(const std::u8string& s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(const std::u8string_view& s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u8string utf32tou8(const std::u32string& s)
-    {
-        std::u8string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u8string utf32tou8(const std::u32string_view& s)
-    {
-        std::u8string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(const std::u8string& s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(const std::u8string_view& s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::size_t find_invalid(const std::u8string& s)
-    {
-        std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
-
-    inline bool is_valid(const std::u8string& s)
-    {
-        return is_valid(s.begin(), s.end());
-    }
-
-    inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement)
-    {
-        std::u8string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
-
-    inline std::u8string replace_invalid(const std::u8string& s)
-    {
-        std::u8string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline bool starts_with_bom(const std::u8string& s)
-    {
-        return starts_with_bom(s.begin(), s.end());
-    }
- 
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h
deleted file mode 100644
index 173d0302..00000000
--- a/common/include/third_party/utf8cpp/utf8/unchecked.h
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-
-namespace utf8
-{
-    namespace unchecked
-    {
-        template <typename octet_iterator>
-        octet_iterator append(utfchar32_t cp, octet_iterator result)
-        {
-            return internal::append(cp, result);
-        }
-
-        template <typename word_iterator>
-        word_iterator append16(utfchar32_t cp, word_iterator result)
-        {
-            return internal::append16(cp, result);
-        }
-
-        template <typename octet_iterator, typename output_iterator>
-        output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)
-        {
-            while (start != end) {
-                octet_iterator sequence_start = start;
-                internal::utf_error err_code = utf8::internal::validate_next(start, end);
-                switch (err_code) {
-                    case internal::UTF8_OK :
-                        for (octet_iterator it = sequence_start; it != start; ++it)
-                            *out++ = *it;
-                        break;
-                    case internal::NOT_ENOUGH_ROOM:
-                        out = utf8::unchecked::append(replacement, out);
-                        start = end;
-                        break;
-                    case internal::INVALID_LEAD:
-                        out = utf8::unchecked::append(replacement, out);
-                        ++start;
-                        break;
-                    case internal::INCOMPLETE_SEQUENCE:
-                    case internal::OVERLONG_SEQUENCE:
-                    case internal::INVALID_CODE_POINT:
-                        out = utf8::unchecked::append(replacement, out);
-                        ++start;
-                        // just one replacement mark for the sequence
-                        while (start != end && utf8::internal::is_trail(*start))
-                            ++start;
-                        break;
-                }
-            }
-            return out;
-        }
-
-        template <typename octet_iterator, typename output_iterator>
-        inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
-        {
-            static const utfchar32_t replacement_marker = static_cast<utfchar32_t>(utf8::internal::mask16(0xfffd));
-            return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
-        }
-
-        inline std::string replace_invalid(const std::string& s, utfchar32_t replacement)
-        {
-            std::string result;
-            replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-            return result;
-        }
-
-        inline std::string replace_invalid(const std::string& s)
-        {
-            std::string result;
-            replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-            return result;
-        }
-
-        template <typename octet_iterator>
-        utfchar32_t next(octet_iterator& it)
-        {
-            utfchar32_t cp = utf8::internal::mask8(*it);
-            switch (utf8::internal::sequence_length(it)) {
-                case 1:
-                    break;
-                case 2:
-                    ++it;
-                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
-                    break;
-                case 3:
-                    ++it;
-                    cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
-                    ++it;
-                    cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
-                    break;
-                case 4:
-                    ++it;
-                    cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
-                    ++it;
-                    cp = static_cast<utfchar32_t>(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff));
-                    ++it;
-                    cp = static_cast<utfchar32_t>(cp + ((*it) & 0x3f));
-                    break;
-            }
-            ++it;
-            return cp;
-        }
-
-        template <typename octet_iterator>
-        utfchar32_t peek_next(octet_iterator it)
-        {
-            return utf8::unchecked::next(it);
-        }
-
-        template <typename word_iterator>
-        utfchar32_t next16(word_iterator& it)
-        {
-            utfchar32_t cp = utf8::internal::mask16(*it++);
-            if (utf8::internal::is_lead_surrogate(cp))
-                return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET;
-            return cp;
-        }
-
-        template <typename octet_iterator>
-        utfchar32_t prior(octet_iterator& it)
-        {
-            while (utf8::internal::is_trail(*(--it))) ;
-            octet_iterator temp = it;
-            return utf8::unchecked::next(temp);
-        }
-
-        template <typename octet_iterator, typename distance_type>
-        void advance(octet_iterator& it, distance_type n)
-        {
-            const distance_type zero(0);
-            if (n < zero) {
-                // backward
-                for (distance_type i = n; i < zero; ++i)
-                    utf8::unchecked::prior(it);
-            } else {
-                // forward
-                for (distance_type i = zero; i < n; ++i)
-                    utf8::unchecked::next(it);
-            }
-        }
-
-        template <typename octet_iterator>
-        typename std::iterator_traits<octet_iterator>::difference_type
-        distance(octet_iterator first, octet_iterator last)
-        {
-            typename std::iterator_traits<octet_iterator>::difference_type dist;
-            for (dist = 0; first < last; ++dist)
-                utf8::unchecked::next(first);
-            return dist;
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-        {
-            while (start != end) {
-                utfchar32_t cp = utf8::internal::mask16(*start++);
-                // Take care of surrogate pairs first
-                if (utf8::internal::is_lead_surrogate(cp)) {
-                    if (start == end)
-                        return result;
-                    utfchar32_t trail_surrogate = utf8::internal::mask16(*start++);
-                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                }
-                result = utf8::unchecked::append(cp, result);
-            }
-            return result;
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result)
-        {
-            while (start < end) {
-                utfchar32_t cp = utf8::unchecked::next(start);
-                if (cp > 0xffff) { //make a surrogate pair
-                    *result++ = static_cast<utfchar16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                    *result++ = static_cast<utfchar16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-                }
-                else
-                    *result++ = static_cast<utfchar16_t>(cp);
-            }
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-        {
-            while (start != end)
-                result = utf8::unchecked::append(*(start++), result);
-
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result)
-        {
-            while (start < end)
-                (*result++) = utf8::unchecked::next(start);
-
-            return result;
-        }
-
-        // The iterator class
-        template <typename octet_iterator>
-          class iterator {
-            octet_iterator it;
-            public:
-            typedef utfchar32_t value_type;
-            typedef utfchar32_t* pointer;
-            typedef utfchar32_t& reference;
-            typedef std::ptrdiff_t difference_type;
-            typedef std::bidirectional_iterator_tag iterator_category;
-            iterator () {}
-            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
-            // the default "big three" are OK
-            octet_iterator base () const { return it; }
-            utfchar32_t operator * () const
-            {
-                octet_iterator temp = it;
-                return utf8::unchecked::next(temp);
-            }
-            bool operator == (const iterator& rhs) const
-            {
-                return (it == rhs.it);
-            }
-            bool operator != (const iterator& rhs) const
-            {
-                return !(operator == (rhs));
-            }
-            iterator& operator ++ ()
-            {
-                ::std::advance(it, utf8::internal::sequence_length(it));
-                return *this;
-            }
-            iterator operator ++ (int)
-            {
-                iterator temp = *this;
-                ::std::advance(it, utf8::internal::sequence_length(it));
-                return temp;
-            }
-            iterator& operator -- ()
-            {
-                utf8::unchecked::prior(it);
-                return *this;
-            }
-            iterator operator -- (int)
-            {
-                iterator temp = *this;
-                utf8::unchecked::prior(it);
-                return temp;
-            }
-          }; // class iterator
-
-    } // namespace utf8::unchecked
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp
index ac49fd5b..296c0a87 100644
--- a/tuple/include/array_of_strings_sketch.hpp
+++ b/tuple/include/array_of_strings_sketch.hpp
@@ -42,8 +42,16 @@ class default_array_of_strings_update_policy {
   void update(array_of_strings& array, const array_of_strings* input) const;
 };
 
-// serializer/deserializer for an array of strings
-// Requirements: all strings must be valid UTF-8 and array size must be <= 127.
+/**
+ * Serializer/deserializer for an array of strings.
+ *
+ * Requirements:
+ * - Array size must be <= 127.
+ *
+ * This serde does not perform UTF-8 validation. Callers must ensure strings
+ * are valid UTF-8 before serialization to guarantee interoperability with
+ * Java, Go, and Rust implementations.
+ */
 template<typename Allocator = std::allocator<array_of_strings>>
 struct default_array_of_strings_serde {
   using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
@@ -60,7 +68,6 @@ struct default_array_of_strings_serde {
   summary_allocator summary_allocator_;
   static void check_num_nodes(uint8_t num_nodes);
   static uint32_t compute_total_bytes(const array_of_strings& item);
-  static void check_utf8(const std::string& value);
 };
 
 /**
@@ -69,8 +76,18 @@ struct default_array_of_strings_serde {
 uint64_t hash_array_of_strings_key(const array_of_strings& key);
 
 /**
- * Extended class of compact_tuple_sketch for array of strings
- * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
+ * Extended class of compact_tuple_sketch for array of strings.
+ *
+ * Requirements:
+ * - Array size must be <= 127.
+ *
+ * UTF-8 compatibility:
+ * Serialized sketches are intended to be language and platform independent.
+ * Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings.
+ * This C++ implementation does not validate UTF-8; it is the caller's
+ * responsibility to ensure all strings are valid UTF-8 before calling update().
+ * Non-UTF-8 strings may serialize successfully but will fail to deserialize
+ * in other language implementations.
  */
 template<typename Allocator = std::allocator<array_of_strings>>
 class compact_array_of_strings_tuple_sketch:
diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 81045472..26751d66 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -22,9 +22,7 @@
 
 #include <stdexcept>
 
-#include "array_of_strings_sketch.hpp"
 #include "common_defs.hpp"
-#include "third_party/utf8cpp/utf8.h"
 
 namespace datasketches {
 
@@ -116,7 +114,6 @@ void default_array_of_strings_serde<Allocator>::serialize(
     write(os, num_nodes);
     const std::string* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
-      check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
       write(os, length);
       os.write(data[j].data(), length);
@@ -143,7 +140,6 @@ void default_array_of_strings_serde<Allocator>::deserialize(
         is.read(&value[0], length);
         if (!is) throw std::runtime_error("array_of_strings stream read failed");
       }
-      check_utf8(value);
       array[j] = std::move(value);
     }
     summary_allocator alloc(summary_allocator_);
@@ -166,7 +162,6 @@ size_t default_array_of_strings_serde<Allocator>::serialize(
     bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written);
     const std::string* data = items[i].data();
     for (uint8_t j = 0; j < num_nodes; ++j) {
-      check_utf8(data[j]);
       const uint32_t length = static_cast<uint32_t>(data[j].size());
 
       bytes_written += copy_to_mem(length, ptr8 + bytes_written);
@@ -200,7 +195,6 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
       if (length != 0) {
         bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
       }
-      check_utf8(value);
       array[j] = std::move(value);
     }
     summary_allocator alloc(summary_allocator_);
@@ -233,13 +227,6 @@ uint32_t default_array_of_strings_serde<Allocator>::compute_total_bytes(const ar
   return static_cast<uint32_t>(total);
 }
 
-template<typename Allocator>
-void default_array_of_strings_serde<Allocator>::check_utf8(const std::string& value) {
-  if (!utf8::is_valid(value.begin(), value.end())) {
-    throw std::runtime_error("array_of_strings contains invalid UTF-8");
-  }
-}
-
 } /* namespace datasketches */
 
 #endif
diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp
index dc21aceb..5507c071 100644
--- a/tuple/test/array_of_strings_sketch_test.cpp
+++ b/tuple/test/array_of_strings_sketch_test.cpp
@@ -256,18 +256,6 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
 TEST_CASE("aos serde validation", "[tuple_sketch]") {
   default_array_of_strings_serde<> serde;
 
-  SECTION("invalid utf8 rejected") {
-    array_of_strings array(1, "", std::allocator<std::string>());
-    const std::string invalid_utf8("\xC3\x28", 2);
-    array[0] = invalid_utf8;
-    std::stringstream ss;
-    ss.exceptions(std::ios::failbit | std::ios::badbit);
-    REQUIRE_THROWS_WITH(
-      serde.serialize(ss, &array, 1),
-      Catch::Matchers::Contains("invalid UTF-8")
-    );
-  }
-
   SECTION("too many nodes rejected") {
     array_of_strings array(128, "", std::allocator<std::string>());
     std::stringstream ss;

From bc447d2307cd6501119dfdfd6889946e35d709c2 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sat, 14 Mar 2026 00:42:35 +0900
Subject: [PATCH 60/75] fix: destroy in the failure after partial success

---
 .../include/array_of_strings_sketch_impl.hpp  | 93 ++++++++++++-------
 1 file changed, 58 insertions(+), 35 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 26751d66..7884c5e5 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -125,25 +125,39 @@ template<typename Allocator>
 void default_array_of_strings_serde<Allocator>::deserialize(
   std::istream& is, array_of_strings* items, unsigned num
 ) const {
-  for (unsigned i = 0; i < num; ++i) {
-    read<uint32_t>(is); // total_bytes
-    if (!is) throw std::runtime_error("array_of_strings stream read failed");
-    const uint8_t num_nodes = read<uint8_t>(is);
-    if (!is) throw std::runtime_error("array_of_strings stream read failed");
-    check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "");
-    for (uint8_t j = 0; j < num_nodes; ++j) {
-      const uint32_t length = read<uint32_t>(is);
-      if (!is) throw std::runtime_error("array_of_strings stream read failed");
-      std::string value(length, '\0');
-      if (length != 0) {
-        is.read(&value[0], length);
-        if (!is) throw std::runtime_error("array_of_strings stream read failed");
+  unsigned i = 0;
+  bool failure = false;
+  try {
+    for (; i < num; ++i) {
+      read<uint32_t>(is); // total_bytes
+      if (!is) { failure = true; break; }
+      const uint8_t num_nodes = read<uint8_t>(is);
+      if (!is) { failure = true; break; }
+      check_num_nodes(num_nodes);
+      array_of_strings array(num_nodes, "");
+      for (uint8_t j = 0; j < num_nodes; ++j) {
+        const uint32_t length = read<uint32_t>(is);
+        if (!is) { failure = true; break; }
+        std::string value(length, '\0');
+        if (length != 0) {
+          is.read(&value[0], length);
+          if (!is) { failure = true; break; }
+        }
+        array[j] = std::move(value);
       }
-      array[j] = std::move(value);
+      if (failure) break;
+      summary_allocator alloc(summary_allocator_);
+      std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
     }
+  } catch (std::istream::failure&) {
+    failure = true;
+  }
+  if (failure) {
     summary_allocator alloc(summary_allocator_);
-    std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
+    for (unsigned j = 0; j < i; ++j) {
+      std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
+    }
+    throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i));
   }
 }
 
@@ -177,28 +191,37 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
 ) const {
   const uint8_t* ptr8 = static_cast<const uint8_t*>(ptr);
   size_t bytes_read = 0;
-
-  for (unsigned i = 0; i < num; ++i) {
-    check_memory_size(bytes_read + sizeof(uint32_t), capacity);
-    const size_t item_start = bytes_read;
-    uint32_t total_bytes;
-    bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
-    check_memory_size(item_start + total_bytes, capacity);
-    uint8_t num_nodes;
-    bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
-    check_num_nodes(num_nodes);
-    array_of_strings array(num_nodes, "");
-    for (uint8_t j = 0; j < num_nodes; ++j) {
-      uint32_t length;
-      bytes_read += copy_from_mem(ptr8 + bytes_read, length);
-      std::string value(length, '\0');
-      if (length != 0) {
-        bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
+  unsigned i = 0;
+
+  try {
+    for (; i < num; ++i) {
+      check_memory_size(bytes_read + sizeof(uint32_t), capacity);
+      const size_t item_start = bytes_read;
+      uint32_t total_bytes;
+      bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
+      check_memory_size(item_start + total_bytes, capacity);
+      uint8_t num_nodes;
+      bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
+      check_num_nodes(num_nodes);
+      array_of_strings array(num_nodes, "");
+      for (uint8_t j = 0; j < num_nodes; ++j) {
+        uint32_t length;
+        bytes_read += copy_from_mem(ptr8 + bytes_read, length);
+        std::string value(length, '\0');
+        if (length != 0) {
+          bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
+        }
+        array[j] = std::move(value);
       }
-      array[j] = std::move(value);
+      summary_allocator alloc(summary_allocator_);
+      std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
     }
+  } catch (...) {
     summary_allocator alloc(summary_allocator_);
-    std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
+    for (unsigned j = 0; j < i; ++j) {
+      std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
+    }
+    throw;
   }
   return bytes_read;
 }

From 12a5116abe1065f57c1858b4b5a3362103ebe018 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sat, 14 Mar 2026 01:13:20 +0900
Subject: [PATCH 61/75] fix: more stricter check

---
 .../include/array_of_strings_sketch_impl.hpp  | 40 +++++++++++++------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index 7884c5e5..f38dc1fb 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -107,17 +107,23 @@ template<typename Allocator>
 void default_array_of_strings_serde<Allocator>::serialize(
   std::ostream& os, const array_of_strings* items, unsigned num
 ) const {
-  for (unsigned i = 0; i < num; ++i) {
-    const uint32_t total_bytes = compute_total_bytes(items[i]);
-    const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
-    write(os, total_bytes);
-    write(os, num_nodes);
-    const std::string* data = items[i].data();
-    for (uint8_t j = 0; j < num_nodes; ++j) {
-      const uint32_t length = static_cast<uint32_t>(data[j].size());
-      write(os, length);
-      os.write(data[j].data(), length);
+  unsigned i = 0;
+  try {
+    for (; i < num; ++i) {
+      const uint32_t total_bytes = compute_total_bytes(items[i]);
+      const uint8_t num_nodes = static_cast<uint8_t>(items[i].size());
+      write(os, total_bytes);
+      write(os, num_nodes);
+      const std::string* data = items[i].data();
+      for (uint8_t j = 0; j < num_nodes; ++j) {
+        const uint32_t length = static_cast<uint32_t>(data[j].size());
+        write(os, length);
+        os.write(data[j].data(), length);
+      }
     }
+  } catch (std::runtime_error& e) {
+    if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
+    throw std::runtime_error("array_of_strings stream write failed at item " + std::to_string(i));
   }
 }
 
@@ -149,7 +155,7 @@ void default_array_of_strings_serde<Allocator>::deserialize(
       summary_allocator alloc(summary_allocator_);
       std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
     }
-  } catch (std::istream::failure&) {
+  } catch (...) {
     failure = true;
   }
   if (failure) {
@@ -191,6 +197,7 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
 ) const {
   const uint8_t* ptr8 = static_cast<const uint8_t*>(ptr);
   size_t bytes_read = 0;
+
   unsigned i = 0;
 
   try {
@@ -200,15 +207,21 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
       uint32_t total_bytes;
       bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes);
       check_memory_size(item_start + total_bytes, capacity);
+
+      check_memory_size(bytes_read + sizeof(uint8_t), capacity);
       uint8_t num_nodes;
       bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes);
       check_num_nodes(num_nodes);
+
       array_of_strings array(num_nodes, "");
       for (uint8_t j = 0; j < num_nodes; ++j) {
+        check_memory_size(bytes_read + sizeof(uint32_t), capacity);
         uint32_t length;
         bytes_read += copy_from_mem(ptr8 + bytes_read, length);
+
         std::string value(length, '\0');
         if (length != 0) {
+          check_memory_size(bytes_read + length, capacity);
           bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
         }
         array[j] = std::move(value);
@@ -216,12 +229,13 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
       summary_allocator alloc(summary_allocator_);
       std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
     }
-  } catch (...) {
+  } catch (std::exception& e) {
     summary_allocator alloc(summary_allocator_);
     for (unsigned j = 0; j < i; ++j) {
       std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
     }
-    throw;
+    if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
+    throw std::runtime_error("array_of_strings bytes read failed at item " + std::to_string(i));
   }
   return bytes_read;
 }

From 7617df45a1f2d6e8cad54b31aa5b77b007214874 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Sat, 14 Mar 2026 01:33:23 +0900
Subject: [PATCH 62/75] refactor: change code for consistency

---
 tuple/include/array_of_strings_sketch_impl.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp
index f38dc1fb..400df477 100644
--- a/tuple/include/array_of_strings_sketch_impl.hpp
+++ b/tuple/include/array_of_strings_sketch_impl.hpp
@@ -155,8 +155,13 @@ void default_array_of_strings_serde<Allocator>::deserialize(
       summary_allocator alloc(summary_allocator_);
       std::allocator_traits<summary_allocator>::construct(alloc, &items[i], std::move(array));
     }
-  } catch (...) {
-    failure = true;
+  } catch (std::exception& e) {
+    summary_allocator alloc(summary_allocator_);
+    for (unsigned j = 0; j < i; ++j) {
+      std::allocator_traits<summary_allocator>::destroy(alloc, &items[j]);
+    }
+    if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw;
+    throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i));
   }
   if (failure) {
     summary_allocator alloc(summary_allocator_);
@@ -219,9 +224,9 @@ size_t default_array_of_strings_serde<Allocator>::deserialize(
         uint32_t length;
         bytes_read += copy_from_mem(ptr8 + bytes_read, length);
 
+        check_memory_size(bytes_read + length, capacity);
         std::string value(length, '\0');
         if (length != 0) {
-          check_memory_size(bytes_read + length, capacity);
           bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length);
         }
         array[j] = std::move(value);

From c65472084a7f314163c12fe2b4476f0b6dcb7a9a Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 16 Mar 2026 15:21:27 +0900
Subject: [PATCH 63/75] doc: update utf8 compatibility about serde

---
 common/include/serde.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/common/include/serde.hpp b/common/include/serde.hpp
index ad20fe63..c4e46d7d 100644
--- a/common/include/serde.hpp
+++ b/common/include/serde.hpp
@@ -132,6 +132,11 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
 /// ItemsSketch<String> with ArrayOfStringsSerDe in Java.
 /// The length of each string is stored as a 32-bit integer (historically),
 /// which may be too wasteful. Treat this as an example.
+///
+/// This implementation treats std::string as an arbitrary byte container.
+/// It does not check whether string contents are valid UTF-8.
+///
+/// Use a UTF-8-validating SerDe when cross-language portability is required.
 template<>
 struct serde<std::string> {
   /// @copydoc serde::serialize

From 04104c04dffa15f47f07f65d476f2ee78b531b42 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 16 Mar 2026 15:23:11 +0900
Subject: [PATCH 64/75] doc: add comments about utf8 compatibility for tuple
 sketch

---
 tuple/include/tuple_sketch.hpp | 41 ++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index cbfd9f11..7b636a78 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -46,6 +46,11 @@ struct pair_extract_key {
 /**
  * Base class for Tuple sketch.
  * This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key.
+ *
+ * Summary that may retain string values.
+ * For Summary containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
  */
 template<
   typename Summary,
@@ -253,6 +258,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given string.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * @param key string to update the sketch with
    * @param value to update the sketch with
    */
@@ -261,6 +269,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given unsigned 64-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * @param key uint64_t to update the sketch with
    * @param value to update the sketch with
    */
@@ -269,6 +280,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given signed 64-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * @param key int64_t to update the sketch with
    * @param value to update the sketch with
    */
@@ -277,6 +291,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given unsigned 32-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key uint32_t to update the sketch with
    * @param value to update the sketch with
@@ -286,6 +303,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given signed 32-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key int32_t to update the sketch with
    * @param value to update the sketch with
@@ -295,6 +315,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given unsigned 16-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key uint16_t to update the sketch with
    * @param value to update the sketch with
@@ -304,6 +327,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given signed 16-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key int16_t to update the sketch with
    * @param value to update the sketch with
@@ -313,6 +339,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given unsigned 8-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key uint8_t to update the sketch with
    * @param value to update the sketch with
@@ -322,6 +351,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given signed 8-bit integer.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key int8_t to update the sketch with
    * @param value to update the sketch with
@@ -331,6 +363,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given double-precision floating point value.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key double to update the sketch with
    * @param value to update the sketch with
@@ -340,6 +375,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
 
   /**
    * Update this sketch with a given floating point value.
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * For compatibility with Java implementation.
    * @param key float to update the sketch with
    * @param value to update the sketch with
@@ -357,6 +395,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
    * Otherwise two sketches that should represent overlapping sets will be disjoint
    * For instance, for signed 32-bit values call update(int32_t) method above,
    * which does widening conversion to int64_t, if compatibility with Java is expected
+   * If the summary contains strings and cross-language portability is required,
+   * callers should ensure that any strings in the summary
+   * use a compatible encoding (valid UTF-8).
    * @param key pointer to the data
    * @param length of the data in bytes
    * @param value to update the sketch with

From 1cfe24520492e331a732d6f640cd7ab705d93583 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 16 Mar 2026 15:23:20 +0900
Subject: [PATCH 65/75] doc: add comments about utf8 compatibility for sampling
 sketches

---
 sampling/include/ebpps_sketch.hpp   | 13 +++++++++++++
 sampling/include/var_opt_sketch.hpp |  9 +++++++++
 sampling/include/var_opt_union.hpp  |  6 +++++-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sampling/include/ebpps_sketch.hpp b/sampling/include/ebpps_sketch.hpp
index 038b5a30..615d37b8 100644
--- a/sampling/include/ebpps_sketch.hpp
+++ b/sampling/include/ebpps_sketch.hpp
@@ -50,6 +50,11 @@ namespace ebpps_constants {
  * The sample may be smaller than k and the resulting size of the sample potentially includes
  * a probabilistic component, meaning the resulting sample size is not always constant.
  *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
+ *
  * @author Jon Malkin
  */
 template<
@@ -71,6 +76,8 @@ class ebpps_sketch {
     /**
      * Updates this sketch with the given data item with the given weight.
      * This method takes an lvalue.
+     * If cross-language portability is required, callers should ensure that
+     * the input string uses a compatible encoding (valid UTF-8).
      * @param item an item from a stream of items
      * @param weight the weight of the item
      */
@@ -79,6 +86,8 @@ class ebpps_sketch {
     /**
      * Updates this sketch with the given data item with the given weight.
      * This method takes an rvalue.
+     * If cross-language portability is required, callers should ensure that
+     * the input string uses a compatible encoding (valid UTF-8).
      * @param item an item from a stream of items
      * @param weight the weight of the item
      */
@@ -87,6 +96,8 @@ class ebpps_sketch {
     /**
      * Merges the provided sketch into the current one.
      * This method takes an lvalue.
+     * If sketches contain strings, callers are responsible for ensuring that
+     * both sketches were built using compatible string encodings.
      * @param sketch the sketch to merge into the current object
      */
     void merge(const ebpps_sketch<T, A>& sketch);
@@ -94,6 +105,8 @@ class ebpps_sketch {
     /**
      * Merges the provided sketch into the current one.
      * This method takes an rvalue.
+     * If sketches contain strings, callers are responsible for ensuring that
+     * both sketches were built using compatible string encodings.
      * @param sketch the sketch to merge into the current object
      */
     void merge(ebpps_sketch<T, A>&& sketch);
diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 1324883c..6b157caa 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -57,6 +57,11 @@ namespace var_opt_constants {
  * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
  * subset sum estimation.
  *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
+ *
  * author Kevin Lang
  * author Jon Malkin
  */
@@ -111,6 +116,8 @@ class var_opt_sketch {
     /**
      * Updates this sketch with the given data item with the given weight.
      * This method takes an lvalue.
+     * If cross-language portability is required, callers should ensure that
+     * the input string uses a compatible encoding (valid UTF-8).
      * @param item an item from a stream of items
      * @param weight the weight of the item
      */
@@ -119,6 +126,8 @@ class var_opt_sketch {
     /**
      * Updates this sketch with the given data item with the given weight.
      * This method takes an rvalue.
+     * If cross-language portability is required, callers should ensure that
+     * the input string uses a compatible encoding (valid UTF-8).
      * @param item an item from a stream of items
      * @param weight the weight of the item
      */
diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp
index 0e4f76d8..68d1ac4b 100644
--- a/sampling/include/var_opt_union.hpp
+++ b/sampling/include/var_opt_union.hpp
@@ -65,13 +65,17 @@ class var_opt_union {
   /**
    * Updates this union with the given sketch
    * This method takes an lvalue.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param sk a sketch to add to the union
    */
   void update(const var_opt_sketch<T, A>& sk);
-  
+
   /**
    * Updates this union with the given sketch
    * This method takes an rvalue.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param sk a sketch to add to the union
    */
   void update(var_opt_sketch<T, A>&& sk);

From 14c20a636404874b858b374311b470bc5d64dcff Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 16 Mar 2026 15:23:35 +0900
Subject: [PATCH 66/75] doc: add comments about utf8 compatibility for
 frequency sketch

---
 fi/include/frequent_items_sketch.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp
index 0aa9514c..87ee174e 100644
--- a/fi/include/frequent_items_sketch.hpp
+++ b/fi/include/frequent_items_sketch.hpp
@@ -44,6 +44,11 @@ enum frequent_items_error_type {
  * Based on Java implementation here:
  * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
  * @author Alexander Saydakov
+ *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
  */
 template<
   typename T,
@@ -74,6 +79,8 @@ class frequent_items_sketch {
 
   /**
    * Update this sketch with an item and a positive weight (frequency count).
+   * If cross-language portability is required, callers should ensure that
+   * the input string uses a compatible encoding (valid UTF-8).
    * @param item for which the weight should be increased (lvalue)
    * @param weight the amount by which the weight of the item should be increased
    * A count of zero is a no-op, and a negative count will throw an exception.
@@ -82,6 +89,8 @@ class frequent_items_sketch {
 
   /**
    * Update this sketch with an item and a positive weight (frequency count).
+   * If cross-language portability is required, callers should ensure that
+   * the input string uses a compatible encoding (valid UTF-8).
    * @param item for which the weight should be increased (rvalue)
    * @param weight the amount by which the weight of the item should be increased
    * A count of zero is a no-op, and a negative count will throw an exception.
@@ -91,6 +100,8 @@ class frequent_items_sketch {
   /**
    * This function merges the other sketch into this one.
    * The other sketch may be of a different size.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param other sketch to be merged into this (lvalue)
    */
   void merge(const frequent_items_sketch& other);
@@ -98,6 +109,8 @@ class frequent_items_sketch {
   /**
    * This function merges the other sketch into this one.
    * The other sketch may be of a different size.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param other sketch to be merged into this (rvalue)
    */
   void merge(frequent_items_sketch&& other);

From a9b42755072b079fd90b29b9851adc121015c58e Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Mon, 16 Mar 2026 15:24:16 +0900
Subject: [PATCH 67/75] doc: add comments about utf8 compatibility for
 quantiels sketches

---
 kll/include/kll_sketch.hpp             | 11 ++++++++++-
 quantiles/include/quantiles_sketch.hpp |  9 +++++++++
 req/include/req_sketch.hpp             |  9 +++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp
index 904587a1..d672c419 100644
--- a/kll/include/kll_sketch.hpp
+++ b/kll/include/kll_sketch.hpp
@@ -46,6 +46,11 @@ namespace kll_constants {
  * and nearly optimal accuracy per retained item.
  * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
  *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
+ *
  * <p>This is a stochastic streaming sketch that enables near real-time analysis of the
  * approximate distribution of items from a very large stream in a single pass, requiring only
  * that the items are comparable.
@@ -56,7 +61,7 @@ namespace kll_constants {
  * <p>As of May 2020, this implementation produces serialized sketches which are binary-compatible
  * with the equivalent Java implementation only when template parameter T = float
  * (32-bit single precision values).
- * 
+ *
  * <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
  * item is defined as its index <i>(1 to N)</i> in inclusive mode
  * or <i>(0 to N-1)</i> in exclusive mode
@@ -225,6 +230,8 @@ class kll_sketch {
 
     /**
      * Updates this sketch with the given data item.
+     * If cross-language portability is required, callers should ensure that
+     * the input string uses a compatible encoding (valid UTF-8).
      * @param item from a stream of items
      */
     template<typename FwdT>
@@ -232,6 +239,8 @@ class kll_sketch {
 
     /**
      * Merges another sketch into this one.
+     * If sketches contain strings, callers are responsible for ensuring that
+     * both sketches were built using compatible string encodings.
      * @param other sketch to merge into this one
      */
     template<typename FwdSk>
diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp
index b1e2e3c1..e995e3e3 100644
--- a/quantiles/include/quantiles_sketch.hpp
+++ b/quantiles/include/quantiles_sketch.hpp
@@ -47,6 +47,11 @@ namespace quantiles_constants {
  * The analysis is obtained using get_rank() and get_quantile() functions,
  * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
  *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
+ *
  * <p>Consider a large stream of one million values such as packet sizes coming into a network node.
  * The natural rank of any specific size value is its index in the hypothetical sorted
  * array of values.
@@ -206,6 +211,8 @@ class quantiles_sketch {
 
   /**
    * Updates this sketch with the given data item.
+   * If cross-language portability is required, callers should ensure that
+   * the input string uses a compatible encoding (valid UTF-8).
    * @param item from a stream of items
    */
   template<typename FwdT>
@@ -213,6 +220,8 @@ class quantiles_sketch {
 
   /**
    * Merges another sketch into this one.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param other sketch to merge into this one
    */
   template<typename FwdSk>
diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp
index 21ccac0c..52295bd2 100755
--- a/req/include/req_sketch.hpp
+++ b/req/include/req_sketch.hpp
@@ -35,6 +35,11 @@ namespace datasketches {
  * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty,
  * Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý.
  *
+ * Sketch that may retain string values.
+ * For sketches containing strings, cross-language portability depends on
+ * using compatible string encodings. This class does not by itself enforce
+ * UTF-8 validity for all string inputs.
+ *
  * <p>Reference: https://arxiv.org/abs/2004.01668</p>
  *
  * <p>This implementation differs from the algorithm described in the paper in the following:</p>
@@ -179,6 +184,8 @@ class req_sketch {
 
   /**
    * Updates this sketch with the given data item.
+   * If cross-language portability is required, callers should ensure that
+   * the input string uses a compatible encoding (valid UTF-8).
    * @param item from a stream of items
    */
   template<typename FwdT>
@@ -186,6 +193,8 @@ class req_sketch {
 
   /**
    * Merges another sketch into this one.
+   * If sketches contain strings, callers are responsible for ensuring that
+   * both sketches were built using compatible string encodings.
    * @param other sketch to merge into this one
    */
   template<typename FwdSk>

From bda16fd2287cc523f6422d4781a51da2429e70c4 Mon Sep 17 00:00:00 2001
From: Lee Rhodes <leerho@users.noreply.github.com>
Date: Sat, 21 Mar 2026 23:06:03 -0700
Subject: [PATCH 68/75] Update GHA Code Coverage workflow (#493)

---
 .github/workflows/code_coverage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml
index 060242fa..09a8dbc9 100644
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Generate coverage .info
         run: cmake --build build --target coverage_report
       - name: Post to Coveralls
-        uses: coverallsapp/github-action@master
+        uses: coverallsapp/github-action@v2
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           path-to-lcov: build/lcov.info

From 5e20ad04e53b3bdcf7dc4ff6f964818731da09a5 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Tue, 24 Mar 2026 14:25:25 +0900
Subject: [PATCH 69/75] fix: allow positive weight only

---
 sampling/include/var_opt_sketch.hpp      |  2 +-
 sampling/include/var_opt_sketch_impl.hpp |  7 +++----
 sampling/test/var_opt_sketch_test.cpp    | 14 ++++++++++----
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp
index 6b157caa..df080c6e 100644
--- a/sampling/include/var_opt_sketch.hpp
+++ b/sampling/include/var_opt_sketch.hpp
@@ -272,7 +272,7 @@ class var_opt_sketch {
     typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
     typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
 
-    static const uint32_t MIN_LG_ARR_ITEMS = 3;
+    static const uint32_t MIN_LG_ARR_ITEMS = 4;
 
     static const uint8_t PREAMBLE_LONGS_EMPTY  = 1;
     static const uint8_t PREAMBLE_LONGS_WARMUP = 3;
diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp
index 36ee3fc8..30d526af 100644
--- a/sampling/include/var_opt_sketch_impl.hpp
+++ b/sampling/include/var_opt_sketch_impl.hpp
@@ -772,12 +772,11 @@ string<A> var_opt_sketch<T, A>::items_to_string(bool print_gap) const {
 template<typename T, typename A>
 template<typename O>
 void var_opt_sketch<T, A>::update(O&& item, double weight, bool mark) {
-  if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) {
-    throw std::invalid_argument("Item weights must be nonnegative and finite. Found: "
+  if (weight <= 0.0 || std::isnan(weight) || std::isinf(weight)) {
+    throw std::invalid_argument("Item weights must be positive and finite. Found: "
                                 + std::to_string(weight));
-  } else if (weight == 0.0) {
-    return;
   }
+
   ++n_;
 
   if (r_ == 0) {
diff --git a/sampling/test/var_opt_sketch_test.cpp b/sampling/test/var_opt_sketch_test.cpp
index 71d16e91..179d7016 100644
--- a/sampling/test/var_opt_sketch_test.cpp
+++ b/sampling/test/var_opt_sketch_test.cpp
@@ -178,11 +178,17 @@ TEST_CASE("varopt sketch: non-empty degenerate sketch", "[var_opt_sketch]") {
 
 TEST_CASE("varopt sketch: invalid weight", "[var_opt_sketch]") {
   var_opt_sketch<std::string> sk(100, resize_factor::X2);
-  REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
 
-  // should not throw but sketch should still be empty
-  sk.update("zero weight", 0.0);
-  REQUIRE(sk.is_empty());
+  // Negative
+  REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument);
+  // Zero
+  REQUIRE_THROWS_AS(sk.update("zero_weight", 0.0), std::invalid_argument);
+  // NaN
+  REQUIRE_THROWS_AS(sk.update("NaN_weight", std::numeric_limits<double>::quiet_NaN()), std::invalid_argument);
+  // +Inf
+  REQUIRE_THROWS_AS(sk.update("positive_infinity", std::numeric_limits<double>::infinity()), std::invalid_argument);
+  // -Inf
+  REQUIRE_THROWS_AS(sk.update("negative_infinity", -std::numeric_limits<double>::infinity()), std::invalid_argument);
 }
 
 TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") {

From de35ce73d52f7c4b13d87129892a50cf50a6f948 Mon Sep 17 00:00:00 2001
From: lani_karrot <lani@daangn.com>
Date: Wed, 25 Mar 2026 09:55:12 +0900
Subject: [PATCH 70/75] ci: upload coverage report directly

---
 .github/workflows/code_coverage.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml
index 09a8dbc9..69fa94ec 100644
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -37,7 +37,8 @@ jobs:
       - name: Generate coverage .info
         run: cmake --build build --target coverage_report
       - name: Post to Coveralls
-        uses: coverallsapp/github-action@v2
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          path-to-lcov: build/lcov.info
+        run: |
+          curl -sL https://coveralls.io/coveralls-linux.tar.gz | tar -xz
+          ./coveralls report build/lcov.info
+        env:
+          COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 53588892848771e4a90238ca07a32ec990e605d3 Mon Sep 17 00:00:00 2001
From: Lee Rhodes <leerho@gmail.com>
Date: Fri, 27 Mar 2026 10:35:46 -0700
Subject: [PATCH 71/75] fix get_RSE()

---
 req/include/req_sketch_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp
index 3c1c2fc1..7f0b4557 100755
--- a/req/include/req_sketch_impl.hpp
+++ b/req/include/req_sketch_impl.hpp
@@ -293,7 +293,7 @@ double req_sketch<T, C, A>::get_rank_upper_bound(double rank, uint8_t num_std_de
 
 template<typename T, typename C, typename A>
 double req_sketch<T, C, A>::get_RSE(uint16_t k, double rank, bool hra, uint64_t n) {
-  return get_rank_lb(k, 2, rank, 1, n, hra);
+  return get_rank_ub(k, 2, rank, 1, n, hra) - rank;
 }
 
 template<typename T, typename C, typename A>

From 9130a0751ee48882680a4c80284372c20c901a3d Mon Sep 17 00:00:00 2001
From: Lee Rhodes <leerho@gmail.com>
Date: Fri, 27 Mar 2026 16:19:16 -0700
Subject: [PATCH 72/75] add get_RSE() test

---
 req/test/req_sketch_test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp
index 2a338b8a..d9c9a16e 100755
--- a/req/test/req_sketch_test.cpp
+++ b/req/test/req_sketch_test.cpp
@@ -43,6 +43,7 @@ TEST_CASE("req sketch: empty", "[req_sketch]") {
   REQUIRE_FALSE(sketch.is_estimation_mode());
   REQUIRE(sketch.get_n() == 0);
   REQUIRE(sketch.get_num_retained() == 0);
+  REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, true, 0) == 0);
   REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
   REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
   REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error);
@@ -61,6 +62,7 @@ TEST_CASE("req sketch: single value, lra", "[req_sketch]") {
   REQUIRE_FALSE(sketch.is_estimation_mode());
   REQUIRE(sketch.get_n() == 1);
   REQUIRE(sketch.get_num_retained() == 1);
+  REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, false, sketch.get_n()) == 0);
   REQUIRE(sketch.get_rank(1.0f, false) == 0);
   REQUIRE(sketch.get_rank(1.0f) == 1);
   REQUIRE(sketch.get_rank(1.1f, false) == 1);

From 0a885718cad4f32e7cc240dba288d0a12b6b09b0 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Sat, 11 Apr 2026 17:44:14 +0900
Subject: [PATCH 73/75] fix: missing header when compile using gcc 15

---
 common/include/serde.hpp              | 1 +
 fi/include/reverse_purge_hash_map.hpp | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/include/serde.hpp b/common/include/serde.hpp
index c4e46d7d..02c2fc16 100644
--- a/common/include/serde.hpp
+++ b/common/include/serde.hpp
@@ -20,6 +20,7 @@
 #ifndef DATASKETCHES_SERDE_HPP_
 #define DATASKETCHES_SERDE_HPP_
 
+#include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <memory>
diff --git a/fi/include/reverse_purge_hash_map.hpp b/fi/include/reverse_purge_hash_map.hpp
index b75abc43..5d59c187 100644
--- a/fi/include/reverse_purge_hash_map.hpp
+++ b/fi/include/reverse_purge_hash_map.hpp
@@ -20,8 +20,9 @@
 #ifndef REVERSE_PURGE_HASH_MAP_HPP_
 #define REVERSE_PURGE_HASH_MAP_HPP_
 
-#include <memory>
+#include <cstdint>
 #include <iterator>
+#include <memory>
 
 namespace datasketches {
 

From 44e6fb33d23e86cbef437ba54c7d2bfa1b06e7f1 Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Sat, 11 Apr 2026 17:44:32 +0900
Subject: [PATCH 74/75] ci: change build coverage

---
 .github/workflows/build_cmake.yml | 178 +++++++++++++++++++++++++++---
 1 file changed, 165 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml
index aee7ec3d..6687f8e7 100644
--- a/.github/workflows/build_cmake.yml
+++ b/.github/workflows/build_cmake.yml
@@ -6,7 +6,7 @@ env:
   BUILD_TYPE: Release
 
 jobs:
-  build:
+  build-native:
     name: ${{ matrix.config.name }}
     runs-on: ${{ matrix.config.os }}
     strategy:
@@ -14,23 +14,16 @@ jobs:
       matrix:
         config:
         - {
-            name: "MacOS Latest, Clang",
-            os: macos-latest,
+            name: "macOS 15, Clang",
+            os: macos-15,
             test_target: test,
             cc: "clang", cxx: "clang++"
           }
         - {
-            name: "Ubuntu Latest, GCC",
-            os: ubuntu-latest,
-            test_target: test,
-            cc: "gcc", cxx: "g++"
-          }
-        - {
-            name: "Windows Latest, MSVC",
-            os: windows-latest,
+            name: "Windows 2022, MSVC",
+            os: windows-2022,
             test_target: RUN_TESTS,
-            cc: "cl", cxx: "cl",
-            environment_script: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/Auxiliary/Build/vcvars64.bat"
+            cc: "cl", cxx: "cl"
           }
         #- {
         #    name: "Windows Latest, MinGW+gcc",
@@ -52,3 +45,162 @@ jobs:
         run: cmake --build build --config Release --target ${{ matrix.config.test_target }}
       - name: Install headers
         run: cmake --build build -t install
+
+  build-ubuntu-gcc:
+    name: Compiler / ${{ matrix.config.name }}
+    runs-on: ubuntu-24.04
+    container:
+      image: ${{ matrix.config.image }}
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {
+            name: "Ubuntu 24.04, GCC 9",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-9", cxx: "g++-9",
+            packages: "gcc-9 g++-9",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 24.04, GCC 10",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-10", cxx: "g++-10",
+            packages: "gcc-10 g++-10",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 24.04, GCC 11",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-11", cxx: "g++-11",
+            packages: "gcc-11 g++-11",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 24.04, GCC 12",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-12", cxx: "g++-12",
+            packages: "gcc-12 g++-12",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 24.04, GCC 13",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-13", cxx: "g++-13",
+            packages: "gcc-13 g++-13",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 24.04, GCC 14",
+            image: "ubuntu:24.04",
+            test_target: test,
+            cc: "gcc-14", cxx: "g++-14",
+            packages: "gcc-14 g++-14",
+            cxx_standard: "11"
+          }
+        - {
+            name: "Ubuntu 25.10, GCC 15",
+            image: "ubuntu:25.10",
+            test_target: test,
+            cc: "gcc-15", cxx: "g++-15",
+            packages: "gcc-15 g++-15",
+            cxx_standard: "11"
+          }
+    steps:
+      - name: Install build dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            ca-certificates \
+            cmake \
+            git \
+            make \
+            ${{ matrix.config.packages }}
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          persist-credentials: false
+      - name: Configure
+        env:
+          CC: ${{ matrix.config.cc }}
+          CXX: ${{ matrix.config.cxx }}
+        run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test
+      - name: Build C++ unit tests
+        run: cmake --build build --config Release
+      - name: Run C++ tests
+        run: cmake --build build --config Release --target ${{ matrix.config.test_target }}
+      - name: Install headers
+        run: cmake --build build -t install
+
+  build-ubuntu-std:
+    name: Standard / Ubuntu 25.10, GCC 15, C++${{ matrix.config.cxx_standard }}
+    runs-on: ubuntu-24.04
+    container:
+      image: ubuntu:25.10
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {
+            cxx_standard: "11",
+            test_target: test
+          }
+        - {
+            cxx_standard: "14",
+            test_target: test
+          }
+        - {
+            cxx_standard: "17",
+            test_target: test
+          }
+        - {
+            cxx_standard: "20",
+            test_target: test
+          }
+        - {
+            cxx_standard: "23",
+            test_target: test,
+          }
+    steps:
+      - name: Install build dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            ca-certificates \
+            cmake \
+            gcc-15 \
+            g++-15 \
+            git \
+            make
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          persist-credentials: false
+      - name: Configure
+        env:
+          CC: gcc-15
+          CXX: g++-15
+        run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test
+      - name: Build C++ unit tests
+        run: cmake --build build --config Release
+      - name: Run C++ tests
+        run: cmake --build build --config Release --target ${{ matrix.config.test_target }}
+      - name: Install headers
+        run: cmake --build build -t install

From bbd13d27958383c66df4fd3cefe18b2cdd66826f Mon Sep 17 00:00:00 2001
From: proost <proost@apache.org>
Date: Sat, 11 Apr 2026 23:43:25 +0900
Subject: [PATCH 75/75] fix: pinning with windows 2025

---
 .github/workflows/build_cmake.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml
index 6687f8e7..d8a53900 100644
--- a/.github/workflows/build_cmake.yml
+++ b/.github/workflows/build_cmake.yml
@@ -20,8 +20,8 @@ jobs:
             cc: "clang", cxx: "clang++"
           }
         - {
-            name: "Windows 2022, MSVC",
-            os: windows-2022,
+            name: "Windows 2025, MSVC",
+            os: windows-2025,
             test_target: RUN_TESTS,
             cc: "cl", cxx: "cl"
           }