diff --git a/.asf.yaml b/.asf.yaml index 45d974b1..f15ed263 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -2,3 +2,11 @@ github: homepage: https://datasketches.apache.org ghp_branch: gh-pages ghp_path: /docs + + protected_branches: + master: + required_pull_request_reviews: + dismiss_stale_reviews: false + required_approving_review_count: 1 + required_signatures: false + required_conversation_resolution: false diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..d0cdc6e9 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +--- +Checks: | + clang-diagnostic-*, + clang-analyzer-*, + -clang-analyzer-alpha*, + google-*, + modernize-*, + -modernize-avoid-c-arrays, + -modernize-use-trailing-return-type, + -modernize-use-nodiscard, + +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: '0' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' \ No newline at end of file diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index aee7ec3d..d8a53900 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -6,7 +6,7 @@ env: BUILD_TYPE: Release jobs: - build: + build-native: name: ${{ matrix.config.name }} runs-on: ${{ matrix.config.os }} strategy: @@ -14,23 +14,16 @@ jobs: matrix: config: - { - name: "MacOS Latest, Clang", - os: macos-latest, + name: "macOS 15, Clang", + os: macos-15, test_target: test, cc: "clang", cxx: "clang++" } - { - name: "Ubuntu Latest, GCC", - os: ubuntu-latest, - test_target: test, - cc: "gcc", cxx: "g++" - } - - { - name: "Windows Latest, MSVC", - os: windows-latest, + name: "Windows 2025, MSVC", + os: windows-2025, test_target: RUN_TESTS, - cc: "cl", cxx: "cl", - environment_script: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/Auxiliary/Build/vcvars64.bat" + cc: "cl", cxx: "cl" } #- { # name: "Windows Latest, MinGW+gcc", @@ -52,3 +45,162 @@ jobs: run: cmake --build build --config Release --target ${{ matrix.config.test_target }} - name: Install headers run: cmake --build build -t install + + build-ubuntu-gcc: + name: Compiler / ${{ matrix.config.name }} + runs-on: ubuntu-24.04 + container: + image: ${{ matrix.config.image }} + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + config: + - { + name: "Ubuntu 24.04, GCC 9", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-9", cxx: "g++-9", + packages: "gcc-9 g++-9", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 10", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-10", cxx: "g++-10", + packages: "gcc-10 g++-10", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 11", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-11", cxx: "g++-11", + packages: "gcc-11 g++-11", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 12", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-12", cxx: "g++-12", + packages: "gcc-12 g++-12", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 13", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-13", cxx: "g++-13", + packages: "gcc-13 g++-13", + cxx_standard: "11" + } + - { + name: "Ubuntu 24.04, GCC 14", + image: "ubuntu:24.04", + test_target: test, + cc: "gcc-14", cxx: "g++-14", + packages: "gcc-14 g++-14", + cxx_standard: "11" + } + - { + name: "Ubuntu 25.10, GCC 15", + image: "ubuntu:25.10", + test_target: test, + cc: "gcc-15", cxx: "g++-15", + packages: "gcc-15 g++-15", + cxx_standard: "11" + } + steps: + - name: Install build dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt-get update + apt-get install -y --no-install-recommends \ + ca-certificates \ + cmake \ + git \ + make \ + ${{ matrix.config.packages }} + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + - name: Configure + env: + CC: ${{ matrix.config.cc }} + CXX: ${{ matrix.config.cxx }} + run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test + - name: Build C++ unit tests + run: cmake --build build --config Release + - name: Run C++ tests + run: cmake --build build --config Release --target ${{ matrix.config.test_target }} + - name: Install headers + run: cmake --build build -t install + + build-ubuntu-std: + name: Standard / Ubuntu 25.10, GCC 15, C++${{ matrix.config.cxx_standard }} + runs-on: ubuntu-24.04 + container: + image: ubuntu:25.10 + defaults: + run: + shell: bash + strategy: + fail-fast: false + matrix: + config: + - { + cxx_standard: "11", + test_target: test + } + - { + cxx_standard: "14", + test_target: test + } + - { + cxx_standard: "17", + test_target: test + } + - { + cxx_standard: "20", + test_target: test + } + - { + cxx_standard: "23", + test_target: test, + } + steps: + - name: Install build dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt-get update + apt-get install -y --no-install-recommends \ + ca-certificates \ + cmake \ + gcc-15 \ + g++-15 \ + git \ + make + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + - name: Configure + env: + CC: gcc-15 + CXX: g++-15 + run: cmake -B build -S . -DCMAKE_CXX_STANDARD=${{ matrix.config.cxx_standard }} -DCMAKE_INSTALL_PREFIX=./install_test + - name: Build C++ unit tests + run: cmake --build build --config Release + - name: Run C++ tests + run: cmake --build build --config Release --target ${{ matrix.config.test_target }} + - name: Install headers + run: cmake --build build -t install diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml index 060242fa..69fa94ec 100644 --- a/.github/workflows/code_coverage.yml +++ b/.github/workflows/code_coverage.yml @@ -37,7 +37,8 @@ jobs: - name: Generate coverage .info run: cmake --build build --target coverage_report - name: Post to Coveralls - uses: coverallsapp/github-action@master - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: build/lcov.info + run: | + curl -sL https://coveralls.io/coveralls-linux.tar.gz | tar -xz + ./coveralls report build/lcov.info + env: + COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/hardening.yml b/.github/workflows/hardening.yml new file mode 100644 index 00000000..e264ebd9 --- /dev/null +++ b/.github/workflows/hardening.yml @@ -0,0 +1,59 @@ +name: libc++ Hardening Tests + +on: + push: + branches: + - master + pull_request: + branches: + - master + workflow_dispatch: + +env: + BUILD_TYPE: Debug + +jobs: + hardening-test: + name: C++17 with libc++ Hardening Mode + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + + - name: Install LLVM and libc++ + run: | + # Install LLVM/Clang with libc++ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + sudo add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" + sudo apt-get update + sudo apt-get install -y clang-18 libc++-18-dev libc++abi-18-dev + + - name: Configure with C++17 and libc++ hardening + env: + CC: clang-18 + CXX: clang++-18 + run: | + cmake -B build -S . \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS="-stdlib=libc++ -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG" \ + -DCMAKE_EXE_LINKER_FLAGS="-stdlib=libc++ -lc++abi" + + - name: Build hardening tests + run: cmake --build build --target hardening_test --config ${{ env.BUILD_TYPE }} + + - name: Run hardening tests + run: | + cd build + ./common/test/hardening_test "[deserialize_hardening]" + + - name: Report results + if: always() + run: | + echo "✅ Tests passed with libc++ hardening enabled!" + echo "This verifies the fix for issue #477 prevents SIGABRT." diff --git a/.github/workflows/serde_compat.yml b/.github/workflows/serde_compat.yml index 33c31801..81547ee7 100644 --- a/.github/workflows/serde_compat.yml +++ b/.github/workflows/serde_compat.yml @@ -12,16 +12,16 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Checkout Java - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: apache/datasketches-java path: java - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: - java-version: '17' + java-version: '25' distribution: 'temurin' - name: Run Java run: cd java && mvn test -P generate-java-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..262fd02e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# To use this, install the python package `pre-commit` and +# run once `pre-commit install`. This will setup a git pre-commit-hook +# that is executed on each commit and will report the linting problems. +# To run all hooks on all files use `pre-commit run -a` + +repos: + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + - id: clang-tidy + args: ['--quiet', '-p=build/compile_commands.json', '--config-file=.clang-tidy'] + types_or: [c++, c] \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 056bb701..c469e456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,7 +59,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT CMAKE_CXX_COMPILER_VERSION VERS add_compile_options(-Wimplicit-fallthrough=3) endif() -# Code generation options, to ensure shaerd libraries work and are portable +# Code generation options, to ensure shared libraries work and are portable set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_C_EXTENSIONS OFF) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index bdce6af9..0bdf0791 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,3 +1,3 @@ # Code of Conduct -We adhere to the Apache Softare Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct). \ No newline at end of file +We adhere to the Apache Software Foundation's [Code of Conduct](https://www.apache.org/foundation/policies/conduct). diff --git a/README.md b/README.md index 4b216167..57125686 100644 --- a/README.md +++ b/README.md @@ -1,106 +1,99 @@ # Apache DataSketches Core C++ Library Component -This is the core C++ component of the Apache DataSketches library. It contains all of the key sketching algorithms that are in the Java component and can be accessed directly from user applications. -This component is also a dependency of other components of the library that create adaptors for target systems, such as PostgreSQL. +This is the core C++ component of the Apache DataSketches library. It contains all the key sketching algorithms from the Java implementation and can be accessed directly by user applications. -Note that we have a parallel core component for [Java]((https://github.com/apache/datasketches-java) and [Python]((https://github.com/apache/datasketches-python) implementations of the same sketch algorithms. +This component is also a dependency of other library components that create adaptors for target systems, such as [PostgreSQL](https://github.com/apache/datasketches-postgresql). + +Note that we have parallel core library components for Java, Python, and GO implementations of many of the same sketch algorithms: + +- [datasketches-java](https://github.com/apache/datasketches-java) +- [datasketches-python](https://github.com/apache/datasketches-python) +- [datasketches-go](https://github.com/apache/datasketches-go) Please visit the main [Apache DataSketches website](https://datasketches.apache.org) for more information. -If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. +If you are interested in making contributions to this site, please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. --- This code requires C++11. -This library is header-only. The build process provided is only for building unit tests. +This library is header-only. The provided build process is only for unit tests. -Building the unit tests requires cmake 3.12.0 or higher. +Building the unit tests requires CMake 3.12.0 or higher. -Installing the latest cmake on OSX: brew install cmake +Installing the latest CMake on OSX: `brew install cmake`. -Building and running unit tests using cmake for OSX and Linux: +Building and running unit tests using CMake for OSX and Linux: -``` - $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release - $ cmake --build build/Release -t all test +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release +cmake --build build/Release -t all test ``` -Building and running unit tests using cmake for Windows from the command line: +Building and running unit tests using CMake for Windows from the command line: -``` - $ cd build - $ cmake .. - $ cd .. - $ cmake --build build --config Release - $ cmake --build build --config Release --target RUN_TESTS +```shell +cd build +cmake .. +cd .. +cmake --build build --config Release +cmake --build build --config Release --target RUN_TESTS ``` -To install a local distribution (OSX and Linux), use the following command. The -CMAKE_INSTALL_PREFIX variable controls the destination. If not specified, it -defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below, -the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include, -/tmp/install/DataSketches/lib, etc) +To install a local distribution (OSX and Linux), use the following command. The `CMAKE_INSTALL_PREFIX` variable controls the destination. If not specified, it defaults to installing in /usr (/usr/include, /usr/lib, etc). In the command below, the installation will be in /tmp/install/DataSketches (/tmp/install/DataSketches/include, /tmp/install/DataSketches/lib, etc). -``` - $ cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches - $ cmake --build build/Release -t install +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/install/DataSketches +cmake --build build/Release -t install ``` -To generate an installable package using cmake's built in cpack packaging tool, -use the following command. The type of packaging is controlled by the CPACK_GENERATOR -variable (semi-colon separated list). Cmake usually supports packaging types such as RPM, -DEB, STGZ, TGZ, TZ, ZIP, etc. +To generate an installable package using CMake's built-in cpack packaging tool, use the following command. The type of packaging is controlled by the `CPACK_GENERATOR` variable (semi-colon separated list). CMake usually supports packaging formats such as RPM, DEB, STGZ, TGZ, TZ, and ZIP. -``` - $ cmake3 -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" - $ cmake3 --build build/Release -t package +```shell +cmake -S . -B build/Release -DCMAKE_BUILD_TYPE=Release -DCPACK_GENERATOR="RPM;STGZ;TGZ" +cmake --build build/Release -t package ``` The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways. -If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some -way, then CMake's `find_package` command can be used like this: -``` - find_package(DataSketches 3.2 REQUIRED) - target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB}) +If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some way, then CMake's `find_package` command can be used like this: + +```cmake +find_package(DataSketches 3.2 REQUIRED) +target_link_library(my_dependent_target PUBLIC ${DATASKETCHES_LIB}) ``` When used with find_package, DataSketches exports several variables, including - - `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported. - - `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files. - Because cmake automatically includes the interface directories for included target libraries when - using `target_link_library`, under normal circumstances there will be no need to include this directly. - - `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling - in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies - and include paths. - -If you don't have DataSketches installed locally, dependent projects can pull it directly -from GitHub using CMake's `ExternalProject` module. The code would look something like this: - -``` - cmake_policy(SET CMP0097 NEW) - include(ExternalProject) - ExternalProject_Add(datasketches - GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git - GIT_TAG 3.2.0 - GIT_SHALLOW true - GIT_SUBMODULES "" - INSTALL_DIR /tmp/datasketches-prefix - CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix - - # Override the install command to add DESTDIR - # This is necessary to work around an oddity in the RPM (but not other) package - # generation, as CMake otherwise picks up the Datasketch files when building - # an RPM for a dependent package. (RPM scans the directory for files in addition to installing - # those files referenced in an "install" rule in the cmake file) - INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install - ) - ExternalProject_Get_property(datasketches INSTALL_DIR) - set(datasketches_INSTALL_DIR ${INSTALL_DIR}) - message("Source dir of datasketches = ${datasketches_INSTALL_DIR}") - target_include_directories(my_dependent_target - PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches) - add_dependencies(my_dependent_target datasketches) +- `DATASKETCHES_VERSION`: The version number of the datasketches package that was imported. +- `DATASKETCHES_INCLUDE_DIR`: The directory that should be added to access DataSketches include files. Because CMake automatically includes the interface directories for included target libraries when using `target_link_library`, under normal circumstances, there will be no need to include this directly +- `DATASKETCHES_LIB`: The name of the DataSketches target to include as a dependency. Projects pulling in DataSketches should reference this with `target_link_library` in order to set up all the correct dependencies and include paths. + +If you don't have DataSketches installed locally, dependent projects can pull it directly from GitHub using CMake's `ExternalProject` module. The code would look something like this: + +```cmake +cmake_policy(SET CMP0097 NEW) +include(ExternalProject) +ExternalProject_Add(datasketches + GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git + GIT_TAG 3.2.0 + GIT_SHALLOW true + GIT_SUBMODULES "" + INSTALL_DIR /tmp/datasketches-prefix + CMAKE_ARGS -DBUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=/tmp/datasketches-prefix + + # Override the install command to add DESTDIR + # This is necessary to work around an oddity in the RPM (but not other) package + # generation, as CMake otherwise picks up the Datasketch files when building + # an RPM for a dependent package. (RPM scans the directory for files in addition to installing + # those files referenced in an "install" rule in the cmake file) + INSTALL_COMMAND env DESTDIR= ${CMAKE_COMMAND} --build . --target install +) +ExternalProject_Get_property(datasketches INSTALL_DIR) +set(datasketches_INSTALL_DIR ${INSTALL_DIR}) +message("Source dir of datasketches = ${datasketches_INSTALL_DIR}") +target_include_directories(my_dependent_target + PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches) +add_dependencies(my_dependent_target datasketches) ``` diff --git a/common/include/binomial_bounds.hpp b/common/include/binomial_bounds.hpp index 3b73535b..ff7cccc9 100644 --- a/common/include/binomial_bounds.hpp +++ b/common/include/binomial_bounds.hpp @@ -441,8 +441,8 @@ class binomial_bounds { } static void check_theta(double theta) { - if (theta < 0 || theta > 1) { - throw std::invalid_argument("theta must be in [0, 1]"); + if (theta <= 0 || theta > 1) { + throw std::invalid_argument("theta must be in (0, 1]"); } } diff --git a/common/include/serde.hpp b/common/include/serde.hpp index ad20fe63..02c2fc16 100644 --- a/common/include/serde.hpp +++ b/common/include/serde.hpp @@ -20,6 +20,7 @@ #ifndef DATASKETCHES_SERDE_HPP_ #define DATASKETCHES_SERDE_HPP_ +#include #include #include #include @@ -132,6 +133,11 @@ struct serde::value>::type> { /// ItemsSketch with ArrayOfStringsSerDe in Java. /// The length of each string is stored as a 32-bit integer (historically), /// which may be too wasteful. Treat this as an example. +/// +/// This implementation treats std::string as an arbitrary byte container. +/// It does not check whether string contents are valid UTF-8. +/// +/// Use a UTF-8-validating SerDe when cross-language portability is required. template<> struct serde { /// @copydoc serde::serialize diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt index c598c353..c3e937a2 100644 --- a/common/test/CMakeLists.txt +++ b/common/test/CMakeLists.txt @@ -69,15 +69,21 @@ target_sources(common_test PRIVATE quantiles_sorted_view_test.cpp optional_test.cpp + binomial_bounds_test.cpp ) # now the integration test part add_executable(integration_test) -target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple common_test_lib) +target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib) +# Use CMAKE_CXX_STANDARD if defined, otherwise C++11 +set(_integration_cxx_standard 11) +if(DEFINED CMAKE_CXX_STANDARD) + set(_integration_cxx_standard ${CMAKE_CXX_STANDARD}) +endif() set_target_properties(integration_test PROPERTIES - CXX_STANDARD 11 + CXX_STANDARD ${_integration_cxx_standard} CXX_STANDARD_REQUIRED YES ) @@ -90,3 +96,39 @@ target_sources(integration_test PRIVATE integration_test.cpp ) + +# Separate hardening test executable (header-only, no pre-compiled libs) +# This ensures the sketch code is compiled with C++17 + hardening +# Always build this target - it will use CMAKE_CXX_STANDARD if set (and >= 17), otherwise C++17 + +add_executable(hardening_test) +target_link_libraries(hardening_test common common_test_lib) + +# Include directories for header-only sketch implementations +target_include_directories(hardening_test PRIVATE + ${CMAKE_SOURCE_DIR}/quantiles/include + ${CMAKE_SOURCE_DIR}/kll/include + ${CMAKE_SOURCE_DIR}/req/include + ${CMAKE_SOURCE_DIR}/common/include +) + +# Use C++17 minimum for hardening tests +set(_hardening_cxx_standard 17) +if(DEFINED CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17) + set(_hardening_cxx_standard ${CMAKE_CXX_STANDARD}) +endif() +set_target_properties(hardening_test PROPERTIES + CXX_STANDARD ${_hardening_cxx_standard} + CXX_STANDARD_REQUIRED YES +) +message(STATUS "hardening_test will use C++${_hardening_cxx_standard}") + +add_test( + NAME hardening_test + COMMAND hardening_test "[deserialize_hardening]" +) + +target_sources(hardening_test + PRIVATE + deserialize_hardening_test.cpp +) diff --git a/common/test/binomial_bounds_test.cpp b/common/test/binomial_bounds_test.cpp new file mode 100644 index 00000000..6bde0910 --- /dev/null +++ b/common/test/binomial_bounds_test.cpp @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "binomial_bounds.hpp" + +namespace datasketches { + +TEST_CASE("binomial_bounds: get_lower_bound", "[common]") { + + SECTION("num_samples == 0") { + double result = binomial_bounds::get_lower_bound(0, 0.5, 1); + REQUIRE(result == 0.0); + } + + SECTION("theta == 1.0") { + double result = binomial_bounds::get_lower_bound(100, 1.0, 1); + REQUIRE(result == 100.0); + } + + SECTION("num_samples == 1") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples == 1, stddev=2") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples == 1, stddev=3") { + double result = binomial_bounds::get_lower_bound(1, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120") { + double result = binomial_bounds::get_lower_bound(121, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=2") { + double result = binomial_bounds::get_lower_bound(200, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=3") { + double result = binomial_bounds::get_lower_bound(500, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5)") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 1); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 2); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") { + double result = binomial_bounds::get_lower_bound(50, 1.0 - 1e-6, 3); + REQUIRE(std::abs(result - 50.0) < 50.0 * 0.01); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 1); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 2); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") { + double result = binomial_bounds::get_lower_bound(100, 0.001, 3); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta (exact calculation)") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=2") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("2 <= num_samples <= 120 AND middle range theta, stddev=3") { + double result = binomial_bounds::get_lower_bound(10, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("theta=0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(10, 0.0, 1), std::invalid_argument); + } + + SECTION("theta very close to 0") { + double result = binomial_bounds::get_lower_bound(10, 1e-10, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=2 boundary") { + double result = binomial_bounds::get_lower_bound(2, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=120 boundary") { + double result = binomial_bounds::get_lower_bound(120, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("estimate clamping case") { + double result = binomial_bounds::get_lower_bound(10, 0.9, 1); + double estimate = 10.0 / 0.9; + REQUIRE(result <= estimate); + } + + SECTION("invalid theta < 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, -0.1, 1), std::invalid_argument); + } + + SECTION("invalid theta > 1") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 1.1, 1), std::invalid_argument); + } + + SECTION("invalid stddev = 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 0), std::invalid_argument); + } + + SECTION("invalid stddev = 4") { + REQUIRE_THROWS_AS(binomial_bounds::get_lower_bound(100, 0.5, 4), std::invalid_argument); + } +} + +TEST_CASE("binomial_bounds: get_upper_bound", "[common]") { + + SECTION("theta == 1.0") { + double result = binomial_bounds::get_upper_bound(100, 1.0, 1); + REQUIRE(result == 100.0); + } + + SECTION("num_samples == 0") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 1); + REQUIRE(result > 0.0); + } + + SECTION("num_samples == 0, stddev=2") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 2); + REQUIRE(result > 0.0); + } + + SECTION("num_samples == 0, stddev=3") { + double result = binomial_bounds::get_upper_bound(0, 0.5, 3); + REQUIRE(result > 0.0); + } + + SECTION("num_samples > 120") { + double result = binomial_bounds::get_upper_bound(121, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=2") { + double result = binomial_bounds::get_upper_bound(200, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples > 120, stddev=3") { + double result = binomial_bounds::get_upper_bound(500, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5)") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 1); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=2") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 2); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta > (1-1e-5), stddev=3") { + double result = binomial_bounds::get_upper_bound(50, 1.0 - 1e-6, 3); + REQUIRE(result == 51.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 1); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=2") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 2); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND theta < num_samples/360, stddev=3") { + double result = binomial_bounds::get_upper_bound(100, 0.001, 3); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta (exact calculation)") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=2") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 2); + REQUIRE(result >= 0.0); + } + + SECTION("1 <= num_samples <= 120 AND middle range theta, stddev=3") { + double result = binomial_bounds::get_upper_bound(10, 0.5, 3); + REQUIRE(result >= 0.0); + } + + SECTION("theta=0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(10, 0.0, 1), std::invalid_argument); + } + + SECTION("theta very close to 0") { + double result = binomial_bounds::get_upper_bound(10, 1e-10, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=1 boundary") { + double result = binomial_bounds::get_upper_bound(1, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("num_samples=120 boundary") { + double result = binomial_bounds::get_upper_bound(120, 0.5, 1); + REQUIRE(result >= 0.0); + } + + SECTION("estimate clamping case") { + double result = binomial_bounds::get_upper_bound(10, 0.9, 1); + double estimate = 10.0 / 0.9; + REQUIRE(result >= estimate); + } + + SECTION("invalid theta < 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, -0.1, 1), std::invalid_argument); + } + + SECTION("invalid theta > 1") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 1.1, 1), std::invalid_argument); + } + + SECTION("invalid stddev = 0") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 0), std::invalid_argument); + } + + SECTION("invalid stddev = 4") { + REQUIRE_THROWS_AS(binomial_bounds::get_upper_bound(100, 0.5, 4), std::invalid_argument); + } +} + +} /* namespace datasketches */ diff --git a/common/test/deserialize_hardening_test.cpp b/common/test/deserialize_hardening_test.cpp new file mode 100644 index 00000000..64e654b4 --- /dev/null +++ b/common/test/deserialize_hardening_test.cpp @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +// Include all affected sketch types +#include +#include +#include + +namespace datasketches { + +/** + * Test for fix of issue #477: + * BUG: SIGABRT in deserialize(): dereferencing empty std::optional (libc++ verbose_abort) + * + * These tests exercise the actual deserialization code path that contained the bug. + * With buggy code (&*tmp on empty optional) and hardening enabled, these will SIGABRT. + * With fixed code (aligned_storage), these pass normally. + * + * IMPORTANT: These tests actually call deserialize() on multi-item sketches, which + * exercises the buggy code path where min/max are deserialized. + */ + +TEST_CASE("quantiles_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + // Create sketch with multiple items (so min/max are stored in serialization) + quantiles_sketch sketch(128); + for (int i = 0; i < 1000; i++) { + sketch.update(static_cast(i)); + } + + // Serialize + auto bytes = sketch.serialize(); + + // Deserialize - WITH BUGGY CODE AND HARDENING, THIS WILL SIGABRT HERE + // The bug is: sd.deserialize(is, &*tmp, 1) where tmp is empty optional + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + + // Verify deserialization worked correctly + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); + REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5)); +} + +TEST_CASE("quantiles_sketch: deserialize from stream", "[deserialize_hardening]") { + quantiles_sketch sketch(256); + for (int i = 0; i < 2000; i++) { + sketch.update(static_cast(i) * 0.5f); + } + + // Serialize to stream + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream - exercises the buggy code path + auto sketch2 = quantiles_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + kll_sketch sketch(200); + for (int i = 0; i < 1500; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy &*tmp code path + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize from stream", "[deserialize_hardening]") { + kll_sketch sketch(400); + for (int i = 0; i < 3000; i++) { + sketch.update(i); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = kll_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize multi-level sketch", "[deserialize_hardening]") { + // REQ sketch only has the bug when num_levels > 1 + // We need to add enough items to trigger multiple levels + req_sketch sketch(12); + for (int i = 0; i < 10000; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy code path when num_levels > 1 + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize from stream", "[deserialize_hardening]") { + req_sketch sketch(20); + for (int i = 0; i < 15000; i++) { + sketch.update(static_cast(i) * 0.1); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = req_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("multiple sketch types: stress test", "[deserialize_hardening]") { + SECTION("quantiles with various sizes") { + for (int k : {64, 128, 256}) { + quantiles_sketch sketch(k); + for (int i = 0; i < 5000; i++) { + sketch.update(i); + } + auto bytes = sketch.serialize(); + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("kll with various sizes") { + for (int k : {100, 200, 400}) { + kll_sketch sketch(k); + for (int i = 0; i < 4000; i++) { + sketch.update(static_cast(i) / 10.0); + } + auto bytes = sketch.serialize(); + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("req with various sizes") { + for (int k : {12, 20}) { + req_sketch sketch(k); + for (int i = 0; i < 8000; i++) { + sketch.update(static_cast(i)); + } + auto bytes = sketch.serialize(); + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } +} + +} // namespace datasketches diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 45376e7b..2f2629fc 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -39,7 +39,9 @@ _num_buckets(num_buckets), _sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator), _seed(seed), _total_weight(0) { - if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1."); + if (num_buckets < 3) { + throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1."); + } // This check is to ensure later compatibility with a Java implementation whose maximum size can only // be 2^31-1. We check only against 2^30 for simplicity. @@ -74,7 +76,7 @@ uint64_t count_min_sketch::get_seed() const { template double count_min_sketch::get_relative_error() const { - return exp(1.0) / double(_num_buckets); + return exp(1.0) / static_cast(_num_buckets); } template @@ -147,7 +149,7 @@ W count_min_sketch::get_estimate(int64_t item) const {return get_estimate(& template W count_min_sketch::get_estimate(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_estimate(item.c_str(), item.length()); } @@ -176,7 +178,7 @@ void count_min_sketch::update(int64_t item, W weight) { template void count_min_sketch::update(const std::string& item, W weight) { - if (item.empty()) return; + if (item.empty()) { return; } update(item.c_str(), item.length(), weight); } @@ -201,7 +203,7 @@ W count_min_sketch::get_upper_bound(int64_t item) const {return get_upper_b template W count_min_sketch::get_upper_bound(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_upper_bound(item.c_str(), item.length()); } @@ -218,7 +220,7 @@ W count_min_sketch::get_lower_bound(int64_t item) const {return get_lower_b template W count_min_sketch::get_lower_bound(const std::string& item) const { - if (item.empty()) return 0; // Empty strings are not inserted into the sketch. + if (item.empty()) { return 0; } // Empty strings are not inserted into the sketch. return get_lower_bound(item.c_str(), item.length()); } @@ -232,17 +234,13 @@ void count_min_sketch::merge(const count_min_sketch &other_sketch) { /* * Merges this sketch into other_sketch sketch by elementwise summing of buckets */ - if (this == &other_sketch) { - throw std::invalid_argument( "Cannot merge a sketch with itself." ); - } + if (this == &other_sketch) { throw std::invalid_argument( "Cannot merge a sketch with itself." ); } bool acceptable_config = (get_num_hashes() == other_sketch.get_num_hashes()) && (get_num_buckets() == other_sketch.get_num_buckets()) && (get_seed() == other_sketch.get_seed()); - if (!acceptable_config) { - throw std::invalid_argument( "Incompatible sketch configuration." ); - } + if (!acceptable_config) { throw std::invalid_argument( "Incompatible sketch configuration." ); } // Merge step - iterate over the other vector and add the weights to this sketch auto it = _sketch_array.begin(); // This is a std::vector iterator. @@ -290,7 +288,7 @@ void count_min_sketch::serialize(std::ostream& os) const { write(os, nhashes); write(os, seed_hash); write(os, unused8); - if (is_empty()) return; // sketch is empty, no need to write further bytes. + if (is_empty()) { return; } // sketch is empty, no need to write further bytes. // Long 2 write(os, _total_weight); @@ -327,7 +325,7 @@ auto count_min_sketch::deserialize(std::istream& is, uint64_t seed, const A } count_min_sketch c(nhashes, nbuckets, seed, allocator); const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; - if (is_empty == 1) return c; // sketch is empty, no need to read further. + if (is_empty == 1) { return c; } // sketch is empty, no need to read further. // Set the sketch weight and read in the sketch values const auto weight = read(is); @@ -373,7 +371,7 @@ auto count_min_sketch::serialize(unsigned header_size_bytes) const -> vecto ptr += copy_to_mem(nhashes, ptr); ptr += copy_to_mem(seed_hash, ptr); ptr += copy_to_mem(null_characters_8, ptr); - if (is_empty()) return bytes; // sketch is empty, no need to write further bytes. + if (is_empty()) { return bytes; } // sketch is empty, no need to write further bytes. // Long 2 const W t_weight = _total_weight; @@ -423,7 +421,7 @@ auto count_min_sketch::deserialize(const void* bytes, size_t size, uint64_t } count_min_sketch c(nhashes, nbuckets, seed, allocator); const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; - if (is_empty) return c; // sketch is empty, no need to read further. + if (is_empty) { return c; } // sketch is empty, no need to read further. ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes)); @@ -449,8 +447,7 @@ string count_min_sketch::to_string() const { // count the number of used entries in the sketch uint64_t num_nonzero = 0; for (const auto entry: _sketch_array) { - if (entry != static_cast(0.0)) - ++num_nonzero; + if (entry != static_cast(0.0)) { ++num_nonzero; } } // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. diff --git a/count/test/count_min_test.cpp b/count/test/count_min_test.cpp index 143be1b8..8b7ae0a7 100644 --- a/count/test/count_min_test.cpp +++ b/count/test/count_min_test.cpp @@ -55,7 +55,7 @@ TEST_CASE("CM init") { TEST_CASE("CM parameter suggestions", "[error parameters]") { // Bucket suggestions - REQUIRE_THROWS(count_min_sketch::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_buckets(-1.0), "Relative error must be at least 0."); REQUIRE(count_min_sketch::suggest_num_buckets(0.2) == 14); REQUIRE(count_min_sketch::suggest_num_buckets(0.1) == 28); REQUIRE(count_min_sketch::suggest_num_buckets(0.05) == 55); @@ -69,8 +69,8 @@ TEST_CASE("CM parameter suggestions", "[error parameters]") { REQUIRE(count_min_sketch(n_hashes, 272).get_relative_error() <= 0.01); // Hash suggestions - REQUIRE_THROWS(count_min_sketch::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ); - REQUIRE_THROWS(count_min_sketch::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ); + REQUIRE_THROWS_WITH(count_min_sketch::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ); REQUIRE(count_min_sketch::suggest_num_hashes(0.682689492) == 2); // 1 STDDEV REQUIRE(count_min_sketch::suggest_num_hashes(0.954499736) == 4); // 2 STDDEV REQUIRE(count_min_sketch::suggest_num_hashes(0.997300204) == 6); // 3 STDDEV @@ -161,9 +161,9 @@ TEST_CASE("CM merge - reject", "[reject cases]") { std::vector> sketches = {s1, s2, s3}; // Fail cases - REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." ); + REQUIRE_THROWS_WITH(s.merge(s), "Cannot merge a sketch with itself." ); for (count_min_sketch sk : sketches) { - REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." ); + REQUIRE_THROWS_WITH(s.merge(sk), "Incompatible sketch configuration." ); } } diff --git a/cpc/include/cpc_compressor_impl.hpp b/cpc/include/cpc_compressor_impl.hpp index 062e2e0e..0cc24b19 100644 --- a/cpc/include/cpc_compressor_impl.hpp +++ b/cpc/include/cpc_compressor_impl.hpp @@ -157,7 +157,7 @@ void cpc_compressor::compress(const cpc_sketch_alloc& source, compressed_s break; case cpc_sketch_alloc::flavor::PINNED: compress_pinned_flavor(source, result); - if (result.window_data.size() == 0) throw std::logic_error("window is not expected"); + if (result.window_data.size() == 0) throw std::logic_error("window is expected"); break; case cpc_sketch_alloc::flavor::SLIDING: compress_sliding_flavor(source, result); diff --git a/cpc/include/cpc_sketch_impl.hpp b/cpc/include/cpc_sketch_impl.hpp index 84709cdc..80f111f1 100644 --- a/cpc/include/cpc_sketch_impl.hpp +++ b/cpc/include/cpc_sketch_impl.hpp @@ -73,7 +73,7 @@ bool cpc_sketch_alloc::is_empty() const { template double cpc_sketch_alloc::get_estimate() const { - if (!was_merged) return get_hip_estimate(); + if (!was_merged) { return get_hip_estimate(); } return get_icon_estimate(); } @@ -92,7 +92,7 @@ double cpc_sketch_alloc::get_lower_bound(unsigned kappa) const { if (kappa < 1 || kappa > 3) { throw std::invalid_argument("kappa must be 1, 2 or 3"); } - if (!was_merged) return get_hip_confidence_lb(*this, kappa); + if (!was_merged) { return get_hip_confidence_lb(*this, kappa); } return get_icon_confidence_lb(*this, kappa); } @@ -101,13 +101,13 @@ double cpc_sketch_alloc::get_upper_bound(unsigned kappa) const { if (kappa < 1 || kappa > 3) { throw std::invalid_argument("kappa must be 1, 2 or 3"); } - if (!was_merged) return get_hip_confidence_ub(*this, kappa); + if (!was_merged) { return get_hip_confidence_ub(*this, kappa); } return get_icon_confidence_ub(*this, kappa); } template void cpc_sketch_alloc::update(const std::string& value) { - if (value.empty()) return; + if (value.empty()) { return; } update(value.c_str(), value.length()); } @@ -173,15 +173,15 @@ void cpc_sketch_alloc::update(float value) { } static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) { - if (lg_k > 26) throw std::logic_error("lg_k > 26"); + if (lg_k > 26) { throw std::logic_error("lg_k > 26"); } const uint32_t k = 1 << lg_k; uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64 - if (col > 63) col = 63; // clip so that 0 <= col <= 63 + if (col > 63) { col = 63; } // clip so that 0 <= col <= 63 const uint32_t row = hash0 & (k - 1); uint32_t row_col = (row << 6) | col; // To avoid the hash table's "empty" value, we change the row of the following pair. // This case is extremely unlikely, but we might as well handle it. - if (row_col == UINT32_MAX) row_col ^= 1 << 6; + if (row_col == UINT32_MAX) { row_col ^= 1 << 6; } return row_col; } @@ -195,7 +195,7 @@ void cpc_sketch_alloc::update(const void* value, size_t size) { template void cpc_sketch_alloc::row_col_update(uint32_t row_col) { const uint8_t col = row_col & 63; - if (col < first_interesting_column) return; // important speed optimization + if (col < first_interesting_column) { return; } // important speed optimization // window size is 0 until sketch is promoted from sparse to windowed if (sliding_window.size() == 0) { update_sparse(row_col); @@ -208,26 +208,26 @@ template void cpc_sketch_alloc::update_sparse(uint32_t row_col) { const uint32_t k = 1 << lg_k; const uint64_t c32pre = static_cast(num_coupons) << 5; - if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE + if (c32pre >= 3 * k) { throw std::logic_error("c32pre >= 3 * k"); } // C < 3K/32, in other words flavor == SPARSE bool is_novel = surprising_value_table.maybe_insert(row_col); if (is_novel) { num_coupons++; update_hip(row_col); const uint64_t c32post = static_cast(num_coupons) << 5; - if (c32post >= 3 * k) promote_sparse_to_windowed(); // C >= 3K/32 + if (c32post >= 3 * k) { promote_sparse_to_windowed(); } // C >= 3K/32 } } // the flavor is HYBRID, PINNED, or SLIDING template void cpc_sketch_alloc::update_windowed(uint32_t row_col) { - if (window_offset > 56) throw std::logic_error("wrong window offset"); + if (window_offset > 56) { throw std::logic_error("wrong window offset"); } const uint32_t k = 1 << lg_k; const uint64_t c32pre = static_cast(num_coupons) << 5; - if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID + if (c32pre < 3 * k) { throw std::logic_error("c32pre < 3 * k"); } // C < 3K/32, in other words flavor >= HYBRID const uint64_t c8pre = static_cast(num_coupons) << 3; const uint64_t w8pre = static_cast(window_offset) << 3; - if (c8pre >= (27 + w8pre) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset) + if (c8pre >= (27 + w8pre) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset) bool is_novel = false; const uint8_t col = row_col & 63; @@ -235,7 +235,7 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { if (col < window_offset) { // track the surprising 0's "before" the window is_novel = surprising_value_table.maybe_delete(row_col); // inverted logic } else if (col < window_offset + 8) { // track the 8 bits inside the window - if (col < window_offset) throw std::logic_error("col < window_offset"); + if (col < window_offset) { throw std::logic_error("col < window_offset"); } const uint32_t row = row_col >> 6; const uint8_t old_bits = sliding_window[row]; const uint8_t new_bits = old_bits | (1 << (col - window_offset)); @@ -244,7 +244,7 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { is_novel = true; } } else { // track the surprising 1's "after" the window - if (col < window_offset + 8) throw std::logic_error("col < window_offset + 8"); + if (col < window_offset + 8) { throw std::logic_error("col < window_offset + 8"); } is_novel = surprising_value_table.maybe_insert(row_col); // normal logic } @@ -254,9 +254,9 @@ void cpc_sketch_alloc::update_windowed(uint32_t row_col) { const uint64_t c8post = static_cast(num_coupons) << 3; if (c8post >= (27 + w8pre) * k) { move_window(); - if (window_offset < 1 || window_offset > 56) throw std::logic_error("wrong window offset"); + if (window_offset < 1 || window_offset > 56) { throw std::logic_error("wrong window offset"); } const uint64_t w8post = static_cast(window_offset) << 3; - if (c8post >= (27 + w8post) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset) + if (c8post >= (27 + w8post) * k) { throw std::logic_error("c8pre is wrong"); } // C < (K * 27/8) + (K * window_offset) } } } @@ -276,7 +276,7 @@ template void cpc_sketch_alloc::promote_sparse_to_windowed() { const uint32_t k = 1 << lg_k; const uint64_t c32 = static_cast(num_coupons) << 5; - if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32"); + if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) { throw std::logic_error("wrong c32"); } sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it) @@ -285,7 +285,7 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { const uint32_t* old_slots = surprising_value_table.get_slots(); const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size(); - if (window_offset != 0) throw std::logic_error("window_offset != 0"); + if (window_offset != 0) { throw std::logic_error("window_offset != 0"); } for (uint32_t i = 0; i < old_num_slots; i++) { const uint32_t row_col = old_slots[i]; @@ -297,7 +297,7 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { } else { // cannot use u32_table::must_insert(), because it doesn't provide for growth const bool is_novel = new_table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } } @@ -308,17 +308,17 @@ void cpc_sketch_alloc::promote_sparse_to_windowed() { template void cpc_sketch_alloc::move_window() { const uint8_t new_offset = window_offset + 1; - if (new_offset > 56) throw std::logic_error("new_offset > 56"); - if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong"); + if (new_offset > 56) { throw std::logic_error("new_offset > 56"); } + if (new_offset != determine_correct_offset(lg_k, num_coupons)) { throw std::logic_error("new_offset is wrong"); } - if (sliding_window.size() == 0) throw std::logic_error("no sliding window"); + if (sliding_window.size() == 0) { throw std::logic_error("no sliding window"); } const uint32_t k = 1 << lg_k; // Construct the full-sized bit matrix that corresponds to the sketch vector_u64 bit_matrix = build_bit_matrix(); // refresh the KXP register on every 8th window shift. - if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data()); + if ((new_offset & 0x7) == 0) { refresh_kxp(bit_matrix.data()); } surprising_value_table.clear(); // the new number of surprises will be about the same @@ -339,14 +339,14 @@ void cpc_sketch_alloc::move_window() { pattern = pattern ^ (static_cast(1) << col); // erase the 1 const uint32_t row_col = (i << 6) | col; const bool is_novel = surprising_value_table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } window_offset = new_offset; first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored); - if (first_interesting_column > new_offset) first_interesting_column = new_offset; // corner case + if (first_interesting_column > new_offset) { first_interesting_column = new_offset; } // corner case } // The KXP register is a double with roughly 50 bits of precision, but @@ -438,7 +438,7 @@ void cpc_sketch_alloc::serialize(std::ostream& os) const { write(os, compressed.table_num_entries); // HIP values can be in two different places in the sequence of fields // this is the first HIP decision point - if (has_hip) write_hip(os); + if (has_hip) { write_hip(os); } } if (has_table) { write(os, compressed.table_data_words); @@ -447,7 +447,7 @@ void cpc_sketch_alloc::serialize(std::ostream& os) const { write(os, compressed.window_data_words); } // this is the second HIP decision point - if (has_hip && !(has_table && has_window)) write_hip(os); + if (has_hip && !(has_table && has_window)) { write_hip(os); } if (has_window) { write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t)); } @@ -494,7 +494,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.table_num_entries, ptr); // HIP values can be in two different places in the sequence of fields // this is the first HIP decision point - if (has_hip) ptr += copy_hip_to_mem(ptr); + if (has_hip) { ptr += copy_hip_to_mem(ptr); } } if (has_table) { ptr += copy_to_mem(compressed.table_data_words, ptr); @@ -503,7 +503,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.window_data_words, ptr); } // this is the second HIP decision point - if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr); + if (has_hip && !(has_table && has_window)) { ptr += copy_hip_to_mem(ptr); } if (has_window) { ptr += copy_to_mem(compressed.window_data.data(), ptr, compressed.window_data_words * sizeof(uint32_t)); } @@ -511,7 +511,7 @@ auto cpc_sketch_alloc::serialize(unsigned header_size_bytes) const -> vector_ ptr += copy_to_mem(compressed.table_data.data(), ptr, compressed.table_data_words * sizeof(uint32_t)); } } - if (ptr != bytes.data() + size) throw std::logic_error("serialized size mismatch"); + if (ptr != bytes.data() + size) { throw std::logic_error("serialized size mismatch"); } return bytes; } @@ -561,7 +561,7 @@ cpc_sketch_alloc cpc_sketch_alloc::deserialize(std::istream& is, uint64_t compressed.table_data.resize(compressed.table_data_words); read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t)); } - if (!has_window) compressed.table_num_entries = num_coupons; + if (!has_window) { compressed.table_num_entries = num_coupons; } } uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window); @@ -583,8 +583,9 @@ cpc_sketch_alloc cpc_sketch_alloc::deserialize(std::istream& is, uint64_t } uncompressed_state uncompressed(allocator); get_compressor().uncompress(compressed, uncompressed, lg_k, num_coupons); - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { + throw std::runtime_error("error reading from std::istream"); + } return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table), std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed); } diff --git a/cpc/include/cpc_union_impl.hpp b/cpc/include/cpc_union_impl.hpp index f277107f..673aa7a4 100644 --- a/cpc/include/cpc_union_impl.hpp +++ b/cpc/include/cpc_union_impl.hpp @@ -109,15 +109,15 @@ void cpc_union_alloc::internal_update(S&& sketch) { + std::to_string(seed_hash_sketch)); } const auto src_flavor = sketch.determine_flavor(); - if (cpc_sketch_alloc::flavor::EMPTY == src_flavor) return; + if (cpc_sketch_alloc::flavor::EMPTY == src_flavor) { return; } - if (sketch.get_lg_k() < lg_k) reduce_k(sketch.get_lg_k()); - if (sketch.get_lg_k() < lg_k) throw std::logic_error("sketch lg_k < union lg_k"); + if (sketch.get_lg_k() < lg_k) { reduce_k(sketch.get_lg_k()); } + if (sketch.get_lg_k() < lg_k) { throw std::logic_error("sketch lg_k < union lg_k"); } - if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit matrix are absent"); + if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit matrix are absent"); } if (cpc_sketch_alloc::flavor::SPARSE == src_flavor && accumulator != nullptr) { // Case A - if (bit_matrix.size() > 0) throw std::logic_error("union bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("union bit_matrix is not expected"); } const auto initial_dest_flavor = accumulator->determine_flavor(); if (cpc_sketch_alloc::flavor::EMPTY != initial_dest_flavor && cpc_sketch_alloc::flavor::SPARSE != initial_dest_flavor) throw std::logic_error("wrong flavor"); @@ -138,24 +138,24 @@ void cpc_union_alloc::internal_update(S&& sketch) { } if (cpc_sketch_alloc::flavor::SPARSE == src_flavor && bit_matrix.size() > 0) { // Case B - if (accumulator != nullptr) throw std::logic_error("union accumulator != null"); + if (accumulator != nullptr) { throw std::logic_error("union accumulator != null"); } or_table_into_matrix(sketch.surprising_value_table); return; } if (cpc_sketch_alloc::flavor::HYBRID != src_flavor && cpc_sketch_alloc::flavor::PINNED != src_flavor - && cpc_sketch_alloc::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); + && cpc_sketch_alloc::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); } // source is past SPARSE mode, so make sure that dest is a bit matrix if (accumulator != nullptr) { - if (bit_matrix.size() > 0) throw std::logic_error("union bit matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("union bit matrix is not expected"); } const auto dst_flavor = accumulator->determine_flavor(); if (cpc_sketch_alloc::flavor::EMPTY != dst_flavor && cpc_sketch_alloc::flavor::SPARSE != dst_flavor) { throw std::logic_error("wrong flavor"); } switch_to_bit_matrix(); } - if (bit_matrix.size() == 0) throw std::logic_error("union bit_matrix is expected"); + if (bit_matrix.size() == 0) { throw std::logic_error("union bit_matrix is expected"); } if (cpc_sketch_alloc::flavor::HYBRID == src_flavor || cpc_sketch_alloc::flavor::PINNED == src_flavor) { // Case C or_window_into_matrix(sketch.sliding_window, sketch.window_offset, sketch.get_lg_k()); @@ -165,7 +165,7 @@ void cpc_union_alloc::internal_update(S&& sketch) { // SLIDING mode involves inverted logic, so we can't just walk the source sketch. // Instead, we convert it to a bitMatrix that can be OR'ed into the destination. - if (cpc_sketch_alloc::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D + if (cpc_sketch_alloc::flavor::SLIDING != src_flavor) { throw std::logic_error("wrong flavor"); } // Case D vector_u64 src_matrix = sketch.build_bit_matrix(); or_matrix_into_matrix(src_matrix, sketch.get_lg_k()); } @@ -173,20 +173,20 @@ void cpc_union_alloc::internal_update(S&& sketch) { template cpc_sketch_alloc cpc_union_alloc::get_result() const { if (accumulator != nullptr) { - if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); } return get_result_from_accumulator(); } - if (bit_matrix.size() == 0) throw std::logic_error("bit_matrix is expected"); + if (bit_matrix.size() == 0) { throw std::logic_error("bit_matrix is expected"); } return get_result_from_bit_matrix(); } template cpc_sketch_alloc cpc_union_alloc::get_result_from_accumulator() const { - if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k"); + if (lg_k != accumulator->get_lg_k()) { throw std::logic_error("lg_k != accumulator->lg_k"); } if (accumulator->get_num_coupons() == 0) { return cpc_sketch_alloc(lg_k, seed, accumulator->get_allocator()); } - if (accumulator->determine_flavor() != cpc_sketch_alloc::flavor::SPARSE) throw std::logic_error("wrong flavor"); + if (accumulator->determine_flavor() != cpc_sketch_alloc::flavor::SPARSE) { throw std::logic_error("wrong flavor"); } cpc_sketch_alloc copy(*accumulator); copy.was_merged = true; return copy; @@ -199,7 +199,7 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { const auto flavor = cpc_sketch_alloc::determine_flavor(lg_k, num_coupons); if (flavor != cpc_sketch_alloc::flavor::HYBRID && flavor != cpc_sketch_alloc::flavor::PINNED - && flavor != cpc_sketch_alloc::flavor::SLIDING) throw std::logic_error("wrong flavor"); + && flavor != cpc_sketch_alloc::flavor::SLIDING) { throw std::logic_error("wrong flavor"); } const uint8_t offset = cpc_sketch_alloc::determine_correct_offset(lg_k, num_coupons); @@ -208,7 +208,7 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { // dynamically growing caused snowplow effect uint8_t table_lg_size = lg_k - 4; // K/16; in some cases this will end up being oversized - if (table_lg_size < 2) table_lg_size = 2; + if (table_lg_size < 2) { table_lg_size = 2; } u32_table table(table_lg_size, 6 + lg_k, bit_matrix.get_allocator()); // the following should work even when the offset is zero @@ -229,14 +229,14 @@ cpc_sketch_alloc cpc_union_alloc::get_result_from_bit_matrix() const { pattern = pattern ^ (static_cast(1) << col); // erase the 1 const uint32_t row_col = (i << 6) | col; bool is_novel = table.maybe_insert(row_col); - if (!is_novel) throw std::logic_error("is_novel != true"); + if (!is_novel) { throw std::logic_error("is_novel != true"); } } } // at this point we could shrink an oversized hash table, but the relative waste isn't very big uint8_t first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored); - if (first_interesting_column > offset) first_interesting_column = offset; // corner case + if (first_interesting_column > offset) { first_interesting_column = offset; } // corner case // HIP-related fields will contain zeros, and that is okay return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(table), std::move(sliding_window), false, 0, 0, seed); @@ -260,9 +260,9 @@ void cpc_union_alloc::walk_table_updating_sketch(const u32_table& table) { // Using a golden ratio stride fixes the snowplow effect. const double golden = 0.6180339887498949025; uint32_t stride = static_cast(golden * static_cast(num_slots)); - if (stride < 2) throw std::logic_error("stride < 2"); - if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd - if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range"); + if (stride < 2) { throw std::logic_error("stride < 2"); } + if (stride == ((stride >> 1) << 1)) { stride += 1; } // force the stride to be odd + if (stride < 3 || stride >= num_slots) { throw std::out_of_range("stride out of range"); } for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) { j &= num_slots - 1; @@ -290,7 +290,7 @@ void cpc_union_alloc::or_table_into_matrix(const u32_table& table) { template void cpc_union_alloc::or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k) { - if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK"); + if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); } const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK const uint32_t src_k = 1 << src_lg_k; for (uint32_t src_row = 0; src_row < src_k; src_row++) { @@ -300,7 +300,7 @@ void cpc_union_alloc::or_window_into_matrix(const vector_bytes& sliding_windo template void cpc_union_alloc::or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k) { - if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK"); + if (lg_k > src_lg_k) { throw std::logic_error("dst LgK > src LgK"); } const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK const uint32_t src_k = 1 << src_lg_k; for (uint32_t src_row = 0; src_row < src_k; src_row++) { @@ -310,11 +310,11 @@ void cpc_union_alloc::or_matrix_into_matrix(const vector_u64& src_matrix, uin template void cpc_union_alloc::reduce_k(uint8_t new_lg_k) { - if (new_lg_k >= lg_k) throw std::logic_error("new LgK >= union lgK"); - if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit_matrix are absent"); + if (new_lg_k >= lg_k) { throw std::logic_error("new LgK >= union lgK"); } + if (accumulator == nullptr && bit_matrix.size() == 0) { throw std::logic_error("both accumulator and bit_matrix are absent"); } if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix - if (accumulator != nullptr) throw std::logic_error("accumulator is not null"); + if (accumulator != nullptr) { throw std::logic_error("accumulator is not null"); } vector_u64 old_matrix = std::move(bit_matrix); const uint8_t old_lg_k = lg_k; const uint32_t new_k = 1 << new_lg_k; @@ -325,7 +325,7 @@ void cpc_union_alloc::reduce_k(uint8_t new_lg_k) { } if (accumulator != nullptr) { // downsample the unioner's sketch - if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected"); + if (bit_matrix.size() > 0) { throw std::logic_error("bit_matrix is not expected"); } if (!accumulator->is_empty()) { cpc_sketch_alloc old_accumulator(*accumulator); *accumulator = cpc_sketch_alloc(new_lg_k, seed, old_accumulator.get_allocator()); diff --git a/cpc/include/cpc_util.hpp b/cpc/include/cpc_util.hpp index e5664951..c9da8ab7 100644 --- a/cpc/include/cpc_util.hpp +++ b/cpc/include/cpc_util.hpp @@ -25,19 +25,19 @@ namespace datasketches { static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) { - if (y == 0) throw std::invalid_argument("divide_longs_rounding_up: bad argument"); + if (y == 0) { throw std::invalid_argument("divide_longs_rounding_up: bad argument"); } const uint64_t quotient = x / y; - if (quotient * y == x) return (quotient); - else return quotient + 1; + if (quotient * y == x) { return (quotient); } + else { return quotient + 1; } } static inline uint8_t floor_log2_of_long(uint64_t x) { - if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument"); + if (x < 1) { throw std::invalid_argument("floor_log2_of_long: bad argument"); } uint8_t p = 0; uint64_t y = 1; while (true) { - if (y == x) return p; - if (y > x) return p - 1; + if (y == x) { return p; } + if (y > x) { return p - 1; } p += 1; y <<= 1; } @@ -98,7 +98,7 @@ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, ui } static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) { - if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8"); + if ((length & 0x7) != 0) { throw std::invalid_argument("the length of the array must be a multiple of 8"); } uint32_t total = 0; uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights; fours = twos = ones = 0; diff --git a/cpc/include/icon_estimator.hpp b/cpc/include/icon_estimator.hpp index fb3c0c60..ade787e5 100644 --- a/cpc/include/icon_estimator.hpp +++ b/cpc/include/icon_estimator.hpp @@ -246,14 +246,14 @@ static inline double icon_exponential_approximation(double k, double c) { } static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) { - if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range"); - if (c < 2) return ((c == 0) ? 0.0 : 1.0); + if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) { throw std::out_of_range("lg_k out of range"); } + if (c < 2) { return ((c == 0) ? 0.0 : 1.0); } const uint32_t k = 1 << lg_k; const double double_k = static_cast(k); const double double_c = static_cast(c); // Differing thresholds ensure that the approximated estimator is monotonically increasing. const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6); - if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c); + if (double_c > (threshold_factor * double_k)) { return icon_exponential_approximation(double_k, double_c); } const double factor = evaluate_polynomial( ICON_POLYNOMIAL_COEFFICIENTS, ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K), @@ -265,8 +265,8 @@ static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) { // The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS const double term = 1.0 + (ratio * ratio * ratio / 66.774757); const double result = double_c * factor * term; - if (result >= double_c) return result; - else return double_c; + if (result >= double_c) { return result; } + else { return double_c; } } } /* namespace datasketches */ diff --git a/cpc/include/u32_table_impl.hpp b/cpc/include/u32_table_impl.hpp index 62cd7dac..85797bcf 100644 --- a/cpc/include/u32_table_impl.hpp +++ b/cpc/include/u32_table_impl.hpp @@ -43,8 +43,8 @@ num_valid_bits(num_valid_bits), num_items(0), slots(1ULL << lg_size, UINT32_MAX, allocator) { - if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2"); - if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32"); + if (lg_size < 2) { throw std::invalid_argument("lg_size must be >= 2"); } + if (num_valid_bits < 1 || num_valid_bits > 32) { throw std::invalid_argument("num_valid_bits must be between 1 and 32"); } } template @@ -71,8 +71,8 @@ void u32_table::clear() { template bool u32_table::maybe_insert(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == item) return false; - if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert"); + if (slots[index] == item) { return false; } + if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); } slots[index] = item; num_items++; if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) { @@ -84,9 +84,9 @@ bool u32_table::maybe_insert(uint32_t item) { template bool u32_table::maybe_delete(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == UINT32_MAX) return false; - if (slots[index] != item) throw std::logic_error("item does not exist"); - if (num_items == 0) throw std::logic_error("delete error"); + if (slots[index] == UINT32_MAX) { return false; } + if (slots[index] != item) { throw std::logic_error("item does not exist"); } + if (num_items == 0) { throw std::logic_error("delete error"); } // delete the item slots[index] = UINT32_MAX; num_items--; @@ -129,7 +129,7 @@ uint32_t u32_table::lookup(uint32_t item) const { const uint32_t mask = size - 1; const uint8_t shift = num_valid_bits - lg_size; uint32_t probe = item >> shift; - if (probe > mask) throw std::logic_error("probe out of range"); + if (probe > mask) { throw std::logic_error("probe out of range"); } while (slots[probe] != item && slots[probe] != UINT32_MAX) { probe = (probe + 1) & mask; } @@ -140,17 +140,17 @@ uint32_t u32_table::lookup(uint32_t item) const { template void u32_table::must_insert(uint32_t item) { const uint32_t index = lookup(item); - if (slots[index] == item) throw std::logic_error("item exists"); - if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert"); + if (slots[index] == item) { throw std::logic_error("item exists"); } + if (slots[index] != UINT32_MAX) { throw std::logic_error("could not insert"); } slots[index] = item; } template void u32_table::rebuild(uint8_t new_lg_size) { - if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2"); + if (new_lg_size < 2) { throw std::logic_error("lg_size must be >= 2"); } const uint32_t old_size = 1 << lg_size; const uint32_t new_size = 1 << new_lg_size; - if (new_size <= num_items) throw std::logic_error("new_size <= num_items"); + if (new_size <= num_items) { throw std::logic_error("new_size <= num_items"); } vector_u32 old_slots = std::move(slots); slots = vector_u32(new_size, UINT32_MAX, old_slots.get_allocator()); lg_size = new_lg_size; @@ -169,7 +169,7 @@ void u32_table::rebuild(uint8_t new_lg_size) { // The result is nearly sorted, so make sure to use an efficient sort for that case template auto u32_table::unwrapping_get_items() const -> vector_u32 { - if (num_items == 0) return vector_u32(slots.get_allocator()); + if (num_items == 0) { return vector_u32(slots.get_allocator()); } const uint32_t table_size = 1 << lg_size; vector_u32 result(num_items, 0, slots.get_allocator()); size_t i = 0; @@ -187,9 +187,9 @@ auto u32_table::unwrapping_get_items() const -> vector_u32 { // the rest of the table is processed normally while (i < table_size) { const uint32_t item = slots[i++]; - if (item != UINT32_MAX) result[l++] = item; + if (item != UINT32_MAX) { result[l++] = item; } } - if (l != r + 1) throw std::logic_error("unwrapping error"); + if (l != r + 1) { throw std::logic_error("unwrapping error"); } return result; } @@ -213,7 +213,7 @@ void u32_table::merge( else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; } else { arr_c[c] = arr_b[b++]; } } - if (a != lim_a || b != lim_b) throw std::logic_error("merging error"); + if (a != lim_a || b != lim_b) { throw std::logic_error("merging error"); } } // In applications where the input array is already nearly sorted, diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp index 0aa9514c..87ee174e 100644 --- a/fi/include/frequent_items_sketch.hpp +++ b/fi/include/frequent_items_sketch.hpp @@ -44,6 +44,11 @@ enum frequent_items_error_type { * Based on Java implementation here: * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java * @author Alexander Saydakov + * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. */ template< typename T, @@ -74,6 +79,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (lvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -82,6 +89,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (rvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -91,6 +100,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (lvalue) */ void merge(const frequent_items_sketch& other); @@ -98,6 +109,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (rvalue) */ void merge(frequent_items_sketch&& other); diff --git a/fi/include/frequent_items_sketch_impl.hpp b/fi/include/frequent_items_sketch_impl.hpp index acbd2ee1..3eba188b 100644 --- a/fi/include/frequent_items_sketch_impl.hpp +++ b/fi/include/frequent_items_sketch_impl.hpp @@ -45,13 +45,13 @@ map( allocator ) { - if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size"); + if (lg_start_map_size > lg_max_map_size) { throw std::invalid_argument("starting size must not be greater than maximum size"); } } template void frequent_items_sketch::update(const T& item, W weight) { check_weight(weight); - if (weight == 0) return; + if (weight == 0) { return; } total_weight += weight; offset += map.adjust_or_insert(item, weight); } @@ -59,14 +59,14 @@ void frequent_items_sketch::update(const T& item, W weight) { template void frequent_items_sketch::update(T&& item, W weight) { check_weight(weight); - if (weight == 0) return; + if (weight == 0) { return; } total_weight += weight; offset += map.adjust_or_insert(std::move(item), weight); } template void frequent_items_sketch::merge(const frequent_items_sketch& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end for (auto it: other.map) { update(it.first, it.second); @@ -77,7 +77,7 @@ void frequent_items_sketch::merge(const frequent_items_sketch& ot template void frequent_items_sketch::merge(frequent_items_sketch&& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end for (auto it: other.map) { update(std::move(it.first), it.second); @@ -105,7 +105,7 @@ template W frequent_items_sketch::get_estimate(const T& item) const { // if item is tracked estimate = weight + offset, otherwise 0 const W weight = map.get(item); - if (weight > 0) return weight + offset; + if (weight > 0) { return weight + offset; } return 0; } @@ -210,7 +210,7 @@ void frequent_items_sketch::serialize(std::ostream& os, const Ser template template size_t frequent_items_sketch::get_serialized_size_bytes(const SerDe& sd) const { - if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t); + if (is_empty()) { return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t); } size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W); for (auto it: map) size += sd.size_of_item(it.first); return size; @@ -328,8 +328,7 @@ frequent_items_sketch frequent_items_sketch::deser sketch.total_weight = total_weight; sketch.offset = offset; } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return sketch; } diff --git a/fi/include/reverse_purge_hash_map.hpp b/fi/include/reverse_purge_hash_map.hpp index b75abc43..5d59c187 100644 --- a/fi/include/reverse_purge_hash_map.hpp +++ b/fi/include/reverse_purge_hash_map.hpp @@ -20,8 +20,9 @@ #ifndef REVERSE_PURGE_HASH_MAP_HPP_ #define REVERSE_PURGE_HASH_MAP_HPP_ -#include +#include #include +#include namespace datasketches { diff --git a/fi/include/reverse_purge_hash_map_impl.hpp b/fi/include/reverse_purge_hash_map_impl.hpp index fa2ad824..63909cf3 100644 --- a/fi/include/reverse_purge_hash_map_impl.hpp +++ b/fi/include/reverse_purge_hash_map_impl.hpp @@ -74,7 +74,7 @@ states_(nullptr) if (other.states_[i] > 0) { new (&keys_[i]) K(other.keys_[i]); values_[i] = other.values_[i]; - if (--num == 0) break; + if (--num == 0) { break; } } } } @@ -105,7 +105,7 @@ reverse_purge_hash_map::~reverse_purge_hash_map() { for (uint32_t i = 0; i < size; i++) { if (is_active(i)) { keys_[i].~K(); - if (--num_active_ == 0) break; + if (--num_active_ == 0) { break; } } } } @@ -166,7 +166,7 @@ V reverse_purge_hash_map::get(const K& key) const { const uint32_t mask = (1 << lg_cur_size_) - 1; uint32_t probe = fmix64(H()(key)) & mask; while (is_active(probe)) { - if (E()(keys_[probe], key)) return values_[probe]; + if (E()(keys_[probe], key)) { return values_[probe]; } probe = (probe + 1) & mask; } return 0; @@ -271,7 +271,7 @@ void reverse_purge_hash_map::hash_delete(uint32_t delete_index) { probe = (probe + 1) & mask; drift++; // only used for theoretical analysis - if (drift >= DRIFT_LIMIT) throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT"); + if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT"); } } } @@ -289,7 +289,7 @@ uint32_t reverse_purge_hash_map::internal_adjust_or_insert(const index = (index + 1) & mask; drift++; // only used for theoretical analysis - if (drift >= DRIFT_LIMIT) throw std::logic_error("drift limit reached"); + if (drift >= DRIFT_LIMIT) { throw std::logic_error("drift limit reached"); } } // adding the key and value to the table if (num_active_ > get_capacity()) { diff --git a/filters/include/bloom_filter.hpp b/filters/include/bloom_filter.hpp index f3c6a031..fd5816a1 100644 --- a/filters/include/bloom_filter.hpp +++ b/filters/include/bloom_filter.hpp @@ -624,7 +624,7 @@ class bloom_filter_alloc { uint64_t capacity_bits_; uint64_t num_bits_set_; uint8_t* bit_array_; // data backing bit_array_, regardless of ownership - uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr + uint8_t* memory_; // if wrapped, pointer to the start of the filter, otherwise nullptr }; /** diff --git a/filters/test/bloom_filter_test.cpp b/filters/test/bloom_filter_test.cpp index 41b63e64..d8bcec8e 100644 --- a/filters/test/bloom_filter_test.cpp +++ b/filters/test/bloom_filter_test.cpp @@ -399,7 +399,7 @@ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") { REQUIRE(bf_writable.query(-1.0)); // not good memory management to do this, but because we wrapped the same bytes as both - // read-only adn writable, that update should ahve changed the read-only version, too + // read-only and writable, that update should have changed the read-only version, too REQUIRE(bf_wrap.query(-1.0)); } diff --git a/hll/include/CouponHashSet-internal.hpp b/hll/include/CouponHashSet-internal.hpp index 7474cf2c..2ec4d6a8 100644 --- a/hll/include/CouponHashSet-internal.hpp +++ b/hll/include/CouponHashSet-internal.hpp @@ -176,8 +176,7 @@ CouponHashSet* CouponHashSet::newSet(std::istream& is, const A& allocator) read(is, sketch->coupons_.data(), sketch->coupons_.size() * sizeof(uint32_t)); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return ptr.release(); } diff --git a/hll/include/CouponList-internal.hpp b/hll/include/CouponList-internal.hpp index a240a000..c92820e2 100644 --- a/hll/include/CouponList-internal.hpp +++ b/hll/include/CouponList-internal.hpp @@ -162,8 +162,7 @@ CouponList* CouponList::newList(std::istream& is, const A& allocator) { read(is, sketch->coupons_.data(), numToRead * sizeof(uint32_t)); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return ptr.release(); } diff --git a/hll/include/CubicInterpolation-internal.hpp b/hll/include/CubicInterpolation-internal.hpp index c60ddab9..fb74c402 100644 --- a/hll/include/CubicInterpolation-internal.hpp +++ b/hll/include/CubicInterpolation-internal.hpp @@ -165,10 +165,10 @@ static int recursiveFindStraddle(const double xArr[], const int l, const int r, throw std::logic_error("target value invariant violated in search"); } - if (l+1 == r) return (l); + if (l+1 == r) { return (l); } m = l + ((r-l)/2); - if (xArr[m] <= x) return (recursiveFindStraddle(xArr, m, r, x)); - else return (recursiveFindStraddle(xArr, l, m, x)); + if (xArr[m] <= x) { return (recursiveFindStraddle(xArr, m, r, x)); } + else { return (recursiveFindStraddle(xArr, l, m, x)); } } @@ -191,7 +191,7 @@ double CubicInterpolation::usingXArrAndYStride(const double xArr[], const int const int xArrLenM1 = xArrLen - 1; if ((xArrLen < 4) || (x < xArr[0]) || (x > xArr[xArrLenM1])) { - throw std::logic_error("impossible values during interpolaiton"); + throw std::logic_error("impossible values during interpolation"); } if (x == xArr[xArrLenM1]) { /* corner case */ diff --git a/hll/include/Hll4Array-internal.hpp b/hll/include/Hll4Array-internal.hpp index 9d22006b..082f168f 100644 --- a/hll/include/Hll4Array-internal.hpp +++ b/hll/include/Hll4Array-internal.hpp @@ -131,7 +131,7 @@ uint8_t Hll4Array::getSlot(uint32_t slotNo) const { template uint8_t Hll4Array::adjustRawValue(uint32_t slot, uint8_t value) const { - if (value != hll_constants::AUX_TOKEN) return value + this->curMin_; + if (value != hll_constants::AUX_TOKEN) { return value + this->curMin_; } return auxHashMap_->mustFindValueFor(slot); } diff --git a/hll/include/HllArray-internal.hpp b/hll/include/HllArray-internal.hpp index c3c6b3f8..62ea7f78 100644 --- a/hll/include/HllArray-internal.hpp +++ b/hll/include/HllArray-internal.hpp @@ -142,15 +142,16 @@ HllArray* HllArray::newHll(const void* bytes, size_t len, const A& allocat HllArray* sketch = HllSketchImplFactory::newHll(lgK, tgtHllType, startFullSizeFlag, allocator); sketch->putCurMin(curMin); sketch->putOutOfOrderFlag(oooFlag); - if (!oooFlag) sketch->putHipAccum(hip); + if (!oooFlag) { sketch->putHipAccum(hip); } sketch->putKxQ0(kxq0); sketch->putKxQ1(kxq1); sketch->putNumAtCurMin(numAtCurMin); std::memcpy(sketch->hllByteArr_.data(), data + hll_constants::HLL_BYTE_ARR_START, arrayBytes); - if (auxHashMap != nullptr) + if (auxHashMap != nullptr) { ((Hll4Array*)sketch)->putAuxHashMap(auxHashMap); + } aux_ptr.release(); return sketch; @@ -173,7 +174,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { hll_mode mode = HllSketchImpl::extractCurMode(listHeader[hll_constants::MODE_BYTE]); if (mode != HLL) { - throw std::invalid_argument("Calling HLL construtor with non-HLL mode data"); + throw std::invalid_argument("Calling HLL constructor with non-HLL mode data"); } const target_hll_type tgtHllType = HllSketchImpl::extractTgtHllType(listHeader[hll_constants::MODE_BYTE]); @@ -193,7 +194,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { const auto hip = read(is); const auto kxq0 = read(is); const auto kxq1 = read(is); - if (!oooFlag) sketch->putHipAccum(hip); + if (!oooFlag) { sketch->putHipAccum(hip); } sketch->putKxQ0(kxq0); sketch->putKxQ1(kxq1); @@ -209,8 +210,7 @@ HllArray* HllArray::newHll(std::istream& is, const A& allocator) { ((Hll4Array*)sketch)->putAuxHashMap(auxHashMap); } - if (!is.good()) - throw std::runtime_error("error reading from std::istream"); + if (!is.good()) { throw std::runtime_error("error reading from std::istream"); } return sketch_ptr.release(); } @@ -545,7 +545,7 @@ template void HllArray::hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue) { const uint32_t configK = 1 << this->getLgConfigK(); // update hip BEFORE updating kxq - if (!oooFlag_) hipAccum_ += configK / (kxq0_ + kxq1_); + if (!oooFlag_) { hipAccum_ += configK / (kxq0_ + kxq1_); } // update kxq0 and kxq1; subtract first, then add if (oldValue < 32) { kxq0_ -= INVERSE_POWERS_OF_2[oldValue]; } else { kxq1_ -= INVERSE_POWERS_OF_2[oldValue]; } @@ -648,7 +648,7 @@ array_(array), array_size_(array_size), index_(index), hll_type_(hll_type), exce { while (index_ < array_size_) { value_ = get_value(array_, index_, hll_type_, exceptions_, offset_); - if (all_ || value_ != hll_constants::EMPTY) break; + if (all_ || value_ != hll_constants::EMPTY) { break; } ++index_; } } @@ -657,7 +657,7 @@ template typename HllArray::const_iterator& HllArray::const_iterator::operator++() { while (++index_ < array_size_) { value_ = get_value(array_, index_, hll_type_, exceptions_, offset_); - if (all_ || value_ != hll_constants::EMPTY) break; + if (all_ || value_ != hll_constants::EMPTY) { break; } } return *this; } diff --git a/hll/include/HllUnion-internal.hpp b/hll/include/HllUnion-internal.hpp index 3a5a926c..27adab74 100644 --- a/hll/include/HllUnion-internal.hpp +++ b/hll/include/HllUnion-internal.hpp @@ -44,13 +44,13 @@ hll_sketch_alloc hll_union_alloc::get_result(target_hll_type target_type) template void hll_union_alloc::update(const hll_sketch_alloc& sketch) { - if (sketch.is_empty()) return; + if (sketch.is_empty()) { return; } union_impl(sketch, lg_max_k_); } template void hll_union_alloc::update(hll_sketch_alloc&& sketch) { - if (sketch.is_empty()) return; + if (sketch.is_empty()) { return; } if (gadget_.is_empty() && sketch.get_target_type() == HLL_8 && sketch.get_lg_config_k() <= lg_max_k_) { if (sketch.get_current_mode() == HLL || sketch.get_lg_config_k() == lg_max_k_) { gadget_ = std::move(sketch); @@ -131,29 +131,33 @@ void hll_union_alloc::coupon_update(uint32_t coupon) { template double hll_union_alloc::get_estimate() const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_estimate(); } template double hll_union_alloc::get_composite_estimate() const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_composite_estimate(); } template double hll_union_alloc::get_lower_bound(uint8_t num_std_dev) const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_lower_bound(num_std_dev); } template double hll_union_alloc::get_upper_bound(uint8_t num_std_dev) const { - if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) + if (gadget_.sketch_impl->getCurMode() == hll_mode::HLL) { static_cast*>(gadget_.sketch_impl)->check_rebuild_kxq_cur_min(); + } return gadget_.get_upper_bound(num_std_dev); } diff --git a/hll/include/coupon_iterator-internal.hpp b/hll/include/coupon_iterator-internal.hpp index 84133ffb..356517ec 100644 --- a/hll/include/coupon_iterator-internal.hpp +++ b/hll/include/coupon_iterator-internal.hpp @@ -28,7 +28,7 @@ template coupon_iterator::coupon_iterator(const uint32_t* array, size_t array_size, size_t index, bool all): array_(array), array_size_(array_size), index_(index), all_(all) { while (index_ < array_size_) { - if (all_ || array_[index_] != hll_constants::EMPTY) break; + if (all_ || array_[index_] != hll_constants::EMPTY) { break; } ++index_; } } @@ -36,7 +36,7 @@ array_(array), array_size_(array_size), index_(index), all_(all) { template coupon_iterator& coupon_iterator::operator++() { while (++index_ < array_size_) { - if (all_ || array_[index_] != hll_constants::EMPTY) break; + if (all_ || array_[index_] != hll_constants::EMPTY) { break; } } return *this; } diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp index 9d5f78f1..5fc49629 100644 --- a/hll/include/hll.hpp +++ b/hll/include/hll.hpp @@ -160,7 +160,7 @@ class hll_sketch_alloc final { static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A()); //! Class destructor - virtual ~hll_sketch_alloc(); + ~hll_sketch_alloc(); /** * Copy assignment operator diff --git a/hll/test/HllSketchTest.cpp b/hll/test/HllSketchTest.cpp index 1ce21bbe..91197f13 100644 --- a/hll/test/HllSketchTest.cpp +++ b/hll/test/HllSketchTest.cpp @@ -298,7 +298,7 @@ TEST_CASE("hll sketch: deserialize list mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range); REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -325,7 +325,7 @@ TEST_CASE("hll sketch: deserialize set mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7, 0), std::out_of_range); REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); @@ -355,7 +355,7 @@ TEST_CASE("hll sketch: deserialize HLL mode buffer overrun", "[hll_sketch]") { REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 16420, 0), std::out_of_range); // before aux table REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1, 0), std::out_of_range); - // ckeck for leaks on stream exceptions + // check for leaks on stream exceptions { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); diff --git a/hll/test/HllUnionTest.cpp b/hll/test/HllUnionTest.cpp index 41443786..ceaef12f 100644 --- a/hll/test/HllUnionTest.cpp +++ b/hll/test/HllUnionTest.cpp @@ -58,7 +58,7 @@ static void basicUnion(uint64_t n1, uint64_t n2, hll_sketch result = u.get_result(resultType); - // ensure we check a direct union estimate, without first caling get_result() + // ensure we check a direct union estimate, without first calling get_result() u.reset(); u.update(std::move(h1)); u.update(h2); diff --git a/kll/include/kll_helper_impl.hpp b/kll/include/kll_helper_impl.hpp index bb92bdc7..31534d9a 100644 --- a/kll/include/kll_helper_impl.hpp +++ b/kll/include/kll_helper_impl.hpp @@ -36,17 +36,17 @@ bool kll_helper::is_odd(uint32_t value) { } uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) { - if (denom > numer) return 0; + if (denom > numer) { return 0; } uint8_t count = 0; while (true) { denom <<= 1; - if (denom > numer) return count; + if (denom > numer) { return count; } count++; } } uint8_t kll_helper::ub_on_num_levels(uint64_t n) { - if (n == 0) return 1; + if (n == 0) { return 1; } return 1 + floor_of_log2_of_fraction(n, 1); } @@ -65,8 +65,8 @@ uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t heigh } uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) { - if (depth > 60) throw std::invalid_argument("depth > 60"); - if (depth <= 30) return int_cap_aux_aux(k, depth); + if (depth > 60) { throw std::invalid_argument("depth > 60"); } + if (depth <= 30) { return int_cap_aux_aux(k, depth); } const uint8_t half = depth / 2; const uint8_t rest = depth - half; const uint16_t tmp = int_cap_aux_aux(k, half); @@ -74,11 +74,11 @@ uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) { } uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) { - if (depth > 30) throw std::invalid_argument("depth > 30"); + if (depth > 30) { throw std::invalid_argument("depth > 30"); } const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2 const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]); const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2 - if (result > k) throw std::logic_error("result > k"); + if (result > k) { throw std::logic_error("result > k"); } return static_cast(result); } @@ -94,7 +94,7 @@ uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* template void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) { - if (!is_even(length)) throw std::invalid_argument("length must be even"); + if (!is_even(length)) { throw std::invalid_argument("length must be even"); } const uint32_t half_length = length / 2; #ifdef KLL_VALIDATION const uint32_t offset = deterministic_offset(); @@ -110,7 +110,7 @@ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) { template void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) { - if (!is_even(length)) throw std::invalid_argument("length must be even"); + if (!is_even(length)) { throw std::invalid_argument("length must be even"); } const uint32_t half_length = length / 2; #ifdef KLL_VALIDATION const uint32_t offset = deterministic_offset(); @@ -206,7 +206,7 @@ template kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items, uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted) { - if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed + if (num_levels_in == 0) { throw std::invalid_argument("num_levels_in == 0"); } // things are too weird if zero levels are allowed const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0]; uint8_t current_num_levels = num_levels_in; uint32_t current_item_count = starting_item_count; // decreases with each compaction diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index 904587a1..d672c419 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -46,6 +46,11 @@ namespace kll_constants { * and nearly optimal accuracy per retained item. * See Optimal Quantile Approximation in Streams. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

This is a stochastic streaming sketch that enables near real-time analysis of the * approximate distribution of items from a very large stream in a single pass, requiring only * that the items are comparable. @@ -56,7 +61,7 @@ namespace kll_constants { *

As of May 2020, this implementation produces serialized sketches which are binary-compatible * with the equivalent Java implementation only when template parameter T = float * (32-bit single precision values). - * + * *

Given an input stream of N items, the natural rank of any specific * item is defined as its index (1 to N) in inclusive mode * or (0 to N-1) in exclusive mode @@ -225,6 +230,8 @@ class kll_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -232,6 +239,8 @@ class kll_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index fde0a314..b12a39c8 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "conditional_forward.hpp" #include "count_zeros.hpp" @@ -198,7 +199,7 @@ void kll_sketch::update_min_max(const T& item) { template uint32_t kll_sketch::internal_update() { - if (levels_[0] == 0) compress_while_updating(); + if (levels_[0] == 0) { compress_while_updating(); } n_++; is_level_zero_sorted_ = false; return --levels_[0]; @@ -207,7 +208,7 @@ uint32_t kll_sketch::internal_update() { template template void kll_sketch::merge(FwdSk&& other) { - if (other.is_empty()) return; + if (other.is_empty()) { return; } if (m_ != other.m_) { throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_)); } @@ -223,9 +224,9 @@ void kll_sketch::merge(FwdSk&& other) { const uint32_t index = internal_update(); new (&items_[index]) T(conditional_forward(other.items_[i])); } - if (other.num_levels_ >= 2) merge_higher_levels(other, final_n); + if (other.num_levels_ >= 2) { merge_higher_levels(other, final_n); } n_ = final_n; - if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_); + if (other.is_estimation_mode()) { min_k_ = std::min(min_k_, other.min_k_); } assert_correct_total_weight(); reset_sorted_view(); } @@ -257,13 +258,13 @@ bool kll_sketch::is_estimation_mode() const { template T kll_sketch::get_min_item() const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } return *min_item_; } template T kll_sketch::get_max_item() const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } return *max_item_; } @@ -279,28 +280,28 @@ A kll_sketch::get_allocator() const { template double kll_sketch::get_rank(const T& item, bool inclusive) const { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_rank(item, inclusive); } template auto kll_sketch::get_PMF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_PMF(split_points, size, inclusive); } template auto kll_sketch::get_CDF(const T* split_points, uint32_t size, bool inclusive) const -> vector_double { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } setup_sorted_view(); return sorted_view_->get_CDF(split_points, size, inclusive); } template auto kll_sketch::get_quantile(double rank, bool inclusive) const -> quantile_return_type { - if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + if (is_empty()) { throw std::runtime_error("operation is undefined for an empty sketch"); } if ((rank < 0.0) || (rank > 1.0)) { throw std::invalid_argument("normalized rank cannot be less than zero or greater than 1.0"); } @@ -481,18 +482,22 @@ kll_sketch kll_sketch::deserialize(std::istream& is, const Ser read(is, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; @@ -565,18 +570,22 @@ kll_sketch kll_sketch::deserialize(const void* bytes, size_t s ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; diff --git a/kll/test/kll_sketch_deserialize_from_java_test.cpp b/kll/test/kll_sketch_deserialize_from_java_test.cpp index 795486ae..65efc3e5 100644 --- a/kll/test/kll_sketch_deserialize_from_java_test.cpp +++ b/kll/test/kll_sketch_deserialize_from_java_test.cpp @@ -100,4 +100,28 @@ TEST_CASE("kll string", "[serde_compat]") { } } +TEST_CASE("kll long", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "kll_long_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = kll_sketch::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K)); + REQUIRE(sketch.get_n() == n); + if (n > 0) { + REQUIRE(sketch.get_min_item() == 1); + REQUIRE(sketch.get_max_item() == static_cast(n)); + uint64_t weight = 0; + for (const auto pair: sketch) { + REQUIRE(pair.first >= sketch.get_min_item()); + REQUIRE(pair.first <= sketch.get_max_item()); + weight += pair.second; + } + REQUIRE(weight == sketch.get_n()); + } + } +} + } /* namespace datasketches */ diff --git a/kll/test/kll_sketch_serialize_for_java.cpp b/kll/test/kll_sketch_serialize_for_java.cpp index 00b8913d..22b75774 100644 --- a/kll/test/kll_sketch_serialize_for_java.cpp +++ b/kll/test/kll_sketch_serialize_for_java.cpp @@ -43,6 +43,16 @@ TEST_CASE("kll sketch double generate", "[serialize_for_java]") { } } +TEST_CASE("kll sketch long generate", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + kll_sketch sketch; + for (unsigned i = 1; i <= n; ++i) sketch.update(i); + std::ofstream os("kll_long_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.serialize(os); + } +} + struct compare_as_number { bool operator()(const std::string& a, const std::string& b) const { return std::stoi(a) < std::stoi(b); diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp index ab493c99..e995e3e3 100644 --- a/quantiles/include/quantiles_sketch.hpp +++ b/quantiles/include/quantiles_sketch.hpp @@ -47,6 +47,11 @@ namespace quantiles_constants { * The analysis is obtained using get_rank() and get_quantile() functions, * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF(). * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Consider a large stream of one million values such as packet sizes coming into a network node. * The natural rank of any specific size value is its index in the hypothetical sorted * array of values. @@ -206,6 +211,8 @@ class quantiles_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -213,6 +220,8 @@ class quantiles_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template @@ -537,10 +546,10 @@ class quantiles_sketch { static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator); template - static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); + static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator); template - static std::pair deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); + static std::pair deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const Allocator& allocator); static void check_k(uint16_t k); static void check_serial_version(uint8_t serial_version); diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp index 558c13c5..2dacf21e 100644 --- a/quantiles/include/quantiles_sketch_impl.hpp +++ b/quantiles/include/quantiles_sketch_impl.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "count_zeros.hpp" #include "conditional_forward.hpp" @@ -393,18 +394,22 @@ auto quantiles_sketch::deserialize(std::istream &is, const SerDe& serde const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - serde.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - serde.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { read(is); // no longer used @@ -507,18 +512,22 @@ auto quantiles_sketch::deserialize(const void* bytes, size_t size, cons const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { uint64_t unused_long; @@ -581,7 +590,7 @@ auto quantiles_sketch::deserialize_array(const void* bytes, size_t size // serde did not throw, enable destructors items.get_deleter().set_destroy(true); - // succesfully read, now put into a Level + // successfully read, now put into a Level Level level(allocator); level.reserve(capacity); level.insert(level.begin(), diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp index 21ccac0c..52295bd2 100755 --- a/req/include/req_sketch.hpp +++ b/req/include/req_sketch.hpp @@ -35,6 +35,11 @@ namespace datasketches { * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty, * Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Reference: https://arxiv.org/abs/2004.01668

* *

This implementation differs from the algorithm described in the paper in the following:

@@ -179,6 +184,8 @@ class req_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -186,6 +193,8 @@ class req_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp index a28e74e2..7f0b4557 100755 --- a/req/include/req_sketch_impl.hpp +++ b/req/include/req_sketch_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace datasketches { @@ -292,7 +293,7 @@ double req_sketch::get_rank_upper_bound(double rank, uint8_t num_std_de template double req_sketch::get_RSE(uint16_t k, double rank, bool hra, uint64_t n) { - return get_rank_lb(k, 2, rank, 1, n, hra); + return get_rank_ub(k, 2, rank, 1, n, hra) - rank; } template @@ -461,7 +462,6 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -472,14 +472,19 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser uint64_t n = 1; if (num_levels > 1) { n = read(is); - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { @@ -537,7 +542,6 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -549,14 +553,19 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s if (num_levels > 1) { ensure_minimum_memory(end_ptr - ptr, sizeof(n)); ptr += copy_from_mem(ptr, n); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { diff --git a/req/test/req_sketch_test.cpp b/req/test/req_sketch_test.cpp index 2a338b8a..d9c9a16e 100755 --- a/req/test/req_sketch_test.cpp +++ b/req/test/req_sketch_test.cpp @@ -43,6 +43,7 @@ TEST_CASE("req sketch: empty", "[req_sketch]") { REQUIRE_FALSE(sketch.is_estimation_mode()); REQUIRE(sketch.get_n() == 0); REQUIRE(sketch.get_num_retained() == 0); + REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, true, 0) == 0); REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error); REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error); REQUIRE_THROWS_AS(sketch.get_rank(0), std::runtime_error); @@ -61,6 +62,7 @@ TEST_CASE("req sketch: single value, lra", "[req_sketch]") { REQUIRE_FALSE(sketch.is_estimation_mode()); REQUIRE(sketch.get_n() == 1); REQUIRE(sketch.get_num_retained() == 1); + REQUIRE(sketch.get_RSE(sketch.get_k(), 0.5, false, sketch.get_n()) == 0); REQUIRE(sketch.get_rank(1.0f, false) == 0); REQUIRE(sketch.get_rank(1.0f) == 1); REQUIRE(sketch.get_rank(1.1f, false) == 1); diff --git a/sampling/include/ebpps_sample_impl.hpp b/sampling/include/ebpps_sample_impl.hpp index 88a86ae0..c48b32aa 100644 --- a/sampling/include/ebpps_sample_impl.hpp +++ b/sampling/include/ebpps_sample_impl.hpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace datasketches { @@ -365,11 +366,15 @@ std::pair, size_t> ebpps_sample::deserialize(const uint optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } return std::pair, size_t>( @@ -400,11 +405,15 @@ ebpps_sample ebpps_sample::deserialize(std::istream& is, const SerDe optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - sd.deserialize(is, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } if (!is.good()) throw std::runtime_error("error reading from std::istream"); diff --git a/sampling/include/ebpps_sketch.hpp b/sampling/include/ebpps_sketch.hpp index 038b5a30..615d37b8 100644 --- a/sampling/include/ebpps_sketch.hpp +++ b/sampling/include/ebpps_sketch.hpp @@ -50,6 +50,11 @@ namespace ebpps_constants { * The sample may be smaller than k and the resulting size of the sample potentially includes * a probabilistic component, meaning the resulting sample size is not always constant. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * @author Jon Malkin */ template< @@ -71,6 +76,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -79,6 +86,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -87,6 +96,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(const ebpps_sketch& sketch); @@ -94,6 +105,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(ebpps_sketch&& sketch); diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 1324883c..df080c6e 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -57,6 +57,11 @@ namespace var_opt_constants { * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for * subset sum estimation. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * author Kevin Lang * author Jon Malkin */ @@ -111,6 +116,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -119,6 +126,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -263,7 +272,7 @@ class var_opt_sketch { typedef typename std::allocator_traits::template rebind_alloc AllocDouble; typedef typename std::allocator_traits::template rebind_alloc AllocBool; - static const uint32_t MIN_LG_ARR_ITEMS = 3; + static const uint32_t MIN_LG_ARR_ITEMS = 4; static const uint8_t PREAMBLE_LONGS_EMPTY = 1; static const uint8_t PREAMBLE_LONGS_WARMUP = 3; diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp index 7bf40958..30d526af 100644 --- a/sampling/include/var_opt_sketch_impl.hpp +++ b/sampling/include/var_opt_sketch_impl.hpp @@ -772,12 +772,11 @@ string var_opt_sketch::items_to_string(bool print_gap) const { template template void var_opt_sketch::update(O&& item, double weight, bool mark) { - if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) { - throw std::invalid_argument("Item weights must be nonnegative and finite. Found: " + if (weight <= 0.0 || std::isnan(weight) || std::isinf(weight)) { + throw std::invalid_argument("Item weights must be positive and finite. Found: " + std::to_string(weight)); - } else if (weight == 0.0) { - return; } + ++n_; if (r_ == 0) { @@ -1029,7 +1028,7 @@ void var_opt_sketch::transition_from_warmup() { total_wt_r_ = weights_[k_]; // only one item, known location weights_[k_] = -1.0; - // The two lightest items are ncessarily downsample-able to one item, + // The two lightest items are necessarily downsample-able to one item, // and are therefore a valid initial candidate set grow_candidate_set(weights_[k_ - 1] + total_wt_r_, 2); } @@ -1065,7 +1064,7 @@ void var_opt_sketch::restore_towards_leaves(uint32_t slot_in) { while (child <= last_slot) { uint32_t child2 = child + 1; // might also be invalid if ((child2 <= last_slot) && (weights_[child2] < weights_[child])) { - // siwtch to other child if it's both valid and smaller + // switch to other child if it's both valid and smaller child = child2; } @@ -1221,7 +1220,7 @@ uint32_t var_opt_sketch::choose_delete_slot(double wt_cands, uint32_t num_ if ((wt_cands * next_double_exclude_zero()) < ((num_cands - 1) * wt_m_cand)) { return pick_random_slot_in_r(); // keep item in M } else { - return h_; // indext of item in M + return h_; // index of item in M } } else { // general case diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp index 0e4f76d8..68d1ac4b 100644 --- a/sampling/include/var_opt_union.hpp +++ b/sampling/include/var_opt_union.hpp @@ -65,13 +65,17 @@ class var_opt_union { /** * Updates this union with the given sketch * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(const var_opt_sketch& sk); - + /** * Updates this union with the given sketch * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(var_opt_sketch&& sk); diff --git a/sampling/include/var_opt_union_impl.hpp b/sampling/include/var_opt_union_impl.hpp index 1d252245..d04be0cb 100644 --- a/sampling/include/var_opt_union_impl.hpp +++ b/sampling/include/var_opt_union_impl.hpp @@ -590,7 +590,7 @@ void var_opt_union::migrate_marked_items_by_decreasing_k(var_opt_sketch sk(100, resize_factor::X2); - REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument); - // should not throw but sketch should still be empty - sk.update("zero weight", 0.0); - REQUIRE(sk.is_empty()); + // Negative + REQUIRE_THROWS_AS(sk.update("invalid_weight", -1.0), std::invalid_argument); + // Zero + REQUIRE_THROWS_AS(sk.update("zero_weight", 0.0), std::invalid_argument); + // NaN + REQUIRE_THROWS_AS(sk.update("NaN_weight", std::numeric_limits::quiet_NaN()), std::invalid_argument); + // +Inf + REQUIRE_THROWS_AS(sk.update("positive_infinity", std::numeric_limits::infinity()), std::invalid_argument); + // -Inf + REQUIRE_THROWS_AS(sk.update("negative_infinity", -std::numeric_limits::infinity()), std::invalid_argument); } TEST_CASE("varopt sketch: corrupt serialized weight", "[var_opt_sketch]") { diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index d33084ed..095752e9 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -108,6 +108,7 @@ class tdigest { /** * Update this t-Digest with the given value + * NaN and infinity values are ignored * @param value to update the t-Digest with */ void update(T value); @@ -153,6 +154,7 @@ class tdigest { * Compute approximate normalized rank of the given value. * *

If the sketch is empty this throws std::runtime_error. + *

NaN value throw std::invalid_argument. * * @param value to be ranked * @return normalized rank (from 0 to 1 inclusive) @@ -257,16 +259,29 @@ class tdigest { */ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator()); + class const_iterator; + + /** + * Iterator pointing to the first centroid in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first centroid in the sketch + */ + const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end centroid in the sketch. + * It does not point to any centroid, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end centroid in the sketch + */ + const_iterator end() const; private: bool reverse_merge_; uint16_t k_; - uint16_t internal_k_; T min_; T max_; size_t centroids_capacity_; vector_centroid centroids_; uint64_t centroids_weight_; - size_t buffer_capacity_; vector_t buffer_; static const size_t BUFFER_MULTIPLIER = 4; @@ -297,6 +312,27 @@ class tdigest { static inline void check_split_points(const T* values, uint32_t size); }; +template +class tdigest::const_iterator { +public: + using iterator_category = std::input_iterator_tag; + using value_type = std::pair; + using difference_type = void; + using pointer = const return_value_holder; + using reference = const value_type; + + const_iterator& operator++(); + const_iterator& operator++(int); + bool operator==(const const_iterator& other) const; + bool operator!=(const const_iterator& other) const; + reference operator*() const; + pointer operator->() const; +private: + friend class tdigest; + uint32_t index_; + vector_centroid centroids_; + const_iterator(const tdigest& tdigest_, bool is_end); +}; } /* namespace datasketches */ #include "tdigest_impl.hpp" diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 6e3ae1a0..065e3ef1 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -23,12 +23,35 @@ #include #include #include +#include #include "common_defs.hpp" #include "memory_operations.hpp" namespace datasketches { +template +inline void check_not_nan(T value, const char* name) { + if (std::isnan(value)) { + throw std::invalid_argument(std::string(name) + " must not be NaN"); + } +} + +template +inline void check_not_infinite(T value, const char* name) { + if (std::isinf(value)) { + throw std::invalid_argument(std::string(name) + " must not be infinite"); + } +} + +template +inline void check_non_zero(T value, const char* name) { + static_assert(std::is_arithmetic::value, "T must be an arithmetic type"); + if (value == 0) { + throw std::invalid_argument(std::string(name) + " must not be zero"); + } +} + template tdigest::tdigest(uint16_t k, const A& allocator): tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::infinity(), vector_centroid(allocator), 0, vector_t(allocator)) @@ -37,6 +60,7 @@ tdigest(false, k, std::numeric_limits::infinity(), -std::numeric_limits::i template void tdigest::update(T value) { if (std::isnan(value)) return; + if (std::isinf(value)) return; if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress(); buffer_.push_back(value); min_ = std::min(min_, value); @@ -193,7 +217,7 @@ T tdigest::get_quantile(double rank) const { } const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0; const double w2 = centroids_.back().get_weight() / 2.0 - w1; - return weighted_average(centroids_.back().get_weight(), w1, max_, w2); + return weighted_average(centroids_.back().get_mean(), w1, max_, w2); } template @@ -400,6 +424,8 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE); if (is_single_value) { const T value = read(is); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -408,12 +434,26 @@ tdigest tdigest::deserialize(std::istream& is, const A& allocator) { const T min = read(is); const T max = read(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -451,6 +491,8 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ensure_minimum_memory(end_ptr - ptr, sizeof(T)); T value; ptr += copy_from_mem(ptr, value); + check_not_nan(value, "single_value"); + check_not_infinite(value, "single_value"); return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator)); } @@ -465,12 +507,26 @@ tdigest tdigest::deserialize(const void* bytes, size_t size, const A ptr += copy_from_mem(ptr, min); T max; ptr += copy_from_mem(ptr, max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid)); vector_t buffer(num_buffered, 0, allocator); if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T)); uint64_t weight = 0; - for (const auto& c: centroids) weight += c.get_weight(); + for (const auto& c: centroids) { + check_not_nan(c.get_mean(), "centroid mean"); + check_not_infinite(c.get_mean(), "centroid mean"); + check_non_zero(c.get_weight(), "centroid weight"); + + weight += c.get_weight(); + } + for (const auto& value: buffer) { + check_not_nan(value, "buffered_value"); + check_not_infinite(value, "buffered_value"); + } return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer)); } @@ -487,13 +543,24 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc if (type == COMPAT_DOUBLE) { // compatibility with asBytes() const auto min = read_big_endian(is); const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); const auto num_centroids = read_big_endian(is); vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_double = read_big_endian(is); + check_not_nan(weight_double, "centroid weight"); + check_not_infinite(weight_double, "centroid weight"); + check_non_zero(weight_double, "centroid weight"); + const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_double); c = centroid(mean, weight); total_weight += weight; } @@ -502,6 +569,10 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc // COMPAT_FLOAT: compatibility with asSmallBytes() const auto min = read_big_endian(is); // reference implementation uses doubles for min and max const auto max = read_big_endian(is); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); const auto k = static_cast(read_big_endian(is)); // reference implementation stores capacities of the array of centroids and the buffer as shorts // they can be derived from k in the constructor @@ -510,8 +581,13 @@ tdigest tdigest::deserialize_compat(std::istream& is, const A& alloc vector_centroid centroids(num_centroids, centroid(0, 0), allocator); uint64_t total_weight = 0; for (auto& c: centroids) { - const W weight = static_cast(read_big_endian(is)); + const auto weight_float = read_big_endian(is); + check_not_nan(weight_float, "centroid weight"); + check_not_infinite(weight_float, "centroid weight"); const auto mean = read_big_endian(is); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); + const W weight = static_cast(weight_float); c = centroid(mean, weight); total_weight += weight; } @@ -538,6 +614,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); double k_double; ptr += copy_from_mem(ptr, k_double); const uint16_t k = static_cast(byteswap(k_double)); @@ -554,6 +634,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -567,6 +651,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, double max; ptr += copy_from_mem(ptr, max); max = byteswap(max); + check_not_nan(min, "min"); + check_not_infinite(min, "min"); + check_not_nan(max, "max"); + check_not_infinite(max, "max"); float k_float; ptr += copy_from_mem(ptr, k_float); const uint16_t k = static_cast(byteswap(k_float)); @@ -586,6 +674,10 @@ tdigest tdigest::deserialize_compat(const void* bytes, size_t size, float mean; ptr += copy_from_mem(ptr, mean); mean = byteswap(mean); + check_not_nan(weight, "centroid weight"); + check_not_infinite(weight, "centroid weight"); + check_not_nan(mean, "centroid mean"); + check_not_infinite(mean, "centroid mean"); c = centroid(mean, static_cast(weight)); total_weight += static_cast(weight); } @@ -627,6 +719,65 @@ void tdigest::check_split_points(const T* values, uint32_t size) { } } +template +typename tdigest::const_iterator tdigest::begin() const { + return tdigest::const_iterator(*this, false); +} + +template + typename tdigest::const_iterator tdigest::end() const { + return tdigest::const_iterator(*this, true); +} + +template +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): + centroids_(tdigest_.get_allocator()) +{ + // Create a copy of the tdigest to generate the centroids after processing the buffered values + tdigest tmp(tdigest_); + tmp.compress(); + centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); + + if (is_end) { + index_ = centroids_.size(); + } else { + index_ = 0; + } +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++() { + ++index_; + return *this; +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++(int) { + const_iterator tmp(*this); + operator++(); + return tmp; +} + +template +bool tdigest::const_iterator::operator==(const const_iterator& other) const { + return index_ == other.index_; +} + +template +bool tdigest::const_iterator::operator!=(const const_iterator& other) const { + return !operator==(other); +} + +template +auto tdigest::const_iterator::operator*() const -> reference { + return value_type(centroids_[index_].get_mean(), centroids_[index_].get_weight()); +} + +template +auto tdigest::const_iterator::operator->() const -> pointer { + return **this; +} + } /* namespace datasketches */ #endif // _TDIGEST_IMPL_HPP_ diff --git a/tdigest/test/CMakeLists.txt b/tdigest/test/CMakeLists.txt index 18bf3599..8dcfb4f0 100644 --- a/tdigest/test/CMakeLists.txt +++ b/tdigest/test/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(tdigest_test PRIVATE tdigest_test.cpp tdigest_custom_allocator_test.cpp + tdigest_iterator_test.cpp ) if (SERDE_COMPAT) diff --git a/tdigest/test/tdigest_iterator_test.cpp b/tdigest/test/tdigest_iterator_test.cpp new file mode 100644 index 00000000..e7c03205 --- /dev/null +++ b/tdigest/test/tdigest_iterator_test.cpp @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include "tdigest.hpp" + +namespace datasketches { + +TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") { + tdigest_double td(100); + + // Insert 10 distinct values + for (int i = 0; i < 10; i++) { + td.update(static_cast(i)); + } + + // Collect all centroids via iteration + std::map centroids; + for (const auto&& centroid : td) { + centroids[centroid.first] = centroid.second; + } + + // Should have collected all 10 distinct values + REQUIRE(centroids.size() == 10); + + // Verify each value was captured correctly + for (int i = 0; i < 10; i++) { + REQUIRE(centroids.count(static_cast(i)) == 1); + REQUIRE(centroids[static_cast(i)] == 1); + } +} + +TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") { + // This test reproduces the bug scenario found in ClickHouse + std::unique_ptr td(new tdigest_double(100)); + + // Insert distinct values + for (int i = 0; i < 10; i++) { + td->update(static_cast(i)); + } + + // Use explicit begin/end iterators + auto it = td->begin(); + auto end_it = td->end(); + + std::vector means; + std::vector weights; + + while (it != end_it) { + // Before the fix, accessing it->first would return garbage or same value repeatedly + double mean = it->first; + uint64_t weight = it->second; + means.push_back(mean); + weights.push_back(weight); + ++it; + } + + // Should have collected 10 centroids + REQUIRE(means.size() == 10); + REQUIRE(weights.size() == 10); + + // All means should be distinct (not all zeros or garbage) + std::set unique_means(means.begin(), means.end()); + REQUIRE(unique_means.size() == 10); + + // Verify all expected values are present + for (int i = 0; i < 10; i++) { + REQUIRE(unique_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") { + tdigest_double td(100); + + for (int i = 0; i < 5; i++) { + td.update(static_cast(i * 10)); + } + + std::vector> collected; + + // Test structured bindings + for (auto it = td.begin(); it != td.end(); ++it) { + const auto& centroid = *it; + collected.emplace_back(centroid.first, centroid.second); + } + + REQUIRE(collected.size() == 5); + + // Verify distinct values were collected + std::set means; + for (const auto& pair : collected) { + means.insert(pair.first); + REQUIRE(pair.second == 1); // Each value inserted once + } + + REQUIRE(means.size() == 5); + for (int i = 0; i < 5; i++) { + REQUIRE(means.count(static_cast(i * 10)) == 1); + } +} + +TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (int i = 1; i <= 10; i++) { + td.update(static_cast(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100 + } + + // Access via operator-> + std::map centroids; + auto end_it = td.end(); + for (auto it = td.begin(); it != end_it; ++it) { + // operator-> should return valid values + centroids[it->first] = it->second; + } + + REQUIRE(centroids.size() == 10); + + // Verify the squared values + for (int i = 1; i <= 10; i++) { + double expected = static_cast(i * i); + REQUIRE(centroids.count(expected) == 1); + } +} + +TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (double d = 0.0; d < 10.0; d += 1.0) { + td.update(d); + } + + size_t count = 0; + std::set seen_means; + + // This pattern was working in simple tests but failing in optimized builds + for (const auto&& centroid : td) { + seen_means.insert(centroid.first); + count++; + } + + REQUIRE(count == 10); + REQUIRE(seen_means.size() == 10); + + // Verify all values from 0 to 9 are present + for (int i = 0; i < 10; i++) { + REQUIRE(seen_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") { + tdigest_double td(100); + + td.update(1.0); + td.update(2.0); + td.update(3.0); + + auto it = td.begin(); + + // Store the pair + auto pair1 = *it; + double mean1 = pair1.first; + + ++it; + + // Store another pair + auto pair2 = *it; + double mean2 = pair2.first; + + ++it; + + auto pair3 = *it; + double mean3 = pair3.first; + + // All three means should be distinct + REQUIRE(mean1 != mean2); + REQUIRE(mean2 != mean3); + REQUIRE(mean1 != mean3); + + // And they should match our input values + std::set means = {mean1, mean2, mean3}; + REQUIRE(means.count(1.0) == 1); + REQUIRE(means.count(2.0) == 1); + REQUIRE(means.count(3.0) == 1); +} + +TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") { + tdigest_double td(100); + + // Empty sketch should have begin() == end() + REQUIRE(td.begin() == td.end()); + + // Range-based for should not execute + size_t count = 0; + for (const auto&& centroid : td) { + (void)centroid; // Silence unused warning + count++; + } + REQUIRE(count == 0); +} + +TEST_CASE("tdigest iterator: single value", "[tdigest]") { + tdigest_double td(100); + td.update(42.0); + + size_t count = 0; + double captured_mean = 0.0; + uint64_t captured_weight = 0; + + for (const auto&& centroid : td) { + captured_mean = centroid.first; + captured_weight = centroid.second; + count++; + } + + REQUIRE(count == 1); + REQUIRE(captured_mean == 42.0); + REQUIRE(captured_weight == 1); +} + +TEST_CASE("tdigest iterator: large dataset", "[tdigest]") { + tdigest_double td(100); + + // Insert 1000 distinct values + for (int i = 0; i < 1000; i++) { + td.update(static_cast(i)); + } + + // Iterator should provide compressed centroids (not all 1000) + size_t centroid_count = 0; + std::set unique_means; + uint64_t total_weight = 0; + + for (const auto&& centroid : td) { + unique_means.insert(centroid.first); + total_weight += centroid.second; + centroid_count++; + } + + // Should have fewer centroids than input values due to compression + REQUIRE(centroid_count < 1000); + REQUIRE(centroid_count > 0); + + // Total weight should equal number of input values + REQUIRE(total_weight == 1000); + + // All means should be unique (no duplicates) + REQUIRE(unique_means.size() == centroid_count); +} + +} // namespace datasketches diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index fc3f5d1c..07d6185f 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -18,13 +18,36 @@ */ #include +#include #include #include +#include #include "tdigest.hpp" namespace datasketches { +namespace { +constexpr size_t header_size = 8; +constexpr size_t counts_size = 8; +constexpr size_t min_offset = header_size + counts_size; +constexpr size_t max_offset = min_offset + sizeof(double); +constexpr size_t first_centroid_mean_offset = min_offset + sizeof(double) * 2; +constexpr size_t first_centroid_weight_offset = first_centroid_mean_offset + sizeof(double); +constexpr size_t first_buffered_value_offset = first_centroid_mean_offset; +constexpr size_t single_value_offset = header_size; + +template +void write_bytes(std::vector& bytes, size_t offset, T value) { + std::memcpy(bytes.data() + offset, &value, sizeof(T)); +} + +template +void write_bytes(std::string& data, size_t offset, T value) { + std::memcpy(&data[offset], &value, sizeof(T)); +} +} // namespace + TEST_CASE("empty", "[tdigest]") { tdigest_double td(10); // std::cout << td.to_string(); @@ -453,4 +476,129 @@ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") REQUIRE(td.get_rank(n) == 1); } +TEST_CASE("iterate centroids", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; i++) { + td.update(i); + } + + auto centroid_count = 0; + uint64_t total_weight = 0; + for (const auto ¢roid: td) { + centroid_count++; + total_weight += centroid.second; + } + // Ensure that centroids are retrieved for a case where there is buffered values + REQUIRE(centroid_count == 10); + REQUIRE(td.get_total_weight() == total_weight); +} + +TEST_CASE("update rejects positive infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_max_value() == 2.0); +} + +TEST_CASE("update rejects negative infinity", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(-std::numeric_limits::infinity()); + REQUIRE(td.get_total_weight() == 2); + REQUIRE(td.get_min_value() == 1.0); +} + +TEST_CASE("deserialize bytes rejects NaN single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, single_value_offset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects infinity min", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + td.update(3.0); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, min_offset, std::numeric_limits::infinity()); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN centroid mean", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, first_centroid_mean_offset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN buffered value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects infinity single value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + auto bytes = td.serialize(); + write_bytes(bytes, single_value_offset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects NaN max", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, max_offset, std::numeric_limits::quiet_NaN()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects infinity max", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(); + write_bytes(bytes, max_offset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects infinity buffered value", "[tdigest]") { + tdigest_double td(100); + td.update(1.0); + td.update(2.0); + auto bytes = td.serialize(0, true); + write_bytes(bytes, first_buffered_value_offset, std::numeric_limits::infinity()); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize bytes rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + write_bytes(bytes, first_centroid_weight_offset, static_cast(0)); + REQUIRE_THROWS_AS(tdigest_double::deserialize(bytes.data(), bytes.size()), std::invalid_argument); +} + +TEST_CASE("deserialize stream rejects zero centroid weight", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; ++i) td.update(i); + auto bytes = td.serialize(); + std::string data(reinterpret_cast(bytes.data()), bytes.size()); + write_bytes(data, first_centroid_weight_offset, static_cast(0)); + std::istringstream is(data, std::ios::binary); + REQUIRE_THROWS_AS(tdigest_double::deserialize(is), std::invalid_argument); +} + } /* namespace datasketches */ diff --git a/theta/include/theta_set_difference_base_impl.hpp b/theta/include/theta_set_difference_base_impl.hpp index 02317816..40f94a2f 100644 --- a/theta/include/theta_set_difference_base_impl.hpp +++ b/theta/include/theta_set_difference_base_impl.hpp @@ -69,7 +69,7 @@ CS theta_set_difference_base::compute(FwdSketch&& a, const Sketch const uint64_t hash = EK()(entry); if (hash < theta) { auto result = table.find(hash); - if (!result.second) entries.push_back(conditional_forward(entry)); + if (!result.second) entries.emplace_back(conditional_forward(entry)); } else if (a.is_ordered()) { break; // early stop } diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp index ad6421e7..4aab4b92 100644 --- a/theta/include/theta_sketch.hpp +++ b/theta/include/theta_sketch.hpp @@ -609,9 +609,11 @@ class wrapped_compact_theta_sketch_alloc::const_iterator { uint32_t index_; uint64_t previous_; bool is_block_mode_; - uint8_t buf_i_; uint8_t offset_; uint64_t buffer_[8]; + + inline void unpack1(); + inline void unpack8(); }; } /* namespace datasketches */ diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp index b6a5d7ee..304ae64c 100644 --- a/theta/include/theta_sketch_impl.hpp +++ b/theta/include/theta_sketch_impl.hpp @@ -376,7 +376,7 @@ size_t compact_theta_sketch_alloc::get_compressed_serialized_size_bytes(uint8 template void compact_theta_sketch_alloc::serialize(std::ostream& os) const { - const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2; + const uint8_t preamble_longs = get_preamble_longs(false); write(os, preamble_longs); write(os, UNCOMPRESSED_SERIAL_VERSION); write(os, SKETCH_TYPE); @@ -459,7 +459,7 @@ uint8_t compact_theta_sketch_alloc::compute_entry_bits() const { template void compact_theta_sketch_alloc::serialize_version_4(std::ostream& os) const { - const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1; + const uint8_t preamble_longs = get_preamble_longs(true); const uint8_t entry_bits = compute_entry_bits(); const uint8_t num_entries_bytes = get_num_entries_bytes(); @@ -817,23 +817,15 @@ num_entries_(num_entries), index_(index), previous_(0), is_block_mode_(num_entries_ >= 8), -buf_i_(0), offset_(0) { if (entry_bits == 64) { // no compression ptr_ = reinterpret_cast(ptr) + index; } else if (index < num_entries) { if (is_block_mode_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + unpack8(); } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } @@ -844,35 +836,41 @@ auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++() ptr_ = reinterpret_cast(ptr_) + 1; return *this; } - ++index_; - if (index_ < num_entries_) { + if (++index_ < num_entries_) { if (is_block_mode_) { - ++buf_i_; - if (buf_i_ == 8) { - buf_i_ = 0; - if (index_ + 8 < num_entries_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + if ((index_ & 7) == 0) { + if (num_entries_ - index_ >= 8) { + unpack8(); } else { is_block_mode_ = false; - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } return *this; } +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack1() { + const uint32_t i = index_ & 7; + offset_ = unpack_bits(buffer_[i], entry_bits_, reinterpret_cast(ptr_), offset_); + buffer_[i] += previous_; + previous_ = buffer_[i]; +} + +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack8() { + unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); + ptr_ = reinterpret_cast(ptr_) + entry_bits_; + for (int i = 0; i < 8; ++i) { + buffer_[i] += previous_; + previous_ = buffer_[i]; + } +} + template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++(int) -> const_iterator { const_iterator tmp(*this); @@ -895,13 +893,13 @@ bool wrapped_compact_theta_sketch_alloc::const_iterator::operator==(c template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator*() const -> reference { if (entry_bits_ == 64) return *reinterpret_cast(ptr_); - return buffer_[buf_i_]; + return buffer_[index_ & 7]; } template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator->() const -> pointer { if (entry_bits_ == 64) return reinterpret_cast(ptr_); - return buffer_ + buf_i_; + return buffer_ + (index_ & 7); } } /* namespace datasketches */ diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp index b39f8996..0094f9fd 100644 --- a/theta/test/bit_packing_test.cpp +++ b/theta/test/bit_packing_test.cpp @@ -80,4 +80,54 @@ TEST_CASE("pack unpack blocks") { } } +TEST_CASE("pack bits unpack blocks") { + uint64_t value = 0; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + uint8_t offset = 0; + uint8_t* ptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = pack_bits(input[i], bits, ptr, offset); + } + std::vector output(8, 0); + unpack_bits_block8(output.data(), bytes.data(), bits); + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + +TEST_CASE("pack blocks unpack bits") { + uint64_t value = 111; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + pack_bits_block8(input.data(), bytes.data(), bits); + std::vector output(8, 0); + uint8_t offset = 0; + const uint8_t* cptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = unpack_bits(output[i], bits, cptr, offset); + } + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + } /* namespace datasketches */ diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt index 4b0a48c7..54df11ee 100644 --- a/tuple/CMakeLists.txt +++ b/tuple/CMakeLists.txt @@ -54,4 +54,6 @@ install(FILES include/array_tuple_intersection_impl.hpp include/array_tuple_a_not_b.hpp include/array_tuple_a_not_b_impl.hpp + include/array_of_strings_sketch.hpp + include/array_of_strings_sketch_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp new file mode 100644 index 00000000..296c0a87 --- /dev/null +++ b/tuple/include/array_of_strings_sketch.hpp @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_HPP_ + +#include +#include + +#include "array_tuple_sketch.hpp" +#include "xxhash64.h" + +namespace datasketches { + +using array_of_strings = array; + +// default update policy for an array of strings +class default_array_of_strings_update_policy { +public: + default_array_of_strings_update_policy() = default; + + array_of_strings create() const; + + void update(array_of_strings& array, const array_of_strings& input) const; + + void update(array_of_strings& array, const array_of_strings* input) const; +}; + +/** + * Serializer/deserializer for an array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * This serde does not perform UTF-8 validation. Callers must ensure strings + * are valid UTF-8 before serialization to guarantee interoperability with + * Java, Go, and Rust implementations. + */ +template> +struct default_array_of_strings_serde { + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); + + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; + size_t size_of_item(const array_of_strings& item) const; + +private: + summary_allocator summary_allocator_; + static void check_num_nodes(uint8_t num_nodes); + static uint32_t compute_total_bytes(const array_of_strings& item); +}; + +/** + * Hashes an array of strings using ArrayOfStrings-compatible hashing. + */ +uint64_t hash_array_of_strings_key(const array_of_strings& key); + +/** + * Extended class of compact_tuple_sketch for array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * UTF-8 compatibility: + * Serialized sketches are intended to be language and platform independent. + * Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings. + * This C++ implementation does not validate UTF-8; it is the caller's + * responsibility to ensure all strings are valid UTF-8 before calling update(). + * Non-UTF-8 strings may serialize successfully but will fail to deserialize + * in other language implementations. + */ +template> +class compact_array_of_strings_tuple_sketch: + public compact_tuple_sketch { +public: + using Base = compact_tuple_sketch; + using vector_bytes = typename Base::vector_bytes; + using Base::serialize; + + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (update or compact) + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ + template + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); + + /** + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + + /** + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + +private: + explicit compact_array_of_strings_tuple_sketch(Base&& base); +}; + +/** + * Convenience alias for update_tuple_sketch for array of strings + */ +template, + typename Policy = default_array_of_strings_update_policy> +using update_array_of_strings_tuple_sketch = update_tuple_sketch< + array_of_strings, + array_of_strings, + Policy, + Allocator +>; + +/** + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). + * @param sketch input sketch + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ +template, typename Policy = default_array_of_strings_update_policy> +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); + +} /* namespace datasketches */ + +#include "array_of_strings_sketch_impl.hpp" + +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp new file mode 100644 index 00000000..400df477 --- /dev/null +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ + +#include + +#include "common_defs.hpp" + +namespace datasketches { + +inline array_of_strings default_array_of_strings_update_policy::create() const { + return array_of_strings(0, ""); +} + +inline void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings& input +) const { + const auto length = static_cast(input.size()); + array = array_of_strings(static_cast(length), ""); + for (size_t i = 0; i < length; ++i) array[i] = input[i]; +} + +inline void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings* input +) const { + if (input == nullptr) { + array = array_of_strings(0, ""); + return; + } + const auto length = static_cast(input->size()); + array = array_of_strings(static_cast(length), ""); + for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; +} + +inline uint64_t hash_array_of_strings_key(const array_of_strings& key) { + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; + XXHash64 hasher(STRING_ARR_HASH_SEED); + const auto size = static_cast(key.size()); + for (size_t i = 0; i < size; ++i) { + const auto& entry = key[i]; + hasher.add(entry.data(), entry.size()); + if (i + 1 < size) hasher.add(",", 1); + } + return hasher.hash(); +} + +template +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered +) { + return compact_array_of_strings_tuple_sketch(sketch, ordered); +} + +template +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + const Sketch& sketch, bool ordered +): Base(sketch, ordered) {} + +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + Base&& base +): Base(std::move(base)) {} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + auto base = Base::deserialize(is, seed, sd, allocator); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + auto base = Base::deserialize(bytes, size, seed, sd, allocator); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): + summary_allocator_(allocator) {} + +template +void default_array_of_strings_serde::serialize( + std::ostream& os, const array_of_strings* items, unsigned num +) const { + unsigned i = 0; + try { + for (; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } + } + } catch (std::runtime_error& e) { + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings stream write failed at item " + std::to_string(i)); + } +} + +template +void default_array_of_strings_serde::deserialize( + std::istream& is, array_of_strings* items, unsigned num +) const { + unsigned i = 0; + bool failure = false; + try { + for (; i < num; ++i) { + read(is); // total_bytes + if (!is) { failure = true; break; } + const uint8_t num_nodes = read(is); + if (!is) { failure = true; break; } + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + if (!is) { failure = true; break; } + std::string value(length, '\0'); + if (length != 0) { + is.read(&value[0], length); + if (!is) { failure = true; break; } + } + array[j] = std::move(value); + } + if (failure) break; + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } + } catch (std::exception& e) { + summary_allocator alloc(summary_allocator_); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i)); + } + if (failure) { + summary_allocator alloc(summary_allocator_); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + throw std::runtime_error("array_of_strings stream read failed at item " + std::to_string(i)); + } +} + +template +size_t default_array_of_strings_serde::serialize( + void* ptr, size_t capacity, const array_of_strings* items, unsigned num +) const { + uint8_t* ptr8 = static_cast(ptr); + size_t bytes_written = 0; + + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + check_memory_size(bytes_written + total_bytes, capacity); + bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); + bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = static_cast(data[j].size()); + + bytes_written += copy_to_mem(length, ptr8 + bytes_written); + bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length); + } + } + return bytes_written; +} + +template +size_t default_array_of_strings_serde::deserialize( + const void* ptr, size_t capacity, array_of_strings* items, unsigned num +) const { + const uint8_t* ptr8 = static_cast(ptr); + size_t bytes_read = 0; + + unsigned i = 0; + + try { + for (; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + + check_memory_size(bytes_read + sizeof(uint8_t), capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + + check_memory_size(bytes_read + length, capacity); + std::string value(length, '\0'); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + } + array[j] = std::move(value); + } + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } + } catch (std::exception& e) { + summary_allocator alloc(summary_allocator_); + for (unsigned j = 0; j < i; ++j) { + std::allocator_traits::destroy(alloc, &items[j]); + } + if (std::string(e.what()).find("size exceeds 127") != std::string::npos) throw; + throw std::runtime_error("array_of_strings bytes read failed at item " + std::to_string(i)); + } + return bytes_read; +} + +template +size_t default_array_of_strings_serde::size_of_item(const array_of_strings& item) const { + return compute_total_bytes(item); +} + +template +void default_array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { + if (num_nodes > 127) { + throw std::runtime_error("array_of_strings size exceeds 127"); + } +} + +template +uint32_t default_array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { + const auto count = item.size(); + check_num_nodes(static_cast(count)); + size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); + const std::string* data = item.data(); + for (uint32_t j = 0; j < count; ++j) { + total += data[j].size(); + } + return static_cast(total); +} + +} /* namespace datasketches */ + +#endif diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 547b240c..d331f8b1 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -22,6 +22,8 @@ #include #include +#include +#include #include "serde.hpp" #include "tuple_sketch.hpp" @@ -34,17 +36,18 @@ class array { public: using value_type = T; using allocator_type = Allocator; + using alloc_traits = std::allocator_traits; - explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()): + explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - std::fill(array_, array_ + size_, value); + init_values(value, std::is_trivially_copyable()); } array(const array& other): allocator_(other.allocator_), size_(other.size_), array_(allocator_.allocate(size_)) { - std::copy(other.array_, other.array_ + size_, array_); + copy_from(other, std::is_trivially_copyable()); } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -52,9 +55,13 @@ class array { array_(other.array_) { other.array_ = nullptr; + other.size_ = 0; } ~array() { - if (array_ != nullptr) allocator_.deallocate(array_, size_); + if (array_ != nullptr) { + destroy_values(std::is_trivially_destructible()); + allocator_.deallocate(array_, size_); + } } array& operator=(const array& other) { array copy(other); @@ -75,10 +82,34 @@ class array { T* data() { return array_; } const T* data() const { return array_; } bool operator==(const array& other) const { + if (size_ != other.size_) return false; for (uint8_t i = 0; i < size_; ++i) if (array_[i] != other.array_[i]) return false; return true; } private: + void init_values(const T& value, std::true_type) { + std::fill(array_, array_ + size_, value); + } + void init_values(const T& value, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } + } + void copy_from(const array& other, std::true_type) { + std::copy(other.array_, other.array_ + size_, array_); + } + void copy_from(const array& other, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } + } + void destroy_values(std::true_type) {} + void destroy_values(std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + } + Allocator allocator_; uint8_t size_; T* array_; diff --git a/tuple/include/array_tuple_sketch_impl.hpp b/tuple/include/array_tuple_sketch_impl.hpp index 42b39216..ad0c999c 100644 --- a/tuple/include/array_tuple_sketch_impl.hpp +++ b/tuple/include/array_tuple_sketch_impl.hpp @@ -166,7 +166,7 @@ compact_array_tuple_sketch compact_array_tuple_sketch compact_array_tuple_sketch { /** * Update this sketch with a given string. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key string to update the sketch with * @param value to update the sketch with */ @@ -261,6 +269,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key uint64_t to update the sketch with * @param value to update the sketch with */ @@ -269,6 +280,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key int64_t to update the sketch with * @param value to update the sketch with */ @@ -277,6 +291,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint32_t to update the sketch with * @param value to update the sketch with @@ -286,6 +303,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int32_t to update the sketch with * @param value to update the sketch with @@ -295,6 +315,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint16_t to update the sketch with * @param value to update the sketch with @@ -304,6 +327,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int16_t to update the sketch with * @param value to update the sketch with @@ -313,6 +339,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint8_t to update the sketch with * @param value to update the sketch with @@ -322,6 +351,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int8_t to update the sketch with * @param value to update the sketch with @@ -331,6 +363,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given double-precision floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key double to update the sketch with * @param value to update the sketch with @@ -340,6 +375,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key float to update the sketch with * @param value to update the sketch with @@ -357,6 +395,9 @@ class update_tuple_sketch: public tuple_sketch { * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key pointer to the data * @param length of the data in bytes * @param value to update the sketch with diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp index e5bf8340..b3f2d0bd 100644 --- a/tuple/include/tuple_sketch_impl.hpp +++ b/tuple/include/tuple_sketch_impl.hpp @@ -315,7 +315,7 @@ entries_(other.get_allocator()) { entries_.reserve(other.get_num_retained()); for (uint64_t hash: other) { - entries_.push_back(Entry(hash, summary)); + entries_.emplace_back(hash, summary); } if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end(), comparator()); } @@ -518,7 +518,7 @@ compact_tuple_sketch compact_tuple_sketch::deserialize(std::istream& for (size_t i = 0; i < num_entries; ++i) { const auto key = read(is); sd.deserialize(is, summary.get(), 1); - entries.push_back(Entry(key, std::move(*summary))); + entries.emplace_back(key, std::move(*summary)); (*summary).~S(); } } @@ -585,7 +585,7 @@ compact_tuple_sketch compact_tuple_sketch::deserialize(const void* b uint64_t key; ptr += copy_from_mem(ptr, key); ptr += sd.deserialize(ptr, base + size - ptr, summary.get(), 1); - entries.push_back(Entry(key, std::move(*summary))); + entries.emplace_back(key, std::move(*summary)); (*summary).~S(); } } diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..3d7ccca3 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -44,6 +44,7 @@ target_sources(tuple_test tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp array_of_doubles_sketch_test.cpp + array_of_strings_sketch_test.cpp engagement_test.cpp ) @@ -52,6 +53,7 @@ target_sources(tuple_test PRIVATE aod_sketch_deserialize_from_java_test.cpp tuple_sketch_deserialize_from_java_test.cpp + aos_sketch_deserialize_from_java_test.cpp ) endif() @@ -60,5 +62,6 @@ target_sources(tuple_test PRIVATE aod_sketch_serialize_for_java.cpp tuple_sketch_serialize_for_java.cpp + aos_sketch_serialize_for_java.cpp ) endif() diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp new file mode 100644 index 00000000..af37d6c2 --- /dev/null +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -0,0 +1,283 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + // assume the binary sketches for this test have been generated by datasketches-java code + // in the subdirectory called "java" in the root directory of this project + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + + static std::vector read_binary_file(const std::string& path) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + is.seekg(0, std::ios::end); + const auto size = static_cast(is.tellg()); + is.seekg(0, std::ios::beg); + std::vector bytes(size); + if (size != 0) { + is.read(reinterpret_cast(bytes.data()), size); + } + return bytes; + } + + TEST_CASE("aos sketch one value", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch three values", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + } + } + + TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + } + + TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch unicode strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_unicode_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } + + TEST_CASE("aos sketch empty strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_empty_strings_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } +} diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp new file mode 100644 index 00000000..c6eb0dfc --- /dev/null +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using aos_sketch = update_array_of_strings_tuple_sketch<>; +using array_of_strings = array; + +static array_of_strings make_array(std::initializer_list items) { + array_of_strings array(static_cast(items.size()), ""); + size_t i = 0; + for (const auto& item: items) { + array[static_cast(i)] = item; + ++i; + } + return array; +} + +TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { + auto sketch = aos_sketch::builder() + .set_lg_k(12) + .set_resize_factor(resize_factor::X8) + .set_p(0.01f) + .build(); + array_of_strings key(1, ""); + key[0] = "key1"; + array_of_strings value(1, ""); + value[0] = "value1"; + sketch.update(hash_array_of_strings_key(key), value); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})), + make_array({u8"밸류", u8"값"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})), + make_array({u8"📦", u8"🎁"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})), + make_array({u8"ценить1", u8"ценить2"}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"})); + sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""})); + sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""})); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +} /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp new file mode 100644 index 00000000..5507c071 --- /dev/null +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +TEST_CASE("aos update policy", "[tuple_sketch]") { + default_array_of_strings_update_policy policy; + + SECTION("create empty") { + auto values = policy.create(); + REQUIRE(values.size() == 0); + } + + SECTION("replace array") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "alpha"; + input[1] = "beta"; + policy.update(values, input); + REQUIRE(values.size() == 2); + REQUIRE(values[0] == "alpha"); + REQUIRE(values[1] == "beta"); + input[0] = "changed"; + REQUIRE(values[0] == "alpha"); + + array_of_strings input2(1, "", std::allocator()); + input2[0] = "gamma"; + policy.update(values, input2); + REQUIRE(values.size() == 1); + REQUIRE(values[0] == "gamma"); + } + + SECTION("nullptr clears") { + array_of_strings values(2, "", std::allocator()); + values[0] = "one"; + values[1] = "two"; + + policy.update(values, nullptr); + REQUIRE(values.size() == 0); + } + + SECTION("pointer input copies") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "first"; + input[1] = "second"; + policy.update(values, &input); + REQUIRE(values.size() == 2); + REQUIRE(values[1] == "second"); + input[1] = "changed"; + REQUIRE(values[1] == "second"); + } +} + +TEST_CASE("aos sketch update", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto* entry: entries) array[i++] = entry; + return array; + }; + + SECTION("same key replaces summary") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"first"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"second", "third"}) + ); + + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 2); + REQUIRE(it->second[0] == "second"); + REQUIRE(it->second[1] == "third"); + } + + SECTION("distinct keys retain multiple entries") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"a", "bc"})), + make_array({"one"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"ab", "c"})), + make_array({"two"}) + ); + + REQUIRE(sketch.get_num_retained() == 2); + + bool saw_one = false; + bool saw_two = false; + for (const auto& entry: sketch) { + REQUIRE(entry.second.size() == 1); + if (entry.second[0] == "one") saw_one = true; + if (entry.second[0] == "two") saw_two = true; + } + REQUIRE(saw_one); + REQUIRE(saw_two); + } + + SECTION("empty key") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"})); + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 1); + REQUIRE(it->second[0] == "value"); + } +} + +TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto& entry: entries) array[i++] = entry; + return array; + }; + + auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + typedef std::pair entry_type; + std::vector entries; + for (const auto& entry: sketch) entries.push_back(entry); + struct entry_less { + bool operator()(const entry_type& lhs, const entry_type& rhs) const { + return lhs.first < rhs.first; + } + }; + std::sort(entries.begin(), entries.end(), entry_less()); + return entries; + }; + + auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + compact_sketch.serialize(ss, default_array_of_strings_serde<>()); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize( + ss, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>()); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { + &deserialized_stream, + &deserialized_bytes + }; + for (int list_index = 0; list_index < 2; ++list_index) { + const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index]; + REQUIRE(compact_sketch.is_empty() == deserialized->is_empty()); + REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode()); + REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered()); + REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained()); + REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10)); + REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10)); + REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10)); + REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10)); + + auto original_entries = collect_entries(compact_sketch); + auto round_trip_entries = collect_entries(*deserialized); + REQUIRE(original_entries.size() == round_trip_entries.size()); + for (size_t i = 0; i < original_entries.size(); ++i) { + REQUIRE(original_entries[i].first == round_trip_entries[i].first); + REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size()); + for (size_t j = 0; j < original_entries[i].second.size(); ++j) { + REQUIRE(original_entries[i].second[static_cast(j)] == + round_trip_entries[i].second[static_cast(j)]); + } + } + } + }; + + auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = compact_array_of_strings_sketch(sketch, true); + auto unordered = compact_array_of_strings_sketch(sketch, false); + check_round_trip(ordered); + check_round_trip(unordered); + }; + + SECTION("empty sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + run_tests(sketch); + } + + SECTION("single entry sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"})); + run_tests(sketch); + } + + SECTION("multiple entries exact mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); + for (int i = 0; i < 50; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i), "extra"}) + ); + } + REQUIRE_FALSE(sketch.is_estimation_mode()); + run_tests(sketch); + } + + SECTION("multiple entries estimation mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + for (int i = 0; i < 10000; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i)}) + ); + } + REQUIRE(sketch.is_estimation_mode()); + run_tests(sketch); + } +} + +TEST_CASE("aos serde validation", "[tuple_sketch]") { + default_array_of_strings_serde<> serde; + + SECTION("too many nodes rejected") { + array_of_strings array(128, "", std::allocator()); + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("size exceeds 127") + ); + } +} + +} /* namespace datasketches */ diff --git a/version.cfg.in b/version.cfg.in index ad2e7b11..5ffad51a 100644 --- a/version.cfg.in +++ b/version.cfg.in @@ -1 +1 @@ -5.2.@DT@.@HHMM@ +5.3.@DT@.@HHMM@