diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..7bb4cf765
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
new file mode 100644
index 000000000..69c553908
--- /dev/null
+++ b/.github/workflows/coverage.yml
@@ -0,0 +1,54 @@
+name: Coverage Report
+
+on:
+  workflow_run:
+    workflows: [Tests]
+    types: [completed]
+
+permissions:
+  contents: read
+
+jobs:
+  coverage:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+
+      - name: Setup Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
+        with:
+          python-version: '3.x'
+          architecture: x64
+
+      - name: Install Dependencies
+        run: |-
+          sudo apt install lcov
+          python -m pip install lcov_cobertura
+
+      - name: Run CMake
+        run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++-13 -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_FLAGS="-fprofile-arcs -ftest-coverage" -DCMAKE_C_FLAGS="-fprofile-arcs -ftest-coverage" -DBUILD_TESTING_FULL=ON -DBUILD_CFP=ON -DZFP_WITH_OPENMP=ON
+
+      - name: Build
+        run: cmake --build ${{github.workspace}}/build
+
+      - name: Run Tests
+        working-directory: ${{github.workspace}}/build
+        run: ctest -j 8
+
+      - name: Generate Coverage Report
+        working-directory: ${{github.workspace}}/build
+        # for now, suppress mismatch errors (likely gcc/gcov bug)
+        run: |-
+          lcov -c --directory ${{github.workspace}}/build --output-file coverage.info --ignore-errors mismatch
+          lcov --remove coverage.info '${{github.workspace}}/build/tests/*' --remove coverage.info '${{github.workspace}}/tests/*' --remove coverage.info '/usr/include/*' -o coverage.info
+          lcov_cobertura ${{github.workspace}}/build/coverage.info -d -o ${{github.workspace}}/build/coverage.xml
+
+      - name: Upload Report to Codecov
+        uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2 # v5
+        with:
+          files: ${{github.workspace}}/build/coverage.xml
+          token: ${{secrets.CODECOV_TOKEN}}
+          fail_ci_if_error: true
+          verbose: true
diff --git a/.github/workflows/debug-linux.yml b/.github/workflows/debug-linux.yml
new file mode 100644
index 000000000..de77f45c7
--- /dev/null
+++ b/.github/workflows/debug-linux.yml
@@ -0,0 +1,32 @@
+name: Debug (Linux)
+
+on: [workflow_dispatch]
+
+permissions:
+  contents: read
+
+jobs:
+    debug:
+      runs-on: ubuntu-latest
+      steps:
+        - name: Checkout Zfp
+          uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+
+        - name: Setup Python
+          uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
+          with:
+            python-version: '3.x'
+            architecture: x64
+
+        - name: Install Zfpy Dependencies
+          run: |
+            python -m pip install cython
+            python -m pip install oldest-supported-numpy
+            python -m pip install setuptools
+
+        - name: Install OpenMP
+          run: |
+            sudo apt-get update; sudo apt-get install -y libomp5 libomp-dev
+
+        - name: Setup Tmate Session
+          uses: mxschmitt/action-tmate@c0afd6f790e3a5564914980036ebf83216678101 # v3
diff --git a/.github/workflows/debug-macos.yml b/.github/workflows/debug-macos.yml
new file mode 100644
index 000000000..2197f68d2
--- /dev/null
+++ b/.github/workflows/debug-macos.yml
@@ -0,0 +1,32 @@
+name: Debug (MacOS)
+
+on: [workflow_dispatch]
+
+permissions:
+  contents: read
+
+jobs:
+    debug:
+      runs-on: macos-latest
+      steps:
+        - name: Checkout Zfp
+          uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+
+        - name: Setup Python
+          uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
+          with:
+            python-version: '3.x'
+            architecture: x64
+
+        - name: Install Zfpy Dependencies
+          run: |
+            python -m pip install cython
+            python -m pip install oldest-supported-numpy
+            python -m pip install setuptools
+
+        - name: Install OpenMP
+          run: |
+            brew install libomp
+
+        - name: Setup Tmate Session
+          uses: mxschmitt/action-tmate@c0afd6f790e3a5564914980036ebf83216678101 # v3
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 000000000..1494c0e35
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,101 @@
+name: Tests
+
+on: [push, pull_request]
+
+permissions:
+  contents: read
+
+env:
+  BUILD_TYPE: Release
+
+jobs:
+  build:
+    runs-on: ${{matrix.os}}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-latest
+            cxx_compiler: g++-14
+            c_compiler: gcc-14
+            omp: ON
+            target: all
+            
+          - os: ubuntu-latest
+            cxx_compiler: clang++
+            c_compiler: clang
+            omp: ON
+            target: all
+            
+          - os: ubuntu-24.04-arm
+            cxx_compiler: g++-14
+            c_compiler: gcc-14
+            omp: ON
+            target: all
+            architecture: arm64
+            
+          - os: ubuntu-24.04-arm
+            cxx_compiler: clang++
+            c_compiler: clang
+            omp: ON
+            target: all
+            architecture: arm64
+            
+# macos-latest = macos-15 is currently incompatible with Homebrew gcc;
+# see https://github.com/actions/runner-images/issues/12745.
+# For now, use macos-14.
+
+#         - os: macos-latest
+          - os: macos-14
+            cxx_compiler: g++-13
+            c_compiler: gcc-13
+            omp: ON
+            target: all
+
+          - os: macos-latest
+            cxx_compiler: clang++
+            c_compiler: clang
+            omp: OFF
+            target: all
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+
+      - name: Setup Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
+        with:
+          python-version: '3.x'
+          architecture: ${{ matrix.architecture || 'x64' }}
+
+      - name: Install zfpy dependencies
+        run: |
+          python -m pip install cython
+          python -m pip install oldest-supported-numpy
+          python -m pip install setuptools
+          python -m pip install packaging
+      
+      - name: Setup OpenMP (Linux)
+        if: ${{(matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-24.04-arm') && matrix.cxx_compiler == 'clang++'}}
+        run: sudo apt-get update; sudo apt-get install -y libomp5 libomp-dev
+
+      - name: Setup OpenMP (MacOS)
+        if: ${{(matrix.os == 'macos-latest' || matrix.os == 'macos-14')}}
+        run: |
+          brew install libomp
+          echo "CC=$(brew --prefix llvm)/bin/clang" >> $GITHUB_ENV
+          echo "CXX=$(brew --prefix llvm)/bin/clang++" >> $GITHUB_ENV
+          echo "LDFLAGS=\"-L$(brew --prefix llvm)/lib\"" >> $GITHUB_ENV
+          echo "CPPFLAGS=\"-I$(brew --prefix llvm)/include\"" >> $GITHUB_ENV
+
+      - name: Run CMake
+        id: cmake
+        run: cmake -B ${{github.workspace}}/build ${{matrix.generator}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_CXX_COMPILER=${{matrix.cxx_compiler}} -DCMAKE_C_COMPILER=${{matrix.c_compiler}} -DBUILD_TESTING_FULL=ON -DZFP_WITH_OPENMP=${{matrix.omp}} -DBUILD_ZFPY=ON -DPYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())")  -DPYTHON_LIBRARY=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
+        
+      - name: Build
+        id: build
+        run: cmake --build ${{github.workspace}}/build --target ${{matrix.target}} --config ${{env.BUILD_TYPE}}
+
+      - name: Run Tests
+        id: test
+        working-directory: ${{github.workspace}}/build
+        run: ctest -C ${{env.BUILD_TYPE}} -VV
diff --git a/.gitignore b/.gitignore
index 2f388bbf7..66f13148d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ lib
 dist
 wheelhouse
 zfpy.egg-info
-
+modules
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 000000000..6f2cf5dd9
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-24.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+formats:
+  - pdf
+#   - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 19bbab21d..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,280 +0,0 @@
-language:
-  - generic
-
-matrix:
-  include:
-    - os: osx
-      osx_image: xcode8.3
-      env:
-        - MATRIX_EVAL="CC=clang && CXX=clang++ && PYTHON_VERSION=2.7"
-
-    - os: osx
-      osx_image: xcode8.3
-      env:
-        - MATRIX_EVAL="CC=clang && CXX=clang++ && PYTHON_VERSION=3.5"
-
-    - os: osx
-      osx_image: xcode8.3
-      env:
-        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9 && PYTHON_VERSION=2.7"
-
-    - os: osx
-      osx_image: xcode8.3
-      env:
-        - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9 && PYTHON_VERSION=3.5"
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' COVERAGE='ON'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: clang-3.6
-    #  addons: &clang36
-    #    apt:
-    #      sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty']
-    #      packages:
-    #        - clang-3.6
-    #        - g++-7
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='clang-3.6' CXX='clang++-3.6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: clang-4.0
-    #  before_install:
-    #    - export LD_LIBRARY_PATH=/usr/local/clang/lib:$LD_LIBRARY_PATH
-    #  addons: &clang40
-    #    apt:
-    #      sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-4.0']
-    #      packages:
-    #        - clang-4.0
-    #        - g++-7
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='clang-4.0' CXX='clang++-4.0' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-4.4
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-4.4
-    #        - g++-4.4
-    #        - gfortran-4.4
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-4.4' CXX='g++-4.4' FC='gfortran-4.4' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-4.7
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-4.7
-    #        - g++-4.7
-    #        - gfortran-4.7
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-4.7' CXX='g++-4.7' FC='gfortran-4.7' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-4.8
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-4.8
-    #        - g++-4.8
-    #        - gfortran-4.8
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-4.8' CXX='g++-4.8' FC='gfortran-4.8' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-4.9
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-4.9
-    #        - g++-4.9
-    #        - gfortran-4.9
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-4.9' CXX='g++-4.9' FC='gfortran-4.9' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' C_STANDARD='90'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' C_STANDARD='11'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5-dev
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' CXX_STANDARD='11'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' CXX_STANDARD='14'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-6
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-6
-    #        - g++-6
-    #        - gfortran-6
-    #        - libpython3.5
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2008' PYTHON_VERSION='3.5'
-
-    #- os: linux
-    #  dist: xenial
-    #  compiler: gcc-7
-    #  addons:
-    #    apt:
-    #      sources: ubuntu-toolchain-r-test
-    #      packages:
-    #        - gcc-7
-    #        - g++-7
-    #        - gfortran-7
-    #        - libpython3.5
-    #        - python3-numpy
-    #        - python3-pip
-    #  env: CC='gcc-7' CXX='g++-7' FC='gfortran-7' FORTRAN_STANDARD='2008' PYTHON_VERSION='3.5'
-
-before_install:
-  - eval "${MATRIX_EVAL}"
-
-script:
-  - if [ "$TRAVIS_OS_NAME" == "osx" ]; then pyenv root; fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "osx" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        pyenv install 2.7.12;
-        export PYTHON_INCLUDE_DIR=$(pyenv root)/versions/2.7.12/include/python2.7;
-        export PYTHON_LIBRARY=$(pyenv root)/versions/2.7.12/lib/libpython2.7.dylib;
-        export PYTHON_EXECUTABLE=$(pyenv root)/versions/2.7.12/bin/python2.7;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "osx" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        pyenv install 3.5.0;
-        export PYTHON_INCLUDE_DIR=$(pyenv root)/versions/3.5.0/include/python3.5m;
-        export PYTHON_LIBRARY=$(pyenv root)/versions/3.5.0/lib/libpython3.5m.a;
-        export PYTHON_EXECUTABLE=$(pyenv root)/versions/3.5.0/bin/python3.5m;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" == "osx" ]; then
-        $PYTHON_EXECUTABLE -m pip install --upgrade 'pip<21';
-        $PYTHON_EXECUTABLE -m pip install -r ${TRAVIS_BUILD_DIR}/python/requirements.txt;
-    fi
-
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ]; then
-        export PYTHON_EXECUTABLE=/usr/bin/python$PYTHON_VERSION;
-        source /etc/lsb-release;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        export PYTHON_INCLUDE_DIR=/usr/include/python2.7;
-        export PYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython2.7.so;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        export PYTHON_INCLUDE_DIR=/usr/include/python3.5m;
-        export PYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.5m.so;
-        sudo $PYTHON_EXECUTABLE -m pip install --upgrade 'pip<21';
-        sudo $PYTHON_EXECUTABLE -m pip install --upgrade cython;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$DISTRIB_CODENAME" = "trusty" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        sudo $PYTHON_EXECUTABLE -m pip install --upgrade 'pip<21';
-        sudo $PYTHON_EXECUTABLE -m pip install -r ${TRAVIS_BUILD_DIR}/python/requirements.txt;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$DISTRIB_CODENAME" = "trusty" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        echo "Python 3.5 not supported on Ubuntu Trusty";
-        exit 1;
-    fi
-
-  - printenv | grep PYTHON
-  - ./travis.sh
-
-after_success:
-  - if [[ -n "${COVERAGE}" ]]; then bash <(curl -s https://codecov.io/bash); fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000..7a6089b07
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,448 @@
+Change Log
+==========
+
+---
+
+## Unreleased
+
+### Added
+
+- A new code example, `chunk`, shows how to perform (de)compression in chunks.
+- A new utility function `zfp_block_maximum_size()` returns maximum block size
+  for given scalar type, dimensionality, and compression mode.
+- `zfpy.__version__` for straightfoward access to the zfp version string in Python.
+
+### Fixed
+
+- #241: Signed left shifts, integer overflow invoke undefined behavior.
+- #270: Overflow of maximum stream size when `size_t` is 32 bits.
+
+---
+
+## 1.0.1 (2023-12-15)
+
+This patch release primarily addresses minor bug fixes and is needed to update
+the zfpy Python wheels.
+
+### Added
+
+- A new build macro, `BUILD_TESTING_FULL`, specifies that all unit tests be
+  built; `BUILD_TESTING` produces a smaller subset of tests.  Full tests and
+  documentation are now included in releases.
+
+### Fixed
+
+- #169: `libm` dependency is not always correctly detected.
+- #171: `ptrdiff_t` is not always imported in Cython.
+- #176: cfp API is not exposed via CMake configuration file.
+- #177: Full test suite is not included in release.
+- #181: `rpath` is not set correctly in executables.
+- #204: Array strides are not passed by value in zFORp.
+- #220: Errors reported with scikit-build when building zfpy.
+
+---
+
+## 1.0.0 (2022-08-01)
+
+This release is not ABI compatible with prior releases due to numerous changes
+to function signatures and data structures like `zfp_field`.  However, few of
+the API changes, other than to the cfp C API for compressed arrays, should
+impact existing code.  Note that numerous header files have been renamed or
+moved relative to prior versions.
+
+### Added
+
+- `zfp::const_array`: read-only variable-rate array that supports
+  fixed-precision, fixed-accuracy, and reversible modes.
+- Compressed-array classes for 4D data.
+- `const` versions of array references, pointers, and iterators.
+- A more complete API for pointers and iterators.
+- cfp support for proxy references and pointers, iterators, and
+  (de)serialization.
+- Support for pointers and iterators into array views.
+- `zfp::array::size_bytes()` allows querying the size of different components
+  of an array object (e.g., payload, cache, index, metadata, ...).
+- Templated C++ wrappers around the low-level C API.
+- A generic codec for storing blocks of uncompressed scalars in zfp's
+  C++ arrays.
+- Additional functions for querying `zfp_field` and `zfp_stream` structs.
+- `zfp_config`: struct that encapsulates compression mode and parameters.
+- Rounding modes for reducing bias in compression errors.
+- New examples: `array`, `iteratorC`, and `ppm`.
+
+### Changed
+
+- Headers from `array/`, `cfp/include/`, and `include/` have been renamed
+  and reorganized into a common `include/` directory.
+  - The libzfp API is now confined to `zfp.h`, `zfp.hpp`, and `zfp.mod`
+    for C, C++, and Fortran bindings, respectively.  These all appear in
+    the top-level `include/` directory upon installation.
+  - C++ headers now use a `.hpp` suffix; C headers use a `.h` suffix.
+  - C++ headers like `array/zfparray.h` have been renamed `zfp/array.hpp`.
+  - C headers like `cfp/include/cfparrays.h` have been renamed `zfp/array.h`.
+- `size_t` and `ptrdiff_t` replace `uint` and `int` for array sizes and
+  strides in the array classes and C/Fortran APIs.
+- `zfp_bool` replaces `int` as Boolean type in the C API.
+- `bitstream_offset` and `bitstream_size` replace `size_t` to ensure support
+  for 64-bit offsets into and lengths of bit streams.  Consequently, the
+  `bitstream` API has changed accordingly.
+- All array and view iterators are now random-access iterators.
+- Array inspectors now return `const_reference` rather than a scalar
+  type like `float` to allow obtaining a `const_pointer` to an element
+  of an immutable array.
+- `zfp::array::compressed_data()` now returns `void*` instead of `uchar*`.
+- The array (de)serialization API has been revised, resulting in new
+  `zfp::array::header` and `zfp::exception` classes with new exception
+  messages.
+- The array `codec` class is now responsible for all details regarding
+  compression.
+- The compressed-array C++ implementation has been completely refactored to
+  make it more modular, extensible, and reusable across array types.
+- Array block shapes are now computed on the fly rather than stored.
+- The cfp C API now wraps array objects in structs.
+- The zfpy Python API now supports the more general `memoryview` over
+  `bytes` objects for decompression.
+- The zFORp Fortran module name is now `zfp` instead of `zforp_module`.
+- Some command-line options for the `diffusion` example have changed.
+- CMake 3.9 or later is now required for CMake builds.
+
+### Removed
+
+- `zfp::array::get_header()` has been replaced with a `zfp::array::header`
+  constructor that accepts an array object.
+- `ZFP_VERSION_RELEASE` is no longer defined (use `ZFP_VERSION_PATCH`).
+
+### Fixed
+
+- #66: `make install` overwrites googletest.
+- #84: Incorrect order of parameters in CUDA `memset()`.
+- #86: C++ compiler warns when `__STDC_VERSION__` is undefined.
+- #87: `CXXFLAGS` is misspelled in `cfp/src/Makefile`.
+- #98: `zfp_stream_maximum_size()` underestimates size in reversible mode.
+- #99: Incorrect `private_view` reads due to missing write-back.
+- #109: Unused CPython array is incompatible with PyPy.
+- #112: PGI compiler bug causes issues with memory alignment.
+- #119: All-subnormal blocks may cause floating-point overflow.
+- #121: CUDA bit offsets are limited to 32 bits.
+- #122: `make install` does not install zfp command-line utility.
+- #125: OpenMP bit offsets are limited to 32 bits.
+- #126: `make install` does not install Fortran module.
+- #127: Reversible mode reports incorrect compressed block size.
+- #150: cmocka tests do not build on macOS.
+- #154: Thread safety is broken in `private_view` and `private_const_view`.
+- `ZFP_MAX_BITS` is off by one.
+- `diffusionC`, `iteratorC` are not being built with `gmake`.
+
+---
+
+## 0.5.5 (2019-05-05)
+
+### Added
+
+- Support for reversible (lossless) compression of floating-point and
+  integer data.
+- Methods for serializing and deserializing zfp's compressed arrays.
+- Python bindings for compressing NumPy arrays.
+- Fortran bindings to zfp's high-level C API.
+
+### Changed
+
+- The default compressed-array cache size is now a function of the total
+  number of array elements, irrespective of array shape.
+
+### Fixed
+
+- Incorrect handling of execution policy in zfp utility.
+- Incorrect handling of decompression via header in zfp utility.
+- Incorrect cleanup of device memory in CUDA decompress.
+- Missing tests for failing mallocs.
+- CMake does not install CFP when built.
+- `zfp_write_header()` and `zfp_field_metadata()` succeed even if array
+  dimensions are too large to fit in header.
+
+---
+
+## 0.5.4 (2018-10-01)
+
+### Added
+
+- Support for CUDA fixed-rate compression and decompression.
+- Views into compressed arrays for thread safety, nested array indexing,
+  slicing, and array subsetting.
+- C language bindings for compressed arrays.
+- Support for compressing and decompressing 4D data.
+
+### Changed
+
+- Execution policy now applies to both compression and decompression.
+- Compressed array accessors now return Scalar type instead of
+  `const Scalar&` to avoid stale references to evicted cache lines.
+
+### Fixed
+
+- Incorrect handling of negative strides.
+- Incorrect handling of arrays with more than 2^32 elements in zfp command-line
+  tool.
+- `bitstream` is not C++ compatible.
+- Minimum cache size request is not respected.
+
+---
+
+## 0.5.3 (2018-03-28)
+
+### Added
+
+- Support for OpenMP multithreaded compression (but not decompression).
+- Options for OpenMP execution in zfp command-line tool.
+- Compressed-array support for copy construction and assignment via deep
+  copies.
+- Virtual destructors to enable inheritance from zfp arrays.
+
+### Changed
+
+- `zfp_decompress()` now returns the number of compressed bytes processed so
+  far, i.e., the same value returned by `zfp_compress()`.
+
+---
+
+## 0.5.2 (2017-09-28)
+
+### Added
+
+- Iterators and proxy objects for pointers and references.
+- Example illustrating how to use iterators and pointers.
+
+### Changed
+
+- Diffusion example now optionally uses iterators.
+- Moved internal headers under array to `array/zfp`.
+- Modified 64-bit integer typedefs to avoid the C89 non-compliant `long long`
+  and allow for user-supplied types and literal suffixes.
+- Renamed compile-time macros that did not have a `ZFP` prefix.
+- Rewrote documentation in reStructuredText and added complete documentation
+  of all public functions, classes, types, and macros.
+
+### Fixed
+
+- Issue with setting stream word type via CMake.
+
+---
+
+## 0.5.1 (2017-03-28)
+
+This release primarily fixes a few minor issues but also includes changes in
+anticipation of a large number of planned future additions to the library.
+No changes have been made to the compressed format, which is backwards
+compatible with version 0.5.0.
+
+### Added
+
+- High-level API support for integer types.
+- Example that illustrates in-place compression.
+- Support for CMake builds.
+- Documentation that discusses common issues with using zfp.
+
+### Changed
+
+- Separated library version from CODEC version and added version string.
+- Corrected inconsistent naming of `BIT_STREAM` macros in code and
+  documentation.
+- Renamed some of the header bit mask macros.
+- `stream_skip()` and `stream_flush()` now return the number of bits skipped
+  or output.
+- Renamed `stream_block()` and `stream_delta()` to make it clear that they
+  refer to strided streams.  Added missing definition of
+  `stream_stride_block()`.
+- Changed `int` and `uint` types in places to use `ptrdiff_t` and `size_t`
+  where appropriate.
+- Changed API for `zfp_set_precision()` and `zfp_set_accuracy()` to not
+  require the scalar type.
+- Added missing `static` keyword in `decode_block()`.
+- Changed `testzfp` to allow specifying which tests to perform on the
+  command line.
+- Modified directory structure.
+
+### Fixed
+
+- Bug that prevented defining uninitialized arrays.
+- Incorrect computation of array sizes in `zfp_field_size()`.
+- Minor issues that prevented code from compiling on Windows.
+- Issue with fixed-accuracy headers that caused unnecessary storage.
+
+---
+
+## 0.5.0 (2016-02-29)
+
+This version introduces backwards incompatible changes to the CODEC.
+
+### Added
+
+- Modified CODEC to more efficiently encode blocks whose values are all
+  zero or are smaller in magnitude than the absolute error tolerance.
+  This allows representing "empty" blocks using only one bit each.
+- Added functions for compactly encoding the compression parameters
+  and field meta data, e.g., for producing self-contained compressed
+  streams.  Also added functions for reading and writing a header
+  containing these parameters.
+
+### Changed
+
+- Changed behavior of `zfp_compress()` and `zfp_decompress()` to not
+  automatically rewind the bit stream.  This makes it easier to concatenate
+  multiple compressed bit streams, e.g., when compressing vector fields or
+  multiple scalars together.
+- Changed the zfp example program interface to allow reading and writing
+  compressed streams, optionally with a header.  The zfp tool can now be
+  used to compress and decompress files as a stand alone utility.
+
+---
+
+## 0.4.1 (2015-12-28)
+
+### Added
+
+- Added `simple.c` as a minimal example of how to call the compressor.
+
+### Changed
+
+- Changed compilation of diffusion example to output two executables:
+  one with and one without compression.
+
+### Fixed
+
+- Bug that caused segmentation fault when compressing 3D arrays whose
+  dimensions are not multiples of four.  Specifically, arrays of dimensions
+  nx * ny * nz, with ny not a multiple of four, were not handled correctly.
+- Modified `examples/fields.h` to ensure standard compliance.  Previously,
+  C99 support was needed to handle the hex float constants, which are
+  not supported in C++98.
+
+---
+
+## 0.4.0 (2015-12-05)
+
+This version contains substantial changes to the compression algorithm that
+improve PSNR by about 6 dB and speed by a factor of 2-3.  These changes are
+not backward compatible with previous versions of zfp.
+
+### Added
+
+- Support for 31-bit and 63-bit integer data, as well as shorter integer types.
+- New examples for evaluating the throughput of the (de)compressor and for
+  compressing grayscale images in the pgm format.
+- Frequently asked questions.
+
+### Changed
+
+- Rewrote compression codec entirely in C to make linking and calling
+  easier from other programming languages, and to expose the low-level
+  interface through C instead of C++.  This necessitated significant
+  changes to the API as well.
+- Minor changes to the C++ compressed array API, as well as major
+  implementation changes to support the C library.  The namespace and
+  public types are now all in lower case.
+
+### Removed
+
+- Support for general fixed-point decorrelating transforms.
+
+---
+
+## 0.3.2 (2015-12-03)
+
+### Fixed
+
+- Bug in `Array::get()` that caused the wrong cached block to be looked up,
+  thus occasionally copying incorrect values back to parts of the array.
+
+---
+
+## 0.3.1 (2015-05-06)
+
+### Fixed
+
+- Rare bug caused by exponent underflow in blocks with no normal and some
+  subnormal numbers.
+
+---
+
+## 0.3.0 (2015-03-03)
+
+This version modifies the default decorrelating transform to one that uses
+only additions and bit shifts.  This new transform, in addition to being
+faster, also has some theoretical optimality properties and tends to improve
+rate distortion.  This change is not backwards compatible.
+
+### Added
+
+- Compile-time support for parameterized transforms, e.g., to support other
+  popular transforms like DCT, HCT, and Walsh-Hadamard.
+- Floating-point traits to reduce the number of template parameters.  It is
+  now possible to declare a 3D array as `Array3<float>`, for example.
+- Functions for setting the array scalar type and dimensions.
+- `testzfp` for regression testing.
+
+### Changed
+
+- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
+  Consequently Q1.62 fixed point can be used throughout.
+- Changed the order in which bits are emitted within each bit plane to be more
+  intelligent.  Group tests are now deferred until they are needed, i.e., just
+  before the value bits for the group being tested.  This improves the quality
+  of fixed-rate encodings, but has no impact on compressed size.
+- Made several optimizations to improve performance.
+- Consolidated several header files.
+
+---
+
+## 0.2.1 (2014-12-12)
+
+### Added
+
+- Win64 support via Microsoft Visual Studio compiler.
+- Documentation of the expected output for the diffusion example.
+
+### Changed
+
+- Made several minor changes to suppress compiler warnings.
+
+### Fixed
+
+- Broken support for IBM's `xlc` compiler.
+
+---
+
+## 0.2.0 (2014-12-02)
+
+The compression interface from `zfpcompress` was relocated to a separate
+library, called `libzfp`, and modified to be callable from C.  This API now
+uses a parameter object (`zfp_params`) to specify array type and dimensions
+as well as compression parameters.
+
+### Added
+
+- Several utility functions were added to simplify `libzfp` usage:
+  * Functions for setting the rate, precision, and accuracy.
+    Corresponding functions were also added to the `Codec` class.
+  * A function for estimating the buffer size needed for compression.
+- The `Array` class functionality was expanded:
+  * Support for accessing the compressed bit stream stored with an array,
+    e.g., for offline compressed storage and for initializing an already
+    compressed array.
+  * Functions for dynamically specifying the cache size.
+  * The default cache is now direct-mapped instead of two-way associative.
+
+### Fixed
+
+- Corrected the value of the lowest possible bit plane to account for both
+  the smallest exponent and the number of bits in the significand.
+- Corrected inconsistent use of rate and precision.  The rate refers to the
+  number of compressed bits per floating-point value, while the precision
+  refers to the number of uncompressed bits.  The `Array` API was changed
+  accordingly.
+
+---
+
+## 0.1.0 (2014-11-12)
+
+Initial beta release.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9652a23f9..52ae1584e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,16 @@ string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_MINOR[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_MINOR ${_zfp_h_contents})
 string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_PATCH[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_PATCH ${_zfp_h_contents})
-set(ZFP_VERSION
-  "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}")
+string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_TWEAK[ \t]+([0-9]+).*"
+    "\\1" ZFP_VERSION_TWEAK ${_zfp_h_contents})
+
+if(${ZFP_VERSION_TWEAK} EQUAL 0)
+  set(ZFP_VERSION
+    "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}")
+else()
+  set(ZFP_VERSION
+    "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}.${ZFP_VERSION_TWEAK}")
+endif()
 
 project(ZFP VERSION ${ZFP_VERSION})
 
@@ -43,6 +51,9 @@ if(NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
   set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${ZFP_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
 endif()
 
+# Setup RPath
+set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
+
 #------------------------------------------------------------------------------#
 # Top level options
 #------------------------------------------------------------------------------#
@@ -162,10 +173,18 @@ if(DEFINED ZFP_WITH_OPENMP)
   option(ZFP_WITH_OPENMP "Enable OpenMP parallel compression"
     ${ZFP_WITH_OPENMP})
   if(ZFP_WITH_OPENMP)
-    find_package(OpenMP COMPONENTS C REQUIRED)
+    if(BUILD_EXAMPLES)
+      find_package(OpenMP COMPONENTS C CXX REQUIRED)
+    else()
+      find_package(OpenMP COMPONENTS C REQUIRED)
+    endif()
   endif()
 else()
-  find_package(OpenMP COMPONENTS C)
+  if(BUILD_EXAMPLES)
+    find_package(OpenMP COMPONENTS C CXX)
+  else()
+    find_package(OpenMP COMPONENTS C)
+  endif()
   option(ZFP_WITH_OPENMP "Enable OpenMP parallel compression" ${OPENMP_FOUND})
 endif()
 
@@ -241,10 +260,10 @@ list(APPEND ppm_private_defs PPM_CHROMA=${PPM_CHROMA})
 
 # Link libm only if necessary
 include(CheckCSourceCompiles)
-check_c_source_compiles("#include<math.h>\nfloat f; int main(){sqrt(f);return 0;}" HAVE_MATH)
+check_c_source_compiles("#include<math.h>\nint main(int n,char*v[]){return sqrt(n);}" HAVE_MATH)
 if(NOT HAVE_MATH)
   set(CMAKE_REQUIRED_LIBRARIES m)
-  check_c_source_compiles("#include<math.h>\nfloat f; int main(){sqrt(f);return 0;}" HAVE_LIBM_MATH)
+  check_c_source_compiles("#include<math.h>\nint main(int n,char*v[]){return sqrt(n);}" HAVE_LIBM_MATH)
   unset(CMAKE_REQUIRED_LIBRARIES)
   if(NOT HAVE_LIBM_MATH)
     message(FATAL_ERROR "Unable to use C math library functions (with or without -lm)")
@@ -255,7 +274,7 @@ endif()
 # Add source code
 #------------------------------------------------------------------------------#
 include(CTest)
-if(BUILD_TESTING)
+if(BUILD_TESTING OR BUILD_TESTING_FULL)
   enable_testing()
 endif()
 
@@ -299,7 +318,7 @@ if(BUILD_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
-if(BUILD_TESTING)
+if(BUILD_TESTING OR BUILD_TESTING_FULL)
   # Disable gtest install to prevent clobbering existing installations 
   option(INSTALL_GMOCK "Install Googlemock" OFF)
   option(INSTALL_GTEST "Install Googletest" OFF)
@@ -310,11 +329,11 @@ endif()
 #------------------------------------------------------------------------------#
 # Header install
 #------------------------------------------------------------------------------#
-install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-install(DIRECTORY array/   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
 if(BUILD_CFP)
-  install(DIRECTORY cfp/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+else()
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+          PATTERN "cfp" EXCLUDE)
 endif()
 #------------------------------------------------------------------------------#
 # Build type: one of None, Debug, Release, RelWithDebInfo, MinSizeRel
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..9bc8fa832
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,13 @@
+Contributing
+============
+
+The zfp project uses the
+[Gitflow](https://nvie.com/posts/a-successful-git-branching-model/)
+development model.  Contributions should be made as pull requests on the
+`develop` branch.  Although this branch is under continuous development,
+it should be robust enough to pass all regression tests.  For contributions
+that are not production ready, please [contact us](mailto:zfp.llnl.gov) to
+have a separate branch created.  The `master` branch is updated with each
+release and reflects the most recent official release of zfp.  See the
+[Releases Page](https://github.com/LLNL/zfp/releases) for a history
+of releases.
diff --git a/Config b/Config
index a2d7aba5f..cf0df65df 100644
--- a/Config
+++ b/Config
@@ -10,31 +10,47 @@ FC = gfortran
   CSTD = -std=c99
   CXXSTD = -std=c++98
 # CXXSTD = -std=c++11
-  FSTD = -std=f2003 -ffree-form -Wno-c-binding-type
+  FSTD = -std=f2018 -ffree-form -Wno-c-binding-type
 
 # common compiler options -----------------------------------------------------
 
 OPTFLAGS = -O3
 FLAGS = $(OPTFLAGS) -fPIC -pedantic -Wall -Wextra
+LDFLAGS =
 SOFLAGS =
 
 # OpenMP compiler options -----------------------------------------------------
 
-# do not uncomment; use "make ZFP_WITH_OPENMP=0" to disable OpenMP
+# do not comment out; use "make ZFP_WITH_OPENMP=0" to disable OpenMP
 OMPFLAGS = -fopenmp
 
-# optional compiler macros ----------------------------------------------------
+# Apple clang OpenMP options
+# OMPFLAGS = -Xclang -fopenmp
 
-# use long long for 64-bit types
-# DEFS += -DZFP_INT64='long long' -DZFP_INT64_SUFFIX='ll'
-# DEFS += -DZFP_UINT64='unsigned long long' -DZFP_UINT64_SUFFIX='ull'
+# optional compiler macros ----------------------------------------------------
 
-# use smaller bit stream word type for finer rate granularity
+# use smaller bit stream word type for finer rate granularity;
+# can bet set on command line, e.g., "make BIT_STREAM_WORD_TYPE=uint8"
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint8
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint16
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint32
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint64
 
+# reduce bias and slack in errors; can be set on command line, e.g.,
+# "make ZFP_ROUNDING_MODE=ZFP_ROUND_FIRST"
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_NEVER
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_FIRST
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_LAST
+# DEFS += -DZFP_WITH_TIGHT_ERROR
+
+# treat subnormals as zero to avoid overflow; can be set on command line, e.g.,
+# "make ZFP_WITH_DAZ=1"
+# DEFS += -DZFP_WITH_DAZ
+
+# use long long for 64-bit types
+# DEFS += -DZFP_INT64='long long' -DZFP_INT64_SUFFIX='ll'
+# DEFS += -DZFP_UINT64='unsigned long long' -DZFP_UINT64_SUFFIX='ull'
+
 # cache alignment
 # DEFS += -DZFP_CACHE_LINE_SIZE=256
 
@@ -53,14 +69,6 @@ OMPFLAGS = -fopenmp
 # count cache misses
 # DEFS += -DZFP_WITH_CACHE_PROFILE
 
-# reduce bias and slack in errors
-# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_FIRST
-# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_LAST
-# DEFS += -DZFP_WITH_TIGHT_ERROR
-
-# treat subnormals as zero to avoid overflow
-# DEFS += -DZFP_WITH_DAZ
-
 # build targets ---------------------------------------------------------------
 
 # default targets
@@ -93,7 +101,7 @@ else
   LIBCFP = libcfp.a
 endif
 
-# conditionals ----------------------------------------------------------------
+# operating system and compiler dependent flags -------------------------------
 
 # macOS configuration; compile with "make OS=mac"
 ifeq ($(OS),mac)
@@ -105,6 +113,13 @@ ifeq ($(CSTD),-std=c89)
   FLAGS += -Wno-unused-function
 endif
 
+# process macros set on the command line --------------------------------------
+
+# bit stream word type
+ifdef BIT_STREAM_WORD_TYPE
+  DEFS += -DBIT_STREAM_WORD_TYPE=$(BIT_STREAM_WORD_TYPE)
+endif
+
 # enable OpenMP?
 ifdef ZFP_WITH_OPENMP
   ifneq ($(ZFP_WITH_OPENMP),0)
@@ -114,6 +129,13 @@ ifdef ZFP_WITH_OPENMP
   endif
 endif
 
+# treat subnormals as zero to avoid overflow
+ifdef ZFP_WITH_DAZ
+  ifneq ($(ZFP_WITH_DAZ),0)
+    FLAGS += -DZFP_WITH_DAZ
+  endif
+endif
+
 # rounding mode and slack in error
 ifdef ZFP_ROUNDING_MODE
   FLAGS += -DZFP_ROUNDING_MODE=$(ZFP_ROUNDING_MODE)
@@ -127,13 +149,6 @@ ifdef ZFP_ROUNDING_MODE
   endif
 endif
 
-# treat subnormals as zero to avoid overflow
-ifdef ZFP_WITH_DAZ
-  ifneq ($(ZFP_WITH_DAZ),0)
-    FLAGS += -DZFP_WITH_DAZ
-  endif
-endif
-
 # chroma mode for ppm example
 ifdef PPM_CHROMA
   PPM_FLAGS += -DPPM_CHROMA=$(PPM_CHROMA)
diff --git a/LICENSE b/LICENSE
index f371f8825..3c726d295 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC
+Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..a34c60c68
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,6 @@
+include python/zfpy.pxd
+include python/zfpy.pyx
+recursive-include include *.h
+recursive-include src *.c *.h
+include LICENSE
+include pyproject.toml
\ No newline at end of file
diff --git a/Makefile b/Makefile
index bddc72ae1..aacf77894 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ all:
 	@echo $(LIBRARY)
 	@cd src; $(MAKE) clean $(LIBRARY)
 ifneq ($(BUILD_CFP),0)
-	@cd cfp/src; $(MAKE) clean $(LIBRARY)
+	@cd cfp; $(MAKE) clean $(LIBRARY)
 endif
 ifneq ($(BUILD_ZFORP),0)
 	@cd fortran; $(MAKE) clean $(LIBRARY)
@@ -33,7 +33,7 @@ test:
 # clean all
 clean:
 	@cd src; $(MAKE) clean
-	@cd cfp/src; $(MAKE) clean
+	@cd cfp; $(MAKE) clean
 	@cd fortran; $(MAKE) clean
 	@cd utils; $(MAKE) clean
 	@cd tests; $(MAKE) clean
diff --git a/README.md b/README.md
index c9c3a1e92..4688473cb 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,10 @@
 ZFP
 ===
-[![Travis CI Build Status](https://travis-ci.com/LLNL/zfp.svg?branch=develop)](https://travis-ci.com/LLNL/zfp)
+[![GitHub Actions Test Status](https://github.com/LLNL/zfp/actions/workflows/tests.yml/badge.svg)](https://github.com/LLNL/zfp/actions/workflows/tests.yml)
 [![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/qb3ld7j11segy52k/branch/develop?svg=true)](https://ci.appveyor.com/project/lindstro/zfp)
-[![Documentation Status](https://readthedocs.org/projects/zfp/badge/?version=release0.5.5)](https://zfp.readthedocs.io/en/release0.5.5/?badge=release0.5.5)
-[![Code Coverage](https://codecov.io/gh/LLNL/zfp/branch/develop/graph/badge.svg)](https://codecov.io/gh/LLNL/zfp)
+[![Documentation Status](https://readthedocs.org/projects/zfp/badge/?version=release1.0.1)](https://zfp.readthedocs.io/en/release1.0.1/)
+[![codecov](https://codecov.io/gh/LLNL/zfp/branch/develop/graph/badge.svg?token=jqvMVvgRQ9)](https://codecov.io/gh/LLNL/zfp)
+[![R&D100 - Winner](https://img.shields.io/badge/R%26D100-Winner-gold)](https://www.rdworldonline.com/rd-100-winners-for-2023-are-announced-2/)
 
 zfp is a compressed format for representing multidimensional floating-point
 and integer arrays.  zfp provides compressed-array classes that support high
@@ -44,7 +45,8 @@ zfp may be built using either [CMake](https://cmake.org/) or
 
 This builds the zfp library in the `build/lib` directory and the zfp
 command-line executable in the `build/bin` directory.  It then runs
-the regression tests.
+the regression tests. The full test suite may be run by enabling the 
+`BUILD_TESTING_FULL` CMake option during the build step.
 
 zfp may also be built using GNU make:
 
@@ -56,29 +58,25 @@ Note: GNU builds are less flexible and do not support all available features,
 e.g., CUDA support.
 
 For further configuration and build instructions, please consult the
-[documentation](https://zfp.readthedocs.io/en/latest/installation.html).
+[documentation](https://zfp.readthedocs.io/en/release1.0.1/installation.html).
+For examples of how to call the C library and use the C++ array classes,
+see the [examples](https://zfp.readthedocs.io/en/release1.0.1/examples.html)
+section.
 
 
 Documentation
 -------------
 
-Full HTML [documentation](http://zfp.readthedocs.io/) is available online.
-A [PDF](http://readthedocs.org/projects/zfp/downloads/pdf/latest/) version
-is also available.
+Full HTML [documentation](http://zfp.readthedocs.io/en/release1.0.1) is
+available online.
+A [PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release1.0.1/)
+version is also available.
 
+Further information on the zfp software is included in these files:
 
-Contributing
-------------
-
-The zfp project uses the
-[Gitflow](https://nvie.com/posts/a-successful-git-branching-model/)
-development model.  Contributions should be made as pull requests on the
-`develop` branch.  Although this branch is under continuous development,
-it should be robust enough to pass all regression tests.
-The `master` branch is updated with each release and reflects the most
-recent official release of zfp.  See the
-[Releases Page](https://github.com/LLNL/zfp/releases) for a history
-of releases.
+- Change log: see [CHANGELOG.md](./CHANGELOG.md).
+- Support and additional resources: see [SUPPORT.md](./SUPPORT.md).
+- Code contributions: see [CONTRIBUTING.md](./CONTRIBUTING.md).
 
 
 Authors
@@ -98,25 +96,21 @@ If you use zfp for scholarly research, please cite this paper:
   IEEE Transactions on Visualization and Computer Graphics, 20(12):2674-2683, December 2014.
   [doi:10.1109/TVCG.2014.2346458](http://doi.org/10.1109/TVCG.2014.2346458).
 
+The algorithm implemented in the current version of zfp is described in the
+[documentation](https://zfp.readthedocs.io/en/latest/algorithm.html) and in
+the following paper:
 
-Additional Resources
---------------------
-
-For more information on zfp, please see the
-[zfp website](https://computing.llnl.gov/casc/zfp/).
-For bug reports and feature requests, please consult the
-[GitHub issue tracker](https://github.com/LLNL/zfp/issues/).
-For questions and comments not answered here or in the
-[documentation](http://zfp.readthedocs.io),
-please send e-mail to [zfp@llnl.gov](mailto:zfp@llnl.gov).
+* James Diffenderfer, Alyson Fox, Jeffrey Hittinger, Geoffrey Sanders, Peter Lindstrom.
+  [Error Analysis of ZFP Compression for Floating-Point Data](https://www.researchgate.net/publication/324908266_Error_Analysis_of_ZFP_Compression_for_Floating-Point_Data).
+  SIAM Journal on Scientific Computing, 41(3):A1867-A1898, June 2019.
+  [doi:10.1137/18M1168832](http://doi.org/10.1137/18M1168832).
 
 
 License
 -------
 
-zfp is distributed under the terms of the BSD 3-Clause license.  See the
-files [LICENSE](https://github.com/LLNL/zfp/blob/develop/LICENSE) and
-[NOTICE](https://github.com/LLNL/zfp/blob/develop/NOTICE) for details.
+zfp is distributed under the terms of the BSD 3-Clause license.  See
+[LICENSE](./LICENSE) and [NOTICE](./NOTICE) for details.
 
 SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..e92d7de6d
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,19 @@
+# Security Policy
+
+The zfp team takes software security and safety issues seriously.  We will
+work to resolve security issues in a timely manner as they become known.
+
+## Supported Versions
+
+Security updates to zfp are generally applied only to the latest release.
+
+## Reporting a Vulnerability
+
+If you have discovered a zfp vulnerability, please report it privately.
+**Do not disclose it as a public issue.**
+This gives us time to work with you to fix the issue before public exposure,
+reducing the chance that the vulnerability will be exploited before a patch
+is released.
+
+Please [report security issues](https://github.com/LLNL/zfp/security/advisories/new)
+on the GitHub Security tab.
diff --git a/SUPPORT.md b/SUPPORT.md
new file mode 100644
index 000000000..83a979317
--- /dev/null
+++ b/SUPPORT.md
@@ -0,0 +1,11 @@
+Support
+=======
+
+For more information on zfp, please see the
+[zfp website](https://zfp.llnl.gov).
+For bug reports and feature requests, please consult the
+[GitHub issue tracker](https://github.com/LLNL/zfp/issues/).
+For questions and comments not answered here or in the
+[documentation](http://zfp.readthedocs.io),
+please contact us by email at
+[zfp@llnl.gov](mailto:zfp@llnl.gov).
diff --git a/VERSIONS.md b/VERSIONS.md
deleted file mode 100644
index 2b7e0e72e..000000000
--- a/VERSIONS.md
+++ /dev/null
@@ -1,298 +0,0 @@
-# zfp Release Notes
-
-## 0.5.5 (May 5, 2019)
-
-- Added support for reversible (lossless) compression of floating-point and
-  integer data.
-
-- Added methods for serializing and deserializing zfp's compressed arrays.
-
-- Added Python bindings for compressing NumPy arrays.
-
-- Added Fortran bindings to zfp's high-level C API.
-
-- Change:
-  - The default compressed-array cache size is now a function of the total
-    number of array elements, irrespective of array shape.
-
-- Bug fixes:
-  - Incorrect handling of execution policy in zfp utility.
-  - Incorrect handling of decompression via header in zfp utility.
-  - Incorrect cleanup of device memory in CUDA decompress.
-  - Tests for failing mallocs.
-  - CMake installation of CFP when built.
-  - zfp\_write\_header and zfp\_field\_metadata now fail if array dimensions
-    are too large to fit in header.
-
-
-## 0.5.4 (October 1, 2018)
-
-- Added support for CUDA fixed-rate compression and decompression.
-
-- Added views into compressed arrays for thread safety, nested array
-  indexing, slicing, and array subsetting.
-
-- Added C language bindings for compressed arrays.
-
-- Added support for compressing and decompressing 4D data.
-
-- Changes:
-  - Execution policy now applies to both compression and decompression.
-  - Compressed array accessors now return Scalar type instead of
-    const Scalar& to avoid stale references to evicted cache lines.
-
-- Bug fixes:
-  - Handling of negative strides.
-  - Command line tool handling of arrays with more than 2^32 elements.
-  - bitstream C++ compatibility.  
-  - Respect minimum cache size request.
-
-
-## 0.5.3 (March 28, 2018)
-
-- Added support for OpenMP multithreaded compression (but not decompression).
-
-- Added options for OpenMP execution to zfp command-line tool.
-
-- Changed return value of zfp\_decompress to indicate the number of compressed
-  bytes processed so far (now returns same value as zfp\_compress on success).
-
-- Added compressed array support for copy construction and assignment via
-  deep copies.
-
-- Added virtual destructors to enable inheritance from zfp arrays.
-
-
-## 0.5.2 (September 28, 2017)
-
-- Added iterators and proxy objects for pointers and references.
-
-- Added example illustrating how to use iterators and pointers.
-
-- Modified diffusion example to optionally use iterators.
-
-- Moved internal headers under array to array/zfp.
-
-- Modified 64-bit integer typedefs to avoid the C89 non-compliant long long
-  and allow for user-supplied types and literal suffixes.
-
-- Renamed compile-time macros that did not have a ZFP prefix.
-
-- Fixed issue with setting stream word type via CMake.
-
-- Rewrote documentation in reStructuredText and added complete
-  documentation of all public functions, classes, types, and macros.
-  Removed ASCII documentation.
-
-
-## 0.5.1 (March 28, 2017)
-
-- This release primarily fixes a few minor issues but also includes
-  changes in anticipation of a large number of planned future additions
-  to the library.  No changes have been made to the compressed format,
-  which is backwards compatible with version 0.5.0.
-
-- Added high-level API support for integer types.
-
-- Separated library version from CODEC version and added version string.
-
-- Added example that illustrates in-place compression.
-
-- Added support for CMake builds.
-
-- Corrected inconsistent naming of BIT\_STREAM macros in code and
-  documentation.
-
-- Renamed some of the header bit mask macros.
-
-- Added return values to stream\_skip and stream\_flush to indicate the
-  number of bits skipped or output.
-
-- Renamed stream\_block and stream\_delta to make it clear that they refer
-  to strided streams.  Added missing definition of stream\_stride\_block.
-
-- Changed int/uint types in places to use ptrdiff\_t/size\_t where
-  appropriate.
-
-- Changed API for zfp\_set\_precision and zfp\_set\_accuracy to not require
-  the scalar type.
-
-- Added missing static keyword in decode\_block.
-
-- Changed testzfp to allow specifying which tests to perform on the
-  command line.
-
-- Fixed bug that prevented defining uninitialized arrays.
-
-- Fixed incorrect computation of array sizes in zfp\_field\_size.
-
-- Fixed minor issues that prevented code from compiling on Windows.
-
-- Fixed issue with fixed-accuracy headers that caused unnecessary storage.
-
-- Modified directory structure.
-
-- Added documentation that discusses common issues with using zfp.
-
-
-## 0.5.0 (February 29, 2016)
-
-- Modified CODEC to more efficiently encode blocks whose values are all
-  zero or are smaller in magnitude than the absolute error tolerance.
-  This allows representing "empty" blocks using only one bit each.  This
-  version is not backwards compatible with prior zfp versions.
-
-- Changed behavior of zfp\_compress and zfp\_decompress to not automatically
-  rewind the bit stream.  This makes it easier to concatenate multiple
-  compressed bit streams, e.g., when compressing vector fields or multiple
-  scalars together.
-
-- Added functions for compactly encoding the compression parameters
-  and field meta data, e.g., for producing self-contained compressed
-  streams.  Also added functions for reading and writing a header
-  containing these parameters.
-
-- Changed the zfp example program interface to allow reading and writing
-  compressed streams, optionally with a header.  The zfp tool can now be
-  used to compress and decompress files as a stand alone utility.
-
-
-## 0.4.1 (December 28, 2015)
-
-- Fixed bug that caused segmentation fault when compressing 3D arrays
-  whose dimensions are not multiples of four.  Specifically, arrays of
-  dimensions nx * ny * nz, with ny not a multiple of four, were not
-  handled correctly.
-
-- Modified examples/fields.h to ensure standard compliance.  Previously,
-  C99 support was needed to handle the hex float constants, which are
-  not supported in C++98.
-
-- Added simple.c as a minimal example of how to call the compressor.
-
-- Changed compilation of diffusion example to output two executables:
-  one with and one without compression.
-
-
-## 0.4.0 (December 5, 2015)
-
-- Substantial changes to the compression algorithm that improve PSNR
-  by about 6 dB and speed by a factor of 2-3.  These changes are not
-  backward compatible with previous versions of zfp.
-
-- Added support for 31-bit and 63-bit integer data, as well as shorter
-  integer types.
-
-- Rewrote compression codec entirely in C to make linking and calling
-  easier from other programming languages, and to expose the low-level
-  interface through C instead of C++.  This necessitated significant
-  changes to the API as well.
-
-- Minor changes to the C++ compressed array API, as well as major
-  implementation changes to support the C library.  The namespace and
-  public types are now all in lower case.
-
-- Deprecated support for general fixed-point decorrelating transforms
-  and slimmed down implementation.
-
-- Added new examples for evaluating the throughput of the (de)compressor
-  and for compressing grayscale images in the pgm format.
-
-- Added FAQ.
-
-
-## 0.3.2 (December 3, 2015)
-
-- Fixed bug in Array::get() that caused the wrong cached block to be
-  looked up, thus occasionally copying incorrect values back to parts
-  of the array.
-
-
-## 0.3.1 (May 6, 2015)
-
-- Fixed rare bug caused by exponent underflow in blocks with no normal
-  and some denormal numbers.
-
-
-## 0.3.0 (March 3, 2015)
-
-- Modified the default decorrelating transform to one that uses only
-  additions and bit shifts.  This new transform, in addition to being
-  faster, also has some theoretical optimality properties and tends to
-  improve rate distortion.
-
-- Added compile-time support for parameterized transforms, e.g., to
-  support other popular transforms like DCT, HCT, and Walsh-Hadamard.
-
-- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
-  Consequently Q1.62 fixed point can be used throughout.
-
-- Changed the order in which bits are emitted within each bit plane
-  to be more intelligent.  Group tests are now deferred until they
-  are needed, i.e., just before the value bits for the group being
-  tested.  This improves the quality of fixed-rate encodings, but
-  has no impact on compressed size.
-
-- Made several optimizations to improve performance.
-
-- Added floating-point traits to reduce the number of template
-  parameters.  It is now possible to declare a 3D array as
-  Array3<float>, for example.
-
-- Added functions for setting the array scalar type and dimensions.
-
-- Consolidated several header files.
-
-- Added testzfp for regression testing.
-
-
-## 0.2.1 (December 12, 2014)
-
-- Added Win64 support via Microsoft Visual Studio compiler.
-
-- Fixed broken support for IBM's xlc compiler.
-
-- Made several minor changes to suppress compiler warnings.
-
-- Documented expected output for the diffusion example.
-
-
-## 0.2.0 (December 2, 2014)
-
-- The compression interface from zfpcompress was relocated to a
-  separate library, called libzfp, and modified to be callable from C.
-  This API now uses a parameter object (zfp\_params) to specify array
-  type and dimensions as well as compression parameters.
-
-- Several utility functions were added to simplify libzfp usage:
-
-  * Functions for setting the rate, precision, and accuracy.
-    Corresponding functions were also added to the Codec class.
-
-  * A function for estimating the buffer size needed for compression.
-
-- The Array class functionality was expanded:
-
-  * Support for accessing the compressed bit stream stored with an
-    array, e.g., for offline compressed storage and for initializing
-    an already compressed array.
-
-  * Functions for dynamically specifying the cache size.
-
-  * The default cache is now direct-mapped instead of two-way
-    associative.
-
-- Minor bug fixes:
-
-  * Corrected the value of the lowest possible bit plane to account for
-    both the smallest exponent and the number of bits in the significand.
-
-  * Corrected inconsistent use of rate and precision.  The rate refers
-    to the number of compressed bits per floating-point value, while
-    the precision refers to the number of uncompressed bits.  The Array
-    API was changed accordingly.
-
-
-## 0.1.0 (November 12, 2014)
-
-- Initial beta release.
diff --git a/appveyor.yml b/appveyor.yml
index 18963b02c..661fd1a37 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,4 +1,4 @@
-version: 0.5.5-{build}
+version: 1.0.1-{build}
 
 environment:
   # zfpy only build for Release builds (otherwise need debug python libs python27_d.lib)
@@ -8,26 +8,14 @@ environment:
       APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
       PLATFORM: x64
       BUILD_TYPE: Release
-      PYTHON_VERSION: 35
+      PYTHON_VERSION: 38
 
     - COMPILER: msvc
       GENERATOR: Visual Studio 15 2017
       APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
       PLATFORM: Win32
       BUILD_TYPE: Release
-      PYTHON_VERSION: 35
-
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 14 2015 Win64
-      PLATFORM: x64
-      BUILD_TYPE: Release
-      PYTHON_VERSION: 35
-
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 14 2015
-      PLATFORM: Win32
-      BUILD_TYPE: Release
-      PYTHON_VERSION: 27
+      PYTHON_VERSION: 38
 
     - COMPILER: mingw
       GENERATOR: MinGW Makefiles
@@ -63,5 +51,8 @@ install:
   - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" pip install -r python\requirements.txt
   - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" python --version
 
+
 build_script:
   - sh appveyor.sh
+  # uncomment to enable interactive remote desktop mode
+  #- ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1'))
diff --git a/array/ieeecodec.h b/array/ieeecodec.h
deleted file mode 100644
index be2626889..000000000
--- a/array/ieeecodec.h
+++ /dev/null
@@ -1,260 +0,0 @@
-#ifndef ZFP_IEEE_CODEC_H
-#define ZFP_IEEE_CODEC_H
-
-#include <algorithm>
-#include <climits>
-#include <cstring>
-#include "zfp.h"
-#include "zfpcpp.h"
-#include "zfp/traits.h"
-
-namespace zfp {
-
-// base class for IEEE-754 coding of {float, double} x {1D, 2D, 3D} data
-template <typename Scalar, uint dims>
-class ieee_codec_base {
-protected:
-  // constructor takes pre-allocated buffer of compressed blocks
-  ieee_codec_base(void* data, size_t size) :
-    data(data),
-    size(size)
-  {}
-
-public:
-  // destructor
-  ~ieee_codec_base()
-  {}
-
-  // return nearest rate supported
-  static double nearest_rate(double target_rate)
-  {
-    size_t block_bits = static_cast<size_t>(target_rate * block_size);
-    size_t word_bits = stream_alignment();
-    size_t words = std::max((block_bits + word_bits - 1) / word_bits, size_t(1));
-    return static_cast<double>(words * word_bits) / block_size;
-  }
-
-  // rate in bits/value
-  double rate() const { return double(zfp->maxbits) / block_size; }
-
-  // set rate in bits/value
-  double set_rate(double rate) { return zfp_stream_set_rate(zfp, rate, type, dims, zfp_true); }
-
-  static const zfp_type type = zfp::trait<Scalar>::type; // scalar type
-
-  // zfp::ieee_codec_base::header class for array (de)serialization
-  #include "zfp/ieeeheader.h"
-
-protected:
-  // encode full contiguous block
-  size_t encode_block(size_t offset, const Scalar* block)
-  {
-    ptrdiff_t = offset / sizeof();
-    // copy here and optionally convert
-    return block_size * rate;
-
-    stream_wseek(zfp->stream, offset);
-    size_t size = zfp::encode_block<Scalar, dims>(zfp, block);
-    size += zfp_stream_flush(zfp);
-    return size;
-  }
-
-  // decode full contiguous block
-  size_t decode_block(size_t offset, Scalar* block)
-  {
-    stream_rseek(zfp->stream, offset);
-    size_t size = zfp::decode_block<Scalar, dims>(zfp, block);
-    size += zfp_stream_align(zfp);
-    return size;
-  }
-
-  static const size_t block_size = 1u << (2 * dims); // block size in number of scalars
-
-  void* data;
-  size_t size;
-};
-
-// zfp codec templated on scalar type and number of dimensions
-template <typename Scalar, uint dims>
-class zfp_codec;
-
-// 1D codec
-template <typename Scalar>
-class zfp_codec<Scalar, 1> : public zfp_codec_base<Scalar, 1> {
-public:
-  // constructor takes pre-allocated buffer of compressed blocks
-  zfp_codec(void* data, size_t size) : zfp_codec_base<Scalar, 1>(data, size) {}
-
-  // encode contiguous 1D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block)
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1)
-                 : zfp_codec_base<Scalar, 1>::encode_block(offset, block);
-  }
-
-  // encode 1D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx)
-  {
-    size_t size;
-    stream_wseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, sx);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(zfp, p, sx);
-    size += zfp_stream_flush(zfp);
-    return size;
-  }
-
-  // decode contiguous 1D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block)
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1)
-                 : decode_block(offset, block);
-  }
-
-  // decode 1D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx)
-  {
-    size_t size;
-    stream_rseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, sx);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(zfp, p, sx);
-    size += zfp_stream_align(zfp);
-    return size;
-  }
-
-protected:
-  using zfp_codec_base<Scalar, 1>::encode_block;
-  using zfp_codec_base<Scalar, 1>::decode_block;
-  using zfp_codec_base<Scalar, 1>::zfp;
-};
-
-// 2D codec
-template <typename Scalar>
-class zfp_codec<Scalar, 2> : public zfp_codec_base<Scalar, 2> {
-public:
-  // constructor takes pre-allocated buffer of compressed blocks
-  zfp_codec(void* data, size_t size) : zfp_codec_base<Scalar, 2>(data, size) {}
-
-  // encode contiguous 2D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block)
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1, 4)
-                 : encode_block(offset, block);
-  }
-
-  // encode 2D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
-  {
-    size_t size;
-    stream_wseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy);
-    size += zfp_stream_flush(zfp);
-    return size;
-  }
-
-  // decode contiguous 2D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block)
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1, 4)
-                 : decode_block(offset, block);
-  }
-
-  // decode 2D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
-  {
-    size_t size;
-    stream_rseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy);
-    size += zfp_stream_align(zfp);
-    return size;
-  }
-
-protected:
-  using zfp_codec_base<Scalar, 2>::encode_block;
-  using zfp_codec_base<Scalar, 2>::decode_block;
-  using zfp_codec_base<Scalar, 2>::zfp;
-};
-
-// 3D codec
-template <typename Scalar>
-class zfp_codec<Scalar, 3> : public zfp_codec_base<Scalar, 3> {
-public:
-  // constructor takes pre-allocated buffer of compressed blocks
-  zfp_codec(void* data, size_t size) : zfp_codec_base<Scalar, 3>(data, size) {}
-
-  // encode contiguous 3D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block)
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
-                 : encode_block(offset, block);
-  }
-
-  // encode 3D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
-  {
-    size_t size;
-    stream_wseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy, sz);
-    size += zfp_stream_flush(zfp);
-    return size;
-  }
-
-  // decode contiguous 3D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block)
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
-                 : decode_block(offset, block);
-  }
-
-  // decode 3D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
-  {
-    size_t size;
-    stream_rseek(zfp->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy, sz);
-    size += zfp_stream_align(zfp);
-    return size;
-  }
-
-protected:
-  using zfp_codec_base<Scalar, 3>::encode_block;
-  using zfp_codec_base<Scalar, 3>::decode_block;
-  using zfp_codec_base<Scalar, 3>::zfp;
-};
-
-}
-
-#endif
diff --git a/array/zfpcodec.h b/array/zfpcodec.h
deleted file mode 100644
index 18cbe1f70..000000000
--- a/array/zfpcodec.h
+++ /dev/null
@@ -1,381 +0,0 @@
-#ifndef ZFP_ZFP_CODEC_H
-#define ZFP_ZFP_CODEC_H
-
-#include <algorithm>
-#include <climits>
-#include <cstring>
-#include "zfp.h"
-#include "zfpcpp.h"
-#include "zfp/memory.h"
-#include "zfp/traits.h"
-
-namespace zfp {
-namespace codec {
-
-// abstract base class for zfp coding of {float, double} x {1D, 2D, 3D, 4D} data
-template <uint dims, typename Scalar>
-class zfp_base {
-protected:
-  // default constructor
-  zfp_base() :
-    stream(zfp_stream_open(0))
-  {}
-
-  // destructor
-  ~zfp_base()
-  {
-    close();
-    zfp_stream_close(stream);
-  }
-
-public:
-  // assignment operator--performs deep copy
-  zfp_base& operator=(const zfp_base& codec)
-  {
-    if (this != &codec)
-      deep_copy(codec);
-    return *this;
-  }
-
-  // conservative buffer size for current codec settings
-  size_t buffer_size(const zfp_field* field) const
-  {
-    // empty field case
-    if (!field->nx && !field->ny && !field->nz && !field->nw)
-      return 0;
-    // variable-rate case
-    if (zfp_stream_compression_mode(stream) != zfp_mode_fixed_rate)
-      return zfp_stream_maximum_size(stream, field);
-    // fixed-rate case: exclude header
-    size_t bx = (std::max(field->nx, size_t(1)) + 3) / 4;
-    size_t by = (std::max(field->ny, size_t(1)) + 3) / 4;
-    size_t bz = (std::max(field->nz, size_t(1)) + 3) / 4;
-    size_t bw = (std::max(field->nw, size_t(1)) + 3) / 4;
-    size_t blocks = bx * by * bz * bw;
-    return zfp::round_up(blocks * stream->maxbits, stream_alignment()) / CHAR_BIT;
-  }
-
-  // open bit stream
-  void open(void* data, size_t size)
-  {
-    zfp_stream_set_bit_stream(stream, stream_open(data, size));
-  }
-
-  // close bit stream
-  void close()
-  {
-    stream_close(zfp_stream_bit_stream(stream));
-    zfp_stream_set_bit_stream(stream, 0);
-  }
-
-  // compression mode
-  zfp_mode mode() const { return zfp_stream_compression_mode(stream); }
-
-  // rate in compressed bits/value (fixed-rate mode only)
-  double rate() const { return zfp_stream_rate(stream, dims); }
-
-  // precision in uncompressed bits/value (fixed-precision mode only)
-  uint precision() const { return zfp_stream_precision(stream); }
-
-  // accuracy as absolute error tolerance (fixed-accuracy mode only)
-  double accuracy() const { return zfp_stream_accuracy(stream); }
-
-  // compression parameters (all compression modes)
-  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { zfp_stream_params(stream, minbits, maxbits, maxprec, minexp); }
-
-  // enable reversible (lossless) mode
-  void set_reversible() { zfp_stream_set_reversible(stream); }
-
-  // set rate in compressed bits/value
-  double set_rate(double rate, bool align) { return zfp_stream_set_rate(stream, rate, type, dims, align); }
-
-  // set precision in uncompressed bits/value
-  uint set_precision(uint precision) { return zfp_stream_set_precision(stream, precision); }
-
-  // set accuracy as absolute error tolerance
-  double set_accuracy(double tolerance) { return zfp_stream_set_accuracy(stream, tolerance); }
-
-  // set expert mode parameters
-  bool set_params(uint minbits, uint maxbits, uint maxprec, int maxexp) { return zfp_stream_set_params(stream, minbits, maxbits, maxprec, maxexp) == zfp_true; }
-
-  // byte size of codec data structure components indicated by mask
-  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
-  {
-    size_t size = 0;
-    if (mask & ZFP_DATA_META) {
-      size += sizeof(*stream);
-      size += sizeof(*this);
-    }
-    return size;
-  }
-
-  // unit of allocated data in bytes
-  static size_t alignment() { return stream_alignment() / CHAR_BIT; }
-
-  static const zfp_type type = zfp::trait<Scalar>::type; // scalar type
-
-  // zfp::codec::zfp_base::header class for array (de)serialization
-  #include "zfp/zfpheader.h"
-
-protected:
-  // deep copy
-  void deep_copy(const zfp_base& codec)
-  {
-    stream = zfp_stream_open(0);
-    *stream = *codec.stream;
-    stream->stream = 0;
-  }
-
-  // encode full contiguous block
-  size_t encode_block(size_t offset, const Scalar* block) const
-  {
-    stream_wseek(stream->stream, offset);
-    size_t size = zfp::encode_block<Scalar, dims>(stream, block);
-    zfp_stream_flush(stream);
-    return size;
-  }
-
-  // decode full contiguous block
-  size_t decode_block(size_t offset, Scalar* block) const
-  {
-    stream_rseek(stream->stream, offset);
-    size_t size = zfp::decode_block<Scalar, dims>(stream, block);
-    zfp_stream_align(stream);
-    return size;
-  }
-
-  zfp_stream* stream; // compressed zfp stream
-};
-
-// 1D codec
-template <typename Scalar>
-class zfp1 : public zfp_base<1, Scalar> {
-public:
-  // encode contiguous 1D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block) const
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1)
-                 : zfp_base<1, Scalar>::encode_block(offset, block);
-  }
-
-  // encode 1D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx) const
-  {
-    size_t size;
-    stream_wseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(stream, p, nx, sx);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(stream, p, sx);
-    zfp_stream_flush(stream);
-    return size;
-  }
-
-  // decode contiguous 1D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block) const
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1)
-                 : decode_block(offset, block);
-  }
-
-  // decode 1D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx) const
-  {
-    size_t size;
-    stream_rseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(stream, p, nx, sx);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(stream, p, sx);
-    zfp_stream_align(stream);
-    return size;
-  }
-
-protected:
-  using zfp_base<1, Scalar>::encode_block;
-  using zfp_base<1, Scalar>::decode_block;
-  using zfp_base<1, Scalar>::stream;
-};
-
-// 2D codec
-template <typename Scalar>
-class zfp2 : public zfp_base<2, Scalar> {
-public:
-  // encode contiguous 2D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block) const
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1, 4)
-                 : encode_block(offset, block);
-  }
-
-  // encode 2D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
-  {
-    size_t size;
-    stream_wseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(stream, p, nx, ny, sx, sy);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(stream, p, sx, sy);
-    zfp_stream_flush(stream);
-    return size;
-  }
-
-  // decode contiguous 2D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block) const
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1, 4)
-                 : decode_block(offset, block);
-  }
-
-  // decode 2D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
-  {
-    size_t size;
-    stream_rseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(stream, p, nx, ny, sx, sy);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(stream, p, sx, sy);
-    zfp_stream_align(stream);
-    return size;
-  }
-
-protected:
-  using zfp_base<2, Scalar>::encode_block;
-  using zfp_base<2, Scalar>::decode_block;
-  using zfp_base<2, Scalar>::stream;
-};
-
-// 3D codec
-template <typename Scalar>
-class zfp3 : public zfp_base<3, Scalar> {
-public:
-  // encode contiguous 3D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block) const
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
-                 : encode_block(offset, block);
-  }
-
-  // encode 3D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
-  {
-    size_t size;
-    stream_wseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(stream, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(stream, p, sx, sy, sz);
-    zfp_stream_flush(stream);
-    return size;
-  }
-
-  // decode contiguous 3D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block) const
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
-                 : decode_block(offset, block);
-  }
-
-  // decode 3D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
-  {
-    size_t size;
-    stream_rseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(stream, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(stream, p, sx, sy, sz);
-    zfp_stream_align(stream);
-    return size;
-  }
-
-protected:
-  using zfp_base<3, Scalar>::encode_block;
-  using zfp_base<3, Scalar>::decode_block;
-  using zfp_base<3, Scalar>::stream;
-};
-
-// 4D codec
-template <typename Scalar>
-class zfp4 : public zfp_base<4, Scalar> {
-public:
-  // encode contiguous 4D block
-  size_t encode_block(size_t offset, uint shape, const Scalar* block) const
-  {
-    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16, 64)
-                 : encode_block(offset, block);
-  }
-
-  // encode 4D block from strided storage
-  size_t encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
-  {
-    size_t size;
-    stream_wseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      uint nw = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::encode_partial_block_strided<Scalar>(stream, p, nx, ny, nz, nw, sx, sy, sz, sw);
-    }
-    else
-      size = zfp::encode_block_strided<Scalar>(stream, p, sx, sy, sz, sw);
-    zfp_stream_flush(stream);
-    return size;
-  }
-
-  // decode contiguous 4D block
-  size_t decode_block(size_t offset, uint shape, Scalar* block) const
-  {
-    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16, 64)
-                 : decode_block(offset, block);
-  }
-
-  // decode 4D block to strided storage
-  size_t decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
-  {
-    size_t size;
-    stream_rseek(stream->stream, offset);
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      uint nw = 4 - (shape & 3u); shape >>= 2;
-      size = zfp::decode_partial_block_strided<Scalar>(stream, p, nx, ny, nz, nw, sx, sy, sz, sw);
-    }
-    else
-      size = zfp::decode_block_strided<Scalar>(stream, p, sx, sy, sz, sw);
-    zfp_stream_align(stream);
-    return size;
-  }
-
-protected:
-  using zfp_base<4, Scalar>::encode_block;
-  using zfp_base<4, Scalar>::decode_block;
-  using zfp_base<4, Scalar>::stream;
-};
-
-} // codec
-} // zfp
-
-#endif
diff --git a/cfp/CMakeLists.txt b/cfp/CMakeLists.txt
index febd4f0ab..3d8af6ecf 100644
--- a/cfp/CMakeLists.txt
+++ b/cfp/CMakeLists.txt
@@ -1 +1,36 @@
-add_subdirectory(src)
+add_library(cfp cfp.cpp)
+
+if(DEFINED CFP_NAMESPACE)
+  list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}")
+endif()
+
+list(APPEND cfp_private_defs ${zfp_compressed_array_defs})
+
+if(WIN32 AND BUILD_SHARED_LIBS)
+  # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL
+  list(APPEND cfp_public_defs ZFP_SHARED_LIBS)
+  list(APPEND cfp_private_defs ZFP_SOURCE)
+endif()
+
+target_compile_definitions(cfp
+  PUBLIC ${cfp_public_defs}
+  PRIVATE ${cfp_private_defs})
+
+target_include_directories(cfp
+  PUBLIC
+    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE
+    ${ZFP_SOURCE_DIR}/src
+)
+
+target_link_libraries(cfp zfp)
+
+set_property(TARGET cfp PROPERTY VERSION ${ZFP_VERSION})
+set_property(TARGET cfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
+set_property(TARGET cfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}cfp)
+
+install(TARGETS cfp EXPORT cfp-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/cfp/src/Makefile b/cfp/Makefile
similarity index 74%
rename from cfp/src/Makefile
rename to cfp/Makefile
index 37cb00adf..37881a76c 100644
--- a/cfp/src/Makefile
+++ b/cfp/Makefile
@@ -1,9 +1,9 @@
-include ../../Config
+include ../Config
 
-LIBDIR = ../../lib
+LIBDIR = ../lib
 TARGETS = $(LIBDIR)/libcfp.a $(LIBDIR)/libcfp.so
-OBJECTS = cfparray.o
-INCS = -I../include -I../../include -I../../array -I../../src
+OBJECTS = cfp.o
+INCS = -I../include -I../src
 
 static: $(LIBDIR)/libcfp.a
 
diff --git a/cfp/src/cfparray.cpp b/cfp/cfp.cpp
similarity index 98%
rename from cfp/src/cfparray.cpp
rename to cfp/cfp.cpp
index 9c4de0bba..b360760f4 100644
--- a/cfp/src/cfparray.cpp
+++ b/cfp/cfp.cpp
@@ -1,5 +1,5 @@
 #include "cfpheader.cpp"
-#include "cfparray.h"
+#include "zfp/array.h"
 
 #include "cfparray1f.cpp"
 #include "cfparray1d.cpp"
@@ -27,6 +27,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array1f_set_cache_size,
     cfp_array1f_clear_cache,
     cfp_array1f_flush_cache,
+    cfp_array1f_size_bytes,
     cfp_array1f_compressed_size,
     cfp_array1f_compressed_data,
     cfp_array1f_size,
@@ -129,6 +130,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array1d_set_cache_size,
     cfp_array1d_clear_cache,
     cfp_array1d_flush_cache,
+    cfp_array1d_size_bytes,
     cfp_array1d_compressed_size,
     cfp_array1d_compressed_data,
     cfp_array1d_size,
@@ -231,6 +233,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array2f_set_cache_size,
     cfp_array2f_clear_cache,
     cfp_array2f_flush_cache,
+    cfp_array2f_size_bytes,
     cfp_array2f_compressed_size,
     cfp_array2f_compressed_data,
     cfp_array2f_size,
@@ -336,6 +339,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array2d_set_cache_size,
     cfp_array2d_clear_cache,
     cfp_array2d_flush_cache,
+    cfp_array2d_size_bytes,
     cfp_array2d_compressed_size,
     cfp_array2d_compressed_data,
     cfp_array2d_size,
@@ -441,6 +445,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array3f_set_cache_size,
     cfp_array3f_clear_cache,
     cfp_array3f_flush_cache,
+    cfp_array3f_size_bytes,
     cfp_array3f_compressed_size,
     cfp_array3f_compressed_data,
     cfp_array3f_size,
@@ -548,6 +553,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array3d_set_cache_size,
     cfp_array3d_clear_cache,
     cfp_array3d_flush_cache,
+    cfp_array3d_size_bytes,
     cfp_array3d_compressed_size,
     cfp_array3d_compressed_data,
     cfp_array3d_size,
@@ -655,6 +661,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array4f_set_cache_size,
     cfp_array4f_clear_cache,
     cfp_array4f_flush_cache,
+    cfp_array4f_size_bytes,
     cfp_array4f_compressed_size,
     cfp_array4f_compressed_data,
     cfp_array4f_size,
@@ -764,6 +771,7 @@ const cfp_api CFP_NAMESPACE = {
     cfp_array4d_set_cache_size,
     cfp_array4d_clear_cache,
     cfp_array4d_flush_cache,
+    cfp_array4d_size_bytes,
     cfp_array4d_compressed_size,
     cfp_array4d_compressed_data,
     cfp_array4d_size,
diff --git a/cfp/src/cfparray1d.cpp b/cfp/cfparray1d.cpp
similarity index 86%
rename from cfp/src/cfparray1d.cpp
rename to cfp/cfparray1d.cpp
index 35e18ace2..3a76b65f9 100644
--- a/cfp/src/cfparray1d.cpp
+++ b/cfp/cfparray1d.cpp
@@ -1,5 +1,5 @@
-#include "cfparray1d.h"
-#include "zfparray1.h"
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/array1.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray1f.cpp b/cfp/cfparray1f.cpp
similarity index 86%
rename from cfp/src/cfparray1f.cpp
rename to cfp/cfparray1f.cpp
index dd6859bb1..2df705301 100644
--- a/cfp/src/cfparray1f.cpp
+++ b/cfp/cfparray1f.cpp
@@ -1,5 +1,5 @@
-#include "cfparray1f.h"
-#include "zfparray1.h"
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/array1.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray2d.cpp b/cfp/cfparray2d.cpp
similarity index 86%
rename from cfp/src/cfparray2d.cpp
rename to cfp/cfparray2d.cpp
index f35acd2c8..fa3051b12 100644
--- a/cfp/src/cfparray2d.cpp
+++ b/cfp/cfparray2d.cpp
@@ -1,5 +1,5 @@
-#include "cfparray2d.h"
-#include "zfparray2.h"
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/array2.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray2f.cpp b/cfp/cfparray2f.cpp
similarity index 86%
rename from cfp/src/cfparray2f.cpp
rename to cfp/cfparray2f.cpp
index 893710c7d..ebfd1d9db 100644
--- a/cfp/src/cfparray2f.cpp
+++ b/cfp/cfparray2f.cpp
@@ -1,5 +1,5 @@
-#include "cfparray2f.h"
-#include "zfparray2.h"
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/array2.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray3d.cpp b/cfp/cfparray3d.cpp
similarity index 86%
rename from cfp/src/cfparray3d.cpp
rename to cfp/cfparray3d.cpp
index aea61b0aa..100d639a0 100644
--- a/cfp/src/cfparray3d.cpp
+++ b/cfp/cfparray3d.cpp
@@ -1,5 +1,5 @@
-#include "cfparray3d.h"
-#include "zfparray3.h"
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/array3.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray3f.cpp b/cfp/cfparray3f.cpp
similarity index 86%
rename from cfp/src/cfparray3f.cpp
rename to cfp/cfparray3f.cpp
index f3220f656..b5cafb71b 100644
--- a/cfp/src/cfparray3f.cpp
+++ b/cfp/cfparray3f.cpp
@@ -1,5 +1,5 @@
-#include "cfparray3f.h"
-#include "zfparray3.h"
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/array3.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray4d.cpp b/cfp/cfparray4d.cpp
similarity index 86%
rename from cfp/src/cfparray4d.cpp
rename to cfp/cfparray4d.cpp
index 0854fa85b..bf1a2b06d 100644
--- a/cfp/src/cfparray4d.cpp
+++ b/cfp/cfparray4d.cpp
@@ -1,5 +1,5 @@
-#include "cfparray4d.h"
-#include "zfparray4.h"
+#include "zfp/internal/cfp/array4d.h"
+#include "zfp/array4.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/src/cfparray4f.cpp b/cfp/cfparray4f.cpp
similarity index 86%
rename from cfp/src/cfparray4f.cpp
rename to cfp/cfparray4f.cpp
index 4e52600f2..ca6bf0ddd 100644
--- a/cfp/src/cfparray4f.cpp
+++ b/cfp/cfparray4f.cpp
@@ -1,5 +1,5 @@
-#include "cfparray4f.h"
-#include "zfparray4.h"
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/array4.hpp"
 
 #include "template/template.h"
 
diff --git a/cfp/cfpheader.cpp b/cfp/cfpheader.cpp
new file mode 100644
index 000000000..b4b66e09e
--- /dev/null
+++ b/cfp/cfpheader.cpp
@@ -0,0 +1,21 @@
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/internal/codec/zfpheader.hpp"
+#include "zfp/internal/cfp/header.h"
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/internal/cfp/array4d.h"
+
+#include "template/template.h"
+
+#define CFP_HEADER_TYPE cfp_header
+#define ZFP_HEADER_TYPE zfp::array::header
+
+#include "template/cfpheader.cpp"
diff --git a/cfp/include/cfparray.h b/cfp/include/cfparray.h
deleted file mode 100644
index 5ced8b7c5..000000000
--- a/cfp/include/cfparray.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef CFP_ARRAY
-#define CFP_ARRAY
-
-#include <stddef.h>
-#include "cfpheader.h"
-#include "cfparray1f.h"
-#include "cfparray1d.h"
-#include "cfparray2f.h"
-#include "cfparray2d.h"
-#include "cfparray3f.h"
-#include "cfparray3d.h"
-#include "cfparray4f.h"
-#include "cfparray4d.h"
-
-typedef struct {
-  cfp_array1f_api array1f;
-  cfp_array1d_api array1d;
-  cfp_array2f_api array2f;
-  cfp_array2d_api array2d;
-  cfp_array3f_api array3f;
-  cfp_array3d_api array3d;
-  cfp_array4f_api array4f;
-  cfp_array4d_api array4d;
-} cfp_api;
-
-#ifndef CFP_NAMESPACE
-  #define CFP_NAMESPACE cfp
-#endif
-
-extern_ const cfp_api CFP_NAMESPACE;
-
-#endif
diff --git a/cfp/include/cfparrays.h b/cfp/include/cfparrays.h
deleted file mode 100644
index 39b267243..000000000
--- a/cfp/include/cfparrays.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#warning "deprecated header; use cfparray.h"
-#include "cfparray.h"
diff --git a/cfp/src/CMakeLists.txt b/cfp/src/CMakeLists.txt
deleted file mode 100644
index 922710133..000000000
--- a/cfp/src/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-add_library(cfp cfparray.cpp)
-
-if(DEFINED CFP_NAMESPACE)
-  list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}")
-endif()
-
-list(APPEND cfp_private_defs ${zfp_compressed_array_defs})
-
-if(WIN32 AND BUILD_SHARED_LIBS)
-  # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL
-  list(APPEND cfp_public_defs ZFP_SHARED_LIBS)
-  list(APPEND cfp_private_defs ZFP_SOURCE)
-endif()
-
-target_compile_definitions(cfp
-  PUBLIC ${cfp_public_defs}
-  PRIVATE ${cfp_private_defs})
-
-target_include_directories(cfp
-  PUBLIC
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/cfp/include>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PRIVATE
-    ${ZFP_SOURCE_DIR}/array
-    ${ZFP_SOURCE_DIR}/src
-)
-
-target_link_libraries(cfp zfp)
-
-set_property(TARGET cfp PROPERTY VERSION ${ZFP_VERSION})
-set_property(TARGET cfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
-set_property(TARGET cfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}cfp)
-
-install(TARGETS cfp EXPORT cfp-targets
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/cfp/src/cfpheader.cpp b/cfp/src/cfpheader.cpp
deleted file mode 100644
index 667e63852..000000000
--- a/cfp/src/cfpheader.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "zfparray1.h"
-#include "zfparray2.h"
-#include "zfparray3.h"
-#include "zfparray4.h"
-#include "zfp/zfpheader.h"
-#include "cfpheader.h"
-#include "cfparray1f.h"
-#include "cfparray1d.h"
-#include "cfparray2f.h"
-#include "cfparray2d.h"
-#include "cfparray3f.h"
-#include "cfparray3d.h"
-#include "cfparray4f.h"
-#include "cfparray4d.h"
-
-#include "template/template.h"
-
-#define CFP_HEADER_TYPE cfp_header
-#define ZFP_HEADER_TYPE zfp::array::header
-
-#include "template/cfpheader.cpp"
diff --git a/cfp/src/template/cfparray.cpp b/cfp/template/cfparray.cpp
similarity index 95%
rename from cfp/src/template/cfparray.cpp
rename to cfp/template/cfparray.cpp
index b3b3e0760..70bb3c2dc 100644
--- a/cfp/src/template/cfparray.cpp
+++ b/cfp/template/cfparray.cpp
@@ -42,6 +42,12 @@ _t1(CFP_ARRAY_TYPE, set_rate)(CFP_ARRAY_TYPE self, double rate)
   return static_cast<ZFP_ARRAY_TYPE*>(self.object)->set_rate(rate);
 }
 
+static size_t
+_t1(CFP_ARRAY_TYPE, size_bytes)(CFP_ARRAY_TYPE self, uint mask)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_bytes(mask);
+}
+
 static size_t
 _t1(CFP_ARRAY_TYPE, compressed_size)(CFP_ARRAY_TYPE self)
 {
diff --git a/cfp/src/template/cfparray1.cpp b/cfp/template/cfparray1.cpp
similarity index 100%
rename from cfp/src/template/cfparray1.cpp
rename to cfp/template/cfparray1.cpp
diff --git a/cfp/src/template/cfparray2.cpp b/cfp/template/cfparray2.cpp
similarity index 100%
rename from cfp/src/template/cfparray2.cpp
rename to cfp/template/cfparray2.cpp
diff --git a/cfp/src/template/cfparray3.cpp b/cfp/template/cfparray3.cpp
similarity index 100%
rename from cfp/src/template/cfparray3.cpp
rename to cfp/template/cfparray3.cpp
diff --git a/cfp/src/template/cfparray4.cpp b/cfp/template/cfparray4.cpp
similarity index 100%
rename from cfp/src/template/cfparray4.cpp
rename to cfp/template/cfparray4.cpp
diff --git a/cfp/src/template/cfpheader.cpp b/cfp/template/cfpheader.cpp
similarity index 99%
rename from cfp/src/template/cfpheader.cpp
rename to cfp/template/cfpheader.cpp
index ac6319aa1..b9f619179 100644
--- a/cfp/src/template/cfpheader.cpp
+++ b/cfp/template/cfpheader.cpp
@@ -2,6 +2,8 @@ static CFP_HEADER_TYPE
 _t1(CFP_HEADER_TYPE, ctor_buffer)(const void* data, size_t bytes)
 {
   CFP_HEADER_TYPE h;
+  h.object = 0;
+
   try {
     // construct generic header and query array type
     header hdr(data, bytes);
@@ -35,9 +37,7 @@ _t1(CFP_HEADER_TYPE, ctor_buffer)(const void* data, size_t bytes)
         break;
     }
   }
-  catch (...) {
-    h.object = 0;
-  }
+  catch (...) {}
   return h;
 }
 
diff --git a/cmake/appveyor.cmake b/cmake/appveyor.cmake
index 1e803713e..29cc79069 100644
--- a/cmake/appveyor.cmake
+++ b/cmake/appveyor.cmake
@@ -12,6 +12,7 @@ set(CTEST_CMAKE_GENERATOR "${GENERATOR}")
 set(CTEST_BUILD_NAME "$ENV{APPVEYOR_REPO_BRANCH}-${job_details}")
 set(cfg_options
   -DCMAKE_BUILD_TYPE=$ENV{BUILD_TYPE}
+  -DBUILD_TESTING_FULL=ON
   -DBUILD_CFP=${BUILD_CFP}
   -DBUILD_ZFPY=${BUILD_ZFPY}
   -DZFP_WITH_OPENMP=${BUILD_OPENMP}
diff --git a/cmake/travis.cmake b/cmake/travis.cmake
deleted file mode 100644
index f2bf844b7..000000000
--- a/cmake/travis.cmake
+++ /dev/null
@@ -1,87 +0,0 @@
-
-set(CTEST_SOURCE_DIRECTORY "$ENV{TRAVIS_BUILD_DIR}")
-set(CTEST_BINARY_DIRECTORY "$ENV{TRAVIS_BUILD_DIR}/build")
-
-set(CTEST_COMMAND ctest)
-include(${CTEST_SOURCE_DIRECTORY}/CTestConfig.cmake)
-set(CTEST_SITE "travis")
-set(CTEST_CMAKE_GENERATOR "Unix Makefiles")
-set(CTEST_BUILD_NAME "$ENV{TRAVIS_BRANCH}-#$ENV{TRAVIS_JOB_NUMBER}")
-set(cfg_options
-  -DCMAKE_C_STANDARD=${C_STANDARD}
-  -DCMAKE_CXX_STANDARD=${CXX_STANDARD}
-  -DBUILD_CFP=${BUILD_CFP}
-  -DBUILD_ZFPY=${BUILD_ZFPY}
-  -DBUILD_ZFORP=${BUILD_ZFORP}
-  -DZFP_WITH_OPENMP=${BUILD_OPENMP}
-  -DZFP_WITH_CUDA=${BUILD_CUDA}
-  )
-
-# Add the variants to the testers name so that we can report multiple
-# times from the same CI builder
-if(BUILD_OPENMP)
-  set(CTEST_SITE "${CTEST_SITE}_openmp")
-endif()
-
-if(BUILD_CUDA)
-  set(CTEST_SITE "${CTEST_SITE}_cuda")
-endif()
-
-if(BUILD_CFP)
-  set(CTEST_SITE "${CTEST_SITE}_cfp")
-
-  if(CFP_NAMESPACE)
-    list(APPEND cfg_options
-      -DCFP_NAMESPACE=${CFP_NAMESPACE}
-      )
-    set(CTEST_SITE "${CTEST_SITE}namespace")
-  endif()
-endif()
-
-if(BUILD_ZFPY)
-  set(CTEST_SITE "${CTEST_SITE}_zfpy$ENV{PYTHON_VERSION}")
-  list(APPEND cfg_options
-    -DPYTHON_INCLUDE_DIR=$ENV{PYTHON_INCLUDE_DIR}
-    -DPYTHON_LIBRARY=$ENV{PYTHON_LIBRARY}
-    -DPYTHON_EXECUTABLE=$ENV{PYTHON_EXECUTABLE}
-    )
-endif()
-
-if(BUILD_ZFORP)
-  set(CTEST_SITE "${CTEST_SITE}_zforp$ENV{FORTRAN_STANDARD}")
-  list(APPEND cfg_options
-    -DCMAKE_FORTRAN_FLAGS='-std=f$ENV{FORTRAN_STANDARD}'
-    )
-endif()
-
-if(WITH_COVERAGE)
-  list(APPEND cfg_options
-    -DCMAKE_C_FLAGS=-coverage
-    -DCMAKE_CXX_FLAGS=-coverage
-    -DCMAKE_Fortran_FLAGS=-coverage
-    )
-  set(CTEST_SITE "${CTEST_SITE}_coverage")
-endif()
-
-if(OMP_TESTS_ONLY)
-  list(APPEND cfg_options
-    -DZFP_OMP_TESTS_ONLY=1
-    )
-endif()
-
-ctest_start(Experimental TRACK Travis)
-ctest_configure(OPTIONS "${cfg_options}")
-ctest_submit(PARTS Update Notes Configure)
-ctest_build(FLAGS -j1)
-ctest_submit(PARTS Build)
-ctest_test(PARALLEL_LEVEL 6 RETURN_VALUE rv)
-ctest_submit(PARTS Test)
-
-if(WITH_COVERAGE)
-  ctest_coverage()
-  ctest_submit(PARTS Coverage)
-endif()
-
-if(NOT rv EQUAL 0)
-  message(FATAL_ERROR "Test failures occurred.")
-endif()
diff --git a/docs/Makefile b/docs/Makefile
index 22a0d97ec..987761f1f 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,18 +3,26 @@
 
 # You can set these variables from the command line.
 SPHINXOPTS    =
-SPHINXBUILD   = /usr/bin/python3 -msphinx
+SPHINXBUILD   = /usr/bin/python3 -m sphinx
 SPHINXPROJ    = zfp
 SOURCEDIR     = source
 BUILDDIR      = build
 
+# Needed for spell checking on macOS
+PYENCHANT_LIBRARY_PATH = /opt/homebrew/lib/libenchant-2.dylib
+
+# Build HTML by default
 all:
 	@$(MAKE) html
 
-# Put it first so that "make" without argument is like "make help".
+# List targets
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
+# Run spell checker
+spell:
+	@PYENCHANT_LIBRARY_PATH=$(PYENCHANT_LIBRARY_PATH) $(SPHINXBUILD) -b spelling -d "$(BUILDDIR)/doctrees" "$(SOURCEDIR)" "$(BUILDDIR)/spelling"
+
 .PHONY: help Makefile
 
 # Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..f0c7424af
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+sphinx-fortran==1.1.1
+
+# required by sphinx-fortran but not installed on RTD
+six
diff --git a/docs/source/algorithm.rst b/docs/source/algorithm.rst
index 826bc6fbd..fd4c72f38 100644
--- a/docs/source/algorithm.rst
+++ b/docs/source/algorithm.rst
@@ -102,6 +102,12 @@ purposes):
      |4powd| |minus| *n* bits of the bit plane are run-length encoded as
      described above, which potentially results in *n* being increased.
 
+     As an example, *x* = 000001001101000 with *m* = 10 is encoded as
+     **0**\ 100\ **1**\ 1\ **1**\ 10\ **1**\ 1000\ **1**, where the bits in
+     boldface indicate "group tests" that determine if the remainder of *x*
+     (to the left) contains any one-bits.  Again, this variable-length code
+     is generated and parsed from right to left.
+
   8. The embedded coder emits one bit at a time, with each successive bit
      potentially improving the accuracy of the approximation.  The early
      bits are most important and have the greatest impact on accuracy,
diff --git a/docs/source/arrays.rst b/docs/source/arrays.rst
index 297674cb9..2a0999085 100644
--- a/docs/source/arrays.rst
+++ b/docs/source/arrays.rst
@@ -163,7 +163,7 @@ in the base class.
 .. cpp:function:: double array::set_rate(double rate)
 
   Set desired compression rate in bits per value.  Return the closest rate
-  supported.  See :ref:`FAQ #12 <q-granularity>` and :ref:`FAQ #18 <q-rate>`
+  supported.  See FAQ :ref:`#12 <q-granularity>` and FAQ :ref:`#18 <q-rate>`
   for discussions of the rate granularity.  This method destroys the previous
   contents of the array.
 
@@ -243,7 +243,7 @@ in the base class.
 
   Initialize array by copying and compressing data stored at *p*.  The
   uncompressed data is assumed to be stored as in the :cpp:func:`get`
-  method.
+  method.  If *p* = 0, then the array is zero-initialized.
 
 ----
 
@@ -271,28 +271,28 @@ in the base class.
 
 .. cpp:function:: iterator array::begin()
 
-  Return mutable iterator to beginning of array.
+  Return random-access mutable iterator to beginning of array.
 
 ----
 
 .. cpp:function:: iterator array::end()
 
-  Return mutable iterator to end of array.  As with STL iterators, the end
-  points to a virtual element just past the last valid array element.
+  Return random-access mutable iterator to end of array.  As with STL iterators,
+  the end points to a virtual element just past the last valid array element.
 
 ----
 
 .. cpp:function:: const_iterator array::begin() const
 .. cpp:function:: const_iterator array::cbegin() const
 
-  Return const iterator to beginning of array.
+  Return random-access const iterator to beginning of array.
 
 ----
 
 .. cpp:function:: const_iterator array::end() const
 .. cpp:function:: const_iterator array::cend() const
 
-  Return const iterator to end of array.
+  Return random-access const iterator to end of array.
 
 .. note::
   Const :ref:`references <references>`, :ref:`pointers <pointers>`, and
@@ -502,23 +502,23 @@ with only a few differences:
 - All methods other than those that specify array-wide settings, such as
   compression mode and parameters, array dimensions, and array contents,
   are :code:`const` qualified.  There are, thus, no methods for obtaining
-  a writeable reference, pointer, or iterator.  Consequently, one may not
+  a writable reference, pointer, or iterator.  Consequently, one may not
   initialize such arrays one element at a time.  Rather, the user initializes
   the whole array by passing a pointer to uncompressed data.
 
 - Whereas the constructors for fixed-rate arrays accept a *rate* parameter,
   the read-only arrays allow specifying any compression mode and
-  corresponding parameters (if any) via a :c:struct:`zfp_config` object.
+  corresponding parameters (if any) via a :c:type:`zfp_config` object.
 
 - Additional methods are available for setting and querying compression
   mode and parameters after construction.
 
 - Read-only arrays are templated on a block index class that encodes the
   bit offset to each block of data.  Multiple index classes are available
-  that trade compactness and speed of access.  The default index represents
-  64-bit offsets using only 16 bits of amortized storage per block.  An
-  "implicit" index is available for fixed-rate read-only arrays, which
-  computes rather than stores offsets to equal-sized blocks.
+  that trade compactness and speed of access.  The default :cpp:class:`hybrid4`
+  index represents 64-bit offsets using only 24 bits of amortized storage per
+  block.  An "implicit" index is available for fixed-rate read-only arrays,
+  which computes rather than stores offsets to equal-sized blocks.
 
 .. note::
   Whereas variable-rate compression almost always improves accuracy per bit
@@ -716,7 +716,7 @@ Additional methods are documented below.
   Whereas the :ref:`read-write fixed-rate arrays <array_classes>`
   (:cpp:class:`zfp::array`) require that block storage is word aligned, the
   read-only arrays (:cpp:class:`zfp::const_array`) are not subject to such
-  restrictions and thefore support finer rate granularity.  For a
+  restrictions and therefore support finer rate granularity.  For a
   *d*-dimensional :cpp:class:`const_array`, the rate granularity is
   4\ :sup:`-d` bits/value, e.g., a quarter bit/value for 1D arrays.
 
@@ -738,10 +738,11 @@ Additional methods are documented below.
 
 ----
 
-.. cpp:function:: double const_array::set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+.. cpp:function:: bool const_array::set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
 
   Set :ref:`expert mode <mode-expert>` parameters.  This method destroys the
-  previous contents of the array.
+  previous contents of the array.  Return whether the codec supports the
+  combination of parameters.
 
 ----
 
diff --git a/docs/source/bit-stream.rst b/docs/source/bit-stream.rst
index 19c5054a2..f46e6ae23 100644
--- a/docs/source/bit-stream.rst
+++ b/docs/source/bit-stream.rst
@@ -14,7 +14,7 @@ against.
 
 From an implementation standpoint, bit streams are read from and written
 to memory in increments of *words* of bits.  The constant power-of-two
-word size is configured at :ref:`compile time <config>`, and is limited
+:ref:`word size <word-size>` is configured at compile time, and is limited
 to 8, 16, 32, or 64 bits.
 
 The bit stream API is publicly exposed and may be used to write additional
@@ -66,7 +66,7 @@ section.
 Types
 -----
 
-.. c:type:: word
+.. c:type:: bitstream_word
 
   Bits are buffered and read/written in units of words.  By default, the
   bit stream word type is 64 bits, but may be set to 8, 16, or 32 bits
@@ -75,6 +75,36 @@ Types
   tend to give higher throughput, while 8-bit words are needed to ensure
   endian independence (see FAQ :ref:`#11 <q-portability>`).
 
+.. note::
+  To avoid potential name clashes, this type was renamed in
+  |zfp| |64bitrelease| from the shorter and more ambiguous type name
+  :code:`word`.
+
+----
+
+.. c:type:: bitstream_offset
+
+  Type holding the offset, measured in number of bits, into the bit stream
+  where the next bit will be read or written.  This type allows referencing
+  bits in streams at least 2\ :sup:`64` bits long.  Note that it is possible
+  that :code:`sizeof(bitstream_offset) > sizeof(size_t)` since a stream may
+  be as long as `sizeof(size_t) * CHAR_BIT` bits.
+
+----
+
+.. c:type:: bitstream_size
+
+  Alias for :c:type:`bitstream_offset` that signifies the bit length of a
+  stream or substream rather than an offset into it.
+
+----
+
+.. c:type:: bitstream_count
+
+  Type sufficient to count the number of bits read or written in functions
+  like :c:func:`stream_read_bits` and :c:func:`stream_write_bits`.
+  :code:`sizeof(bitstream_count) <= sizeof(bitstream_size)`.
+
 ----
 
 .. c:type:: bitstream
@@ -85,13 +115,13 @@ Types
   ::
 
     struct bitstream {
-      uint bits;       // number of buffered bits (0 <= bits < word size)
-      word buffer;     // buffer for incoming/outgoing bits (buffer < 2^bits)
-      word* ptr;       // pointer to next word to be read/written
-      word* begin;     // beginning of stream
-      word* end;       // end of stream (currently unused)
-      size_t mask;     // one less the block size in number of words (if BIT_STREAM_STRIDED)
-      ptrdiff_t delta; // number of words between consecutive blocks (if BIT_STREAM_STRIDED)
+      bitstream_count bits;  // number of buffered bits (0 <= bits < word size)
+      bitstream_word buffer; // incoming/outgoing bits (buffer < 2^bits)
+      bitstream_word* ptr;   // pointer to next word to be read/written
+      bitstream_word* begin; // beginning of stream
+      bitstream_word* end;   // end of stream (not enforced)
+      size_t mask;           // one less the block size in number of words (if BIT_STREAM_STRIDED)
+      ptrdiff_t delta;       // number of words between consecutive blocks (if BIT_STREAM_STRIDED)
     };
 
 .. _bs-data:
@@ -129,7 +159,7 @@ Functions
 
 ----
 
-.. c:function:: size_t stream_alignment()
+.. c:function:: bitstream_count stream_alignment()
 
   Word size in bits.  This is a functional form of the constant
   :c:var:`stream_word_bits` and returns the same value.
@@ -153,7 +183,8 @@ Functions
 
 .. c:function:: size_t stream_capacity(const bitstream* stream)
 
-  Return byte size of memory buffer associated with *stream*.
+  Return byte size of memory buffer associated with *stream* specified
+  in :c:func:`stream_open`.
 
 ----
 
@@ -166,29 +197,30 @@ Functions
 .. c:function:: uint stream_write_bit(bitstream* stream, uint bit)
 
   Write single *bit* to *stream*.  *bit* must be one of 0 or 1.
+  The value of *bit* is returned.
 
 ----
 
-.. c:function:: uint64 stream_read_bits(bitstream* stream, uint n)
+.. c:function:: uint64 stream_read_bits(bitstream* stream, bitstream_count n)
 
   Read and return 0 |leq| *n* |leq| 64 bits from *stream*.
 
 ----
 
-.. c:function:: uint64 stream_write_bits(bitstream* stream, uint64 value, uint n)
+.. c:function:: uint64 stream_write_bits(bitstream* stream, uint64 value, bitstream_count n)
 
   Write 0 |leq| *n* |leq| 64 low bits of *value* to *stream*.  Return any
   remaining bits from *value*, i.e., *value* >> *n*.
 
 ----
 
-.. c:function:: size_t stream_rtell(const bitstream* stream)
+.. c:function:: bitstream_offset stream_rtell(const bitstream* stream)
 
   Return bit offset to next bit to be read.
 
 ----
 
-.. c:function:: size_t stream_wtell(const bitstream* stream)
+.. c:function:: bitstream_offset stream_wtell(const bitstream* stream)
 
   Return bit offset to next bit to be written.
 
@@ -201,41 +233,41 @@ Functions
 
 ----
 
-.. c:function:: void stream_rseek(bitstream* stream, size_t offset)
+.. c:function:: void stream_rseek(bitstream* stream, bitstream_offset offset)
 
   Position stream for reading at given bit offset.  This places the
   stream in read mode.
 
 ----
 
-.. c:function:: void stream_wseek(bitstream* stream, size_t offset)
+.. c:function:: void stream_wseek(bitstream* stream, bitstream_offset offset)
 
   Position stream for writing at given bit offset.  This places the
   stream in write mode.
 
 ----
 
-.. c:function:: void stream_skip(bitstream* stream, uint n)
+.. c:function:: void stream_skip(bitstream* stream, bitstream_count n)
 
   Skip over the next *n* bits, i.e., without reading them.
 
 ----
 
-.. c:function:: void stream_pad(bitstream* stream, uint n)
+.. c:function:: void stream_pad(bitstream* stream, bitstream_count n)
 
   Append *n* zero-bits to *stream*.
 
 ----
 
-.. c:function:: size_t stream_align(bitstream* stream)
+.. c:function:: bitstream_count stream_align(bitstream* stream)
 
-  Align stream on next word boundary by skipping bits.  No skipping is
-  done if the stream is already word aligned.  Return the number of
-  skipped bits, if any.
+  Align stream on next word boundary by skipping bits, i.e., without reading
+  them.  No skipping is done if the stream is already word aligned.  Return
+  the number of skipped bits, if any.
 
 ----
 
-.. c:function:: size_t stream_flush(bitstream* stream)
+.. c:function:: bitstream_count stream_flush(bitstream* stream)
 
   Write out any remaining buffered bits.  When one or more bits are
   buffered, append zero-bits to the stream to align it on a word boundary.
@@ -243,7 +275,7 @@ Functions
 
 ----
 
-.. c:function:: void stream_copy(bitstream* dst, bitstream* src, size_t n)
+.. c:function:: void stream_copy(bitstream* dst, bitstream* src, bitstream_size n)
 
   Copy *n* bits from *src* to *dst*, advancing both bit streams.
 
@@ -267,4 +299,5 @@ Functions
 .. c:function:: int stream_set_stride(bitstream* stream, size_t block, ptrdiff_t delta)
 
   Set block size, *block*, in number of words and spacing, *delta*, in number
-  of blocks for strided access.  Requires :c:macro:`BIT_STREAM_STRIDED`.
+  of blocks for :ref:`strided access <bs-strides>`.  Return nonzero upon
+  success.  Requires :c:macro:`BIT_STREAM_STRIDED`.
diff --git a/docs/source/caching.inc b/docs/source/caching.inc
index 9243b3c5e..374963e01 100644
--- a/docs/source/caching.inc
+++ b/docs/source/caching.inc
@@ -16,13 +16,13 @@ best choice varies from one application to another, we suggest allocating
 at least two "layers" of blocks, e.g., 2 |times| (*nx* / 4) |times| (*ny* / 4)
 blocks for 3D arrays, for applications that stream through the array and
 perform stencil computations such as gathering data from neighboring elements.
-This allows limiting the cache misses to compulsory ones.  If the *csize*
+This allows limiting the cache misses to compulsory ones.  If the *cache_size*
 parameter provided to the constructor is set to zero bytes, then a default
 cache size of at least |sqrt|\ *n* blocks is used, where *n* is the total
 number of blocks contained in the array.
 
-The cache size can be set during construction, or can be set at a later
-time via :cpp:func:`array::set_cache_size`.  Note that if *csize* = 0, then
+The cache size can be set during construction, or can be set at a later time
+via :cpp:func:`array::set_cache_size`.  Note that if *cache_size* = 0, then
 the array dimensions must have already been specified for the default size
 to be computed correctly.  When the cache is resized, it is first flushed
 if not already empty.  The cache can also be flushed explicitly if desired
diff --git a/docs/source/cfp.rst b/docs/source/cfp.rst
index 78465a573..87f474e98 100644
--- a/docs/source/cfp.rst
+++ b/docs/source/cfp.rst
@@ -346,6 +346,12 @@ not actually part of the |cfp| API.
 
 ----
 
+.. c:function:: size_t cfp.array.size_bytes(const cfp_array self, uint mask)
+
+  See :cpp:func:`array::size_bytes`.
+
+----
+
 .. c:function:: size_t cfp.array.compressed_size(const cfp_array self)
 
   See :cpp:func:`array::compressed_size`.
@@ -789,7 +795,7 @@ and are themselves not modified by these functions.
 Iterators
 ---------
 
-|cfp| iterators wrap the C++ :ref:`iterator <iterators>` classes.
+|cfp| random-access iterators wrap the C++ :ref:`iterator <iterators>` classes.
 All iterators are :ref:`passed by value <cfp_rpi_value_semantics>` and
 are themselves not modified by these functions. Iterators are constructed 
 similar to C++ iterators via :c:func:`cfp.array.begin` and 
diff --git a/docs/source/codec.inc b/docs/source/codec.inc
index be9c7b252..136e42a39 100644
--- a/docs/source/codec.inc
+++ b/docs/source/codec.inc
@@ -24,6 +24,14 @@ codec does not support a certain compression mode, it should throw an
 Codecs reside in the :code:`zfp::codec` namespace, e.g.,
 :code:`zfp::codec::zfp3<Scalar>` is the default codec for 3D arrays.
 
+As of |zfp| |cpprelease|, there is in addition to the default |zfp| codec
+a "generic" codec that allows storing data in |zfp| arrays in "uncompressed"
+form using any scalar type (specified as a template parameter).  This
+"internal" scalar type may differ from the "external" scalar type exposed
+to the user through the :cpp:class:`zfp::array` API.  For instance, the
+internal type may be :code:`float` while the external type is :code:`double`,
+which provides for 2:1 fixed-rate "compression" using IEEE 754 floating point.
+
 .. cpp:namespace:: zfp::codec
 
 .. cpp:class:: codec
@@ -146,6 +154,18 @@ Codecs reside in the :code:`zfp::codec` namespace, e.g.,
 
 ----
 
+.. cpp:function:: bool codec::set_thread_safety(bool safety)
+
+  Enable or disable thread safety.  This function is called whenever |zfp|
+  is built with OpenMP support and when the number of mutable or immutable
+  :ref:`private views <private_immutable_view>` of an array changes.  When
+  two or more private views of an array are accessed by separate threads,
+  multiple blocks may be compressed or decompressed simultaneously.  The
+  codec then has to take care that there are no race conditions on the data
+  structures (e.g., :c:type:`bitstream`) used for (de)compression.
+
+----
+
 .. cpp:function:: size_t codec::size_bytes(uint mask = ZFP_DATA_ALL) const
 
   Return storage size of components of codec data structure indicated by
@@ -164,52 +184,75 @@ Codecs reside in the :code:`zfp::codec` namespace, e.g.,
 
 ----
 
-.. cpp:function:: size_t codec::encode_block(size_t offset, uint shape, const Scalar* block) const
+.. cpp:function:: size_t codec::encode_block(bitstream_offset offset, const Scalar* block) const
+
+  Encode contiguous *block* of |4powd| scalars and store at specified bit
+  *offset* within compressed-data buffer.  Return the number of bits of
+  compressed storage for the block, excluding any necessary padding.  This
+  method must flush any buffered compressed data without counting any padding
+  (e.g., for byte alignment) in the compressed size (unless the codec requires
+  alignment of the bit offsets).
+
+----
+
+.. cpp:function:: size_t codec::decode_block(bitstream_offset offset, Scalar* block) const
+
+  Decode contiguous *block* of |4powd| scalars from specified bit *offset*
+  within compressed-data buffer (see :cpp:func:`codec::encode_block`).
+  Return number of bits of compressed data decoded, excluding any padding
+  bits, i.e., the same value reported in encoding.
+
+----
+
+.. cpp:function:: size_t codec1::encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+.. cpp:function:: size_t codec2::encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+.. cpp:function:: size_t codec3::encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+.. cpp:function:: size_t codec4::encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
 
   Encode contiguous *block* of data of given *shape* and store at specified
   bit *offset* within compressed-data buffer.  Return the number of bits of
-  compressed storage for the block.
+  compressed storage for the block (see also :cpp:func:`codec::encode_block`).
 
   The *shape* is a (2 |times| *d*)-bit encoding of the size of the
   *d*-dimensional block.  For each successive pair of bits *s* of *shape*,
   the block size in the corresponding dimension is *n* = 4 - *s*, where
   0 |leq| *s* |leq| 3.  Thus, *shape* = 0 implies a full block of |4powd|
   values.  The size of the fastest varying dimension is specified in the
-  least significant bits of *shape*.  These methods must flush any buffered
-  compressed data without counting any padding (e.g., for byte alignment)
-  in the compressed size (unless the codec requires alignment of the bit
-  offsets).
+  least significant bits of *shape*.
 
 ----
 
-.. cpp:function:: size_t codec::decode_block(size_t offset, uint shape, Scalar* block) const
+.. cpp:function:: size_t codec1::decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+.. cpp:function:: size_t codec2::decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+.. cpp:function:: size_t codec3::decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+.. cpp:function:: size_t codec4::decode_block(bitstream_offset offset, uint shape, Scalar* block) const
 
   Decode contiguous *block* of data of given *shape* from specified bit
-  *offset* within compressed-data buffer (see
+  *offset* within compressed-data buffer (see also
   :cpp:func:`codec1::encode_block`).  Return number of bits of compressed
   data decoded, excluding any padding bits, i.e., the same value reported
   in encoding.
 
 ----
 
-.. cpp:function:: size_t codec1::encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx) const
-.. cpp:function:: size_t codec2::encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
-.. cpp:function:: size_t codec3::encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
-.. cpp:function:: size_t codec4::encode_block_strided(size_t offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+.. cpp:function:: size_t codec1::encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx) const
+.. cpp:function:: size_t codec2::encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+.. cpp:function:: size_t codec3::encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+.. cpp:function:: size_t codec4::encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
 
   Encode block of data stored at *p* with strides *sx*, *sy*, *sz*,
   and *sw*.  See :c:type:`zfp_field` for information on strided storage.
   The *shape*, *offset*, and return value are as in
-  :cpp:func:`codec::encode_block`.
+  :cpp:func:`codec1::encode_block`.
 
 ----
 
-.. cpp:function:: size_t codec1::decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx) const
-.. cpp:function:: size_t codec2::decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
-.. cpp:function:: size_t codec3::decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
-.. cpp:function:: size_t codec4::decode_block_strided(size_t offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+.. cpp:function:: size_t codec1::decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx) const
+.. cpp:function:: size_t codec2::decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+.. cpp:function:: size_t codec3::decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+.. cpp:function:: size_t codec4::decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
 
   Decode block to strided storage pointed to by *p* with strides *sx*, *sy*,
-  *sz*, and *sw.  See :c:type:`zfp_field` for information on strided storage.
+  *sz*, and *sw*.  See :c:type:`zfp_field` for information on strided storage.
   The *shape*, *offset*, and return value are as in 
-  :cpp:func:`codec::decode_block`.
+  :cpp:func:`codec1::decode_block`.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e44cef318..21bccdc5d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,10 +27,23 @@
 #
 # needs_sphinx = '1.0'
 
+import sys
+
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.imgmath', 'sphinxfortran.fortran_domain']
+extensions = [
+  'sphinx.ext.imgmath',
+  'sphinx.ext.imgconverter',
+  'sphinxfortran.fortran_domain'
+]
+
+# Require sphinxcontrib.spelling only when running spell checker.
+if 'spelling' in sys.argv:
+    extensions += ['sphinxcontrib.spelling']
+
+# Ensure rasterization of vector graphics uses sufficient DPI
+image_converter_args = ['-density', '300', '-geometry', '50%']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -46,24 +59,27 @@
 
 # General information about the project.
 project = u'zfp'
-copyright = u'2014-2019, LLNL-CODE-663824'
-author = u'Peter Lindstrom'
+copyright = u'2014-2023, LLNL-CODE-663824'
+author = u'Peter Lindstrom, Danielle Asher'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u'0.5'
+version = u'1.0'
 # The full version, including alpha/beta/rc tags.
-release = u'0.5.5'
+release = u'1.0.1'
+
+# The release date (as the RTD server is in another time zone).
+today = u'Dec 15, 2023'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
 
 # Enable automatic numbering of figures referenced by :numref:.
 numfig = True
@@ -134,7 +150,12 @@
 
     # Additional stuff for the LaTeX preamble.
     #
-    # 'preamble': '',
+    # Unicode definitions needed for TeX Live 2024
+    'preamble': ('\\DeclareUnicodeCharacter{2212}{\\ensuremath{-}}'
+                 '\\DeclareUnicodeCharacter{2264}{\\ensuremath{\\leq}}'
+                 '\\DeclareUnicodeCharacter{2265}{\\ensuremath{\\geq}}'
+                 '\\DeclareUnicodeCharacter{221A}{\\ensuremath{\\sqrt{}}}'
+                 '\\DeclareUnicodeCharacter{2248}{\\ensuremath{\\approx}}'),
 
     # Latex figure (float) alignment
     #
@@ -149,7 +170,7 @@
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
     (master_doc, 'zfp.tex', u'zfp Documentation',
-     u'\shortstack[l]{Peter Lindstrom\\\\Markus Salasoo\\\\Matt Larsen\\\\Stephen Herbein}', 'manual'),
+     u'\\shortstack[l]{Peter Lindstrom\\\\Danielle Asher}', 'manual'),
 ]
 
 
@@ -173,6 +194,3 @@
      author, 'zfp', 'One line description of project.',
      'Miscellaneous'),
 ]
-
-
-
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
new file mode 100644
index 000000000..5b00fb845
--- /dev/null
+++ b/docs/source/configuration.rst
@@ -0,0 +1,399 @@
+.. include:: defs.rst
+
+.. index::
+   single: Software Configuration
+.. _config:
+
+Configuration
+=============
+
+The :ref:`installation <installation>` section describes compile-time
+options available to configure the |zfp| software.  This section provides
+additional, more detailed documentation of the rationale for and potential
+impact of these settings, including portability of |zfp| compressed streams
+across builds with different configuration settings.
+
+Unfortunately, |zfp| streams do not currently embed any information with
+regards to the settings configured for the stream producer, though some
+settings for a given |zfp| build can be determined programmatically at
+run time.  We hope to rectify this in future versions of |zfp|.
+
+The following sections discuss configuration settings in detail:
+
+* :ref:`Word Size <word-size>`: :c:macro:`BIT_STREAM_WORD_TYPE`, :c:macro:`ZFP_BIT_STREAM_WORD_SIZE`
+* :ref:`Rounding Mode <rounding>`: :c:macro:`ZFP_ROUNDING_MODE`
+* :ref:`Subnormals <subnormals>`: :c:macro:`ZFP_WITH_DAZ`
+
+.. index::
+   single: Word Size
+.. _word-size:
+
+Word Size
+---------
+
+|zfp| bit streams are read and written one *word* at a time.  The size of a
+word is a user-configurable parameter (see :c:macro:`BIT_STREAM_WORD_TYPE`
+and :c:macro:`ZFP_BIT_STREAM_WORD_SIZE`) set at compile time, and may be one
+of 8, 16, 32, and 64 bits.  By default, it is set to 64 bits as longer words
+tend to improve performance.
+
+Regardless of the word size, the |zfp| :ref:`bitstream <bs-api>` buffers one
+word of input or output, and each call to :c:func:`stream_write_bits` to
+output 1 |leq| *n* |leq| 64 bits conceptually appends those *n* bits to the
+buffered word one at a time, from least to most significant bit.  As soon as
+the buffered word is full, it is written to the output as a whole word in the
+native endian byte order of the hardware platform.  Analogously, when reading
+a bit stream, one word is fetched and buffered at a time, and bits are
+returned by :c:func:`stream_read_bits` by consuming bits from the buffered
+word from least to most significant bit.  This process is illustrated in
+:numref:`word-size-fig`.
+
+.. _word-size-fig:
+.. figure:: word-size.pdf
+  :figwidth: 90 %
+  :align: center
+  :alt: "bit stream word size"
+
+  Top: Bit stream written as (from right to left) five sequences of length
+  12 + 1 + 25 + 5 + 64 = 107 bits.
+  Bottom: Bit stream written as 8-bit and 32-bit words in little and big
+  endian byte order.  The two little endian streams differ only in the
+  amount of padding appended to fill out the last (leftmost) word.
+
+Determining Word Size
+^^^^^^^^^^^^^^^^^^^^^
+
+After |zfp| has been built, it is possible to query the word size that was
+chosen at compile time.  Programmatically, the constant
+:c:var:`stream_word_bits` as well as the function :c:func:`stream_alignment`
+give the word size in bits.  One may also glean this information from the
+command line using the :program:`testzfp` executable.
+
+Unfortunately, |zfp| currently does not embed in the compressed stream any
+information regarding the word size used.  If :ref:`headers` are used,
+one may at best infer little- versus big-endian byte order by inspecting
+the bytes stored one at a time, which begins with the characters 'z', 'f',
+'p'.  On big-endian machines with word sizes greater than 8, those first
+bytes will appear in a different order.
+
+Rate Granularity
+^^^^^^^^^^^^^^^^
+
+The word size dictates the granularity of rates (in bits/value) supported
+by |zfp|'s :ref:`compressed-array classes <arrays>`.  Each *d*-dimensional
+compressed block of |4powd| values is represented as a whole number of words.
+Thus, smaller words result in finer rate granularity.  See also FAQ
+:ref:`#12 <q-granularity>`.
+
+Performance
+^^^^^^^^^^^
+
+Performance is improved by larger word sizes due to fewer reads from and
+writes to memory, as well as fewer loop iterations to process the up to
+64 bits read or written.  If portability across different-endian platforms
+is not necessary (e.g., for persistent storage of compressed streams), then
+we suggest using as word size the widest integer size supported by the
+hardware (usually 32 or 64 bits).
+
+Execution Policy
+^^^^^^^^^^^^^^^^
+
+The CUDA back-end currently ignores the word size specified at compile time
+and always use 64-bit words.  This impacts portability of streams compressed
+or decompressed using these execution policies.  We expect future support for
+user-configurable word sizes for CUDA.  In contrast, both the serial and
+OpenMP back-ends respect word size.
+
+Portability
+^^^^^^^^^^^
+
+When the chosen word size is larger than one byte (8 bits), the byte order
+employed by the hardware architecture affects the sequence of bytes written
+to and read from the stream, as each read or written word is broken down
+into a set of bytes.  Two common conventions are used: *little endian*
+order, where the least significant byte of a word appears first, and
+*big endian* order, where the most significant byte appears first.  Therefore,
+a stream written on a little-endian platform with a word size greater than
+8 bits will not be properly read on a big-endian platform and vice versa.
+We say that such |zfp| streams are endian-dependent and not portable.
+
+When the word size is one byte (8 bits), on the other hand, each word read
+or written is one byte, and endianness does not matter.  Such |zfp| streams
+are portable.
+
+.. warning::
+  For compressed streams to be portable across platforms with different byte
+  order, |zfp| must be built with a word size of 8 bits.
+
+When using the |zfp| :ref:`bitstream API <bs-api>`, it is possible to write
+up to 64 bits at a time.  When the word size is 8 bits and more than 8 bits
+are written at a time, |zfp| appends bits to the output in little-endian
+order, from least to most significant bit, regardless of the endianness of
+the hardware architecture.  This ensures portability across machines with
+different byte order, and should be the preferred configuration when
+cross-platform portability is needed.  For this reason, the |zfp| compression
+plugin for the HDF5 file format, `H5Z-ZFP <https://github.com/LLNL/H5Z-ZFP>`__,
+requires |zfp| to be built with an 8-bit word size.
+
+On little-endian hardware platforms, the order of bytes read and written is
+independent of word size.  While readers and writers may in principle employ
+different word sizes, it is rarely safe to do so.  High-level API functions
+like :c:func:`zfp_compress` and :c:func:`zfp_decompress` always align the
+stream on a word boundary before returning.  The consequences of this are
+twofold:
+
+* If a stream is read with a larger word size than the word size used when
+  the stream was written, then the last word read may extend beyond the
+  memory buffer allocated for the stream, resulting in a *buffer over-read*
+  memory access violation error.
+
+* When multiple fields are compressed back-to-back to the same stream through
+  a sequence of :c:func:`zfp_compress` calls, padding is potentially inserted
+  between consecutive fields.  The amount of padding is dependent on word
+  size.  That is, :c:func:`zfp_compress` flushes up to a word of buffered bits
+  if the stream does not already end on a word boundary.  Similarly,
+  :c:func:`zfp_decompress` positions the stream on the same word boundary
+  (when the word size is fixed) so that compression and decompression are
+  synchronized.  Because of such padding, subsequent :c:func:`zfp_decompress`
+  calls may not read from the correct bit stream offset if word sizes do not
+  agree between reader and writer.  For portability, the user may have to
+  manually insert additional padding (using :c:func:`stream_wtell` and
+  :c:func:`stream_pad` on writes, :c:func:`stream_rtell` and
+  :c:func:`stream_skip` on reads) to align the stream on a whole 64-bit word
+  boundary.
+
+..
+  [figure showing overread and padding for two different word sizes]
+
+.. warning::
+  Even though |zfp| uses little-endian byte order, the word alignment imposed
+  by the high-level API functions :c:func:`zfp_compress` and
+  :c:func:`zfp_decompress` may result in differences in padding when different
+  word sizes are used.  To guarantee portability of |zfp| streams, we recommend
+  using a word size of 8 bits (one byte).
+
+On big-endian platforms, it is not possible to ensure portability unless the
+word size is 8 bits.  Thus, for full portability when compressed data is
+exchanged between different platforms, we suggest using 8-bit words.
+
+Testing
+^^^^^^^
+
+The |zfp| unit tests have been designed only for the default 64-bit word
+size.  Thus, most tests will fail if a smaller word size is used.  We plan
+to address this shortcoming in the near future.
+
+.. index::
+   single: Rounding Mode
+.. _rounding:
+
+Rounding Mode
+-------------
+
+In |zfp|'s lossy compression modes, quantization is usually employed to
+discard some number of least significant bits of transform coefficients.
+By default, such bits are simply replaced with zeros, which is analogous
+to *truncation*, or rounding towards zero.  (Because |zfp| represents
+coefficients in *negabinary*, or base minus two, the actual effect of
+such truncation is more complicated.)  The net effect is that compression
+errors are usually biased in one direction or another, and this bias
+further depends on a value's location within a block (see FAQ
+:ref:`#30 <q-err-dist>`).  To mitigate this bias, other rounding
+modes can be selected at compile time via :c:macro:`ZFP_ROUNDING_MODE`.
+
+Supported Rounding Modes
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As of |zfp| |roundingrelease|, the following three rounding modes are
+available:
+
+.. c:macro:: ZFP_ROUND_NEVER
+
+  This is the default rounding mode, which simply zeros trailing bits
+  analogous to truncation, as described above.
+
+.. c:macro:: ZFP_ROUND_FIRST
+
+  This mode applies rounding during *compression* by first offsetting values
+  by an amount proportional to the quantization step before truncation,
+  causing errors to cancel on average.  This rounding mode is essentially a
+  form of *mid-tread quantization*.
+
+  Although this is the preferred rounding mode as far as error bias
+  cancellation is concerned, it relies on knowing in advance the precision of
+  each coefficient and is available only in
+  :ref:`fixed-precision <mode-fixed-precision>` and
+  :ref:`-accuracy <mode-fixed-accuracy>` compression modes.
+
+.. note::
+  :c:macro:`ZFP_ROUND_FIRST` impacts the both the bits stored in the compressed
+  stream and the decompressed values.
+
+
+.. c:macro:: ZFP_ROUND_LAST
+
+  This mode applies rounding during *decompression* by offsetting decoded
+  values by an amount proportional to the quantization step.  This rounding
+  mode is essentially a form of *mid-riser quantization*.
+
+  This rounding mode is available in all compression modes but tends to be
+  less effective at reducing error bias than :c:macro:`ZFP_ROUND_FIRST`,
+  though more effective than :c:macro:`ZFP_ROUND_NEVER`.
+
+.. note::
+  As :c:macro:`ZFP_ROUND_LAST` is applied only during decompression, it has
+  no impact on the compressed stream.  Only the values returned from
+  decompression are affected.
+
+The rounding mode must be selected at compile time by setting
+:c:macro:`ZFP_ROUNDING_MODE`, e.g., using GNU make or CMake commands
+::
+
+    make ZFP_ROUNDING_MODE=ZFP_ROUND_NEVER
+    cmake -DZFP_ROUNDING_MODE=ZFP_ROUND_NEVER ..
+
+In general, the same rounding mode ought to be used by data producer and
+consumer, though since :c:macro:`ZFP_ROUND_NEVER` and
+:c:macro:`ZFP_ROUND_FIRST` decode values the same way, and since
+:c:macro:`ZFP_ROUND_NEVER` and :c:macro:`ZFP_ROUND_LAST` encode values the
+same way, there really is only one combination of rounding modes that should
+be avoided:
+
+.. warning::
+  Do not compress data with :c:macro:`ZFP_ROUND_FIRST` and then decompress
+  with :c:macro:`ZFP_ROUND_LAST`.  This will apply bias correction twice and
+  cause errors to be larger than necessary, perhaps even exceeding any
+  specified error tolerance.
+
+Error Bounds and Distributions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The centering of errors implied by :c:macro:`ZFP_ROUND_FIRST` and
+:c:macro:`ZFP_ROUND_LAST` reduces not only the bias but also the maximum
+absolute error for a given quantization level (or precision).  In fact, the
+reduction in maximum error is so large that it is possible to reduce precision
+of transform coefficients by one bit in
+:ref:`fixed-accuracy mode <mode-fixed-accuracy>` while staying within the
+prescribed error tolerance.  (Note that the same precision reduction applies
+to :ref:`expert mode <mode-expert>` when :c:member:`zfp_stream.minexp` is
+specified.)  In other words, one may boost the compression ratio for a given
+error tolerance.  Viewed differently, the error bound can be tightened such
+that observed errors are closer to the tolerance.
+
+To take advantage of such a tighter error bound and improvement in compression
+ratio, one should enable :c:macro:`ZFP_WITH_TIGHT_ERROR` at compile time.
+This macro, which should only be used in conjunction with
+:c:macro:`ZFP_ROUND_FIRST` or :c:macro:`ZFP_ROUND_LAST`, reduces precision
+by one bit in :ref:`fixed-accuracy mode <mode-fixed-accuracy>`, thus
+increasing error while decreasing compressed size without violating the
+error tolerance.
+
+.. warning::
+  Both producer and consumer must use the same setting of
+  :c:macro:`ZFP_WITH_TIGHT_ERROR`.  Also note that this setting makes
+  compressed streams incompatible with the default settings of |zfp| and
+  existing compressed formats built on top of |zfp|, such as the
+  `H5Z-ZFP <https://github.com/LLNL/H5Z-ZFP>`__ HDF5 plugin.
+
+For more details on how rounding modes and tight error bounds impact error,
+see FAQ :ref:`#30 <q-err-dist>`.
+
+Performance
+^^^^^^^^^^^
+
+The rounding mode has only a small impact on performance.  As both
+:c:macro:`ZFP_ROUND_FIRST` and :c:macro:`ZFP_ROUND_LAST` require an offset to
+be applied to transform coefficient, they incur a small overhead relative to
+:c:macro:`ZFP_ROUND_NEVER`, where no such corrections are needed.
+
+Execution Policy
+^^^^^^^^^^^^^^^^
+
+:c:macro:`ZFP_WITH_TIGHT_ERROR` applies only to
+:ref:`fixed-accuracy <mode-fixed-accuracy>` and :ref:`expert <mode-expert>`
+mode, neither of which is currently supported by the CUDA execution policy.
+Therefore, this setting is currently ignored in CUDA but will be supported
+in the next |zfp| release.
+
+Portability
+^^^^^^^^^^^
+
+As :c:macro:`ZFP_WITH_TIGHT_ERROR` determines the number of bits to write
+per block in :ref:`fixed-accuracy mode <mode-fixed-accuracy>`, the producer
+and consumer of compressed streams must be compiled with the same setting
+for streams to be portable in this compression mode.
+
+Testing
+^^^^^^^
+
+The |zfp| unit tests have been designed for the default rounding mode,
+:c:macro:`ZFP_ROUND_NEVER`.  These tests will in general fail when another
+rounding mode is chosen.
+
+.. index::
+   single: Subnormals
+.. _subnormals:
+
+Subnormals
+----------
+
+Subnormal numbers (aka. denormals) are extremely small floating-point numbers
+(on the order of 10\ :sup:`-308` for double precision) that have a special
+IEEE 754 floating-point representation.  Because such numbers are exceptions
+that deviate from the usual floating-point representation, some hardware
+architectures do not even allow them but rather replace such numbers
+with zero whenever they occur.  Such treatment of subnormals is commonly
+referred to as a *denormals-are-zero* (DAZ) policy.  And while some
+architectures handle subnormals, they do so only in software or microcode
+and at a substantial performance penalty.
+
+The default (lossy) |zfp| implementation might struggle with blocks composed
+of all-subnormal numbers, as the numeric transformations involved in
+compression and decompression might then cause values to overflow and invoke
+undefined behavior (see
+`Issue #119 <https://github.com/LLNL/zfp/issues/119>`__).  Although such
+blocks are in practice reconstructed as all-subnormals, precision might be
+completely lost, and the resulting decompressed values are undefined.
+
+One way to resolve this issue is to manually force all-subnormal blocks
+to all-zeros (assuming the floating-point hardware did not already do this).
+This denormals-are-zero policy is enforced when enabling
+:c:macro:`ZFP_WITH_DAZ` at compile time.
+
+.. warning::
+  :c:macro:`ZFP_WITH_DAZ` can mitigate difficulties with most but not all
+  subnormal numbers.  A more general solution has been identified that will
+  become available in a future release.
+
+.. note::
+  |zfp|'s :ref:`reversible-mode <mode-reversible>` compression algorithm
+  handles subnormals correctly, without loss.
+
+Performance
+^^^^^^^^^^^
+
+There is a negligible compression performance penalty associated with
+:c:macro:`ZFP_WITH_DAZ`.
+
+Execution Policy
+^^^^^^^^^^^^^^^^
+
+All execution policies support :c:macro:`ZFP_WITH_DAZ`.
+
+Portability
+^^^^^^^^^^^
+
+Because subnormals are modified before compression, the compressed stream
+could in principle change when forcing blocks to be encoded as all-zeros.
+While compressed streams with and without this setting may not match
+bit-for-bit, the impact of :c:macro:`ZFP_WITH_DAZ` tends to be benign.
+In particular, this setting has no impact on decompression.  Thus, all
+combinations of :c:macro:`ZFP_WITH_DAZ` between producer and consumer are
+safe.
+
+Testing
+^^^^^^^
+
+:c:macro:`ZFP_WITH_DAZ` affects only extremely rare subnormal values that
+do not partake in the vast majority of |zfp| unit tests.  Tests are unlikely
+to be impacted by enabling this setting.
diff --git a/docs/source/contributors.rst b/docs/source/contributors.rst
index 5c1067dea..45e945c24 100644
--- a/docs/source/contributors.rst
+++ b/docs/source/contributors.rst
@@ -4,19 +4,21 @@
 Contributors
 ============
 
-* LLNL |zfp| team
+* |zfp| development team
 
   - Peter Lindstrom
-  - Garrett Morrison
-  - Markus Salasoo
-  - Matt Larsen
+  - Danielle Asher
+
+* Major contributors
+
+  - Chuck Atkins
   - Stephen Herbein
+  - Mark Kim
+  - Matt Larsen
   - Mark Miller
+  - Markus Salasoo
+  - David Wade
+  - Haiying Xu
 
-* External contributors
-
-  - Chuck Atkins, Kitware (CMake support)
-  - Stephen Hamilton, Johns Hopkins University (VTK plugin)
-  - Mark Kim, ORNL (original CUDA port)
-  - Amik St-Cyr, Shell (OpenMP compressor)
-  - Eric Suchyta, ORNL (ADIOS plugin)
+For a full list of contributors, see the
+`GitHub Contributors <https://github.com/LLNL/zfp/graphs/contributors>`__ page.
diff --git a/docs/source/defs.rst b/docs/source/defs.rst
index 6d6a82dd3..f78886edc 100644
--- a/docs/source/defs.rst
+++ b/docs/source/defs.rst
@@ -6,6 +6,7 @@
 .. |sqrt| unicode:: 0x221a
 .. |check| unicode:: 0x2713
 .. |reg| unicode:: 0x00ae
+.. |tm| unicode:: 0x2122
 .. |zfp| replace:: zfp
 .. |cfp| replace:: cfp
 .. |zforp| replace:: zFORp
@@ -30,11 +31,14 @@
 .. |zforprelease| replace:: 0.5.5
 .. |zfpyrelease| replace:: 0.5.5
 .. |csizerelease| replace:: 0.5.5
-.. |crpirelease| replace:: 0.5.6
-.. |raiterrelease| replace:: 0.5.6
-.. |64bitrelease| replace:: 0.5.6
-.. |boolrelease| replace:: 0.5.6
-.. |4darrrelease| replace:: 0.5.6
-.. |fieldrelease| replace:: 0.5.6
-.. |carrrelease| replace:: 0.5.6
-.. |cpprelease| replace:: 0.5.6
+.. |crpirelease| replace:: 1.0.0
+.. |raiterrelease| replace:: 1.0.0
+.. |64bitrelease| replace:: 1.0.0
+.. |boolrelease| replace:: 1.0.0
+.. |4darrrelease| replace:: 1.0.0
+.. |fieldrelease| replace:: 1.0.0
+.. |carrrelease| replace:: 1.0.0
+.. |cpprelease| replace:: 1.0.0
+.. |verrelease| replace:: 1.0.0
+.. |roundingrelease| replace:: 1.0.0
+.. |nextrelease| replace:: 1.1.0
diff --git a/docs/source/directions.rst b/docs/source/directions.rst
index 16c139d42..adff6246c 100644
--- a/docs/source/directions.rst
+++ b/docs/source/directions.rst
@@ -17,7 +17,7 @@ important features, including:
   values as missing or indeterminate.  Current solutions often rely on tagging
   missing values as NaNs or special, often very large sentinel values outside
   the normal range, which can lead to poor compression and complete loss of
-  accuracy in nearby valid values.  See :ref:`FAQ #7 <q-missing>`.
+  accuracy in nearby valid values.  See FAQ :ref:`#7 <q-missing>`.
 
 - **Support for NaNs and infinities**.  Similar to missing values, some
   applications store special IEEE floating-point values that are supported
@@ -44,7 +44,7 @@ important features, including:
   at reduced precision over the entire domain, with quality increasing
   progressively as more data arrives.  The low-level bit stream interface
   already supports progressive access by interleaving bits across blocks
-  (see :ref:`FAQ #13 <q-progressive>`), but |zfp| lacks a high-level API
+  (see FAQ :ref:`#13 <q-progressive>`), but |zfp| lacks a high-level API
   for generating and accessing progressive streams.
 
 - **Parallel compression**.  |zfp|'s data partitioning into blocks invites
@@ -56,14 +56,15 @@ important features, including:
   decompression on the GPU via CUDA.  However, only fixed-rate mode is
   so far supported.
 
-- **Variable-rate arrays**.  |zfp| currently supports only fixed-rate
-  compressed arrays, which wastes bits in smooth regions with little
-  information content while too few bits may be allocated to accurately
-  preserve sharp features such as shocks and material interfaces, which
-  tend to drive the physics in numerical simulations.  Two candidate
-  solutions have been identified for read-only and read-write access
-  to variable-rate arrays with very modest storage overhead.  These
-  arrays will support both fixed precision and accuracy.
+- **Variable-rate arrays**.  |zfp| currently offers only fixed-rate
+  compressed arrays with random-access write support; |zfp| |carrrelease|
+  further provides read-only variable-rate arrays.  Fixed-rate arrays waste
+  bits in smooth regions with little information content while too few bits
+  may be allocated to accurately preserve sharp features such as shocks and
+  material interfaces, which tend to drive the physics in numerical
+  simulations.  A candidate solution has been developed for variable-rate
+  arrays that support read-write random access with modest storage overhead.
+  We expect to release this capability in the near future.
 
 - **Array operations**.  |zfp|'s compressed arrays currently support basic
   indexing and initialization, but lack array-wise operations such as
diff --git a/docs/source/disclaimer.inc b/docs/source/disclaimer.inc
index 4745f5edc..58a69ba02 100644
--- a/docs/source/disclaimer.inc
+++ b/docs/source/disclaimer.inc
@@ -5,4 +5,5 @@
   map to the innermost (rightmost) array dimension in a C array and to the
   leftmost dimension in a Fortran array.  Getting the order of dimensions
   right is crucial for good compression and accuracy.  See the discussion of
-  :ref:`dimensions and strides <indexing>` for further information.
+  :ref:`dimensions and strides <indexing>` and FAQ :ref:`#0 <q-layout>` for
+  further information.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 14feeedde..9391b8415 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -3,7 +3,7 @@
 Code Examples
 =============
 
-The :file:`examples` directory includes five programs that make use of the
+The :file:`examples` directory includes ten programs that make use of the
 compressor.
 
 .. _ex-simple:
@@ -15,8 +15,7 @@ The :program:`simple` program is a minimal example that shows how to call
 the compressor and decompressor on a double-precision 3D array.  Without
 the :code:`-d` option, it will compress the array and write the compressed
 stream to standard output.  With the :code:`-d` option, it will instead
-read the compressed stream from standard input and decompress the
-array::
+read the compressed stream from standard input and decompress the array::
 
     simple > compressed.zfp
     simple -d < compressed.zfp
@@ -24,6 +23,52 @@ array::
 For a more elaborate use of the compressor, see the
 :ref:`zfp utility <zfpcmd>`.
 
+.. _ex-array:
+
+Compressed-Array C++ Classes
+----------------------------
+
+The :program:`array` program shows how to declare, write to, and read from
+|zfp|'s compressed-array C++ objects (in this case, 2D double-precision
+arrays), which is essentially as straightforward as working with STL vectors.
+This example initializes a 2D array with a linear ramp of 12 |times| 8 = 96
+values using only four bits of storage per value, which using uncompressed
+storage would not be enough to distinguish more than 16 different values.
+For more advanced compressed-array features, see the
+:ref:`tutorial <tut-arrays>`.
+
+.. _ex-chunk:
+
+Chunked (De)compression
+-----------------------
+
+The :program:`chunk` program is an example of how to perform chunked
+(de)compression, where the compressed stream for a 3D array is produced or
+consumed in multiple chunks.  Chunking slices the array along the *z*
+direction (the slowest varying dimension) into slabs that are (de)compressed
+independently.  Assuming the chosen array dimensions, rate, and number of
+chunks admit (de)compression by satisfying certain constraints (see FAQ
+:ref:`#32 <q-chunked>`), (de)compression in chunks should result in the same
+output as if the entire array were (de)compressed all at once.
+
+The array dimensions are specified as :code:`-3 nx ny nz` (default is
+125 |times| 100 |times| 240); the rate as :code:`-r rate` (default is
+16 bits/value); and the number of chunks as :code:`-n chunks` (default is one
+chunk).  Without :code:`-d`, a synthetic array is generated and compressed to
+standard output.  Using :code:`-d`, standard input is decompressed and written
+to standard output.  For example::
+
+    chunk -n 1 > single.zfp
+    chunk -n 4 > quadruple.zfp
+    diff single.zfp quadruple.zfp
+
+    chunk -n 1 -d < single.zfp > single.f64
+    chunk -n 4 -d < single.zfp > quadruple.f64
+    diff single.f64 quadruple.f64
+
+Here :program:`diff` should report no differences.  See FAQ
+:ref:`#32 <q-chunked>` for further discussion of chunked (de)compression.
+
 .. _ex-diffusion:
 
 Diffusion Solver
@@ -39,8 +84,11 @@ solution and solution time.  The usage is::
       -a <tolerance> : absolute error tolerance (requires -c)
       -b <blocks> : cache size in number of 4x4 blocks
       -c : use read-only arrays (needed for -a, -p, -R)
+      -d : use double-precision tiled arrays
+      -f : use single-precision tiled arrays
+      -h : use half-precision tiled arrays
       -i : traverse arrays using iterators instead of integer indices
-      -j : use OpenMP parallel execution
+      -j : use OpenMP parallel execution (requires -r)
       -n <nx> <ny> : grid dimensions
       -p <precision> : precision in uncompressed bits/value (requires -c)
       -r <rate> : rate in compressed bits/value
@@ -48,12 +96,11 @@ solution and solution time.  The usage is::
       -t <nt> : number of time steps
 
 Here *rate* specifies the exact number of compressed bits to store per
-double-precision floating-point value (default = 64); *nx* and *ny*
-specify the grid size (default = 100 |times| 100); *nt* specifies the number
-of time steps to take (the default is to run until time *t* = 1); and *blocks*
-is the number of uncompressed blocks to cache
-(default = (|sqrt| *n*) / 4, where *n* = *nx* |times| *ny*).
-The :code:`-i` option enables array traversal via iterators instead of indices.
+double-precision floating-point value; *nx* and *ny* specify the grid size
+(default = 128 |times| 128); *nt* specifies the number of time steps to take
+(the default is to run until time *t* = 1); and *blocks* is the number of
+uncompressed blocks to cache (default = *nx* / 2).  The :code:`-i` option
+enables array traversal via iterators instead of indices.
 
 The :code:`-j` option enables OpenMP parallel execution, which makes use
 of both mutable and immutable :ref:`private views <private_immutable_view>`
@@ -66,22 +113,33 @@ This example also illustrates how :ref:`read-only arrays <carray_classes>`
 fixed-precision (:code:`-p`), fixed-accuracy (:code:`-a`),
 or reversible (:code:`-R`) mode.
 
-Running diffusion with the following arguments::
+The output lists for each time step the current rate of the state array and
+in parentheses any additional storage, e.g., for the block
+:ref:`cache <caching>` and :ref:`index <index>` data structures, both in bits
+per array element.  Running diffusion with the following arguments::
 
     diffusion -r 8
     diffusion -r 12
-    diffusion -r 20
-    diffusion -r 64
+    diffusion -r 16
+    diffusion -r 24
+    diffusion
 
-should result in this output::
+should result in this final output::
 
-    sum=0.996442 error=4.813938e-07
-    sum=0.998338 error=1.967777e-07
-    sum=0.998326 error=1.967952e-07
-    sum=0.998326 error=1.967957e-07
+    sum=0.995170 error=4.044954e-07
+    sum=0.998151 error=1.237837e-07
+    sum=0.998345 error=1.212734e-07
+    sum=0.998346 error=1.212716e-07
+    sum=0.998346 error=1.212716e-07
 
 For speed and quality comparison, the solver solves the same problem using
-uncompressed double-precision arrays when compression parameters are omitted.
+uncompressed double-precision row-major arrays when compression parameters
+are omitted.  If one of :code:`-h`, :code:`-f`, :code:`-d` is specified,
+uncompressed tiled arrays are used.  These arrays are based on the |zfp|
+array classes but make use of the :ref:`generic codec <codec>`, which
+stores blocks as uncompressed scalars of the specified type (:code:`half`,
+:code:`float`, or :code:`double`) while utilizing a double-precision block
+cache (like |zfp|'s compressed arrays).
 
 The :program:`diffusionC` program is the same example written entirely
 in C using the |cfp| :ref:`wrappers <cfp>` around the C++ compressed array
@@ -159,7 +217,7 @@ stream.  The program takes one optional argument::
     inplace [tolerance]
 
 which specifies the fixed-accuracy absolute tolerance to use during
-compression.  Please see :ref:`FAQ #19 <q-inplace>` for more on the
+compression.  Please see FAQ :ref:`#19 <q-inplace>` for more on the
 limitations of in-place compression.
 
 .. _ex-iterators:
diff --git a/docs/source/execution.rst b/docs/source/execution.rst
index 4ef8db5cb..02c0137d5 100644
--- a/docs/source/execution.rst
+++ b/docs/source/execution.rst
@@ -27,9 +27,13 @@ responsible for compressing a *chunk* of several contiguous blocks.
 
 This section describes the |zfp| parallel compression algorithm and explains
 how to configure |libzfp| and enable parallel compression at run time via
-its :ref:`high-level C API <hl-api>`.  Parallel compression is not supported
-via the :ref:`low-level API <ll-api>`.
+its :ref:`high-level C API <hl-api>`.
 
+.. note::
+  Parallel compression is not supported via the :ref:`low-level API <ll-api>`,
+  which ignores all execution policy settings and always executes in serial.
+
+.. _exec-policies:
 
 Execution Policies
 ------------------
@@ -186,18 +190,30 @@ stream is copied back to the host and device memory is deallocated.  If both
 pointers are device pointers, then no copies are made.  Additionally, any
 combination of mixing host and device pointers is supported.
 
-Additional Requirements
-^^^^^^^^^^^^^^^^^^^^^^^
+.. _cuda-limitations:
+
+CUDA Limitations
+^^^^^^^^^^^^^^^^
+
+The CUDA implementation has a number of limitations:
+
+* Only the :ref:`fixed-rate mode <mode-fixed-rate>` mode is supported.
+  Other modes will be supported in a future release.
+* 4D arrays are not supported.
+* :ref:`Headers <headers>` are not supported.  Any header already present in
+  the stream will be silently overwritten on compression.
+* |zfp| must be built with a :c:macro:`ZFP_BIT_STREAM_WORD_SIZE` of 64 bits.
+* Although :ref:`strides <field>` are supported, fields must be contiguous
+  when stored in host memory, i.e., with no unused memory addresses between
+  the minimum and maximum address spanned by the field (see
+  :c:func:`zfp_field_is_contiguous`).  This requirement avoids having to copy
+  and allocate more temporary memory than needed to hold the array if it were
+  not strided.  Note that the strides can still be arbitrary as long as they
+  serve only to permute the array elements.  Moreover, this restriction
+  applies only to the CUDA execution policy and the case where the
+  uncompressed field resides on the host.
 
-The CUDA implementation supports strided fields.  However, when the field
-is stored in host memory, it must occupy contiguous storage, i.e., with
-no unused memory addresses between the minimum and maximum address spanned
-by the field.  This requirement avoids having to copy and allocate more
-temporary memory than needed to hold the array if it were not strided.
-Note that the strides can still be arbitrary as long as they serve only to
-permute the array elements.  Moreover, this restriction applies only to the
-CUDA execution policy and the case where the uncompressed field resides on
-the host.
+We expect to address these limitations over time.
 
 
 Setting the Execution Policy
@@ -280,12 +296,15 @@ fixed storage, and therefore the decompressor needs to be instructed
 where each compressed block resides in the bit stream to enable
 parallel decompression.  Because the |zfp| bit stream does not currently
 store such information, variable-rate parallel decompression is not yet
-supported.
+supported, though plans are to make such functionality available in the
+near future.
 
 The CUDA implementation supports fixed-rate decompression.  OpenMP
-fixed-rate decompression will be added in the near future.
+fixed-rate decompression has been implemented and will be released in the
+near future.
 
 Future versions of |zfp| will allow efficient encoding of block sizes and/or
 offsets to allow each thread to quickly locate the blocks it is responsible
 for decompressing, which will allow for variable-rate compression and
-decompression.
+decompression.  Such capabilities are already present in the implementation
+of the |zfp| :ref:`read-only arrays <carray_classes>`.
diff --git a/docs/source/faq.rst b/docs/source/faq.rst
index 4a953d973..92cd8b270 100644
--- a/docs/source/faq.rst
+++ b/docs/source/faq.rst
@@ -40,6 +40,10 @@ Questions answered in this FAQ:
   #. :ref:`Why are compressed arrays so slow? <q-1d-speed>`
   #. :ref:`Do compressed arrays use reference counting? <q-ref-count>`
   #. :ref:`How large a buffer is needed for compressed storage? <q-max-size>`
+  #. :ref:`How can I print array values? <q-printf>`
+  #. :ref:`What is known about zfp compression errors? <q-err-dist>`
+  #. :ref:`Why are zfp blocks 4 * 4 * 4 values? <q-block-size>`
+  #. :ref:`Can zfp (de)compress a single array in chunks? <q-chunked>`
 
 -------------------------------------------------------------------------------
 
@@ -139,7 +143,7 @@ of dimensions *nx* |times| *ny*.  Can I use a 3D |zfp| array to store this as::
   array3d velocity(2, nx, ny, rate);
 
 A: Although this could be done, zfp assumes that consecutive values are
-related.  The two velocity components (*vx*, *vy*) are almost suredly
+related.  The two velocity components (*vx*, *vy*) are almost assuredly
 independent and would not be correlated.  This will severely hurt the
 compression rate or quality.  Instead, consider storing *vx* and *vy* as
 two separate 2D scalar arrays::
@@ -257,6 +261,10 @@ A: Yes, but your mileage may vary.  Dense matrices, unlike smooth scalar
 fields, rarely exhibit correlation between adjacent rows and columns.  Thus,
 the quality or compression ratio may suffer.
 
+For examples of dense linear solvers that use |zfp| for matrix storage,
+see `STRUMPACK <https://portal.nersc.gov/project/sparse/strumpack/>`__
+and `ButterflyPACK <https://portal.nersc.gov/project/sparse/butterflypack/>`__.
+
 -------------------------------------------------------------------------------
 
 .. _q-structured:
@@ -470,6 +478,20 @@ implies that after every *m* 64-bit words have been decoded, the bit stream
 is advanced by *m* |times| *n* words to the next set of m 64-bit words
 associated with the block.
 
+Rather than overcoming the technical challenges discussed here, we advocate a
+simpler, compressor-agnostic approach to progressive-precision access:
+
+#. V. Magri, P. Lindstrom,
+   "`A General Framework for Progressive Data Compression and Retrieval <https://doi.org/10.1109/TVCG.2023.3327186>`__,"
+   IEEE Transactions on Visualization and Computer Graphics, 2024.
+
+Additional benefits of this framework include:
+
+- No need to perform surgery on |zfp|.
+- Support for arbitrarily small |zfp| error tolerances (see
+  :ref:`Q17 <q-tolerance>`) and even lossless compression in the limit.
+- Easy integration with current file formats and I/O libraries.
+
 -------------------------------------------------------------------------------
 
 .. _q-init:
@@ -496,31 +518,49 @@ information independently.
 
 Q15: *Must I use the same parameters during compression and decompression?*
 
-A: Not necessarily.  When decompressing one block at a time, it is possible
-to use more tightly constrained :c:type:`zfp_stream` parameters during
-decompression than were used during compression.  For instance, one may use a
-larger :c:member:`zfp_stream.minbits`, smaller :c:member:`zfp_stream.maxbits`,
-smaller :c:member:`zfp_stream.maxprec`, or larger :c:member:`zfp_stream.minexp`
-during decompression to process fewer compressed bits than are stored, and to
-decompress the array more quickly at a lower precision.  This may be useful
-in situations where the precision and accuracy requirements are not known a
-priori, thus forcing conservative settings during compression, or when the
-compressed stream is used for multiple purposes.  For instance, visualization
-usually has less stringent precision requirements than quantitative data
-analysis.  This feature of decompressing to a lower precision is particularly
-useful when the stream is stored progressively (see :ref:`Q13 <q-progressive>`).
-
-Note that one may not use less constrained parameters during decompression,
-e.g., one cannot ask for more than :c:member:`zfp_stream.maxprec` bits of
-precision when decompressing.  Furthermore, the parameters must agree between
-compression and decompression when calling the high-level API function
-:c:func:`zfp_decompress`.
-
-Currently float arrays have a different compressed representation from
-compressed double arrays due to differences in exponent width.  It is not
-possible to compress a double array and then decompress (demote) the result
-to floats, for instance.  Future versions of the |zfp| codec may use a unified
-representation that does allow this.
+A: Usually, but there are exceptions.  When decompressing one block at a time
+using the :ref:`low-level API <ll-api>`, it is possible to use more tightly
+constrained :c:type:`zfp_stream` parameters during decompression than were
+used during compression.  For instance, one may use a smaller
+:c:member:`zfp_stream.maxbits`, smaller :c:member:`zfp_stream.maxprec`, or
+larger :c:member:`zfp_stream.minexp` during decompression to process fewer
+compressed bits than are stored, and to decompress the array more quickly at
+a lower precision.  This may be useful in situations where the precision and
+accuracy requirements are not known a priori, thus forcing conservative
+settings during compression, or when the compressed stream is used for
+multiple purposes.  For instance, visualization usually has less stringent
+precision requirements than quantitative data analysis.  This feature of
+decompressing to a lower precision is particularly useful when the stream is
+stored progressively (see :ref:`Q13 <q-progressive>`).
+
+Note, however, that when doing so, the caller must manually fast-forward
+the stream (using :c:func:`stream_rseek`) to the beginning of the next block
+before decompressing it, which may require extra bookkeeping.
+
+Also note that one may not use less constrained parameters during
+decompression, e.g., one cannot ask for more than
+:c:member:`zfp_stream.maxprec` bits of precision when decompressing.
+Furthermore, the parameters must agree between compression and decompression
+when calling the high-level API function :c:func:`zfp_decompress`.
+
+With regards to the :c:type:`zfp_field` struct passed to
+:c:func:`zfp_compress` and :c:func:`zfp_decompress`, field dimensions must
+generally match between compression and decompression, though see
+:ref:`Q32 <q-chunked>` on chunked (de)compression.  Strides, however, need
+not match; see :ref:`Q16 <q-strides>`.  Additionally, the scalar type,
+:c:type:`zfp_type`, must match.  For example, float arrays currently have a
+compressed representation different from compressed double arrays due to
+differences in exponent width.  It is not possible to compress a double array
+and then decompress (demote) the result to floats, for instance.  Future
+versions of the |zfp| codec may use a unified representation that does allow
+this.
+
+By default, compression parameters and array metadata are not stored in the
+compressed stream, as often such information is recorded separately.
+However, the user may optionally record both compression parameters and array
+metadata in a header at the beginning of the compressed stream; see
+:c:func:`zfp_write_header`, :c:func:`zfp_read_header`, and further discussion
+:ref:`here <field-match>`.
 
 -------------------------------------------------------------------------------
 
@@ -542,6 +582,33 @@ scalar fields can later be decompressed as non-interleaved fields::
 using strides *sx* = 1, *sy* = *nx* and pointers :code:`&out[0][0][0]`
 and :code:`&out[1][0][0]`.
 
+Another use case is when a compressed array is to be decompressed into a
+larger surrounding array.  For example, a 3D subarray with dimensions
+*mx* |times| *my* |times| *mz* may be decompressed into a larger array
+with dimensions *nx* |times| *ny* |times| *nz*, with *mx* |leq| *nx*,
+*my* |leq| *ny*, *mz* |leq| *nz*.  This can be achieved by setting the strides
+to *sx* = 1, *sy* = *nx*, *sz* = *nx* |times| *ny* upon decompression (using
+:c:func:`zfp_field_set_stride_3d`), while specifying *mx*, *my*, and *mz* as
+the field dimensions (using :c:func:`zfp_field_3d` or
+:c:func:`zfp_field_set_size_3d`).  In this case, one may also wish to offset
+the decompressed subarray to (*ox*, *oy*, *oz*) within the larger array
+using::
+
+  float* data = new float[nx * ny * nz];
+  pointer = data + ox * sx + oy * sy + oz * sz
+
+where *data* specifies the beginning of the larger array.  *pointer* rather
+than *data* would then be passed to :c:func:`zfp_field_3d` or
+:c:func:`zfp_field_set_pointer` before calling :c:func:`zfp_decompress`.
+
+.. note::
+  Strides are a property of the in-memory layout of an uncompressed array
+  and have no meaning with respect to the compressed bit stream representation
+  of the array.  As such, |zfp| provides no mechanism for storing information
+  about the original strides in the compressed stream.  If strides are to be
+  retained upon decompression, then the user needs to record them as auxiliary
+  metadata and initialize :c:type:`zfp_field` with them.
+
 -------------------------------------------------------------------------------
 
 .. _q-tolerance:
@@ -551,17 +618,40 @@ Q17: *Why does zfp sometimes not respect my error tolerance?*
 A: First, |zfp| does not support
 :ref:`fixed-accuracy mode <mode-fixed-accuracy>` for integer data and
 will ignore any tolerance requested via :c:func:`zfp_stream_set_accuracy`
-or associated :ref:`expert mode <mode-expert>` parameter settings.
-
-For floating-point data, |zfp| does not store each scalar value independently
-but represents a group of values (4, 16, 64, or 256 values, depending on
-dimensionality) as linear combinations like averages by evaluating arithmetic
-expressions.  Just like in uncompressed IEEE floating-point arithmetic, both
-representation error and roundoff error in the least significant bit(s) often
-occur.
+or associated :ref:`expert mode <mode-expert>` parameter settings.  So this
+FAQ pertains to floating-point data only.
+
+The short answer is that, given finite precision, the |zfp| and IEEE
+floating-point number systems represent distinct subsets of the reals
+(or, in case of |zfp|, blocks of reals).  Although these subsets have
+significant overlap, they are not equal.  Consequently, there are some
+combinations of floating-point values that |zfp| cannot represent exactly;
+conversely, there are some |zfp| blocks that cannot be represented exactly
+as IEEE floating point.  If the user-specified tolerance is smaller than the
+difference between the IEEE floating-point representation to be compressed
+and its closest |zfp| representation, then the tolerance necessarily will
+be violated (except in :ref:`reversible mode <mode-reversible>`).  In
+practice, absolute tolerances have to be extremely small relative to the
+numbers being compressed for this issue to occur, however.
+
+Note that this issue is not particular to |zfp| but occurs in the conversion
+between any two number systems of equal precision; we may just as well fault
+IEEE floating point for not being able to represent all |zfp| blocks
+accurately enough!  By analogy, not all 32-bit integers can be represented
+exactly in 32-bit floating point.  The integer 123456789 is one example; the
+closest float is 123456792.  And, obviously, not all floats (e.g., 0.5) can
+be represented exactly as integers.
+
+To further demonstrate this point, let us consider a concrete example.  |zfp|
+does not store each floating-point scalar value independently but represents
+a group of values (4, 16, 64, or 256 values, depending on dimensionality) as
+linear combinations like averages by evaluating arithmetic expressions.
+Just like in uncompressed IEEE floating-point arithmetic, both representation
+error and roundoff error in the least significant bit(s) often occur.
 
 To illustrate this, consider compressing the following 1D array of four
-floats::
+floats
+::
 
   float f[4] = { 1, 1e-1, 1e-2, 1e-3 };
 
@@ -579,20 +669,23 @@ is even smaller: 5.424e-9.  This reconstruction error is primarily due to
 |zfp|'s block-floating-point representation, which expresses the four values
 in a block relative to a single, common binary exponent.  Such exponent
 alignment occurs also in regular IEEE floating-point operations like addition.
-For instance,::
+For instance,
+::
 
   float x = (f[0] + f[3]) - 1;
 
 should of course result in :code:`x = f[3] = 1e-3`, but due to exponent
 alignment a few of the least significant bits of f[3] are lost in the
-addition, giving a result of :code:`x = 1.0000467e-3` and a roundoff error
-of 4.668e-8.  Similarly,::
+rounded result of the addition, giving :code:`x = 1.0000467e-3` and a
+roundoff error of 4.668e-8.  Similarly,
+::
 
   float sum = f[0] + f[1] + f[2] + f[3];
 
 should return :code:`sum = 1.111`, but is computed as 1.1110000610.  Moreover,
 the value 1.111 cannot even be represented exactly in (radix-2) floating-point;
-the closest float is 1.1109999.  Thus the computed error::
+the closest float is 1.1109999.  Thus the computed error
+::
 
   float error = sum - 1.111f;
 
@@ -600,7 +693,7 @@ which itself has some roundoff error, is 1.192e-7.
 
 *Phew*!  Note how the error introduced by |zfp| (5.472e-9) is in fact one to
 two orders of magnitude smaller than the roundoff errors (4.668e-8 and
-1.192e-7) introduced by IEEE floating-point in these computations.  This lower
+1.192e-7) introduced by IEEE floating point in these computations.  This lower
 error is in part due to |zfp|'s use of 30-bit significands compared to IEEE's
 24-bit single-precision significands.  Note that data sets with a large dynamic
 range, e.g., where adjacent values differ a lot in magnitude, are more
@@ -610,9 +703,9 @@ The moral of the story is that error tolerances smaller than machine epsilon
 (relative to the data range) cannot always be satisfied by |zfp|.  Nor are such
 tolerances necessarily meaningful for representing floating-point data that
 originated in floating-point arithmetic expressions, since accumulated
-roundoff errors are likely to swamp compression errors.  Because such roundoff
-errors occur frequently in floating-point arithmetic, insisting on lossless
-compression on the grounds of accuracy is tenuous at best.
+roundoff errors are likely to swamp compression errors.  Because such
+roundoff errors occur frequently in floating-point arithmetic, insisting on
+lossless compression on the grounds of accuracy is tenuous at best.
 
 -------------------------------------------------------------------------------
 
@@ -640,8 +733,8 @@ the number of blocks *bx* |times| *by*::
 
   bitsize = (4 * bx) * (4 * by) * rate
 
-where *nx* |leq| 4 |times| bx < *nx* + 4 and
-*ny* |leq| 4 |times| *by* < *ny* + 4.  When amortizing bitsize over the
+where *nx* |leq| 4 |times| *bx* < *nx* + 4 and
+*ny* |leq| 4 |times| *by* < *ny* + 4.  When amortizing *bitsize* over the
 *nx* |times| *ny* values, a slightly higher rate than requested may result.
 
 Third, to support updating compressed blocks, as is needed by |zfp|'s
@@ -677,8 +770,8 @@ uncompressed array to avoid having to allocate separate storage for the
 compressed stream.  |zfp| does allow for the possibility of such in-place
 compression, but with several caveats and restrictions:
 
-  1. A bitstream must be created whose buffer points to the beginning of
-     uncompressed (and to be compressed) storage.
+  1. A :c:type:`bitstream` must be created whose buffer points to the beginning
+     of uncompressed (and to be compressed) storage.
 
   2. The array must be compressed using |zfp|'s low-level API.  In particular,
      the data must already be partitioned and organized into contiguous blocks
@@ -690,8 +783,8 @@ compression, but with several caveats and restrictions:
      This is usually easily accomplished in fixed-rate mode, although the
      expert interface also allows guarding against this in all modes using the
      :c:member:`zfp_stream.maxbits` parameter.  This parameter should be set to
-     :code:`maxbits = 4^d * 8 * sizeof(type)`, where *d* is the array
-     dimensionality (1, 2, or 3) and where *type* is the scalar type of the
+     :code:`maxbits = 4^d * sizeof(type) * 8`, where *d* is the array
+     dimensionality (1, 2, 3, or 4) and where *type* is the scalar type of the
      uncompressed data.
 
   4. No header information may be stored in the compressed stream.
@@ -735,8 +828,11 @@ floating-point values and then losslessly compressing the result.  The
 *q* least significant bits of *n*-bit floating-point numbers (*n* = 32
 for floats and *n* = 64 for doubles) are truncated by |zfp| by specifying a
 maximum precision of *p* = *n* |minus| *q*.  The resulting point-wise relative
-error is then at most 2\ :sup:`q - 23` (for floats) or 2\ :sup:`q - 52`
-(for doubles).
+error is then at most 2\ :sup:`3 d + q - 23` for floats and
+2\ :sup:`3 d + q - 52` for doubles, where *d* is the dimensionality of
+the data (1 |leq| *d* |leq| 4).  Expressed in terms of *p*, the relative error
+is at most 2\ :sup:`3 (d + 3) - p` for floats and 2\ :sup:`3 (d + 4) - p`
+for doubles.
 
 .. note::
   For large enough *q*, floating-point exponent bits will be discarded,
@@ -745,15 +841,21 @@ error is then at most 2\ :sup:`q - 23` (for floats) or 2\ :sup:`q - 52`
   for subnormals; however, such values are likely too small for relative
   errors to be meaningful.
 
+.. warning::
+  For the bound to hold, |zfp| must be modified to avoid the non-reversible
+  code path when less than full precision is used.  This issue will be
+  addressed in the next |zfp| release.
+
 To bound the relative error, set the expert mode parameters to::
 
-  minbits = 0
-  maxbits = 0
+  minbits = ZFP_MIN_BITS
+  maxbits = ZFP_MAX_BITS
   maxprec = p
   minexp = ZFP_MIN_EXP - 1 = -1075
 
 For example, using the |zfpcmd| command-line tool, set the parameters using
-:option:`-c` :code:`0 0 p -1075`.
+:option:`-c` :code:`0 0 p -1075` (|zfpcmd| will replace the zeros with
+defaults).
 
 Note that while the above approach respects the error bound when the
 above conditions are met, it uses |zfp| for a purpose it was not designed
@@ -940,7 +1042,7 @@ and *q* = 55 + 4 |times| *d* |leq| 64 bits of precision for double-precision
 data.  Of course, the constraint imposed by the available integer precision
 *n* implies that lossless compression of such data is possible only in 1D for
 single-precision data and only in 1D and 2D for double-precision data.
-Finally, to preserve special values such as negative zero, plus and minues
+Finally, to preserve special values such as negative zero, plus and minus
 infinity, and NaNs, reversible mode is needed.
 
 -------------------------------------------------------------------------------
@@ -976,9 +1078,10 @@ for 3D data, while the difference is smaller for 2D and 1D data.
 We recommend experimenting with tolerances and evaluating what error levels
 are appropriate for each application, e.g., by starting with a low,
 conservative tolerance and successively doubling it.  The distribution of
-errors produced by |zfp| is approximately Gaussian, so even if the maximum
-error may seem large at an individual grid point, most errors tend to be
-much smaller and tightly clustered around zero.
+errors produced by |zfp| is approximately Gaussian (see
+:ref:`Q30 <q-err-dist>`), so even if the maximum error may seem large at
+an individual grid point, most errors tend to be much smaller and tightly
+clustered around zero.
 
 -------------------------------------------------------------------------------
 
@@ -1007,7 +1110,7 @@ Q24: *Are zfp's compressed arrays and other data structures thread-safe?*
 A: Yes, compressed arrays can be made thread-safe; no, data structures
 like :c:type:`zfp_stream` and :c:type:`bitstream` are not necessarily
 thread-safe.  As of |zfp| |viewsrelease|, thread-safe read and write access
-to compressed arrays is provided through the use of
+to compressed arrays via OpenMP threads is provided through the use of
 :ref:`private views <private_immutable_view>`, although these come with
 certain restrictions and requirements such as the need for the user to
 enforce cache coherence.  Please see the documentation on
@@ -1092,7 +1195,7 @@ switching from uncompressed to compressed arrays.
 
 Q27: *Do compressed arrays use reference counting?*
 
-A: It is possible to reference compressed  array elements via proxy
+A: It is possible to reference compressed-array elements via proxy
 :ref:`references <references>` and :ref:`pointers <pointers>`, through
 :ref:`iterators <iterators>`, and through :ref:`views <views>`.  Such
 indirect references are valid only during the lifetime of the underlying
@@ -1113,16 +1216,19 @@ and not known a priori.  The function :c:func:`zfp_stream_maximum_size`
 returns a buffer size that is guaranteed to be large enough.  This function,
 which should be called *after* setting the desired compression mode and
 parameters, computes the largest possible compressed data size based on the
-current settings and array size.  Note that by the pigeonhole principle, any
-(lossless) compressor must expand at least one input, so this buffer size may
-be larger than the size of the uncompressed input data.  :c:func:`zfp_compress`
-returns the actual number of bytes of compressed storage.
+current compression settings and array size.  Note that by the pigeonhole
+principle, any (lossless) compressor must expand at least one input, so this
+buffer size may be larger than the size of the uncompressed input data.
+:c:func:`zfp_compress` returns the actual number of bytes of compressed
+storage.
 
 When compressing individual blocks using the :ref:`low-level API <ll-api>`,
 it is useful to know the maximum number of bits that a compressed block
 can occupy.  In addition to the :c:macro:`ZFP_MAX_BITS` macro, the following
 table lists the maximum block size (in bits) for each scalar type, whether
 :ref:`reversible mode <mode-reversible>` is used, and block dimensionality.
+Note that these sizes are upper bounds that are independent of compression
+parameter settings, which may further constrain the storage size.
 
   +--------+---------+-------+-------+-------+-------+
   | type   | rev.    |   1D  |   2D  |   3D  |   4D  |
@@ -1143,3 +1249,279 @@ table lists the maximum block size (in bits) for each scalar type, whether
   | double +---------+-------+-------+-------+-------+
   |        | |check| |   278 |  1058 |  4178 | 16658 |
   +--------+---------+-------+-------+-------+-------+
+
+The function :c:func:`zfp_block_maximum_size` returns the block sizes encoded
+in this table.
+
+-------------------------------------------------------------------------------
+
+.. _q-printf:
+
+Q29: *How can I print array values?*
+
+Consider the following seemingly reasonable piece of code::
+
+  #include <cstdio>
+  #include "zfp/array1.hpp"
+
+  int main()
+  {
+    zfp::array1<double> a(100, 16.0);
+    printf("%f\n", a[0]); // does not compile
+    return 0;
+  }
+
+The compiler will complain about :code:`a[0]` being a non-POD object.  This
+is because :code:`a[0]` is a :ref:`proxy reference <references>` object
+rather than a :code:`double`.  To make this work, :code:`a[0]` must be
+explicitly converted to :code:`double`, e.g., using a cast::
+
+    printf("%f\n", (double)a[0]);
+
+For similar reasons, one may not use :code:`scanf` to initialize the value
+of :code:`a[0]` because :code:`&a[0]` is a :ref:`proxy pointer <pointers>`
+object, not a :code:`double*`.  Rather, one must use a temporary variable,
+e.g.
+::
+
+  double t;
+  scanf("%lf", &t);
+  a[0] = t;
+
+Note that using :code:`iostream`, expressions like
+::
+
+  std::cout << a[0] << std::endl;
+
+do work, but
+::
+
+  std::cin >> a[0];
+
+does not.
+
+-------------------------------------------------------------------------------
+
+.. _q-err-dist:
+
+Q30: *What is known about zfp compression errors?*
+
+A: Significant effort has been spent on characterizing compression errors
+resulting from |zfp|, as detailed in the following publications:
+
+#. P. Lindstrom,
+   "`Error Distributions of Lossy Floating-Point Compressors <https://www.osti.gov/servlets/purl/1526183>`__,"
+   JSM 2017 Proceedings.
+#. J. Diffenderfer, A. Fox, J. Hittinger, G. Sanders, P. Lindstrom,
+   "`Error Analysis of ZFP Compression for Floating-Point Data <https://doi.org/10.1137/18M1168832>`__,"
+   SIAM Journal on Scientific Computing, 2019.
+#. D. Hammerling, A. Baker, A. Pinard, P. Lindstrom,
+   "`A Collaborative Effort to Improve Lossy Compression Methods for Climate Data <https://doi.org/10.1109/DRBSD-549595.2019.00008>`__,"
+   5\ :sup:`th` International Workshop on Data Analysis and Reduction for Big Scientific Data, 2019.
+#. A. Fox, J. Diffenderfer, J. Hittinger, G. Sanders, P. Lindstrom.
+   "`Stability Analysis of Inline ZFP Compression for Floating-Point Data in Iterative Methods <https://doi.org/10.1137/19M126904X>`__,"
+   SIAM Journal on Scientific Computing, 2020.
+#. P. Lindstrom, J. Hittinger, J. Diffenderfer, A. Fox, D. Osei-Kuffuor, J. Banks.
+   "`ZFP: A Compressed Array Representation for Numerical Computations <https://doi.org/10.1177/10943420241284023>`__,"
+   International Journal of High-Performance Computing Applications, 2025.
+#. A. Fox, P. Lindstrom.
+   "`Enhancing ZFP: A Statistical Approach to Understanding and Reducing Error Bias in a Lossy Floating-Point Compression Algorithm <https://doi.org/10.48550/arXiv.2407.01826>`__,"
+   SIAM Journal on Scientific Computing, to appear.
+
+In short, |zfp| compression errors are roughly normally distributed as a
+consequence of the central limit theorem, and can be bounded.  Because the
+error distribution is normal and because the worst-case error is often much
+larger than errors observed in practice, it is common that measured errors
+are far smaller than the absolute error tolerance specified in
+:ref:`fixed-accuracy mode <mode-fixed-accuracy>`
+(see :ref:`Q22 <q-abserr>`).
+
+It is known that |zfp| errors can be slightly biased and correlated (see
+:numref:`zfp-rounding` and papers #3 and #6 above).  Recent work has been done
+to combat such issues by supporting optional :ref:`rounding modes <rounding>`.
+
+.. _zfp-rounding:
+.. figure:: zfp-rounding.pdf
+  :figwidth: 90 %
+  :align: center
+  :alt: "zfp rounding modes"
+
+  |zfp| errors are normally distributed.  This figure illustrates the
+  agreement between theoretical (lines) and observed (dots) error
+  distributions (*X*, *Y*, *Z*, *W*) for 1D blocks.  Without proper rounding
+  (left), errors are biased and depend on the relative location within a |zfp|
+  block, resulting in errors not centered on zero.  With proper rounding
+  (right), errors are both smaller and unbiased.
+
+It is also known how |zfp| compression errors behave as a function of grid
+spacing, *h*.  In particular, regardless of dimensionality, the compression
+error *decreases* with finer grids (smaller *h*) for a given rate (i.e.,
+fixed compressed storage size).  The |zfp| compression error decay is fast
+enough that the corresponding error in partial derivative estimates based on
+finite differences, which *increases* with smaller *h* when using conventional
+floating point, instead *decreases* with finer grids when using |zfp|.  See
+paper #5 for details.
+
+-------------------------------------------------------------------------------
+
+.. _q-block-size:
+
+Q31: *Why are zfp blocks 4* |times| *4* |times| *4 values?*
+
+One might ask why |zfp| uses *d*-dimensional blocks of |4powd| values and not
+some other, perhaps configurable block size, *n*\ :sup:`d`.  There are several
+reasons why *n* = 4 was chosen:
+
+* For good performance, *n* should be an integer power of two so that indexing
+  can be done efficiently using bit masks and shifts rather than more
+  expensive division and modulo operations.  As nontrivial compression demands
+  *n* > 1, possible choices for *n* are 2, 4, 8, 16, ...
+
+* When *n* = 2, blocks are too small to exhibit significant redundancy; there
+  simply is insufficient spatial correlation to exploit for sufficient data
+  reduction.  Additionally, excessive software cache thrashing would likely
+  occur for stencil computations, as even the smallest centered difference
+  stencil would span more than one block.  Finally, per-block overhead in
+  storage (e.g., shared exponent, bit offset) and computation (e.g., software
+  cache lookup) could be amortized over only few values.  Such small blocks
+  were immediately dismissed.
+
+* When *n* = 8, blocks are too large, for several reasons:
+
+  * Each uncompressed block occupies a large number of hardware cache lines.
+    For example, a single 3D block of double-precision values would occupy
+    4,096 bytes, which would represent a significant fraction of L1 cache.
+    |zfp| reduces data movement in computations by ensuring that repeated
+    accesses are to cached data rather than to main memory.
+
+  * A generalization of the |zfp| :ref:`decorrelating transform <algorithm>`
+    to *n* = 8 would require many more operations as well as "arbitrary"
+    numerical constants in need of expensive multiplication instead of cheap
+    bit shifts.  The number of operations in this more general case scales as
+    *d* |times| *n*\ :sup:`d+1`.  For *d* = 4, *n* = 8, this implies
+    2\ :sup:`17` = 131,072 multiplications and 114,688 additions per block.
+    Contrast this with the algorithm optimized for *n* = 4, which uses only
+    1,536 bit shifts and 2,560 additions or subtractions per 4D block.
+
+  * The additional computational cost would also significantly increase the
+    latency of decoding a single block or filling a pipeline of concurrently
+    (de)compressed blocks, as in existing |zfp| hardware implementations.
+
+  * The computational and cache storage overhead of accessing a single value
+    in a block would be very large: 4,096 values in *d* = 4 dimensions would
+    have to be decoded even if only one value were requested.
+
+  * "Skinny" arrays would have to be padded to multiples of *n* = 8, which
+    could introduce an unacceptable storage overhead.  For instance, a
+    30 |times| 20 |times| 3 array of 1,800 values would be padded to
+    32 |times| 24 |times| 8 = 6,144 values, an increase of about 3.4 times.
+    In contrast, when *n* = 4, only 32 |times| 20 |times| 4 = 2,560 values
+    would be needed, representing a 60% overhead.
+
+  * The opportunity for data parallelism would be reduced by a factor of
+    2\ :sup:`d` compared to using *n* = 4.  The finer granularity and larger
+    number of blocks provided by *n* = 4 helps with load balancing and maps
+    well to today's GPUs that can concurrently process thousands of blocks.
+
+  * With blocks comprising as many as 8\ :sup:`4` = 4,096 values, register
+    spillage would be substantial in GPU kernels for compression and
+    decompression.
+
+The choice *n* = 4 seems to be a sweet spot that well balances all of the
+above factors.  Additionally, *n* = 4 has these benefits:
+
+  * *n* = 4 admits a very simple lifted implementation of the decorrelating
+    transform that can be performed using only integer addition, subtraction,
+    and bit shifts.
+
+  * *n* = 4 allows taking advantage of AVX/SSE instructions designed for
+    vectors of length four, both in the (de)compression algorithm and
+    application code.
+
+  * For 2D and 3D data, a block is 16 and 64 values, respectively, which
+    either equals or is close to the warp size on current GPU hardware.  This
+    allows multiple cooperating threads to execute the same instruction on one
+    value in 1-4 blocks (either during (de)compression or in the numerical
+    application code).
+
+  * Using a rate of 16 bits/value (a common choice for numerical computations),
+    a compressed 3D block occupies 128 bytes, or 1-2 hardware cache lines on
+    contemporary computers.  Hence, a fair number of *compressed* blocks can
+    also fit in hardware cache.
+
+-------------------------------------------------------------------------------
+
+.. _q-chunked:
+
+Q32: *Can zfp (de)compress a single array in chunks?*
+
+Yes, but there are restrictions.
+
+First, one can trivially partition any array into subarrays and (de)compress
+those independently using separate matching :c:func:`zfp_compress` and
+:c:func:`zfp_decompress` calls for each chunk.  Via subarray dimensions,
+strides, and pointers into the larger array, one can thus (de)compress the
+full array in pieces; see also :ref:`Q16 <q-strides>`.  This approach to
+chunked (de)compression incurs no constraints on compression mode, compression
+parameters, or array dimensions, though producer and consumer must agree on
+chunk size.  This type of chunking is employed by the |zfp| HDF5 filter
+`H5Z-ZFP <https://github.com/LLNL/H5Z-ZFP>`__ for I/O.
+
+A more restricted form of chunked (de)compression is to produce (compress) or
+consume (decompress) a single compressed stream for the whole array in chunks
+in a manner compatible with producing/consuming the entire stream all at once.
+Such chunked (de)compression divides the array into slabs along the slowest
+varying dimension (e.g., along *z* for 3D arrays), (de)compresses one slab at
+a time, and produces or consumes consecutive pieces of the sequential
+compressed stream.  This approach, too, is possible, though only when these
+requirements are met:
+
+* The size of each chunk (except the last) must be a whole multiple of four
+  along the slowest varying dimension; other dimensions are not subject to this
+  constraint.  For example, a 3D array with *nz* = 120 can be (de)compressed
+  in two or three equal-size chunks, but not four, since 120/2 = 60, and
+  120/3 = 40 are both divisible by four, but 120/4 = 30 is not.  Other viable
+  chunk sizes are 120/5 = 24, 120/6 = 20, 120/10 = 12, 120/15 = 8, and
+  120/30 = 4.  Note that other chunk sizes may be possible by relaxing the
+  constraint that they all be equal, as exploited by the
+  :ref:`chunk <ex-chunk>` code example, e.g., *nz* = 120 can be partitioned
+  into three chunks of size 32 and one of size 24.
+
+  The reason for this requirement is that |zfp| always pads each compressed
+  (sub)array to fill out whole blocks of size 4 in each dimension, and such
+  interior padding would not occur if the whole array were compressed as a
+  single chunk.
+
+* The length of the compressed substream for each chunk must be a multiple of
+  the :ref:`word size <word-size>`.  The reason for this is that each
+  :c:func:`zfp_compress` and :c:func:`zfp_decompress` call aligns the stream
+  on a word boundary upon completion.  One may avoid this requirement by using
+  the low-level API, which does not automatically perform such alignment.
+
+.. note::
+
+  When using the :ref:`high-level API <hl-api>`, the requirement on stream
+  alignment essentially limits chunked (de)compression to
+  :ref:`fixed-rate mode <mode-fixed-rate>`, as it is the only one that can
+  guarantee that the size of each compressed chunk is a multiple of the word
+  size.  To support other compression modes, use the
+  :ref:`low-level API <ll-api>`.
+
+Chunked (de)compression requires the user to set the :c:type:`zfp_field`
+dimensions to match the current chunk size and to set the
+:ref:`field pointer <zfp_field_set>` to the beginning of each uncompressed
+chunk before (de)compressing it.  The user may also have to position the
+compressed stream so that it points to the beginning of each compressed
+chunk.  See the :ref:`code example <ex-chunk>` for how one may implement
+chunked (de)compression.
+
+Note that the chunk size used for compression need not match the size used for
+decompression; e.g., the array may be compressed in a single sweep but
+decompressed in chunks, or vice versa.  Any combination of chunk sizes that
+respect the above constraints is valid.
+
+Chunked (de)compression makes it possible to perform, for example, windowed
+streaming computations on smaller subsets of the decompressed array at a time,
+i.e., without having to allocate enough space to hold the entire uncompressed
+array.  It also can be useful for overlapping or interleaving computation with
+(de)compression in a producer/consumer model.
diff --git a/docs/source/high-level-api.rst b/docs/source/high-level-api.rst
index 110b8fbbb..f3f59c425 100644
--- a/docs/source/high-level-api.rst
+++ b/docs/source/high-level-api.rst
@@ -5,15 +5,14 @@
 High-Level C API
 ================
 
-The C API is broken down into a :ref:`high-level API <hl-api>`,
-which handles compression of entire arrays, and a
-:ref:`low-level-api <ll-api>` for processing individual blocks
-and managing the underlying bit stream.
-
-The high-level API should be the API of choice for applications that
-compress and decompress entire arrays.  A :ref:`low-level API <ll-api>`
-exists for processing individual, possibly partial blocks as well as
-reduced-precision integer data less than 32 bits wide.
+The |libzfp| C API provides functionality for sequentially compressing and
+decompressing whole integer and floating-point arrays or single blocks.  It
+is broken down into a :ref:`high-level API <hl-api>` and a
+:ref:`low-level API <ll-api>`.  The high-level API handles compression of
+entire arrays and supports a variety of back-ends (e.g., serial, OpenMP).
+The low-level API exists for processing individual, possibly partial blocks
+as well as reduced-precision integer data less than 32 bits wide.
+Both C APIs are declared in :file:`zfp.h`.
 
 The following sections are available:
 
@@ -27,6 +26,7 @@ The following sections are available:
   * :ref:`hl-func-exec`
   * :ref:`hl-func-config`
   * :ref:`hl-func-field`
+  * :ref:`hl-func-headers`
   * :ref:`hl-func-codec`
 
 .. _hl-macros:
@@ -34,16 +34,84 @@ The following sections are available:
 Macros
 ------
 
+.. _version-id:
+
 .. c:macro:: ZFP_VERSION_MAJOR
 .. c:macro:: ZFP_VERSION_MINOR
 .. c:macro:: ZFP_VERSION_PATCH
+.. c:macro:: ZFP_VERSION_TWEAK
+
+  Macros identifying the |zfp| library version
+  (*major*.\ *minor*.\ *patch*.\ *tweak*).  :c:macro:`ZFP_VERSION_TWEAK`
+  is new as of |zfp| |verrelease| and is used to mark intermediate develop
+  versions (unofficial releases).
+
+----
+
+.. c:macro:: ZFP_VERSION_DEVELOP
+
+  Macro signifying that the current version is an intermediate version that
+  differs from the last official release.  This macro is undefined for
+  official releases; when defined, its value equals 1.  Note that this
+  macro may be defined even if the four :ref:`version identifiers <version-id>`
+  have not changed.  Available as of |zfp| |verrelease|.
+
+----
+
 .. c:macro:: ZFP_VERSION
+
+  A single integer constructed from the four
+  :ref:`version identifiers <version-id>`.  This integer can be generated by
+  :c:macro:`ZFP_MAKE_VERSION` or :c:macro:`ZFP_MAKE_FULLVERSION`.  Its value
+  equals the global constant :c:data:`zfp_library_version`.
+
+.. note::
+  Although :c:macro:`ZFP_VERSION` increases monotonically with release date
+  and with the four :ref:`version identifiers <version-id>` it depends on,
+  the mapping to :c:macro:`ZFP_VERSION` changed with the introduction of
+  :c:macro:`ZFP_VERSION_TWEAK` in |zfp| |verrelease|.
+
+  Going forward, we recommend using :c:macro:`ZFP_MAKE_VERSION` or
+  :c:macro:`ZFP_MAKE_FULLVERSION` in conditional code that depends on
+  :c:macro:`ZFP_VERSION`, e.g.,
+  :code:`#if ZFP_VERSION >= ZFP_MAKE_VERSION(1, 0, 0)`.
+  Note that such constructions should not be used with older versions of
+  |zfp|, e.g., :code:`if (zfp_library_version == ZFP_MAKE_VERSION(0, 5, 5))`
+  will not give the expected result with binary versions of |libzfp| before
+  version |verrelease|.
+
+----
+
 .. c:macro:: ZFP_VERSION_STRING
 
-  Macros identifying the |zfp| library version.  :c:macro:`ZFP_VERSION` is
-  a single integer constructed from the previous three macros.
-  :c:macro:`ZFP_VERSION_STRING` is a string literal.  See also
-  :c:data:`zfp_library_version` and :c:data:`zfp_version_string`.
+  :c:macro:`ZFP_VERSION_STRING` is a string literal composed of the three
+  to four :ref:`version identifiers <version-id>`.  The string does not
+  include the fourth identifier, :c:macro:`ZFP_VERSION_TWEAK`, if it is
+  zero.  For example, version `1.2.3.0` is identified as `"1.2.3"`.  This
+  macro is one of the components of :c:data:`zfp_version_string`.
+
+----
+
+.. c:macro:: ZFP_MAKE_VERSION(major, minor, patch)
+.. c:macro:: ZFP_MAKE_VERSION_STRING(major, minor, patch)
+
+  Utility macros for constructing :c:macro:`ZFP_VERSION` and
+  :c:macro:`ZFP_VERSION_STRING`, respectively.  Available as of
+  |zfp| |verrelease|, these macros may be used by applications to test
+  for a certain |zfp| version number, e.g.,
+  :code:`#if ZFP_VERSION >= ZFP_MAKE_VERSION(1, 0, 0)`.
+
+----
+
+.. c:macro:: ZFP_MAKE_FULLVERSION(major, minor, patch, tweak)
+.. c:macro:: ZFP_MAKE_FULLVERSION_STRING(major, minor, patch, tweak)
+
+  Utility macros for constructing :c:macro:`ZFP_VERSION` and
+  :c:macro:`ZFP_VERSION_STRING`, respectively.  Includes tweak version 
+  used by intermediate develop versions.  Available as of
+  |zfp| |verrelease|, these macros may be used by applications to test
+  for a certain |zfp| version number, e.g.,
+  :code:`#if ZFP_VERSION >= ZFP_MAKE_FULLVERSION(1, 0, 0, 2)`.
 
 ----
 
@@ -103,6 +171,20 @@ for how to read and write header information.
 
   Full header information (bitwise OR of all :code:`ZFP_HEADER` constants).
 
+----
+
+.. c:macro:: ZFP_MAGIC_BITS
+.. c:macro:: ZFP_META_BITS
+.. c:macro:: ZFP_MODE_SHORT_BITS
+.. c:macro:: ZFP_MODE_LONG_BITS
+.. c:macro:: ZFP_HEADER_MAX_BITS
+.. c:macro:: ZFP_MODE_SHORT_MAX
+
+  Number of bits used by each portion of the header.  These macros are
+  primarily informational and should not be accessed by the user through
+  the high-level API.  For most common compression parameter settings,
+  only :c:macro:`ZFP_MODE_SHORT_BITS` bits of header information are stored
+  to encode the mode (see :c:func:`zfp_stream_mode`).
 
 ----
 
@@ -148,29 +230,6 @@ bitwise ORed together.  Use :c:macro:`ZFP_DATA_ALL` to count all storage used.
 
   All storage (bitwise OR of all :code:`ZFP_DATA` constants).
 
-----
-
-.. c:macro:: ZFP_MAGIC_BITS
-.. c:macro:: ZFP_META_BITS
-.. c:macro:: ZFP_MODE_SHORT_BITS
-.. c:macro:: ZFP_MODE_LONG_BITS
-.. c:macro:: ZFP_HEADER_MAX_BITS
-.. c:macro:: ZFP_MODE_SHORT_MAX
-
-  Number of bits used by each portion of the header.  These macros are
-  primarily informational and should not be accessed by the user through
-  the high-level API.  For most common compression parameter settings,
-  only :c:macro:`ZFP_MODE_SHORT_BITS` bits of header information are stored
-  to encode the mode (see :c:func:`zfp_stream_mode`).
-
-.. c:macro: ZFP_ROUND_FIRST
-.. c:macro: ZFP_ROUND_NEVER
-.. c:macro: ZFP_ROUND_LAST
-
-  Available rounding modes for :c:macro:`ZFP_ROUNDING_MODE`, which
-  specifies at build time how |zfp| performs rounding in lossy compression
-  mode.
-
 
 .. _hl-types:
 
@@ -201,14 +260,21 @@ Types
   The :c:type:`zfp_stream` also stores information about how to execute
   compression, e.g., sequentially or in parallel.  The execution is determined
   by the policy and any policy-specific parameters such as number of
-  threads.
+  threads. 
   ::
 
     typedef struct {
-      zfp_exec_policy policy; // execution policy (serial, omp, ...)
-      zfp_exec_params params; // execution parameters
+      zfp_exec_policy policy; // execution policy (serial, omp, cuda, ...)
+      void* params;           // execution parameters
     } zfp_execution;
 
+.. warning::
+    As of |zfp| |verrelease| :c:type:`zfp_execution` replaces the former
+    :code:`zfp_exec_params` with a :code:`void*` to the associated
+    :code:`zfp_exec_params` type (e.g., :c:type:`zfp_exec_params_omp`) to
+    limit ABI-breaking changes due to future extensions to |zfp| execution
+    policies.
+
 ----
 
 .. c:type:: zfp_exec_policy
@@ -225,29 +291,17 @@ Types
 
 ----
 
-.. c:type:: zfp_exec_params
-
-  Execution parameters are shared among policies in a union.  Currently
-  the only parameters available are for OpenMP.
-  ::
-
-    typedef union {
-      zfp_exec_params_omp omp; // OpenMP parameters
-    } zfp_exec_params;
-
-----
-
 .. c:type:: zfp_exec_params_omp
 
   Execution parameters for OpenMP parallel compression.  These are
   initialized to default values.  When nonzero, they indicate the number
-  of threads to request for parallel compression and the number of 1D
-  blocks to assign to each thread when compressing 1D arrays.
+  of threads to request for parallel compression and the number of
+  consecutive blocks to assign to each thread.
   ::
 
     typedef struct {
       uint threads;    // number of requested threads
-      uint chunk_size; // number of blocks per chunk (1D only)
+      uint chunk_size; // number of blocks per chunk
     } zfp_exec_params_omp;
 
 ----
@@ -294,10 +348,10 @@ Types
 
 .. c:type:: zfp_type
 
-  Enumerates the scalar types supported by the compressor, and is used to
-  describe the uncompressed array.  The compressor and decompressor must use
-  the same :c:type:`zfp_type`, e.g., one cannot compress doubles and decompress
-  to floats or integers.
+  Enumerates the scalar types supported by the compressor and describes the
+  uncompressed array.  The compressor and decompressor must use the same
+  :c:type:`zfp_type`, e.g., one cannot compress doubles and decompress to
+  floats or integers.
   ::
 
     typedef enum {
@@ -342,16 +396,26 @@ Types
   The strides, when nonzero, specify how the array is laid out in memory.
   Strides can be used in case multiple fields are stored interleaved via
   "array of struct" (AoS) rather than "struct of array" (SoA) storage,
-  or if the dimensions should be transposed during (de)compression.
-  Strides may even be negative, allowing one or more dimensions to be
-  traversed in reverse order.  Given 4D array indices (*x*, *y*, *z*, *w*),
-  the corresponding array element is stored at
-  ::
+  or if the dimensions should be transposed during (de)compression; see
+  :ref:`this FAQ <q-strides>`.  Strides may even be negative, allowing one
+  or more dimensions to be traversed in reverse order.  Given 4D array
+  indices (*x*, *y*, *z*, *w*), the corresponding array element is stored
+  at::
 
     data[x * sx + y * sy + z * sz + w * sw]
 
   where :code:`data` is a pointer to the first array element.
 
+.. _field-match:
+.. warning::
+  The user must ensure that the scalar type and field dimensions *nx*, *ny*,
+  *nz*, and *nw* used during compression are matched during decompression.
+  Such metadata must either be maintained separately by the user or be
+  embedded in the compressed stream using :c:func:`zfp_write_header` and
+  recovered during decompression using :c:func:`zfp_read_header`.  Strides,
+  on the other hand, need not match, which can be exploited, for example, to
+  decompress the array into a larger array.  See :ref:`this FAQ <q-same>` and
+  the :ref:`tutorial <tut-hl>` for more details.
 
 .. _new-field:
 .. warning::
@@ -426,9 +490,8 @@ Constants
 .. c:var:: const char* const zfp_version_string
 
   A constant string representing the |zfp| library version and release date.
-  One can search for this string in executables and libraries that use |zfp|
-  to determine which version of the library the application was compiled
-  against.
+  One can search for this string in executables and libraries that link to
+  |libzfp| when built as a static library.
 
 .. _hl-functions:
 
@@ -495,6 +558,11 @@ Compressed Stream
   parameters stored in *stream* and the array whose scalar type and dimensions
   are given by *field*.  This function may be used to determine how large a
   memory buffer to allocate to safely hold the entire compressed array.
+  The buffer may then be resized (using :code:`realloc()`) after the actual
+  number of bytes is known, as returned by :c:func:`zfp_compress`.
+
+  This function returns zero if the size exceeds what can be represented in
+  a :code:`size_t`.
 
 
 .. _hl-func-stream:
@@ -551,7 +619,7 @@ Compression Parameters
   and indirectly governs the relative error.  The actual precision is
   returned, e.g., in case the desired precision is out of range.  To
   preserve a certain floating-point mantissa or integer precision in the
-  decompressed data, see :ref:`FAQ #21 <q-lossless>`.
+  decompressed data, see FAQ :ref:`#21 <q-lossless>`.
 
 ----
 
@@ -569,7 +637,7 @@ Compression Parameters
   :ref:`fixed-accuracy mode <mode-fixed-accuracy>`.  The tolerance ensures
   that values in the decompressed array differ from the input array by no
   more than this tolerance (in all but exceptional circumstances; see
-  :ref:`FAQ #17 <q-tolerance>`).  This compression mode should be used only
+  FAQ :ref:`#17 <q-tolerance>`).  This compression mode should be used only
   with floating-point (not integer) data.
 
 ----
@@ -584,7 +652,7 @@ Compression Parameters
   variable-length encoding can be used to economically encode and decode
   the compression parameters, which is especially important if the parameters
   are to vary spatially over small regions.  Such spatially adaptive coding
-  would have to be implemented via the low-level API.
+  would have to be implemented via the :ref:`low-level API <ll-api>`.
 
 ----
 
@@ -619,7 +687,7 @@ Execution Policy
 
 .. c:function:: zfp_exec_policy zfp_stream_execution(const zfp_stream* stream)
 
-  Return current execution policy.
+  Return current :ref:`execution policy <execution>`.
 
 ----
 
@@ -639,9 +707,9 @@ Execution Policy
 
 .. c:function:: zfp_bool zfp_stream_set_execution(zfp_stream* stream, zfp_exec_policy policy)
 
-  Set execution policy.  If different from the previous policy, initialize
-  the execution parameters to their default values.  :code:`zfp_true` is
-  returned if the execution policy is supported.
+  Set :ref:`execution policy <execution>`.  If different from the previous
+  policy, initialize the execution parameters to their default values.
+  :code:`zfp_true` is returned if the execution policy is supported.
 
 ----
 
@@ -818,6 +886,14 @@ Array Metadata
 
 ----
 
+.. c:function:: size_t zfp_field_blocks(const zfp_field* field)
+
+  Return total number of *d*-dimensional blocks (whether partial or whole)
+  spanning the array.  Each whole block consists of |4powd| scalars.
+  Available since |zfp| |fieldrelease|.
+
+----
+
 .. c:function:: zfp_bool zfp_field_stride(const zfp_field* field, ptrdiff_t* stride)
 
   Return :code:`zfp_false` if the array is stored contiguously as
@@ -927,6 +1003,105 @@ Array Metadata
   Return :code:`zfp_true` upon success.  See :c:func:`zfp_field_metadata` for
   how to encode *meta*.
 
+.. index::
+   single: Header
+
+.. _headers:
+.. _hl-func-headers:
+
+Stream Headers
+^^^^^^^^^^^^^^
+
+When decompressing data, |zfp| needs sufficient information to determine
+:ref:`compression mode <modes>` and settings (like rate, precision, or
+accuracy) as well as the underlying type and dimensions of the uncompressed
+array.  The compressed stream itself does not store this information unless
+the user encodes it in a short, optional header.  The functions below allow
+writing such headers on compression and later reading them during
+decompression.
+
+|zfp| headers have been designed to be very concise representations of
+:ref:`compression parameters <hl-func-stream>` and
+:ref:`array metadata <hl-func-field>` to accommodate compression of many
+small (sub)arrays or spatially varying compression settings with low storage
+overhead.  In most cases, 64 bits of header data suffice to describe both
+array and compression parameters.
+
+Headers are divided into three separate fields, each of which is optional
+and included in the header by passing a :ref:`bit mask <header-macros>` to
+one of the functions below:
+
+* :c:macro:`ZFP_HEADER_MAGIC`: A 32-bit "magic" constant for identifying the
+  stream as |zfp| data.
+
+* :c:macro:`ZFP_HEADER_META`: A 52-bit metadata field that specifies the type
+  and shape of the uncompressed array.  This field encodes one of four scalar
+  types (32- and 64-bit integer and floating-point types), one of four
+  dimensionalities 1 |leq| *d* |leq| 4, and the array dimensions (or shape).
+  The array dimensions supported are limited to 48 / *d* bits each.  For
+  example, when *d* = 2, 24 bits are used to encode the number of rows and
+  columns, which each must be at most 2\ :sup:`24`.  If dimensions exceed
+  these limits, an error code is generated.
+
+* :c:macro:`ZFP_HEADER_MODE`: A 12- or 64-bit compression settings field.
+  This variable-rate encoding has been designed to support most common
+  compression parameters through only 12 bits, which when combined with the
+  52-bit metadata supports magic-less headers of 64 bits (perhaps for many
+  small subarrays with their own sizes and compression settings).  The
+  following compression modes and parameters are supported in the short 12-bit
+  encoding:
+
+  - :ref:`Fixed-rate mode <mode-fixed-rate>`: 1 to 2048 bits per block of
+    |4powd| values.  For example, all possible rates up to
+    2048 / 4\ :sup:`3` = 32 bits/value are supported for 3D arrays.
+
+  - :ref:`Fixed-precision mode <mode-fixed-precision>`: 1 to 128 bits of
+    precision, which covers all precisions currently supported by |zfp|.
+
+  - :ref:`Fixed-accuracy mode <mode-fixed-accuracy>`: All tolerances in
+    the range [2\ :sup:`-1074`, 2\ :sup:`843`] |approx|
+    [5 |times| 10\ :sup:`-324`, 6 |times| 10\ :sup:`253`].
+    Note that |zfp| error tolerances are limited to integer powers of two
+    (|zfp| will round down other tolerances).
+
+  - :ref:`Reversible mode <mode-reversible>`: Takes no compression parameters.
+
+  If compression parameters do not respect these constraints, then a longer
+  64-bit encoding is used that supports all |zfp| compression modes and
+  parameter settings, including any :ref:`expert-mode <mode-expert>` settings
+  not covered by this list.  A reserved 12-bit code is used to denote that
+  52 additional bits follow for a full 64-bit encoding.
+
+.. note::
+  Stream headers are not necessarily comprised of a whole number of
+  :ref:`words <word-size>`.  Hence, the payload compressed data may not be
+  word aligned when headers are prepended.
+
+The |zfp| high-level API provides the following two functions for reading and
+writing headers, which should be called before (de)compressing data so that
+the :c:struct:`zfp_stream` and :c:struct:`zfp_field` objects can be properly
+initialized before decompression.
+
+.. c:function:: size_t zfp_write_header(zfp_stream* stream, const zfp_field* field, uint mask)
+
+  Write an optional variable-length header to the stream that encodes
+  compression parameters, array metadata, etc.  The header information written
+  is determined by the bit *mask* (see :c:macro:`macros <ZFP_HEADER_MAGIC>`).
+  Unlike in :c:func:`zfp_compress`, no word alignment is enforced.  See the
+  :ref:`limitations <limitations>` section for limits on the maximum array
+  size supported by the header.  The return value is the number of bits
+  written, or zero upon failure.
+
+----
+
+.. c:function:: size_t zfp_read_header(zfp_stream* stream, zfp_field* field, uint mask)
+
+  Read header previously written using :c:func:`zfp_write_header`.  The
+  *stream* and *field* data structures are populated with the information
+  stored in the header, as specified by the bit *mask* (see
+  :c:macro:`macros <ZFP_HEADER_MAGIC>`).  The caller must ensure that *mask*
+  agrees between header read and write calls.  The return value is the number
+  of bits read, or zero upon failure.
 
 .. _hl-func-codec:
 
@@ -952,26 +1127,10 @@ Compression and Decompression
   i.e., the current byte offset or the number of compressed bytes consumed.
   Zero is returned if decompression failed.
 
-----
-
-.. _zfp-header:
-.. c:function:: size_t zfp_write_header(zfp_stream* stream, const zfp_field* field, uint mask)
-
-  Write an optional variable-length header to the stream that encodes
-  compression parameters, array metadata, etc.  The header information written
-  is determined by the bit *mask* (see :c:macro:`macros <ZFP_HEADER_MAGIC>`).
-  Unlike in :c:func:`zfp_compress`, no word alignment is enforced.  See the
-  :ref:`limitations <limitations>` section for limits on the maximum array
-  size supported by the header.  The return value is the number of bits
-  written, or zero upon failure.
-
-----
-
-.. c:function:: size_t zfp_read_header(zfp_stream* stream, zfp_field* field, uint mask)
-
-  Read header if one was previously written using :c:func:`zfp_write_header`.
-  The *stream* and *field* data structures are populated with the information
-  stored in the header, as specified by the bit *mask* (see
-  :c:macro:`macros <ZFP_HEADER_MAGIC>`).  The caller must ensure that *mask*
-  agrees between header read and write calls.  The return value is the number
-  of bits read, or zero upon failure.
+  The field dimensions and scalar type used during decompression must match
+  those used during compression (see :c:type:`zfp_field`).  This function
+  further requires that :c:type:`zfp_stream` is initialized with the same
+  :ref:`compression parameters <modes>` used during compression.  To assist
+  with this, an optional :ref:`header <headers>` may be prepended that
+  encodes such metadata, which must be explicitly read using
+  :c:func:`zfp_read_header` to initialize *stream* and *field*.
diff --git a/docs/source/index.inc b/docs/source/index.inc
index 40e354eb1..ede2b27cf 100644
--- a/docs/source/index.inc
+++ b/docs/source/index.inc
@@ -13,9 +13,9 @@ is constant, and the bit offset to each block can be quickly computed.  For
 variable-rate arrays, the compressed block size is data dependent, and
 additional information must be stored to index the blocks.  Toward this end,
 |zfp| arrays make use of an index class that reports the offset and
-size (in number of bits) of each block.  The |zfp| :cpp:class:`zfp::array`
-and :cpp:class:`zfp::const_array` take such an index class as a template
-parameter.  This index class is new as of |zfp| |carrrelease|, which
+size (in number of bits) of each block.  The :cpp:class:`zfp::array`
+and :cpp:class:`zfp::const_array` classes take such an index class as a
+template parameter.  This index class is new as of |zfp| |carrrelease|, which
 introduced variable-rate arrays.
 
 Because |zfp| is designed primarily for very large arrays, the bit offset
@@ -44,7 +44,7 @@ and speed of access:
   The top 32 bits of a 44-bit base offset are stored, with the 12 least
   significant bits of this base set to zero.  Four unsigned 16-bit deltas
   from the base offset complete the representation.  The default for
-  variable-rate arrays, this index offers a good tradeoff between storage,
+  variable-rate arrays, this index offers a good trade-off between storage,
   offset range, and speed.
 
 * :cpp:class:`hybrid8`: Eight consecutive offsets are encoded together
@@ -95,7 +95,7 @@ interface with the |zfp| compressed-array classes.
 
 ----
 
-.. cpp:function:: size_t index::range() const
+.. cpp:function:: bitstream_size index::range() const
 
   Range of bit offsets spanned by index.  This equals the total number of
   bits of compressed-array data.
@@ -108,7 +108,7 @@ interface with the |zfp| compressed-array classes.
 
 ----
 
-.. cpp:function:: size_t index::block_offset(size_t block_index) const
+.. cpp:function:: bitstream_offset index::block_offset(size_t block_index) const
 
   Bit offset to compressed block data.
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0a423ea59..d979a4950 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -10,6 +10,7 @@
    introduction
    license
    installation
+   configuration
    algorithm
    modes
    execution
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index e7ae4c2d4..56e6d4b69 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -15,19 +15,12 @@ they call the compression library, applications must link with |libzfp|.
 
 |zfp| is preferably built using `CMake <https://cmake.org>`__, although the
 core library can also be built using GNU make on Linux, macOS, and MinGW.
-|zfp| has successfully been built and tested using these compilers:
-
-* gcc versions 4.4.7, 4.7.3, 4.8.5, 4.9.4, 5.5.0, 6.1.0, 6.4.0, 7.1.0, 7.3.0, 8.1.0
-* icc versions 14.0.3, 15.0.6, 16.0.4, 17.0.2, 18.0.2, 19.0.3
-* clang versions 3.9.1, 4.0.0, 5.0.0, 6.0.0
-* MinGW version 5.3.0
-* Visual Studio versions 14 (2015), 15 (2017)
 
 |zfp| conforms to various language standards, including C89, C99, C++98,
 C++11, and C++14.
 
 .. note::
-  |zfp| requires 64-bit compiler and operating system support.
+  |zfp| requires compiler support for 64-bit integers.
 
 .. _cmake_builds:
 
@@ -37,7 +30,7 @@ CMake Builds
 To build |zfp| using `CMake <https://cmake.org>`__ on Linux or macOS, start
 a Unix shell and type::
 
-    cd zfp-0.5.5
+    cd zfp-1.0.1
     mkdir build
     cd build
     cmake ..
@@ -55,7 +48,7 @@ By default, CMake builds will attempt to locate and use
 To build |zfp| using Visual Studio on Windows, start a DOS shell
 and type::
 
-    cd zfp-0.5.5
+    cd zfp-1.0.1
     mkdir build
     cd build
     cmake ..
@@ -74,7 +67,7 @@ GNU Builds
 To build |zfp| using `gcc <https://gcc.gnu.org>`__ without
 `OpenMP <http://www.openmp.org>`__, type::
 
-    cd zfp-0.5.5
+    cd zfp-1.0.1
     gmake
 
 This builds |libzfp| as a static library as well as the |zfp|
@@ -138,7 +131,7 @@ Regardless of the settings below, |libzfp| will always be built.
 
 .. c:macro:: BUILD_CFP
 
-  Build |libcfp| for C bindings to compressed arrays.
+  Build |libcfp| for C bindings to the compressed-array classes.
   Default: off.
 
 
@@ -161,7 +154,7 @@ Regardless of the settings below, |libzfp| will always be built.
 .. c:macro:: BUILD_ZFORP
 
   Build |libzforp| for Fortran bindings to the C API.  Requires Fortran
-  standard 2003 or later.  GNU make users may specify the Fortran compiler
+  standard 2018 or later.  GNU make users may specify the Fortran compiler
   to use via
   ::
 
@@ -184,26 +177,31 @@ Regardless of the settings below, |libzfp| will always be built.
 
 .. c:macro:: BUILD_TESTING
 
-  Build |testzfp| and (when on the GitHub
-  `develop branch <https://github.com/LLNL/zfp/tree/develop>`__) unit tests.
+  Build |testzfp| tests.
   Default: on.
 
 
+.. c:macro:: BUILD_TESTING_FULL
+
+  Build all unit tests.
+  Default: off.
+
+
 .. c:macro:: BUILD_SHARED_LIBS
 
   Build shared objects (:file:`.so`, :file:`.dylib`, or :file:`.dll` files).
-  On macOS, the :code:`SOFLAGS` line in the :file:`Config` file may have
-  to be uncommented when building with GNU make.
   CMake default: on.
   GNU make default: off.
 
+.. note::
+  On macOS, add :code:`OS=mac` when building shared libraries with GNU make.
 
 .. index::
    single: Configuration
-.. _config:
+.. _settings:
 
 
-Configuration
+Build Options
 -------------
 
 The behavior of |zfp| can be configured at compile time via a set of macros
@@ -212,6 +210,9 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
 
     cmake -DZFP_WITH_OPENMP=OFF ..
 
+Some of the settings that impact |zfp|'s behavior and what ultimately is
+stored in the compressed stream are further discussed in greater detail in
+the :ref:`config` section.
 
 .. c:macro:: ZFP_INT64
 .. c:macro:: ZFP_INT64_SUFFIX
@@ -234,7 +235,9 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
   0 or OFF to disable OpenMP support.  For GNU builds, OpenMP is disabled by
   default.  Set this macro to 1 or ON to enable OpenMP support.  See also
   OMPFLAGS in :file:`Config` in case the compiler does not recognize
-  :code:`-fopenmp`.  NOTE: clang currently does not support OpenMP on macOS.
+  ``-fopenmp``.  For example, Apple clang requires
+  ``OMPFLAGS=-Xclang -fopenmp``, ``LDFLAGS=-lomp``, and an installation of
+  ``libomp``.
   CMake default: on.
   GNU make default: off.
 
@@ -252,28 +255,32 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
   CMake default: off.
   GNU make default: off and ignored.
 
+.. _rounding-parameter:
+
 .. c:macro:: ZFP_ROUNDING_MODE
 
   **Experimental feature**.  By default, |zfp| coefficients are truncated,
-  not rounded, which can result in biased errors.  To counter this, two
-  rounding modes are available: :code:`ZFP_ROUND_FIRST` (round during
-  compression; analogous to mid-tread quantization) and :code:`ZFP_ROUND_LAST`
-  (round during decompression; analogous to mid-riser quantization).
-  With :code:`ZFP_ROUND_LAST`, the values returned on decompression are
-  slightly modified (and usually closer to the original values) without
-  impacting the compressed data itself.  This rounding mode works with all
+  not rounded, which can result in biased errors (see
+  FAQ :ref:`#30 <q-err-dist>`).  To counter this, two rounding modes are
+  available: :c:macro:`ZFP_ROUND_FIRST` (round during compression; analogous
+  to mid-tread quantization) and :c:macro:`ZFP_ROUND_LAST` (round during
+  decompression; analogous to mid-riser quantization).  With
+  :c:macro:`ZFP_ROUND_LAST`, the values returned on decompression are slightly
+  modified (and usually closer to the original values) without impacting the
+  compressed data itself.  This rounding mode works with all
   :ref:`compression modes <modes>`.
-  With :code:`ZFP_ROUND_FIRST`, the values are modified before compression,
+  With :c:macro:`ZFP_ROUND_FIRST`, the values are modified before compression,
   thus impacting the compressed stream.  This rounding mode tends to be more
   effective at reducing bias, but is invoked only with
   :ref:`fixed-precision <mode-fixed-precision>` and
   :ref:`fixed-accuracy <mode-fixed-accuracy>` compression modes.
   Both of these rounding modes break the regression tests since they alter
   the compressed or decompressed representation, but they may be used with
-  libraries built with the default rounding mode, :code:`ZFP_ROUND_NEVER`,
+  libraries built with the default rounding mode, :c:macro:`ZFP_ROUND_NEVER`,
   and versions of |zfp| that do not support a rounding mode with no adverse
-  effects.
-  Default: :code:`ZFP_ROUND_NEVER`.
+  effects.  For additional information, see the detailed :ref:`rounding`
+  section.
+  Default: :c:macro:`ZFP_ROUND_NEVER`.
 
 .. c:macro:: ZFP_WITH_TIGHT_ERROR
 
@@ -284,7 +291,9 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
   to be satisfied using fewer bits of compressed data.  As a result, when
   enabled, the observed maximum absolute error is closer to the tolerance and
   the compression ratio is increased.  This feature requires the rounding mode
-  to be :code:`ZFP_ROUND_FIRST` or :code:`ZFP_ROUND_LAST`.
+  to be :c:macro:`ZFP_ROUND_FIRST` or :c:macro:`ZFP_ROUND_LAST` and is
+  supported only by the :code:`serial` and :code:`omp`
+  :ref:`execution policies <execution>`.
   Default: undefined/off.
 
 .. c:macro:: ZFP_WITH_DAZ
@@ -301,6 +310,9 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
   results in "random" subnormals upon decompression.  When enabled, compressed
   streams may differ slightly but are decompressed correctly by libraries
   built without this option.  This option may break some regression tests.
+  Note: :c:macro:`ZFP_WITH_DAZ` is currently ignored by all
+  :ref:`execution policies <execution>` other than :code:`serial` and
+  :code:`omp`.
   Default: undefined/off.
 
 .. c:macro:: ZFP_WITH_ALIGNED_ALLOC
@@ -331,11 +343,14 @@ in the same manner that :ref:`build targets <targets>` are specified, e.g.,
   Default: undefined/off.
 
 
+.. _word-size-parameter:
+
 .. c:macro:: BIT_STREAM_WORD_TYPE
 
   Unsigned integer type used for buffering bits.  Wider types tend to give
-  higher performance at the expense of lower bit rate granularity.  For
-  portability of compressed files between little and big endian platforms,
+  higher performance at the expense of lower
+  :ref:`bit rate granularity <q-granularity>`.  For portability of compressed
+  files between little and big endian platforms,
   :c:macro:`BIT_STREAM_WORD_TYPE` should be set to :c:type:`uint8`.
   Default: :c:type:`uint64`.
 
@@ -427,4 +442,4 @@ The necessary dependencies can be installed using ``pip`` and the |zfp|
 Fortran
 ^^^^^^^
 
-The optional Fortran bindings require a Fortran 2003 compiler.
+The optional Fortran bindings require a Fortran 2018 compiler.
diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
index 8e9852747..2757eab0f 100644
--- a/docs/source/introduction.rst
+++ b/docs/source/introduction.rst
@@ -41,6 +41,8 @@ exactly, is also supported.
 `Lawrence Livermore National Laboratory <https://www.llnl.gov>`__
 and is supported by the U.S. Department of Energy's
 `Exascale Computing Project <https://www.exascaleproject.org>`__.
+|zfp| is a
+`2023 R&D 100 Award Winner <https://www.rdworldonline.com/2023-rd-100-award-winners/>`__.
 
 
 Availability
@@ -55,12 +57,14 @@ source code is recommended for users who wish to configure the internals of
 bindings) to install.
 
 |zfp| is also available through several package managers, including
-`Conda <https://anaconda.org/conda-forge/zfpy>`__,
-`PIP <https://pypi.org/project/zfpy>`__, and
-`Spack <https://spack.readthedocs.io/en/latest/package_list.html#zfp>`__.
-`RPM packages <https://repology.org/project/zfp/versions>`__ are available
-for several Linux distributions and may be installed using :code:`apt` or
-:code:`yum`.
+Conda (both `C/C++ <https://anaconda.org/conda-forge/zfp>`__ and
+`Python <https://anaconda.org/conda-forge/zfpy>`__ packages are available),
+`PIP <https://pypi.org/project/zfpy>`__,
+`Spack <https://packages.spack.io/package.html?name=zfp>`__, and
+`MacPorts <https://ports.macports.org/port/zfp/details/>`__.
+`Linux packages <https://repology.org/project/zfp/versions>`__ are available
+for several distributions and may be installed, for example, using :code:`apt`
+and :code:`yum`.
 
 
 .. _app-support:
@@ -71,18 +75,27 @@ Application Support
 |zfp| has been incorporated into several independently developed applications,
 plugins, and formats, such as
 
-* `Compressed file I/O <https://adios2.readthedocs.io/en/latest/api_high/api_high.html?highlight=zfp#_CPPv4I0EN6adios27fstream5writeEvRKNSt6stringEPK1TRKN6adios24DimsERKN6adios24DimsERKN6adios24DimsERKN6adios27vParamsEKb>`__
+* `Compressed file I/O <https://adios2.readthedocs.io/en/latest/operators/CompressorZFP.html>`__
   in `ADIOS <https://www.olcf.ornl.gov/center-projects/adios/>`__.
 
+* `Compression codec <https://www.blosc.org/posts/support-lossy-zfp/>`__
+  in the `BLOSC <https://www.blosc.org>`__ meta compressor.
+
 * `H5Z-ZFP <https://github.com/LLNL/H5Z-ZFP>`__ plugin for
   `HDF5 <https://www.hdfgroup.org/solutions/hdf5/>`__\ |reg|.  |zfp| is also one of the
   select compressors shipped with
   `HDF5 binaries <https://www.hdfgroup.org/downloads/hdf5/>`__.
 
-* `Compression functions <https://software.intel.com/en-us/ipp-dev-reference-zfp-compression-functions>`__
+* `Compression functions <https://www.intel.com/content/www/us/en/developer/articles/technical/parallel-compression-and-decompression-in-intel-integrated-performance-primitives-zfp-.html>`__
   for Intel\ |reg| `Integrated Performance Primitives <https://software.intel.com/en-us/intel-ipp>`__.
 
-* `Compression CODEC <https://community.opengroup.org/osdu/platform/domain-data-mgmt-services/seismic/open-zgy/-/raw/master/doc/compress.html>`__
+* `Compressed MPI messages <https://doi.org/10.1109/IPDPS49936.2021.00053>`__
+  in `MVAPICH2-GDR <https://mvapich.cse.ohio-state.edu/userguide/gdr/>`__.
+
+* `Compressed file I/O <https://www.openinventor.com/en/features/oil-gas-geoscience/zfp-compression/>`__
+  in `OpenInventor <https://www.openinventor.com>`__\ |tm|.
+
+* `Compression codec <https://community.opengroup.org/osdu/platform/domain-data-mgmt-services/seismic/open-zgy/-/raw/master/doc/compress.html>`__
   underlying the
   `OpenZGY <https://community.opengroup.org/osdu/platform/domain-data-mgmt-services/seismic/open-zgy>`__
   format.
@@ -96,6 +109,8 @@ plugins, and formats, such as
 * `Compression worklet <http://m.vtk.org/documentation/namespacevtkm_1_1worklet_1_1zfp.html>`__
   in `VTK-m <http://m.vtk.org>`__.
 
+* `Compression codec <https://numcodecs.readthedocs.io/en/stable/zfpy.html>`__ in `Zarr <https://github.com/zarr-developers/zarr-python>`__ via `numcodecs <https://github.com/zarr-developers/numcodecs>`__.
+
 See
 `this list <https://computing.llnl.gov/projects/floating-point-compression/related-projects>`__
 for other software products that support |zfp|.
diff --git a/docs/source/issues.rst b/docs/source/issues.rst
index ecd12d8ee..fdef39d26 100644
--- a/docs/source/issues.rst
+++ b/docs/source/issues.rst
@@ -74,7 +74,7 @@ as two scalar fields::
 
   vfield[2][ny][nx]
 
-or by using strides (see also :ref:`FAQ #1 <q-vfields>`).  Note that in all
+or by using strides (see also FAQ :ref:`#1 <q-vfields>`).  Note that in all
 these cases |zfp| will still compress the data, but if the dimensionality is
 not correct then the compression ratio will suffer.
 
@@ -296,7 +296,7 @@ the exponent of the largest (in magnitude) value within a block, but produces
 unspecified behavior if that value is not finite.  
 
 |zfp| currently has no independent mechanism for handling fill values.  Ideally
-such special values would be signalled separately, e.g., using a bit mask, 
+such special values would be signaled separately, e.g., using a bit mask, 
 and then replaced with zeros to ensure that they both compress well and do
 not pollute actual data.
 
diff --git a/docs/source/iterators.inc b/docs/source/iterators.inc
index 47235aff1..dc80a71d7 100644
--- a/docs/source/iterators.inc
+++ b/docs/source/iterators.inc
@@ -28,7 +28,7 @@ consequently, larger compression errors than when the entire block is
 initialized as a whole.  Note that the iterator traversal order differs in
 this respect from traversal by :ref:`pointers <pointers>`.
 
-Blocks are visited in raster order similarly to how indidivual array
+Blocks are visited in raster order similarly to how individual array
 elements are indexed, that is, first by *x*, then by *y*, then by *z*,
 etc.  Within each block, elements are visited in the same raster
 order.  All |4powd| values in a block are visited before moving on to the
@@ -41,7 +41,8 @@ forward iterators).  |zfp| iterators are
 be used in STL algorithms that support random access iterators.
 
 |zfp| |crpirelease| adds :code:`const` qualified versions of iterators,
-given by the :code:`const_iterator` class.
+given by the :code:`const_iterator` class.  Such iterators are available
+also for :ref:`read-only arrays <carray_classes>`.
 
 Per STL mandate, the iterators define several types:
 
@@ -134,8 +135,9 @@ The following operations are defined on iterators:
 .. cpp:function:: difference_type const_iterator::operator-(const const_iterator& it) const
 
   Return difference between this iterator and *it* in number of elements.
-  The difference *p* |minus| *q* is negative if *p* < *q*.  The iterators must
-  refer to elements in the same array.
+  The difference *p* |minus| *q* between two iterators, *p* and *q*, is
+  negative if *p* < *q*.  The iterators must refer to elements in the same
+  array.
 
 ----
 
diff --git a/docs/source/license.rst b/docs/source/license.rst
index 6ac2a7a93..1129b833f 100644
--- a/docs/source/license.rst
+++ b/docs/source/license.rst
@@ -3,15 +3,9 @@
 License
 =======
 
-| Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-| Produced at the Lawrence Livermore National Laboratory.
-| Written by Peter Lindstrom, Markus Salasoo, Matt Larsen, and Stephen Herbein.
-| LLNL-CODE-663824.
+| Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC
 | All rights reserved.
 
-This file is part of the zfp library.
-For details, see http://computing.llnl.gov/casc/zfp/.
-
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
@@ -39,24 +33,27 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-Additional BSD Notice
+Notice
+------
+
+This work was produced under the auspices of the U.S. Department of
+Energy by Lawrence Livermore National Laboratory under Contract
+DE-AC52-07NA27344.
 
-1. This notice is required to be provided under our contract with the U.S.
-Department of Energy (DOE).  This work was produced at Lawrence Livermore
-National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+This work was prepared as an account of work sponsored by an agency of
+the United States Government. Neither the United States Government nor
+Lawrence Livermore National Security, LLC, nor any of their employees
+makes any warranty, expressed or implied, or assumes any legal liability
+or responsibility for the accuracy, completeness, or usefulness of any
+information, apparatus, product, or process disclosed, or represents that
+its use would not infringe privately owned rights.
 
-2. Neither the United States Government nor Lawrence Livermore National
-Security, LLC nor any of their employees, makes any warranty, express or
-implied, or assumes any liability or responsibility for the accuracy,
-completeness, or usefulness of any information, apparatus, product, or
-process disclosed, or represents that its use would not infringe
-privately-owned rights.
+Reference herein to any specific commercial product, process, or service
+by trade name, trademark, manufacturer, or otherwise does not necessarily
+constitute or imply its endorsement, recommendation, or favoring by the
+United States Government or Lawrence Livermore National Security, LLC.
 
-3. Also, reference herein to any specific commercial products, process, or
-services by trade name, trademark, manufacturer or otherwise does not
-necessarily constitute or imply its endorsement, recommendation, or
-favoring by the United States Government or Lawrence Livermore National
-Security, LLC.  The views and opinions of authors expressed herein do not
-necessarily state or reflect those of the United States Government or
-Lawrence Livermore National Security, LLC, and shall not be used for
-advertising or product endorsement purposes.
+The views and opinions of authors expressed herein do not necessarily
+state or reflect those of the United States Government or Lawrence
+Livermore National Security, LLC, and shall not be used for advertising
+or product endorsement purposes.
diff --git a/docs/source/limitations.rst b/docs/source/limitations.rst
index 8f36f781c..c3ee71dc2 100644
--- a/docs/source/limitations.rst
+++ b/docs/source/limitations.rst
@@ -5,8 +5,8 @@
 Limitations
 ===========
 
-|zfp| has evolved from a research prototype to a library that is approaching
-production readiness.  However, the API and even the compression codec are
+|zfp| has evolved over the years from a research prototype to a production
+quality library.  However, the API and even the compression codec are
 still undergoing changes as new important features are added.
 
 Below is a list of known limitations of the current version of |zfp|.
@@ -20,7 +20,7 @@ that will address some of these limitations.
   extensions to other floating-point formats should be possible with
   minor effort.
 
-- The optional |zfp| :ref:`header <zfp-header>` supports arrays with at
+- The optional |zfp| :ref:`header <headers>` supports arrays with at
   most 2\ :sup:`48` elements.  The |zfp| header limits each dimension
   to 2\ :sup:`48/d` elements in a *d*-dimensional array, i.e.,
   2\ :sup:`48`, 2\ :sup:`24`, 2\ :sup:`16`, and 2\ :sup:`12` for 1D through
@@ -47,10 +47,8 @@ that will address some of these limitations.
   are implemented).  These proxy references and pointers can, however, safely
   be passed to functions and used where regular references and pointers can.
 
-- Although the current version of |zfp| supports :ref:`iterators <iterators>`,
-  :ref:`pointers <pointers>`, and :ref:`references <references>` to array
-  elements, 'const' versions of these accessors are not yet available for
-  read-only access.
+- The :ref:`read-only array classes <carray_classes>` do not yet support
+  (de)serialization.
 
 - |zfp| can potentially provide higher precision than conventional float
   and double arrays, but the interface currently does not expose this.
@@ -63,6 +61,8 @@ that will address some of these limitations.
   instance, compressed 64-bit-per-value storage of 128-bit quad-precision
   numbers could greatly improve the accuracy of double-precision
   floating-point computations using the same amount of storage.
+  The |zfp| compressed-array classes do not yet support integer scalar
+  types.
 
 - Complex-valued arrays are not directly supported.  Real and imaginary
   components must be stored as separate arrays, which may result in lost
@@ -74,16 +74,14 @@ that will address some of these limitations.
 
 - Version |cudarelease| adds support for CUDA compression and decompression.
   However, only the fixed-rate compression mode is so far supported.
+  The CUDA implementation is further subject to
+  :ref:`additional limitations <cuda-limitations>`.
 
-- As of version |4drelease|, |zfp| supports compression and decompression
-  of 4D arrays.  However, |zfp| does not yet implement a 4D compressed
-  array C++ class.  This will be added in the near future.
-
-- The :ref:`C wrappers <cfp>` for |zfp|'s compressed arrays support only
-  a subset of the C++ API.  |zfp| |4darrrelease| adds support for proxy
+- The |cfp| :ref:`C wrappers <cfp>` for |zfp|'s compressed arrays support
+  only a subset of the C++ API.  |zfp| |4darrrelease| adds support for proxy
   references, pointers, and iterators, but views and read-only arrays are
-  not yet supported,
+  not yet supported.  Furthermore, |cfp| works only with the |zfp| codec.
 
-- The Python and Fortran bindings do not yet support compressed arrays.
-  Moreover, only a select subset of the :ref:`high-level API <hl-api>`
-  is available via Python.
+- The Python and Fortran bindings do not yet support |zfp|'s compressed-array
+  classes.  Moreover, only a select subset of the
+  :ref:`high-level API <hl-api>` is available via Python.
diff --git a/docs/source/low-level-api.rst b/docs/source/low-level-api.rst
index c149c875f..796fa1489 100644
--- a/docs/source/low-level-api.rst
+++ b/docs/source/low-level-api.rst
@@ -5,14 +5,23 @@
 Low-Level C API
 ===============
 
-The low-level C API provides functionality for compressing individual
-*d*-dimensional blocks of up to |4powd| values.  If a block is not
-complete, i.e., contains fewer than |4powd| values, then |zfp|'s partial
+The |libzfp| low-level C API provides functionality for compressing individual
+*d*-dimensional blocks of up to |4powd| values.  If a block is not complete,
+i.e., contains fewer than |4powd| values, then |zfp|'s partial
 block support should be favored over padding the block with, say, zeros
 or other fill values.  The blocks (de)compressed need not be contiguous
 and can be gathered from or scattered to a larger array by setting
 appropriate strides.  As of |zfp| |cpprelease|, templated C++ wrappers
 are also available to simplify calling the low-level API from C++.
+The C API is declared in :file:`zfp.h`; the C++ wrappers are found in
+:file:`zfp.hpp`.
+
+.. note::
+  Because the unit of parallel work in |zfp| is a *block*, and because the
+  low-level API operates on individual blocks, this API supports only the
+  the serial :ref:`execution policy <exec-policies>`.  Any other execution
+  policy set in :c:type:`zfp_stream` is silently ignored.  For parallel
+  execution, see the :ref:`high-level API <hl-api>`.
 
 The following topics are available:
 
@@ -356,6 +365,22 @@ appropriate bias for unsigned integer data.
   Convert *dims*-dimensional contiguous block from 32-bit integer type.
   Use *dims* = 0 to demote a single value.
 
+----
+
+.. c:function:: size_t zfp_block_maximum_size(zfp_type type, uint dims, zfp_bool reversible)
+
+  Maximum compressed size in bits of a single *dims*-dimensional block of
+  the specified scalar type.  Use *reversible* = :code:`zfp_true` with
+  :ref:`reversible mode <mode-reversible>`.  Note that this bound does not
+  include any potential padding at the end of the stream to fill out a whole
+  number of words of length :c:var:`stream_word_bits`, which is inserted
+  when calling :c:func:`zfp_stream_flush` or :c:func:`stream_flush`.
+  Similarly, it includes no storage for the optional header.  The storage
+  bounds returned by this function are known to be loose, and it is
+  possible that they will be tightened in future releases.  This function
+  returns zero if any of the arguments are invalid.  See also
+  :ref:`Q28 <q-max-size>`.
+
 .. _ll-cpp-wrappers:
 
 C++ Wrappers
@@ -364,14 +389,14 @@ C++ Wrappers
 .. cpp:namespace:: zfp
 
 To facilitate calling the low-level API from C++, a number of wrappers are
-available that are templated on scalar type and dimensionality.  Each function
-of the form :code:`zfp_function_type_dims`, where *type* denotes scalar type
-and *dims* denotes dimensionality, has a corresponding C++ wrapper
-:code:`zfp::function<type, dims>`.  For example, the C function
-:c:func:`zfp_encode_block_float_2` has a C++ wrapper
+available (as of |zfp| |cpprelease|) that are templated on scalar type and
+dimensionality.  Each function of the form :code:`zfp_function_type_dims`,
+where *type* denotes scalar type and *dims* denotes dimensionality, has a
+corresponding C++ wrapper :code:`zfp::function<type, dims>`.  For example,
+the C function :c:func:`zfp_encode_block_float_2` has a C++ wrapper
 :cpp:func:`zfp::encode_block\<float, 2>`.  Often *dims* can be inferred from
 the parameters of overloaded functions, in which case it is omitted as
-template parameter.  The C++ wrappers are defined in :code:`zfpcpp.h`.
+template parameter.  The C++ wrappers are defined in :file:`zfp.hpp`.
 
 Encoder
 ^^^^^^^
diff --git a/docs/source/modes.rst b/docs/source/modes.rst
index 72d46c802..4810c6b37 100644
--- a/docs/source/modes.rst
+++ b/docs/source/modes.rst
@@ -58,7 +58,7 @@ the block.  The four constraints are as follows:
 .. c:member:: uint zfp_stream.maxbits
 
   The maximum number of bits used to represent a block.  This parameter
-  sets a hard upper bound on compressed block size, and governs the rate
+  sets a hard upper bound on compressed block size and governs the rate
   in :ref:`fixed-rate mode <mode-fixed-rate>`.  It may also be used as an
   upper storage limit to guard against buffer overruns in combination with
   the accuracy constraints given by :c:member:`zfp_stream.maxprec` and
@@ -92,13 +92,24 @@ the block.  The four constraints are as follows:
   Note that to achieve a certain accuracy in the decompressed values, the
   :c:member:`zfp_stream.minexp` value has to be conservatively lowered since
   |zfp|'s inverse transform may magnify the error (see also
-  :ref:`FAQs #20-22 <q-relerr>`).
+  FAQs :ref:`#20-22 <q-relerr>`).
 
 Care must be taken to allow all constraints to be met, as encoding
 terminates as soon as a single constraint is violated (except
 :c:member:`zfp_stream.minbits`, which is satisfied at the end of encoding by
 padding zeros).
 
+.. warning::
+
+  For floating-point data, the :c:member:`zfp_stream.maxbits` parameter must
+  be large enough to allow the common block exponent and any control bits to
+  be encoded.  This implies *maxbits* |geq| 9 for single-precision data and
+  *maxbits* |geq| 12 for double-precision data.  Choosing a smaller value is
+  of no use as it would prevent any fraction (value) bits from being encoded,
+  resulting in an all-zero decompressed block.  More importantly, such a
+  constraint will not be respected by |zfp| for performance reasons, which
+  if not accounted for could potentially lead to buffer overruns.
+
 As mentioned above, other combinations of constraints can be used.
 For example, to ensure that the compressed stream is not larger than
 the uncompressed one, or that it fits within the amount of memory
@@ -153,7 +164,7 @@ modes.
 
 .. note::
   Use fixed-rate mode only if you have to bound the compressed size
-  or need random access to blocks.
+  or need read and write random access to blocks.
 
 .. _mode-fixed-precision:
 .. index::
diff --git a/docs/source/pointers.inc b/docs/source/pointers.inc
index f4d8da956..aceb71ad6 100644
--- a/docs/source/pointers.inc
+++ b/docs/source/pointers.inc
@@ -60,6 +60,7 @@ and manipulated there, for instance, by passing the pointer by reference via
 
 As of |zfp| |crpirelease|, const qualified pointers :code:`const_pointer`
 are available, and conceptually are equivalent to :code:`const Scalar*`.
+Pointers are available for :ref:`read-only arrays <carray_classes>` also.
 
 The following operators are defined for proxy pointers.  Below *p* refers
 to the pointer being acted upon.
diff --git a/docs/source/python.rst b/docs/source/python.rst
index 92373c814..48dbd2d65 100644
--- a/docs/source/python.rst
+++ b/docs/source/python.rst
@@ -13,8 +13,17 @@ floating-point arrays.  The |zfpy| implementation is based on
 `Cython <https://cython.org>`_ and requires both NumPy and Cython
 to be installed.  Currently, |zfpy| supports only serial execution.
 
-The |zfpy| API is limited to two functions, for compression and
-decompression, which are described below.
+The |zfpy| API is limited to two functions, for compression and decompression,
+and a version identifier, which are described below.
+
+Constants
+---------
+
+.. py:data:: __version__
+
+  Python string identical to :c:macro:`ZFP_VERSION_STRING` representing the
+  |zfpy| library version, e.g., `'1.1.0'`.  Available as of |zfp|
+  |nextrelease|.
 
 Compression
 -----------
diff --git a/docs/source/references.inc b/docs/source/references.inc
index 8834bd9c9..6812b5a24 100644
--- a/docs/source/references.inc
+++ b/docs/source/references.inc
@@ -36,13 +36,17 @@ take the address of a reference, which yields a
 :ref:`proxy pointer <pointers>`.  When a reference appears as an rvalue in
 an expression, it is implicitly converted to a value.
 
-|zfp| |crpirelease| adds :code:`const` qualified versions of references,
+|zfp| |crpirelease| adds ``const`` qualified versions of references,
 pointers, and iterators to support const correctness and potential performance
 improvements when only read access is needed.  As with STL containers, the
-corresponding types are prefixed by :code:`const_`, e.g.,
-:code:`const_reference`.  The mutable versions of these classes inherit
+corresponding types are prefixed by ``const_``, e.g.,
+``const_reference``.  The mutable versions of these classes inherit
 the read-only API from the corresponding const versions.
 
+Only references into :ref:`read-write arrays <array_classes>` are discussed
+here; the :ref:`read-only arrays <carray_classes>` support the same
+``const_reference`` API.
+
 .. note::
   Do not confuse :code:`const_reference` and :code:`const reference`.  The
   former is a reference to an immutable array element, while the latter means
diff --git a/docs/source/serialization.inc b/docs/source/serialization.inc
index 86928f7c5..7f5da8b51 100644
--- a/docs/source/serialization.inc
+++ b/docs/source/serialization.inc
@@ -7,8 +7,8 @@ Serialization
 
 .. cpp:namespace:: zfp
 
-|zfp|'s compressed arrays can be serialized to sequential, contiguous
-storage and later recovered back into an object, e.g., to support
+|zfp|'s read-write compressed arrays can be serialized to sequential,
+contiguous storage and later recovered back into an object, e.g., to support
 I/O of compressed-array objects.  Two pieces of information are needed
 to describe a |zfp| array: the raw compressed data, obtained via
 :cpp:func:`array::compressed_data` and :cpp:func:`array::compressed_size`,
@@ -38,10 +38,10 @@ and via a generic :ref:`factory function <array_factory>`:
   :cpp:func:`array::scalar_type` and :cpp:func:`array::dimensionality`.
 
   The (static) factory function is made available by including
-  :file:`zfpfactory.h`.  This header must be included *after* first
+  :file:`zfp/factory.hpp`.  This header must be included *after* first
   including the header files associated with the compressed arrays, i.e.,
-  :file:`zfparray1.h`, :file:`zfparray2.h`, :file:`zfparray3.h`, and
-  :file:`zfparray4.h`.  Only those arrays whose header files are included
+  :file:`zfp/array1.hpp`, :file:`zfp/array2.hpp`, :file:`zfp/array3.hpp`, and
+  :file:`zfp/array4.hpp`.  Only those arrays whose header files are included
   can be constructed by the factory function.  This design decouples the
   array classes so that they may be included independently, for example,
   to reduce compilation time.
@@ -80,6 +80,8 @@ copy::
   assert(p->dimensionality() == 3 && p->scalar_type() == zfp_type_double);
   zfp::array3d& a = *dynamic_cast<zfp::array3d*>(p);
 
+When the array is no longer in use, call :code:`delete p;` to deallocate it.
+
 .. note::
   The array serialization API changed significantly in |zfp| |crpirelease|.
   The :cpp:func:`array::get_header` function is now deprecated and has been
diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
new file mode 100644
index 000000000..942b01221
--- /dev/null
+++ b/docs/source/spelling_wordlist.txt
@@ -0,0 +1,124 @@
+ABI
+accessor
+accessors
+AoS
+asinh
+atomics
+bitwise
+blockwise
+cfp
+chroma
+cmake
+CMake
+cmocka
+codec
+codecs
+codestream
+compressibility
+Conda
+const
+CPython
+curvilinear
+de
+deallocate
+deallocated
+deallocates
+deallocating
+decompressor
+decorrelate
+decorrelated
+decorrelating
+decorrelation
+denormals
+dereference
+dereferencing
+deserialization
+deserialize
+deserialized
+deserializing
+destructor
+destructors
+dimensionalities
+dimensionality
+endian
+endianness
+enum
+enums
+equidimensional
+executables
+exponentiating
+fortran
+googletest
+grayscale
+Hadamard
+headerless
+libzfp
+linearizing
+lossy
+losslessly
+lvalue
+macOS
+mallocs
+multicore
+multithreaded
+multithreading
+mutator
+mutators
+namespace
+namespaces
+NaN
+negabinary
+optimality
+ORed
+partitioner
+piecewise
+pointwise
+postfix
+pre
+precisions
+prepended
+prepending
+preprocessor
+priori
+programmatically
+quantized
+radix
+redistributions
+representable
+reStructuredText
+roundoff
+rvalue
+scikit
+significand
+significands
+SoA
+strided
+struct
+structs
+subarray
+subarrays
+subdirectories
+suboptimal
+subnormals
+subsampling
+subsetting
+substream
+templated
+typedefs
+uncategorized
+unpromoted
+zag
+zFORp
+zfp
+zfPy
+zig
+Asher
+Diffenderfer
+Haiying
+Hammerling
+Hittinger
+Kuffuor
+Magri
+Osei
+Pinard
+Xu
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 7d20ab767..cb7e6a608 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -4,7 +4,7 @@ Regression Tests
 ================
 
 The :program:`testzfp` program performs basic regression testing by exercising
-a small but important subset of :file:`libzfp` and the compressed array
+a small but important subset of |libzfp| and the compressed-array
 classes.  It serves as a sanity check that |zfp| has been built properly.
 These tests assume the default compiler settings, i.e., with none of the
 settings in :file:`Config` or :file:`CMakeLists.txt` modified.  By default,
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index c1c114516..4846774b7 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -97,7 +97,7 @@ are needed::
 
   // allocate buffer for compressed data
   size_t bufsize = zfp_stream_maximum_size(zfp, field);
-  uchar* buffer = new uchar[bufsize];
+  void* buffer = malloc(bufsize);
 
 Note that :c:func:`zfp_stream_maximum_size` returns the smallest buffer
 size necessary to safely compress the data---the *actual* compressed size
@@ -148,7 +148,7 @@ the compressed data, or via the :c:func:`zfp_write_header` and
 :c:func:`zfp_compress` and :c:func:`zfp_decompress` calls, respectively.
 These calls allow the user to specify what information to store in the header,
 including a 'magic' format identifier, the field type and dimensions, and the
-compression parameters (see the :ref:`ZFP_HEADER <header-macros>` macros).
+compression parameters (see the :ref:`headers` section).
 
 In addition to this initialization, the bit stream has to be rewound to
 the beginning (before reading the header and decompressing the data)::
@@ -235,7 +235,7 @@ The block above could also have been compressed as follows using strides::
   ptrdiff_t sx = &block[0][0][1] - &block[0][0][0]; // x stride =  1
   ptrdiff_t sy = &block[0][1][0] - &block[0][0][0]; // y stride =  4
   ptrdiff_t sz = &block[1][0][0] - &block[0][0][0]; // z stride = 16
-  size_t bits = zfp_encode_block_strided_double_3(zfp, block, sx, sy, sz);
+  size_t bits = zfp_encode_block_strided_double_3(zfp, &block[0][0][0], sx, sy, sz);
 
 The strides are measured in number of array elements, not in bytes.
 
@@ -281,17 +281,16 @@ Compressed C++ Arrays
 
 .. cpp:namespace:: zfp
 
-The |zfp| compressed array API, which currently supports 1D, 2D, and 3D
-(but not 4D) arrays, has been designed to facilitate integration with existing
-applications.  After initial array declaration, a |zfp| array can often
-be used in place of a regular C/C++ array or STL vector, e.g., using flat
-indexing via :code:`a[index]`, nested indexing :code:`a[k][j][i]` (via
-:ref:`nested views <nested_view>`), or using multidimensional indexing via
-:code:`a(i)`, :code:`a(i, j)`, or :code:`a(i, j, k)`.  There are,
-however, some important differences.  For instance, applications that
-rely on addresses or references to array elements may have to be
-modified to use special proxy classes that implement pointers and
-references; see :ref:`limitations`.
+The |zfp| compressed-array API has been designed to facilitate integration
+with existing applications.  After initial array declaration, a |zfp| array
+can often be used in place of a regular C/C++ array or STL vector, e.g.,
+using flat indexing via :code:`a[index]`, nested indexing :code:`a[k][j][i]`
+(via :ref:`nested views <nested_view>`), or using multidimensional indexing
+via :code:`a(i)`, :code:`a(i, j)`, :code:`a(i, j, k)`, or
+:code:`a(i, j, k, l)`.  There are, however, some important differences.  For
+instance, applications that rely on addresses or references to array elements
+may have to be modified to use special proxy classes that implement pointers
+and references; see :ref:`limitations`.
 
 |zfp|'s compressed arrays do not support special floating-point values like
 infinities and NaNs, although subnormal numbers are handled correctly.
@@ -305,8 +304,8 @@ The |zfp| C++ classes are implemented entirely as header files and make
 extensive use of C++ templates to reduce code redundancy.  These classes
 are wrapped in the :cpp:any:`zfp` namespace.
 
-Currently, there are six array classes for 1D, 2D, and 3D arrays, each of
-which can represent single- or double-precision values.  Although these
+Currently, there are eight array classes for 1D, 2D, 3D, and 4D arrays, each
+of which can represent single- or double-precision values.  Although these
 arrays store values in a form different from conventional single- and
 double-precision floating point, the user interacts with the arrays via
 floats and doubles.
@@ -346,6 +345,7 @@ increments of 64 / |4powd| bits in *d* dimensions, i.e.
   1D arrays: 16-bit granularity
   2D arrays: 4-bit granularity
   3D arrays: 1-bit granularity
+  4D arrays: 1/4-bit granularity
 
 For finer granularity, the :c:macro:`BIT_STREAM_WORD_TYPE` macro needs to
 be set to a type narrower than 64 bits during compilation of |libzfp|,
@@ -356,6 +356,7 @@ bits in *d* dimensions, or
   1D arrays: 2-bit granularity
   2D arrays: 1/2-bit granularity
   3D arrays: 1/8-bit granularity
+  4D arrays: 1/32-bit granularity
 
 Note that finer granularity usually implies slightly lower performance.
 Also note that because the arrays are stored compressed, their effective
@@ -403,7 +404,7 @@ directly without having to convert to/from its floating-point representation::
 
 The array can through this pointer be initialized from offline compressed
 storage, but only after its dimensions and rate have been specified (see
-above).  For this to work properly, the cache must first be emptied via a
+above).  For this to work properly, the cache must first be emptied via an
 :cpp:func:`array::clear_cache` call (see below).
 
 Through operator overloading, the array can be accessed in one of two ways.
@@ -497,14 +498,14 @@ for references and pointers that guarantee persistent access by referencing
 elements by array object and index.  These classes perform decompression on
 demand, much like how Boolean vector references are implemented in the STL.
 
-Iterators for 1D arrays support random access, while 2D and 3D array iterators
-are merely forward (sequential) iterators.  All iterators ensure that array
-values are visited one block at a time, and are the preferred way of looping
-over array elements.  Such block-by-block access is especially useful when
-performing write accesses since then complete blocks are updated one at a
-time, thus reducing the likelihood of a partially updated block being evicted
-from the cache and compressed, perhaps with some values in the block being
-uninitialized.  Here is an example of initializing a 3D array::
+As of |zfp| |raiterrelease|, all iterators for 1D-4D arrays support random
+access.  Iterators ensure that array values are visited one block at a time,
+and are the preferred way of looping over array elements.  Such block-by-block
+access is especially useful when performing write accesses since then complete
+blocks are updated one at a time, thus reducing the likelihood of a partially
+updated block being evicted from the cache and compressed, perhaps with some
+values in the block being uninitialized.  Here is an example of initializing
+a 3D array::
 
   for (zfp::array3d::iterator it = a.begin(); it != a.end(); it++) {
     size_t i = it.i();
@@ -553,9 +554,8 @@ row-major order, i.e.
 where :code:`&a(i, j, k)` and :code:`&a[0]` are both of type
 :cpp:class:`array3d::pointer`.  Thus, iterators and pointers do not
 visit arrays in the same order, except for the special case of 1D arrays.
-Unlike iterators, pointers support random access for arrays of all
-dimensions and behave very much like :code:`float*` and :code:`double*`
-built-in pointers.
+Like iterators, pointers support random access for arrays of all dimensions
+and behave very much like :code:`float*` and :code:`double*` built-in pointers.
 
 Proxy objects for array element references have been supported since the
 first release of |zfp|, and may for instance be used in place of
@@ -591,9 +591,9 @@ at least two layers of blocks (2 |times| (*nx* / 4) |times| (*ny* / 4)
 blocks) for applications that stream through the array and perform stencil
 computations such as gathering data from neighboring elements.  This allows
 limiting the cache misses to compulsory ones.  If the *cache_size* parameter
-is set to zero bytes, then a default size of |sqrt|\ *n* blocks is used,
-where *n* is the total number of blocks in the array.
-
+is set to zero bytes, then a default size of |sqrt|\ *n* blocks (rounded
+up to the next integer power of two) is used, where *n* is the total number
+of blocks in the array.
 
 The cache size can be set during construction, or can be set at a later
 time via
diff --git a/docs/source/versions.rst b/docs/source/versions.rst
index 389fd5210..d82c3f229 100644
--- a/docs/source/versions.rst
+++ b/docs/source/versions.rst
@@ -3,303 +3,454 @@
 Release Notes
 =============
 
-zfp 0.5.5, May 5, 2019
+1.0.1 (2023-12-15)
+------------------
+
+This patch release primarily addresses minor bug fixes and is needed to update
+the zfpy Python wheels.
+
+**Added**
+
+- A new build macro, ``BUILD_TESTING_FULL``, specifies that all unit tests be
+  built; ``BUILD_TESTING`` produces a smaller subset of tests.  Full tests and
+  documentation are now included in releases.
+
+**Fixed**
+
+- #169: `libm` dependency is not always correctly detected.
+- #171: `ptrdiff_t` is not always imported in Cython.
+- #176: cfp API is not exposed via CMake configuration file.
+- #177: Full test suite is not included in release.
+- #181: `rpath` is not set correctly in executables.
+- #204: Array strides are not passed by value in zFORp.
+- #220: Errors reported with scikit-build when building zfpy.
+
+----
+
+1.0.0 (2022-08-01)
+------------------
+
+This release is not ABI compatible with prior releases due to numerous changes
+to function signatures and data structures like ``zfp_field``.  However, few of
+the API changes, other than to the |cfp| C API for compressed arrays, should
+impact existing code.  Note that numerous header files have been renamed or
+moved relative to prior versions.
+
+**Added**
+
+- ``zfp::const_array``: read-only variable-rate array that supports
+  fixed-precision, fixed-accuracy, and reversible modes.
+- Compressed-array classes for 4D data.
+- ``const`` versions of array references, pointers, and iterators.
+- A more complete API for pointers and iterators.
+- |cfp| support for proxy references and pointers, iterators, and 
+  (de)serialization.
+- Support for pointers and iterators into array views.
+- ``zfp::array::size_bytes()`` allows querying the size of different components
+  of an array object (e.g., payload, cache, index, metadata, ...).
+- Templated C++ wrappers around the low-level C API.
+- A generic codec for storing blocks of uncompressed scalars in |zfp|'s
+  C++ arrays.
+- Additional functions for querying ``zfp_field`` and ``zfp_stream`` structs.
+- ``zfp_config``: struct that encapsulates compression mode and parameters.
+- Rounding modes for reducing bias in compression errors.
+- New examples: ``array``, ``iteratorC``, and ``ppm``.
+
+**Changed**
+
+- Headers from ``array/``, ``cfp/include/``, and ``include/`` have been renamed
+  and reorganized into a common ``include/`` directory.
+
+  * The libzfp API is now confined to ``zfp.h``, ``zfp.hpp``, and ``zfp.mod``
+    for C, C++, and Fortran bindings, respectively.  These all appear in
+    the top-level ``include/`` directory upon installation.
+  * C++ headers now use a ``.hpp`` suffix; C headers use a ``.h`` suffix.
+  * C++ headers like ``array/zfparray.h`` have been renamed ``zfp/array.hpp``.
+  * C headers like ``cfp/include/cfparrays.h`` have been renamed
+    ``zfp/array.h``.
+
+- ``size_t`` and ``ptrdiff_t`` replace ``uint`` and ``int`` for array sizes and
+  strides in the array classes and C/Fortran APIs.
+- ``zfp_bool`` replaces ``int`` as Boolean type in the C API.
+- ``bitstream_offset`` and ``bitstream_size`` replace ``size_t`` to ensure
+  support for 64-bit offsets into and lengths of bit streams.  Consequently,
+  the ``bitstream`` API has changed accordingly.
+- All array and view iterators are now random-access iterators.
+- Array inspectors now return ``const_reference`` rather than a scalar
+  type like ``float`` to allow obtaining a ``const_pointer`` to an element
+  of an immutable array.
+- ``zfp::array::compressed_data()`` now returns ``void*`` instead of
+  ``uchar*``.
+- The array (de)serialization API has been revised, resulting in new
+  ``zfp::array::header`` and ``zfp::exception`` classes with new exception
+  messages.
+- The array ``codec`` class is now responsible for all details regarding
+  compression.
+- The compressed-array C++ implementation has been completely refactored to
+  make it more modular, extensible, and reusable across array types.
+- Array block shapes are now computed on the fly rather than stored.
+- The |cfp| C API now wraps array objects in structs.
+- The |zfpy| Python API now supports the more general ``memoryview`` over
+  ``bytes`` objects for decompression.
+- The zFORp Fortran module name is now ``zfp`` instead of ``zforp_module``.
+- Some command-line options for the ``diffusion`` example have changed.
+- CMake 3.9 or later is now required for CMake builds.
+
+**Removed**
+
+- ``zfp::array::get_header()`` has been replaced with a ``zfp::array::header``
+  constructor that accepts an array object.
+- ``ZFP_VERSION_RELEASE`` is no longer defined (use ``ZFP_VERSION_PATCH``).
+
+**Fixed**
+
+- #66: ``make install`` overwrites googletest.
+- #84: Incorrect order of parameters in CUDA ``memset()``.
+- #86: C++ compiler warns when ``__STDC_VERSION__`` is undefined.
+- #87: ``CXXFLAGS`` is misspelled in ``cfp/src/Makefile``.
+- #98: ``zfp_stream_maximum_size()`` underestimates size in reversible mode.
+- #99: Incorrect ``private_view`` reads due to missing write-back.
+- #109: Unused CPython array is incompatible with PyPy.
+- #112: PGI compiler bug causes issues with memory alignment.
+- #119: All-subnormal blocks may cause floating-point overflow.
+- #121: CUDA bit offsets are limited to 32 bits.
+- #122: ``make install`` does not install |zfp| command-line utility.
+- #125: OpenMP bit offsets are limited to 32 bits.
+- #126: ``make install`` does not install Fortran module.
+- #127: Reversible mode reports incorrect compressed block size.
+- #150: cmocka tests do not build on macOS.
+- #154: Thread safety is broken in ``private_view`` and ``private_const_view``.
+- ``ZFP_MAX_BITS`` is off by one.
+- ``diffusionC``, ``iteratorC`` are not being built with ``gmake``.
+
+----
+
+0.5.5 (2019-05-05)
+------------------
+
+**Added**
+
+- Support for reversible (lossless) compression of floating-point and
+  integer data.
+- Methods for serializing and deserializing |zfp|'s compressed arrays.
+- Python bindings for compressing NumPy arrays.
+- Fortran bindings to |zfp|'s high-level C API.
+
+**Changed**
+
+- The default compressed-array cache size is now a function of the total
+  number of array elements, irrespective of array shape.
+
+**Fixed**
+
+- Incorrect handling of execution policy in |zfp| utility.
+- Incorrect handling of decompression via header in |zfp| utility.
+- Incorrect cleanup of device memory in CUDA decompress.
+- Missing tests for failing mallocs.
+- CMake does not install CFP when built.
+- ``zfp_write_header()`` and ``zfp_field_metadata()`` succeed even if array
+  dimensions are too large to fit in header.
+
+----
+
+0.5.4 (2018-10-01)
+------------------
+
+**Added**
+
+- Support for CUDA fixed-rate compression and decompression.
+- Views into compressed arrays for thread safety, nested array indexing,
+  slicing, and array subsetting.
+- C language bindings for compressed arrays.
+- Support for compressing and decompressing 4D data.
+
+**Changed**
+
+- Execution policy now applies to both compression and decompression.
+- Compressed array accessors now return Scalar type instead of
+  ``const Scalar&`` to avoid stale references to evicted cache lines.
+
+**Fixed**
+
+- Incorrect handling of negative strides.
+- Incorrect handling of arrays with more than 2\ :sup:`32` elements in |zfp|
+  command-line tool.
+- ``bitstream`` is not C++ compatible.
+- Minimum cache size request is not respected.
+
+----
+
+0.5.3 (2018-03-28)
+------------------
 
-  - Added support for reversible (lossless) compression of floating-point and
-    integer data.
+**Added**
 
-  - Added methods for serializing and deserializing zfp's compressed arrays.
+- Support for OpenMP multithreaded compression (but not decompression).
+- Options for OpenMP execution in |zfp| command-line tool.
+- Compressed-array support for copy construction and assignment via deep
+  copies.
+- Virtual destructors to enable inheritance from |zfp| arrays.
 
-  - Added Python bindings for compressing NumPy arrays.
+**Changed**
 
-  - Added Fortran bindings to zfp's high-level C API.
+- ``zfp_decompress()`` now returns the number of compressed bytes processed so
+  far, i.e., the same value returned by ``zfp_compress()``.
 
-  - Change:
+----
 
-    - The default compressed-array cache size is now a function of the total 
-      number of array elements, irrespective of array shape.
+0.5.2 (2017-09-28)
+------------------
 
-  - Bug fixes:
+**Added**
 
-    - Incorrect handling of execution policy in zfp utility.
-    - Incorrect handling of decompression via header in zfp utility.
-    - Incorrect cleanup of device memory in CUDA decompress.
-    - Tests for failing mallocs.
-    - CMake installation of CFP when built.
-    - zfp_write_header and zfp_field_metadata now fail if array dimensions
-      are too large to fit in header.
+- Iterators and proxy objects for pointers and references.
+- Example illustrating how to use iterators and pointers.
 
+**Changed**
 
-zfp 0.5.4, October 1, 2018
+- Diffusion example now optionally uses iterators.
+- Moved internal headers under array to ``array/zfp``.
+- Modified 64-bit integer typedefs to avoid the C89 non-compliant ``long long``
+  and allow for user-supplied types and literal suffixes.
+- Renamed compile-time macros that did not have a ``ZFP`` prefix.
+- Rewrote documentation in reStructuredText and added complete documentation
+  of all public functions, classes, types, and macros.
 
-  - Added support for CUDA fixed-rate compression and decompression.
+**Fixed**
 
-  - Added views into compressed arrays for thread safety, nested array
-    indexing, slicing, and array subsetting.
+- Issue with setting stream word type via CMake.
 
-  - Added C language bindings for compressed arrays.
+----
 
-  - Added support for compressing and decompressing 4D data.
+0.5.1 (2017-03-28)
+------------------
 
-  - Changes:
+This release primarily fixes a few minor issues but also includes changes in
+anticipation of a large number of planned future additions to the library.
+No changes have been made to the compressed format, which is backwards
+compatible with version 0.5.0.
 
-    - Execution policy now applies to both compression and decompression.
-    - Compressed array accessors now return Scalar type instead of
-      const Scalar& to avoid stale references to evicted cache lines.
+**Added**
 
-  - Bug fixes:
+- High-level API support for integer types.
+- Example that illustrates in-place compression.
+- Support for CMake builds.
+- Documentation that discusses common issues with using |zfp|.
 
-    - Handling of negative strides.
-    - Command line tool handling of arrays with more than 2\ :sup:`32` elements.
-    - bitstream C++ compatibility.
-    - Respect minimum cache size request.
+**Changed**
 
+- Separated library version from CODEC version and added version string.
+- Corrected inconsistent naming of ``BIT_STREAM`` macros in code and
+  documentation.
+- Renamed some of the header bit mask macros.
+- ``stream_skip()`` and ``stream_flush()`` now return the number of bits
+  skipped or output.
+- Renamed ``stream_block()`` and ``stream_delta()`` to make it clear that they
+  refer to strided streams.  Added missing definition of
+  ``stream_stride_block()``.
+- Changed ``int`` and ``uint`` types in places to use ``ptrdiff_t`` and
+  ``size_t`` where appropriate.
+- Changed API for ``zfp_set_precision()`` and ``zfp_set_accuracy()`` to not
+  require the scalar type.
+- Added missing ``static`` keyword in ``decode_block()``.
+- Changed ``testzfp`` to allow specifying which tests to perform on the
+  command line.
+- Modified directory structure.
 
-zfp 0.5.3, March 28, 2018
+**Fixed**
 
-  - Added support for OpenMP multithreaded compression (but not decompression).
+- Bug that prevented defining uninitialized arrays.
+- Incorrect computation of array sizes in ``zfp_field_size()``.
+- Minor issues that prevented code from compiling on Windows.
+- Issue with fixed-accuracy headers that caused unnecessary storage.
 
-  - Added options for OpenMP execution to zfp command-line tool.
+----
 
-  - Changed return value of zfp_decompress to indicate the number of compressed
-    bytes processed so far (now returns same value as zfp_compress on success).
+0.5.0 (2016-02-29)
+------------------
 
-  - Added compressed array support for copy construction and assignment via
-    deep copies.
+This version introduces backwards incompatible changes to the CODEC.
 
-  - Added virtual destructors to enable inheritance from zfp arrays.
+**Added**
 
+- Modified CODEC to more efficiently encode blocks whose values are all
+  zero or are smaller in magnitude than the absolute error tolerance.
+  This allows representing "empty" blocks using only one bit each.
+- Added functions for compactly encoding the compression parameters
+  and field meta data, e.g., for producing self-contained compressed
+  streams.  Also added functions for reading and writing a header
+  containing these parameters.
 
-zfp 0.5.2, September 28, 2017
+**Changed**
 
-  - Added iterators and proxy objects for pointers and references.
+- Changed behavior of ``zfp_compress()`` and ``zfp_decompress()`` to not
+  automatically rewind the bit stream.  This makes it easier to concatenate
+  multiple compressed bit streams, e.g., when compressing vector fields or
+  multiple scalars together.
+- Changed the |zfp| example program interface to allow reading and writing
+  compressed streams, optionally with a header.  The |zfp| tool can now be
+  used to compress and decompress files as a stand alone utility.
 
-  - Added example illustrating how to use iterators and pointers.
+----
 
-  - Modified diffusion example to optionally use iterators.
+0.4.1 (2015-12-28)
+------------------
 
-  - Moved internal headers under array to array/zfp.
+**Added**
 
-  - Modified 64-bit integer typedefs to avoid the C89 non-compliant long long
-    and allow for user-supplied types and literal suffixes.
+- Added ``simple.c`` as a minimal example of how to call the compressor.
 
-  - Renamed compile-time macros that did not have a ZFP prefix.
+**Changed**
 
-  - Fixed issue with setting stream word type via CMake.
+- Changed compilation of diffusion example to output two executables:
+  one with and one without compression.
 
-  - Rewrote documentation in reStructuredText and added complete
-    documentation of all public functions, classes, types, and macros.
-    Removed ASCII documentation.
+**Fixed**
 
+- Bug that caused segmentation fault when compressing 3D arrays whose
+  dimensions are not multiples of four.  Specifically, arrays of dimensions
+  *nx* |times| *ny* |times| *nz*, with *ny* not a multiple of four, were not
+  handled correctly.
+- Modified ``examples/fields.h`` to ensure standard compliance.  Previously,
+  C99 support was needed to handle the hex float constants, which are
+  not supported in C++98.
 
-zfp 0.5.1, March 28, 2017
+----
 
-  - This release primarily fixes a few minor issues but also includes
-    changes in anticipation of a large number of planned future additions
-    to the library.  No changes have been made to the compressed format,
-    which is backwards compatible with version 0.5.0.
+0.4.0 (2015-12-05)
+------------------
 
-  - Added high-level API support for integer types.
+This version contains substantial changes to the compression algorithm that
+improve PSNR by about 6 dB and speed by a factor of 2-3.  These changes are
+not backward compatible with previous versions of |zfp|.
 
-  - Separated library version from CODEC version and added version string.
+**Added**
 
-  - Added example that illustrates in-place compression.
+- Support for 31-bit and 63-bit integer data, as well as shorter integer types.
+- New examples for evaluating the throughput of the (de)compressor and for
+  compressing grayscale images in the ``pgm`` format.
+- Frequently asked questions.
 
-  - Added support for CMake builds.
+**Changed**
 
-  - Corrected inconsistent naming of BIT_STREAM macros in code and
-    documentation.
+- Rewrote compression codec entirely in C to make linking and calling
+  easier from other programming languages, and to expose the low-level
+  interface through C instead of C++.  This necessitated significant
+  changes to the API as well.
+- Minor changes to the C++ compressed array API, as well as major
+  implementation changes to support the C library.  The namespace and
+  public types are now all in lower case.
 
-  - Renamed some of the header bit mask macros.
+**Removed**
 
-  - Added return values to stream_skip and stream_flush to indicate the
-    number of bits skipped or output.
+- Support for general fixed-point decorrelating transforms.
 
-  - Renamed stream_block and stream_delta to make it clear that they refer
-    to strided streams.  Added missing definition of stream_stride_block.
+----
 
-  - Changed int/uint types in places to use ptrdiff_t/size_t where
-    appropriate.
+0.3.2 (2015-12-03)
+------------------
 
-  - Changed API for zfp_set_precision and zfp_set_accuracy to not require
-    the scalar type.
+**Fixed**
 
-  - Added missing static keyword in decode_block.
+- Bug in ``Array::get()`` that caused the wrong cached block to be looked up,
+  thus occasionally copying incorrect values back to parts of the array.
 
-  - Changed testzfp to allow specifying which tests to perform on the
-    command line.
+----
 
-  - Fixed bug that prevented defining uninitialized arrays.
+0.3.1 (2015-05-06)
+------------------
 
-  - Fixed incorrect computation of array sizes in zfp_field_size.
+**Fixed**
 
-  - Fixed minor issues that prevented code from compiling on Windows.
+- Rare bug caused by exponent underflow in blocks with no normal and some
+  subnormal numbers.
 
-  - Fixed issue with fixed-accuracy headers that caused unnecessary storage.
+----
 
-  - Modified directory structure.
+0.3.0 (2015-03-03)
+------------------
 
-  - Added documentation that discusses common issues with using zfp.
+This version modifies the default decorrelating transform to one that uses
+only additions and bit shifts.  This new transform, in addition to being
+faster, also has some theoretical optimality properties and tends to improve
+rate distortion.  This change is not backwards compatible.
 
+**Added**
 
-zfp 0.5.0, February 29, 2016
+- Compile-time support for parameterized transforms, e.g., to support other
+  popular transforms like DCT, HCT, and Walsh-Hadamard.
+- Floating-point traits to reduce the number of template parameters.  It is
+  now possible to declare a 3D array as ``Array3<float>``, for example.
+- Functions for setting the array scalar type and dimensions.
+- ``testzfp`` for regression testing.
 
-  - Modified CODEC to more efficiently encode blocks whose values are all
-    zero or are smaller in magnitude than the absolute error tolerance.
-    This allows representing "empty" blocks using only one bit each.  This
-    version is not backwards compatible with prior zfp versions.
+**Changed**
 
-  - Changed behavior of zfp_compress and zfp_decompress to not automatically
-    rewind the bit stream.  This makes it easier to concatenate multiple
-    compressed bit streams, e.g., when compressing vector fields or multiple
-    scalars together.
+- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
+  Consequently Q1.62 fixed point can be used throughout.
+- Changed the order in which bits are emitted within each bit plane to be more
+  intelligent.  Group tests are now deferred until they are needed, i.e., just
+  before the value bits for the group being tested.  This improves the quality
+  of fixed-rate encodings, but has no impact on compressed size.
+- Made several optimizations to improve performance.
+- Consolidated several header files.
 
-  - Added functions for compactly encoding the compression parameters
-    and field meta data, e.g., for producing self-contained compressed
-    streams.  Also added functions for reading and writing a header
-    containing these parameters.
+----
 
-  - Changed the zfp example program interface to allow reading and writing
-    compressed streams, optionally with a header.  The zfp tool can now be
-    used to compress and decompress files as a stand alone utility.
+0.2.1 (2014-12-12)
+------------------
 
+**Added**
 
-zfp 0.4.1, December 28, 2015
+- Win64 support via Microsoft Visual Studio compiler.
+- Documentation of the expected output for the diffusion example.
 
-  - Fixed bug that caused segmentation fault when compressing 3D arrays
-    whose dimensions are not multiples of four.  Specifically, arrays of
-    dimensions nx * ny * nz, with ny not a multiple of four, were not
-    handled correctly.
+**Changed**
 
-  - Modified examples/fields.h to ensure standard compliance.  Previously,
-    C99 support was needed to handle the hex float constants, which are
-    not supported in C++98.
+- Made several minor changes to suppress compiler warnings.
 
-  - Added simple.c as a minimal example of how to call the compressor.
+**Fixed**
 
-  - Changed compilation of diffusion example to output two executables:
-    one with and one without compression.
+- Broken support for IBM's ``xlc`` compiler.
 
+----
 
-zfp 0.4.0, December 5, 2015
+0.2.0 (2014-12-02)
+------------------
 
-  - Substantial changes to the compression algorithm that improve PSNR
-    by about 6 dB and speed by a factor of 2-3.  These changes are not
-    backward compatible with previous versions of zfp.
+The compression interface from ``zfpcompress`` was relocated to a separate
+library, called ``libzfp``, and modified to be callable from C.  This API now
+uses a parameter object (``zfp_params``) to specify array type and dimensions
+as well as compression parameters.
 
-  - Added support for 31-bit and 63-bit integer data, as well as shorter
-    integer types.
+**Added**
 
-  - Rewrote compression codec entirely in C to make linking and calling
-    easier from other programming languages, and to expose the low-level
-    interface through C instead of C++.  This necessitated significant
-    changes to the API as well.
+- Several utility functions were added to simplify ``libzfp`` usage:
 
-  - Minor changes to the C++ compressed array API, as well as major
-    implementation changes to support the C library.  The namespace and
-    public types are now all in lower case.
+  * Functions for setting the rate, precision, and accuracy.
+    Corresponding functions were also added to the ``Codec`` class.
+  * A function for estimating the buffer size needed for compression.
 
-  - Deprecated support for general fixed-point decorrelating transforms
-    and slimmed down implementation.
+- The ``Array`` class functionality was expanded:
 
-  - Added new examples for evaluating the throughput of the (de)compressor
-    and for compressing grayscale images in the pgm format.
+  * Support for accessing the compressed bit stream stored with an array,
+    e.g., for offline compressed storage and for initializing an already
+    compressed array.
+  * Functions for dynamically specifying the cache size.
+  * The default cache is now direct-mapped instead of two-way associative.
 
-  - Added FAQ.
+**Fixed**
 
+- Corrected the value of the lowest possible bit plane to account for both
+  the smallest exponent and the number of bits in the significand.
+- Corrected inconsistent use of rate and precision.  The rate refers to the
+  number of compressed bits per floating-point value, while the precision
+  refers to the number of uncompressed bits.  The ``Array`` API was changed
+  accordingly.
 
-zfp 0.3.2, December 3, 2015
+----
 
-  - Fixed bug in Array::get() that caused the wrong cached block to be
-    looked up, thus occasionally copying incorrect values back to parts
-    of the array.
+0.1.0 (2014-11-12)
+------------------
 
-
-zfp 0.3.1, May 6, 2015
-
-  - Fixed rare bug caused by exponent underflow in blocks with no normal
-    and some subnormal numbers.
-
-
-zfp 0.3.0, March 3, 2015
-
-  - Modified the default decorrelating transform to one that uses only
-    additions and bit shifts.  This new transform, in addition to being
-    faster, also has some theoretical optimality properties and tends to
-    improve rate distortion.
-
-  - Added compile-time support for parameterized transforms, e.g., to
-    support other popular transforms like DCT, HCT, and Walsh-Hadamard.
-
-  - Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
-    Consequently Q1.62 fixed point can be used throughout.
-
-  - Changed the order in which bits are emitted within each bit plane
-    to be more intelligent.  Group tests are now deferred until they
-    are needed, i.e., just before the value bits for the group being
-    tested.  This improves the quality of fixed-rate encodings, but
-    has no impact on compressed size.
-
-  - Made several optimizations to improve performance.
-
-  - Added floating-point traits to reduce the number of template
-    parameters.  It is now possible to declare a 3D array as
-    Array3<float>, for example.
-
-  - Added functions for setting the array scalar type and dimensions.
-
-  - Consolidated several header files.
-
-  - Added testzfp for regression testing.
-
-
-zfp 0.2.1, December 12, 2014
-
-  - Added Win64 support via Microsoft Visual Studio compiler.
-
-  - Fixed broken support for IBM's xlc compiler.
-
-  - Made several minor changes to suppress compiler warnings.
-
-  - Documented expected output for the diffusion example.
-
-
-zfp 0.2.0, December 2, 2014
-
-  - The compression interface from zfpcompress was relocated to a
-    separate library, called libzfp, and modified to be callable from C.
-    This API now uses a parameter object (zfp_params) to specify array
-    type and dimensions as well as compression parameters.
-
-  - Several utility functions were added to simplify libzfp usage:
-
-    * Functions for setting the rate, precision, and accuracy.
-      Corresponding functions were also added to the Codec class.
-
-    * A function for estimating the buffer size needed for compression.
-
-  - The Array class functionality was expanded:
-
-    * Support for accessing the compressed bit stream stored with an
-      array, e.g., for offline compressed storage and for initializing
-      an already compressed array.
-
-    * Functions for dynamically specifying the cache size.
-
-    * The default cache is now direct-mapped instead of two-way
-      associative.
-
-  - Minor bug fixes:
-
-    * Corrected the value of the lowest possible bit plane to account for
-      both the smallest exponent and the number of bits in the significand.
-
-    * Corrected inconsistent use of rate and precision.  The rate refers
-      to the number of compressed bits per floating-point value, while
-      the precision refers to the number of uncompressed bits.  The Array
-      API was changed accordingly.
-
-
-zfp 0.1.0, November 12, 2014
-
-  - Initial beta release.
+Initial beta release.
diff --git a/docs/source/view-indexing.pdf b/docs/source/view-indexing.pdf
index d6e2edb7f..af4376973 100644
Binary files a/docs/source/view-indexing.pdf and b/docs/source/view-indexing.pdf differ
diff --git a/docs/source/views.inc b/docs/source/views.inc
index e5e081df0..4a14df297 100644
--- a/docs/source/views.inc
+++ b/docs/source/views.inc
@@ -54,6 +54,11 @@ iterators.
   traversed using pointers and iterators.  We have
   :code:`view(10, 7) == (&view(0, 0))[87] == view.begin()[97] == view.end()[-2]`.
 
+With the |zfp| |carrrelease| release of
+:ref:`read-only arrays <carray_classes>`, such arrays also support the two
+kinds of immutable views (:code:`const_view` and :code:`private_const_view`).
+The documentation below applies to views into read-only arrays as well.
+
 .. note::
   Like iterators and proxy references and pointers, a view is valid only
   during the lifetime of the array that it references.  **No reference
@@ -192,7 +197,7 @@ the arrays that they reference.
 .. cpp:function:: array3::const_view::const_iterator array3::const_view::cbegin() const
 .. cpp:function:: array4::const_view::const_iterator array4::const_view::cbegin() const
 
-  Const iterator to first element of view.
+  Random-access const iterator to first element of view.
 
 ----
 
@@ -205,7 +210,7 @@ the arrays that they reference.
 .. cpp:function:: array3::const_view::const_iterator array3::const_view::cend() const
 .. cpp:function:: array4::const_view::const_iterator array4::const_view::cend() const
 
-  Const iterator to end of view.
+  Random-access const iterator to end of view.
 
 There are a number of common methods inherited from a base class,
 :code:`preview`, further up the class hierarchy.
@@ -579,7 +584,7 @@ nested view to in effect provide nested array indexing::
 .. cpp:function:: reference array4::nested_view3::operator()(size_t i, size_t j, size_t k)
 .. cpp:function:: reference array4::nested_view4::operator()(size_t i, size_t j, size_t k, size_t l)
 
-  Return reference to a scalar element of a 2D or 3D array.
+  Return reference to a scalar element of a 2D, 3D, or 4D array.
 
 
 .. _slicing:
@@ -649,6 +654,13 @@ that it maintains its own private cache rather than sharing the
 cache owned by the array.  Multiple threads may thus access the
 same array in parallel through their own private views.
 
+.. note::
+  Thread safety is ensured only for OpenMP threads, and the |zfp|
+  views must be compiled by an OpenMP compliant compiler.  As the
+  |zfp| compressed-array class implementation is defined in headers,
+  the application code using |zfp| must also be compiled with OpenMP
+  enabled if multithreaded access to |zfp| arrays is desired.
+
 .. note::
   Private views **do not guarantee cache coherence**.  If, for example,
   the array is modified, then already cached data in a private view is
diff --git a/docs/source/word-size.pdf b/docs/source/word-size.pdf
new file mode 100644
index 000000000..c34e1644a
Binary files /dev/null and b/docs/source/word-size.pdf differ
diff --git a/docs/source/zforp.rst b/docs/source/zforp.rst
index 25ae7613d..aee51715a 100644
--- a/docs/source/zforp.rst
+++ b/docs/source/zforp.rst
@@ -9,9 +9,11 @@ Fortran Bindings
 
 |zfp| |zforprelease| adds |zforp|: a Fortran API providing wrappers around
 the :ref:`high-level C API <hl-api>`. Wrappers for
-:ref:`compressed arrays <arrays>` will arrive in a future release.
+:ref:`compressed-array classes <arrays>` will arrive in a future release.
 The |zforp| implementation is based on the standard :code:`iso_c_binding`
-module available since Fortran 2003.
+module available since Fortran 2003.  The use of :code:`ptrdiff_t` in
+the |zfp| |fieldrelease| C API, however, requires the corresponding
+:code:`c_ptrdiff_t` available only since Fortran 2018.
 
 Every high-level C API function can be called from a Fortran wrapper function.
 C structs are wrapped as Fortran derived types, each containing a single C
@@ -28,9 +30,13 @@ for how the Fortran API is used to compress and decompress data.
 .. _zforp_changes:
 .. note::
 
-  |zfp| |fieldrelease| simplifies the |zforp| module name from zforp_module to zforp.
-  This will likely require changing associated use statements within existing code when 
-  updating from prior versions of zFORp.
+  |zfp| |fieldrelease| simplifies the |zforp| module name from
+  ``zforp_module`` to ``zfp``.  This will likely require changing
+  associated use statements within existing code when updating
+  from prior versions of zFORp.
+
+  Furthermore, as outlined above, the |zfp| |fieldrelease| API requires
+  a Fortran 2018 compiler.
 
 
 Types
@@ -109,6 +115,12 @@ Non-Enum Constants
 
 ----
 
+.. f:variable:: integer zFORp_version_tweak
+
+  Wraps :c:macro:`ZFP_VERSION_TWEAK`
+
+----
+
 .. f:variable:: integer zFORp_codec_version
 
   Wraps :c:data:`zfp_codec_version`
@@ -668,6 +680,16 @@ Array Metadata
 
 ----
 
+.. f:function:: zFORp_field_blocks(field)
+
+  Wrapper for :c:func:`zfp_field_blocks`
+
+  :p zFORp_field field [in]: Field metadata
+  :r blocks: Total number of blocks spanned by field
+  :rtype blocks: integer (kind=8)
+
+----
+
 .. f:function:: zFORp_field_stride(field, stride_arr)
 
   Wrapper for :c:func:`zfp_field_stride`
diff --git a/docs/source/zfp-rounding.pdf b/docs/source/zfp-rounding.pdf
new file mode 100644
index 000000000..6cda53b3f
Binary files /dev/null and b/docs/source/zfp-rounding.pdf differ
diff --git a/docs/source/zfpcmd.rst b/docs/source/zfpcmd.rst
index 3f93a41e5..6b6b5582a 100644
--- a/docs/source/zfpcmd.rst
+++ b/docs/source/zfpcmd.rst
@@ -115,10 +115,10 @@ General options
 
   Evaluate and print the following error statistics:
 
-  * rmse: The root mean square error.
-  * nrmse: The root mean square error normalized to the range.
-  * maxe: The maximum absolute pointwise error.
-  * psnr: The peak signal to noise ratio in decibels.
+  * ``rmse``: The root mean square error.
+  * ``nrmse``: The root mean square error normalized to the range.
+  * ``maxe``: The maximum absolute pointwise error.
+  * ``psnr``: The peak signal to noise ratio in decibels.
 
 Input and output
 ^^^^^^^^^^^^^^^^
@@ -246,11 +246,11 @@ Examples
 
   * :code:`-i file` : read uncompressed file and compress to memory
   * :code:`-z file` : read compressed file and decompress to memory
-  * :code:`-i ifile -z zfile` : read uncompressed ifile, write compressed zfile
-  * :code:`-z zfile -o ofile` : read compressed zfile, write decompressed ofile
-  * :code:`-i ifile -o ofile` : read ifile, compress, decompress, write ofile
+  * :code:`-i ifile -z zfile` : read uncompressed ``ifile``, write compressed ``zfile``
+  * :code:`-z zfile -o ofile` : read compressed ``zfile``, write decompressed ``ofile``
+  * :code:`-i ifile -o ofile` : read ``ifile``, compress, decompress, write ``ofile``
   * :code:`-i file -s` : read uncompressed file, compress to memory, print stats
-  * :code:`-i - -o - -s` : read stdin, compress, decompress, write stdout, print stats
+  * :code:`-i - -o - -s` : read ``stdin``, compress, decompress, write ``stdout``, print stats
   * :code:`-f -3 100 100 100 -r 16` : 2x fixed-rate compression of 100 |times| 100 |times| 100 floats
   * :code:`-d -1 1000000 -r 32` : 2x fixed-rate compression of 1,000,000 doubles
   * :code:`-d -2 1000 1000 -p 32` : 32-bit precision compression of 1000 |times| 1000 doubles
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 565e929a1..0bc9c5676 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,6 +1,17 @@
+add_executable(array array.cpp)
+target_compile_definitions(array PRIVATE ${zfp_compressed_array_defs})
+target_link_libraries(array zfp)
+
+add_executable(chunk chunk.c)
+target_link_libraries(chunk zfp)
+
 add_executable(diffusion diffusion.cpp)
-target_link_libraries(diffusion zfp)
 target_compile_definitions(diffusion PRIVATE ${zfp_compressed_array_defs})
+if(ZFP_WITH_OPENMP)
+  target_link_libraries(diffusion zfp OpenMP::OpenMP_CXX)
+else()
+  target_link_libraries(diffusion zfp)
+endif()
 
 if(BUILD_CFP)
   add_executable(diffusionC diffusionC.c)
@@ -33,6 +44,7 @@ add_executable(speed speed.c)
 target_link_libraries(speed zfp)
 
 if(HAVE_LIBM_MATH)
+  target_link_libraries(array m)
   target_link_libraries(diffusion m)
 
   if(BUILD_CFP)
diff --git a/examples/Makefile b/examples/Makefile
index 2dc6213f5..6b4b1d100 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,7 +1,9 @@
 include ../Config
 
 BINDIR = ../bin
-TARGETS = $(BINDIR)/diffusion\
+TARGETS = $(BINDIR)/array\
+	  $(BINDIR)/chunk\
+	  $(BINDIR)/diffusion\
 	  $(BINDIR)/inplace\
 	  $(BINDIR)/iterator\
 	  $(BINDIR)/pgm\
@@ -10,31 +12,43 @@ TARGETS = $(BINDIR)/diffusion\
 	  $(BINDIR)/speed
 INCS = -I../include
 LIBS = -L../lib -lzfp
-CLIBS = $(LIBS) -lm
-CXXLIBS = $(LIBS)
+CLIBS = $(LIBS) $(LDFLAGS) -lm
+CXXLIBS = $(LIBS) $(LDFLAGS)
 
 # add cfp examples when BUILD_CFP is enabled
 ifneq ($(BUILD_CFP),0)
-  TARGETS += $(BINDIR)/diffusionC
+  TARGETS += $(BINDIR)/diffusionC $(BINDIR)/iteratorC
 endif
 
 
 all: $(TARGETS)
 
+$(BINDIR)/array: array.cpp ../lib/$(LIBZFP)
+	$(CXX) $(CXXFLAGS) $(INCS) array.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/chunk: chunk.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) $(INCS) chunk.c $(CLIBS) -o $@
+
 $(BINDIR)/diffusion: diffusion.cpp ../lib/$(LIBZFP)
-	$(CXX) $(CXXFLAGS) $(INCS) -I../array diffusion.cpp $(CXXLIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(INCS) diffusion.cpp $(CXXLIBS) -o $@
 
 $(BINDIR)/diffusionC: diffusionC.o ../lib/$(LIBZFP) ../lib/$(LIBCFP)
 	$(CXX) $(CXXFLAGS) diffusionC.o -lcfp $(CLIBS) -o $@
 
 diffusionC.o: diffusionC.c
-	$(CC) $(CFLAGS) $(INCS) -I../cfp/include -c diffusionC.c
+	$(CC) $(CFLAGS) $(INCS) -c diffusionC.c
 
 $(BINDIR)/inplace: inplace.c ../lib/$(LIBZFP)
 	$(CC) $(CFLAGS) $(INCS) inplace.c $(CLIBS) -o $@
 
 $(BINDIR)/iterator: iterator.cpp ../lib/$(LIBZFP)
-	$(CXX) $(CXXFLAGS) $(INCS) -I../array iterator.cpp $(CXXLIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(INCS) iterator.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/iteratorC: iteratorC.o ../lib/$(LIBZFP) ../lib/$(LIBCFP)
+	$(CXX) $(CXXFLAGS) iteratorC.o -lcfp $(CLIBS) -o $@
+
+iteratorC.o: iteratorC.c
+	$(CC) $(CFLAGS) $(INCS) -c iteratorC.c
 
 $(BINDIR)/pgm: pgm.c ../lib/$(LIBZFP)
 	$(CC) $(CFLAGS) $(INCS) pgm.c $(CLIBS) -o $@
@@ -49,4 +63,4 @@ $(BINDIR)/speed: speed.c ../lib/$(LIBZFP)
 	$(CC) $(CFLAGS) $(INCS) speed.c $(CLIBS) -o $@
 
 clean:
-	rm -f $(TARGETS) diffusionC.o
+	rm -f $(TARGETS) $(BINDIR)/diffusionC $(BINDIR)/iteratorC diffusionC.o iteratorC.o
diff --git a/examples/array.cpp b/examples/array.cpp
new file mode 100644
index 000000000..233cb36ee
--- /dev/null
+++ b/examples/array.cpp
@@ -0,0 +1,42 @@
+// simple example that shows how to work with zfp's compressed-array classes
+
+#include <iostream>
+#include <vector>
+#include "zfp/array2.hpp"
+
+int main()
+{
+  // array dimensions (can be arbitrary) and zfp memory footprint
+  const size_t nx = 12;
+  const size_t ny = 8;
+  const double bits_per_value = 4.0;
+
+  // declare 2D arrays using STL and zfp
+  std::vector<double> vec(nx * ny);
+  zfp::array2<double> arr(nx, ny, bits_per_value);
+
+  // initialize arrays to linear ramp
+  for (size_t y = 0; y < ny; y++)
+    for (size_t x = 0; x < nx; x++)
+      arr(x, y) = vec[x + nx * y] = x + nx * y;
+
+  // alternative initialization of entire array, arr:
+  // arr.set(&vec[0]);
+
+  // optional: force compression of cached data
+  arr.flush_cache();
+
+  // print values
+  for (size_t y = 0; y < ny; y++)
+    for (size_t x = 0; x < nx; x++)
+      std::cout << vec[x + nx * y] << " " << arr(x, y) << std::endl;
+
+  // alternative using printf(); note the necessary cast:
+  // printf("%g %g\n", vec[x + nx * y], (double)arr(x, y));
+
+  // print storage size of payload data
+  std::cout << "vec bytes = " << vec.capacity() * sizeof(vec[0]) << std::endl;
+  std::cout << "zfp bytes = " << arr.size_bytes(ZFP_DATA_PAYLOAD) << std::endl;
+
+  return 0;
+}
diff --git a/examples/array2d.h b/examples/array2d.hpp
similarity index 98%
rename from examples/array2d.h
rename to examples/array2d.hpp
index 429da5a93..c349328b5 100644
--- a/examples/array2d.h
+++ b/examples/array2d.hpp
@@ -1,5 +1,5 @@
-#ifndef ARRAY2D_H
-#define ARRAY2D_H
+#ifndef ARRAY2D_HPP
+#define ARRAY2D_HPP
 
 #include <climits>
 #include <vector>
diff --git a/examples/chunk.c b/examples/chunk.c
new file mode 100644
index 000000000..4da611a8c
--- /dev/null
+++ b/examples/chunk.c
@@ -0,0 +1,192 @@
+/* code example showing how to (de)compress a 3D array in chunks */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+
+/* open compressed stream for (de)compressing field at given rate */
+static zfp_stream*
+stream(const zfp_field* field, double rate)
+{
+  const size_t bx = (field->nx + 3) / 4; /* # blocks along x */
+  const size_t by = (field->ny + 3) / 4; /* # blocks along y */
+  const size_t bz = (field->nz + 3) / 4; /* # blocks along z */
+
+  zfp_stream* zfp;   /* compressed stream */
+  size_t words;      /* word size of compressed buffer */
+  size_t bytes;      /* byte size of compressed buffer */
+  void* buffer;      /* storage for compressed stream */
+  bitstream* stream; /* bit stream to write to or read from */
+
+  /* allocate meta data for a compressed stream */
+  zfp = zfp_stream_open(NULL);
+
+  /* set fixed-rate mode with no alignment */
+  zfp_stream_set_rate(zfp, rate, zfp_type_double, zfp_field_dimensionality(field), zfp_false);
+
+  /* determine exact compressed size in words */
+  words = (bx * by * bz * zfp->maxbits + stream_word_bits - 1) / stream_word_bits;
+
+  /* allocate buffer for single chunk of compressed data */
+  bytes = words * stream_word_bits / CHAR_BIT;
+  buffer = malloc(bytes);
+
+  /* associate bit stream with allocated buffer */
+  stream = stream_open(buffer, bytes);
+  zfp_stream_set_bit_stream(zfp, stream);
+
+  return zfp;
+}
+
+/* compress chunk */
+static zfp_bool
+compress(zfp_stream* zfp, const zfp_field* field)
+{
+  void* buffer = stream_data(zfp_stream_bit_stream(zfp));
+
+  /* compress chunk and output compressed data */
+  size_t size = zfp_compress(zfp, field);
+  if (!size)
+    return zfp_false;
+  fwrite(buffer, 1, size, stdout);
+
+  return zfp_true;
+}
+
+/* decompress chunk */
+static zfp_bool
+decompress(zfp_stream* zfp, zfp_field* field)
+{
+  void* buffer = stream_data(zfp_stream_bit_stream(zfp));
+
+  /* decompress chunk and output uncompressed data */
+  size_t size = fread(buffer, 1, stream_capacity(zfp_stream_bit_stream(zfp)), stdin);
+  if (zfp_decompress(zfp, field) != size)
+    return zfp_false;
+  fwrite(zfp_field_pointer(field), sizeof(double), zfp_field_size(field, NULL), stdout);
+
+  return zfp_true;
+}
+
+/* print command usage */
+static int
+usage(void)
+{
+  fprintf(stderr, "chunk [options] <input >output\n");
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "-3 <nx> <ny> <nz> : array dimensions\n");
+  fprintf(stderr, "-d : decompress (from stdin to stdout); else compress\n");
+  fprintf(stderr, "-n <count> : number of chunks along z dimension\n");
+  fprintf(stderr, "-r <rate> : rate in bits/value\n");
+
+  return EXIT_FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+  /* command-line arguments */
+  zfp_bool decode = zfp_false;
+  double rate = 16;
+  int nx = 125;
+  int ny = 100;
+  int nz = 240;
+  int chunks = 1;
+
+  /* local variables */
+  double* array;
+  double* ptr;
+  zfp_field* field;
+  zfp_stream* zfp;
+  int i, x, y, z, mz;
+
+  /* process command line */
+  for (i = 1; i < argc; i++)
+    if (!strcmp(argv[i], "-3")) {
+      if (++i == argc || sscanf(argv[i], "%d", &nx) != 1 ||
+          ++i == argc || sscanf(argv[i], "%d", &ny) != 1 ||
+          ++i == argc || sscanf(argv[i], "%d", &nz) != 1)
+        return usage();
+    }
+    else if (!strcmp(argv[i], "-d"))
+      decode = zfp_true;
+    else if (!strcmp(argv[i], "-r")) {
+      if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
+        return usage();
+    }
+    else if (!strcmp(argv[i], "-n")) {
+      if (++i == argc || sscanf(argv[i], "%d", &chunks) != 1)
+        usage();
+    }
+    else
+      return usage();
+
+  /* compute chunk size (must be a multiple of four) */
+  mz = 4 * ((nz + 4 * chunks - 1) / (4 * chunks));
+  if ((chunks - 1) * mz >= nz) {
+    fprintf(stderr, "cannot partition nz=%d into %d chunks\n", nz, chunks);
+    return EXIT_FAILURE;
+  }
+
+  /* allocate whole nx * ny * nz array of doubles */
+  array = malloc(nx * ny * nz * sizeof(double));
+
+  if (!decode) {
+    /* initialize array to be compressed */
+    for (z = 0; z < nz; z++)
+      for (y = 0; y < ny; y++)
+        for (x = 0; x < nx; x++)
+          array[x + nx * (y + ny * z)] = 1. / (1 + x + nx * (y + ny * z));
+  }
+
+  /* initialize field, stream, and compressed buffer */
+  field = zfp_field_3d(array, zfp_type_double, nx, ny, mz);
+  zfp = stream(field, rate);
+
+  /* warn if compressed size is not a multiple of word size */
+  if (chunks > 1 && (zfp_field_blocks(field) * zfp->maxbits) % stream_word_bits)
+    fprintf(stderr, "warning: compressed size (%ld) is not a multiple of word size (%ld)\n", (long)(zfp_field_blocks(field) * zfp->maxbits), (long)stream_word_bits);
+
+  /* (de)compress array in chunks */
+  ptr = array;
+  for (z = 0; z < nz; z += mz) {
+    /* compute current chunk size as min(mz, nz - z) */
+    int cz = mz < nz - z ? mz : nz - z;
+
+    /* set chunk size and pointer into uncompressed array */
+    zfp_field_set_pointer(field, ptr);
+    zfp_field_set_size_3d(field, nx, ny, cz);
+
+    /* reuse compressed buffer by rewinding compressed stream */
+    zfp_stream_rewind(zfp);
+
+    if (decode) {
+      /* decompress current chunk from stdin to stdout */
+      if (!decompress(zfp, field)) {
+        fprintf(stderr, "decompression failed\n");
+        return EXIT_FAILURE;
+      }
+    }
+    else {
+      /* compress current chunk to stdout */
+      if (!compress(zfp, field)) {
+        fprintf(stderr, "compression failed\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    /* advance pointer to next chunk of uncompressed data */
+    ptr += nx * ny * cz;
+  }
+
+  /* clean up */
+  free(stream_data(zfp_stream_bit_stream(zfp)));
+  stream_close(zfp_stream_bit_stream(zfp));
+  zfp_stream_close(zfp);
+  zfp_field_free(field);
+  free(array);
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/diffusion.cpp b/examples/diffusion.cpp
index 14fe0998c..a62f191e1 100644
--- a/examples/diffusion.cpp
+++ b/examples/diffusion.cpp
@@ -6,14 +6,42 @@
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
-#include "zfparray2.h"
-#include "zfpcarray2.h"
-#include "array2d.h"
+#include <sstream>
+#include "zfp/array2.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/codec/gencodec.hpp"
+#include "array2d.hpp"
+
+// add half precision if compiler supports it
+#define __STDC_WANT_IEC_60559_TYPES_EXT__
+#include <cfloat>
+#ifdef FLT16_MAX
+  #define WITH_HALF 1
+#else
+  #undef WITH_HALF
+#endif
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
+// uncompressed tiled arrays based on zfp generic codec
+namespace tiled {
+#if WITH_HALF
+  typedef zfp::array2< double, zfp::codec::generic2<double, _Float16> > array2h;
+#endif
+  typedef zfp::array2< double, zfp::codec::generic2<double, float> > array2f;
+  typedef zfp::array2< double, zfp::codec::generic2<double, double> > array2d;
+}
+
+// enumeration of uncompressed storage types
+enum storage_type {
+  type_none = 0,
+  type_half = 1,
+  type_float = 2,
+  type_double = 3
+};
+
 // constants used in the solution
 class Constants {
 public:
@@ -54,20 +82,21 @@ laplacian(const array2d& u, size_t x, size_t y, const Constants& c)
   return uxx + uyy;
 }
 
-template <class array2d>
+template <class state, class scratch>
 inline void
-time_step_parallel(array2d& u, const Constants& c);
+time_step_parallel(state& u, scratch& v, const Constants& c);
 
-// advance solution in parallel via thread-safe views
 #ifdef _OPENMP
+// advance solution in parallel via thread-safe views
 template <>
 inline void
-time_step_parallel(zfp::array2d& u, const Constants& c)
+time_step_parallel(zfp::array2d& u, zfp::array2d& du, const Constants& c)
 {
   // flush shared cache to ensure cache consistency across threads
   u.flush_cache();
+  // zero-initialize du
+  du.set(0);
   // compute du/dt in parallel
-  zfp::array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
   #pragma omp parallel
   {
     // create read-only private view of entire array u
@@ -81,15 +110,8 @@ time_step_parallel(zfp::array2d& u, const Constants& c)
       if (1 <= y && y <= c.ny - 2)
         for (size_t i = 0; i < mydu.size_x(); i++) {
           size_t x = mydu.global_x(i);
-          if (1 <= x && x <= c.nx - 2) {
-#if 0
-            double uxx = (myu(x - 1, y) - 2 * myu(x, y) + myu(x + 1, y)) / (c.dx * c.dx);
-            double uyy = (myu(x, y - 1) - 2 * myu(x, y) + myu(x, y + 1)) / (c.dy * c.dy);
-            mydu(i, j) = c.dt * c.k * (uxx + uyy);
-#else
+          if (1 <= x && x <= c.nx - 2)
             mydu(i, j) = c.dt * c.k * laplacian(myu, x, y, c);
-#endif
-          }
         }
     }
     // compress all private cached blocks to shared storage
@@ -100,34 +122,31 @@ time_step_parallel(zfp::array2d& u, const Constants& c)
     u[i] += du[i];
 }
 #else
+// dummy template instantiation when OpenMP support is not available
 template <>
-inline void
-time_step_parallel(zfp::array2d&, const Constants&)
-{
-}
+inline void time_step_parallel(zfp::array2d&, zfp::array2d&, const Constants&) {}
 #endif
 
-// dummy template instantiation; never executed
+// dummy template instantiations; never executed
 template <>
-inline void
-time_step_parallel(zfp::const_array2d&, const Constants&)
-{
-}
-
-// dummy template instantiation; never executed
+inline void time_step_parallel(zfp::const_array2d&, raw::array2d&, const Constants&) {}
 template <>
-inline void
-time_step_parallel(raw::array2d&, const Constants&)
-{
-}
+inline void time_step_parallel(raw::array2d&, raw::array2d&, const Constants&) {}
+template <>
+inline void time_step_parallel(tiled::array2d&, tiled::array2d&, const Constants&) {}
+template <>
+inline void time_step_parallel(tiled::array2f&, tiled::array2f&, const Constants&) {}
+#if WITH_HALF
+template <>
+inline void time_step_parallel(tiled::array2h&, tiled::array2h&, const Constants&) {}
+#endif
 
 // advance solution using integer array indices (generic implementation)
-template <class array2d>
+template <class state, class scratch>
 inline void
-time_step_indexed(array2d& u, const Constants& c)
+time_step_indexed(state& u, scratch& du, const Constants& c)
 {
   // compute du/dt
-  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
   for (size_t y = 1; y < c.ny - 1; y++)
     for (size_t x = 1; x < c.nx - 1; x++)
       du(x, y) = c.dt * c.k * laplacian(u, x, y, c);
@@ -139,10 +158,9 @@ time_step_indexed(array2d& u, const Constants& c)
 // advance solution using integer array indices (read-only arrays)
 template <>
 inline void
-time_step_indexed(zfp::const_array2d& u, const Constants& c)
+time_step_indexed(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
 {
   // initialize v as uncompressed copy of u
-  raw::array2d v(c.nx, c.ny);
   u.get(&v[0]);
   // take forward Euler step v += (du/dt) dt
   for (size_t y = 1; y < c.ny - 1; y++)
@@ -153,48 +171,46 @@ time_step_indexed(zfp::const_array2d& u, const Constants& c)
 }
 
 // advance solution using array iterators (generic implementation)
-template <class array2d>
+template <class state, class scratch>
 inline void
-time_step_iterated(array2d& u, const Constants& c)
+time_step_iterated(state& u, scratch& du, const Constants& c)
 {
   // compute du/dt
-  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
-  for (typename array2d::iterator p = du.begin(); p != du.end(); p++) {
-    size_t x = p.i();
-    size_t y = p.j();
+  for (typename scratch::iterator q = du.begin(); q != du.end(); q++) {
+    size_t x = q.i();
+    size_t y = q.j();
     if (1 <= x && x <= c.nx - 2 &&
         1 <= y && y <= c.ny - 2)
-      *p = c.dt * c.k * laplacian(u, x, y, c);
+      *q = c.dt * c.k * laplacian(u, x, y, c);
   }
   // take forward Euler step
-  for (typename array2d::iterator p = u.begin(), q = du.begin(); p != u.end(); p++, q++)
-    *p += *q;
+  for (typename state::iterator p = u.begin(); p != u.end(); p++)
+    *p += du(p.i(), p.j());
 }
 
-// dummy specialization; never called
+// advance solution using array iterators (read-only arrays)
 template <>
 inline void
-time_step_iterated(zfp::const_array2d& u, const Constants& c)
+time_step_iterated(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
 {
   // initialize v as uncompressed copy of u
-  raw::array2d v(c.nx, c.ny);
   u.get(&v[0]);
   // take forward Euler step v += (du/dt) dt
-  for (raw::array2d::iterator p = v.begin(); p != v.end(); p++) {
-    size_t x = p.i();
-    size_t y = p.j();
+  for (raw::array2d::iterator q = v.begin(); q != v.end(); q++) {
+    size_t x = q.i();
+    size_t y = q.j();
     if (1 <= x && x <= c.nx - 2 &&
         1 <= y && y <= c.ny - 2)
-      *p += c.dt * c.k * laplacian(u, x, y, c);
+      *q += c.dt * c.k * laplacian(u, x, y, c);
   }
   // update u with uncompressed copy v
   u.set(&v[0]);
 }
 
 // set initial conditions with a point heat source (u is assumed zero-initialized)
-template <class array2d>
+template <class state, class scratch>
 inline void
-initialize(array2d& u, const Constants& c)
+initialize(state& u, scratch&, const Constants& c)
 {
   u(c.x0, c.y0) = 1;
 }
@@ -202,20 +218,19 @@ initialize(array2d& u, const Constants& c)
 // set initial conditions for const_array; requires updating the whole array
 template <>
 inline void
-initialize(zfp::const_array2d& u, const Constants& c)
+initialize(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
 {
-  std::vector<double> data(c.nx * c.ny, 0.0);
-  data[c.x0 + c.nx * c.y0] = 1;
-  u.set(&data[0]);
+  v(c.x0, c.y0) = 1;
+  u.set(&v[0]);
 }
 
 // solve heat equation
-template <class array2d>
+template <class state, class scratch>
 inline double
-solve(array2d& u, const Constants& c, bool iterator, bool parallel)
+solve(state& u, scratch& v, const Constants& c, bool iterator, bool parallel)
 {
   // initialize u with point heat source
-  initialize(u, c);
+  initialize(u, v, c);
 
   // iterate until final time
   double t;
@@ -227,20 +242,20 @@ solve(array2d& u, const Constants& c, bool iterator, bool parallel)
     std::cerr << "rate=" << std::setprecision(3) << std::fixed << rate << " (+" << rest << ")" << std::endl;
     // advance solution one time step
     if (parallel)
-      time_step_parallel(u, c);
+      time_step_parallel(u, v, c);
     else if (iterator)
-      time_step_iterated(u, c);
+      time_step_iterated(u, v, c);
     else
-      time_step_indexed(u, c);
+      time_step_indexed(u, v, c);
   }
 
   return t;
 }
 
 // compute sum of array values
-template <class array2d>
+template <class state>
 inline double
-total(const array2d& u)
+total(const state& u)
 {
   double s = 0;
   const size_t nx = u.size_x();
@@ -252,9 +267,9 @@ total(const array2d& u)
 }
 
 // compute root mean square error with respect to exact solution
-template <class array2d>
+template <class state>
 inline double
-error(const array2d& u, const Constants& c, double t)
+error(const state& u, const Constants& c, double t)
 {
   double e = 0;
   for (size_t y = 1; y < c.ny - 1; y++) {
@@ -269,6 +284,20 @@ error(const array2d& u, const Constants& c, double t)
   return std::sqrt(e / ((c.nx - 2) * (c.ny - 2)));
 }
 
+// execute solver and evaluate error
+template <class state, class scratch>
+inline void
+execute(state& u, scratch& v, size_t nt, bool iterator, bool parallel)
+{
+  Constants c(u.size_x(), u.size_y(), nt);
+  double t = solve(u, v, c, iterator, parallel);
+  double sum = total(u);
+  double err = error(u, c, t);
+  std::cerr.unsetf(std::ios::fixed);
+  std::cerr << "sum=" << std::setprecision(6) << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
+}
+
+// print usage information
 inline int
 usage()
 {
@@ -276,7 +305,12 @@ usage()
   std::cerr << "Options:" << std::endl;
   std::cerr << "-a <tolerance> : use compressed arrays with given absolute error tolerance" << std::endl;
   std::cerr << "-b <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl;
-  std::cerr << "-c : use read-only arrays" << std::endl;
+  std::cerr << "-c : use read-only compressed arrays" << std::endl;
+  std::cerr << "-d : use double-precision tiled arrays" << std::endl;
+  std::cerr << "-f : use single-precision tiled arrays" << std::endl;
+#if WITH_HALF
+  std::cerr << "-h : use half-precision tiled arrays" << std::endl;
+#endif
   std::cerr << "-i : traverse arrays using iterators" << std::endl;
 #ifdef _OPENMP
   std::cerr << "-j : use multithreading (only with compressed arrays)" << std::endl;
@@ -291,15 +325,15 @@ usage()
 
 int main(int argc, char* argv[])
 {
-  size_t nx = 100;
-  size_t ny = 100;
+  size_t nx = 128;
+  size_t ny = 128;
   size_t nt = 0;
-  double rate = 64;
   size_t cache_size = 0;
   zfp_config config = zfp_config_none();
   bool iterator = false;
   bool parallel = false;
   bool writable = true;
+  storage_type type = type_none;
 
   // parse command-line options
   for (int i = 1; i < argc; i++)
@@ -310,12 +344,20 @@ int main(int argc, char* argv[])
       config = zfp_config_accuracy(tolerance);
     }
     else if (std::string(argv[i]) == "-b") {
-      if (++i == argc || sscanf(argv[i], "%zu", &cache_size) != 1)
+      if (++i == argc || (std::istringstream(argv[i]) >> cache_size).fail())
         return usage();
       cache_size *= 4 * 4 * sizeof(double);
     }
     else if (std::string(argv[i]) == "-c")
       writable = false;
+    else if (std::string(argv[i]) == "-d")
+      type = type_double;
+    else if (std::string(argv[i]) == "-f")
+      type = type_float;
+#if WITH_HALF
+    else if (std::string(argv[i]) == "-h")
+      type = type_half;
+#endif
     else if (std::string(argv[i]) == "-i")
       iterator = true;
 #ifdef _OPENMP
@@ -323,8 +365,8 @@ int main(int argc, char* argv[])
       parallel = true;
 #endif
     else if (std::string(argv[i]) == "-n") {
-      if (++i == argc || sscanf(argv[i], "%zu", &nx) != 1 ||
-          ++i == argc || sscanf(argv[i], "%zu", &ny) != 1)
+      if (++i == argc || (std::istringstream(argv[i]) >> nx).fail() ||
+          ++i == argc || (std::istringstream(argv[i]) >> ny).fail())
         return usage();
     }
     else if (std::string(argv[i]) == "-p") {
@@ -334,6 +376,7 @@ int main(int argc, char* argv[])
       config = zfp_config_precision(precision);
     }
     else if (std::string(argv[i]) == "-r") {
+      double rate;
       if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
         return usage();
       config = zfp_config_rate(rate, false);
@@ -341,7 +384,7 @@ int main(int argc, char* argv[])
     else if (std::string(argv[i]) == "-R")
       config = zfp_config_reversible();
     else if (std::string(argv[i]) == "-t") {
-      if (++i == argc || sscanf(argv[i], "%zu", &nt) != 1)
+      if (++i == argc || (std::istringstream(argv[i]) >> nt).fail())
         return usage();
     }
     else
@@ -370,38 +413,66 @@ int main(int argc, char* argv[])
     fprintf(stderr, "read-only arrays require compression parameters\n");
     return EXIT_FAILURE;
   }
+  if (compression && type != type_none) {
+    fprintf(stderr, "tiled arrays do not support compression parameters\n");
+    return EXIT_FAILURE;
+  }
 
-  Constants c(nx, ny, nt);
+  // if unspecified, set cache size to two layers of blocks
+  if (!cache_size)
+    cache_size = 2 * 4 * nx * sizeof(double);
 
-  double sum;
-  double err;
+  // solve problem
   if (compression) {
-    // solve problem using compressed arrays
+    // use compressed arrays
     if (writable) {
       // use read-write fixed-rate arrays
-      zfp::array2d u(nx, ny, rate, 0, cache_size);
-      double t = solve(u, c, iterator, parallel);
-      sum = total(u);
-      err = error(u, c, t);
+      zfp::array2d u(nx, ny, config.arg.rate, 0, cache_size);
+      zfp::array2d v(nx, ny, config.arg.rate, 0, cache_size);
+      execute(u, v, nt, iterator, parallel);
     }
     else {
       // use read-only variable-rate arrays
       zfp::const_array2d u(nx, ny, config, 0, cache_size);
-      double t = solve(u, c, iterator, parallel);
-      sum = total(u);
-      err = error(u, c, t);
+      raw::array2d v(nx, ny);
+      execute(u, v, nt, iterator, parallel);
     }
   }
   else {
-    // solve problem using uncompressed arrays
-    raw::array2d u(nx, ny);
-    double t = solve(u, c, iterator, parallel);
-    sum = total(u);
-    err = error(u, c, t);
+    // use uncompressed arrays
+    switch (type) {
+#if WITH_HALF
+      case type_half: {
+          // use zfp generic codec with tiled half-precision storage
+          tiled::array2h u(nx, ny, sizeof(__fp16) * CHAR_BIT, 0, cache_size);
+          tiled::array2h v(nx, ny, sizeof(__fp16) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+#endif
+      case type_float: {
+          // use zfp generic codec with tiled single-precision storage
+          tiled::array2f u(nx, ny, sizeof(float) * CHAR_BIT, 0, cache_size);
+          tiled::array2f v(nx, ny, sizeof(float) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+      case type_double: {
+          // use zfp generic codec with tiled double-precision storage
+          tiled::array2d u(nx, ny, sizeof(double) * CHAR_BIT, 0, cache_size);
+          tiled::array2d v(nx, ny, sizeof(double) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+      default: {
+          // use uncompressed array with row-major double-precision storage
+          raw::array2d u(nx, ny, sizeof(double) * CHAR_BIT);
+          raw::array2d v(nx, ny, sizeof(double) * CHAR_BIT);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+    }
   }
 
-  std::cerr.unsetf(std::ios::fixed);
-  std::cerr << "sum=" << std::setprecision(6) << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
-
   return 0;
 }
diff --git a/examples/diffusionC.c b/examples/diffusionC.c
index a5d88a42a..3a2ac6ab4 100644
--- a/examples/diffusionC.c
+++ b/examples/diffusionC.c
@@ -6,8 +6,8 @@ forward Euler finite difference solution to the heat equation on a 2D grid
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
+#include "zfp/array.h"
 
-#include "cfparray.h"
 #define _ (CFP_NAMESPACE.array2d)
 
 #define MAX(x, y) (((nx) > (ny)) ? (nx) : (ny))
@@ -103,9 +103,9 @@ time_step_indexed(double* u, const constants* c)
   size_t i, x, y;
   for (y = 1; y < c->ny - 1; y++)
     for (x = 1; x < c->nx - 1; x++) {
-      double uxx = (u[y*c->nx + (x - 1)] - 2 * u[y*c->nx + x] + u[y*c->nx + (x + 1)]) / (c->dx * c->dx);
-      double uyy = (u[(y - 1)*c->nx + x] - 2 * u[y*c->nx + x] + u[(y + 1)*c->nx + x]) / (c->dy * c->dy);
-      du[y*c->nx + x] = c->dt * c->k * (uxx + uyy);
+      double uxx = (u[(x - 1) + c->nx * y] - 2 * u[x + c->nx * y] + u[(x + 1) + c->nx * y]) / (c->dx * c->dx);
+      double uyy = (u[x + c->nx * (y - 1)] - 2 * u[x + c->nx * y] + u[x + c->nx * (y + 1)]) / (c->dy * c->dy);
+      du[x + c->nx * y] = c->dt * c->k * (uxx + uyy);
     }
   /* take forward Euler step */
   for (i = 0; i < c->nx * c->ny; i++)
@@ -142,7 +142,7 @@ solve(double* u, const constants* c)
   double t;
 
   /* initialize u with point heat source (u is assumed to be zero initialized) */
-  u[c->y0*c->nx + c->x0] = 1;
+  u[c->x0 + c->nx * c->y0] = 1;
 
   /* iterate until final time */
   for (t = 0; t < c->tfinal; t += c->dt) {
@@ -175,7 +175,7 @@ total(const double* u, size_t nx, size_t ny)
   size_t x, y;
   for (y = 1; y < ny - 1; y++)
     for (x = 1; x < nx - 1; x++)
-      s += u[y*nx + x];
+      s += u[x + nx * y];
   return s;
 }
 
@@ -186,9 +186,9 @@ error_compressed(const cfp_array2d u, const constants* c, double t)
   double e = 0;
   size_t x, y;
   for (y = 1; y < c->ny - 1; y++) {
-    double py = c->dy * (y - c->y0);
+    double py = c->dy * ((int)y - (int)c->y0);
     for (x = 1; x < c->nx - 1; x++) {
-      double px = c->dx * (x - c->x0);
+      double px = c->dx * ((int)x - (int)c->x0);
       double f = _.get(u, x, y);
       double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
       e += (f - g) * (f - g);
@@ -204,10 +204,10 @@ error(const double* u, const constants* c, double t)
   double e = 0;
   size_t x, y;
   for (y = 1; y < c->ny - 1; y++) {
-    double py = c->dy * (y - c->y0);
+    double py = c->dy * ((int)y - (int)c->y0);
     for (x = 1; x < c->nx - 1; x++) {
-      double px = c->dx * (x - c->x0);
-      double f = u[y*c->nx + x];
+      double px = c->dx * ((int)x - (int)c->x0);
+      double f = u[x + c->nx * y];
       double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
       e += (f - g) * (f - g);
     }
@@ -216,26 +216,27 @@ error(const double* u, const constants* c, double t)
 }
 
 static int
-usage()
+usage(void)
 {
   fprintf(stderr, "Usage: diffusionC [options]\n");
   fprintf(stderr, "Options:\n");
+  fprintf(stderr, "-b <blocks> : use 'blocks' 4x4 blocks of cache\n");
+  fprintf(stderr, "-i : traverse arrays using iterators\n");
   fprintf(stderr, "-n <nx> <ny> : number of grid points\n");
+  fprintf(stderr, "-r <rate> : use compressed arrays with given compressed bits/value\n");
   fprintf(stderr, "-t <nt> : number of time steps\n");
-  fprintf(stderr, "-r <rate> : use compressed arrays with 'rate' bits/value\n");
-  fprintf(stderr, "-c <blocks> : use 'blocks' 4x4 blocks of cache\n");
   return EXIT_FAILURE;
 }
 
 int main(int argc, char* argv[])
 {
-  int nx = 100;
-  int ny = 100;
+  int nx = 128;
+  int ny = 128;
   int nt = 0;
+  int cache_size = 0;
   double rate = 64;
-  int iterator = 0;
-  int compression = 0;
-  int cache = 0;
+  zfp_bool iterator = zfp_false;
+  zfp_bool compression = zfp_false;
   constants* c = 0;
   double sum;
   double err;
@@ -246,26 +247,30 @@ int main(int argc, char* argv[])
     if (argv[i][0] != '-' || argv[i][2])
       return usage();
     switch(argv[i][1]) {
+      case 'b':
+        if (++i == argc || sscanf(argv[i], "%d", &cache_size) != 1)
+          return usage();
+        cache_size *= (int)(4 * 4 * sizeof(double));
+        break;
       case 'i':
-        iterator = 1;
+        iterator = zfp_true;
         break;
       case 'n':
         if (++i == argc || sscanf(argv[i], "%d", &nx) != 1 ||
             ++i == argc || sscanf(argv[i], "%d", &ny) != 1)
           return usage();
         break;
-      case 't':
-        if (++i == argc || sscanf(argv[i], "%d", &nt) != 1)
-          return usage();
-        break;
       case 'r':
         if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
           return usage();
-        compression = 1;
+        compression = zfp_true;
         break;
-      case 'c':
-        if (++i == argc || sscanf(argv[i], "%d", &cache) != 1)
+      case 't':
+        if (++i == argc || sscanf(argv[i], "%d", &nt) != 1)
           return usage();
+        break;
+      default:
+        return usage();
     }
   }
 
@@ -274,7 +279,7 @@ int main(int argc, char* argv[])
 
   if (compression) {
     /* solve problem using compressed arrays */
-    cfp_array2d u = _.ctor(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
+    cfp_array2d u = _.ctor(nx, ny, rate, 0, cache_size);
     double t = solve_compressed(u, c, iterator);
     sum = total_compressed(u);
     err = error_compressed(u, c, t);
diff --git a/examples/inplace.c b/examples/inplace.c
index 3764166b5..67d1b3e4d 100644
--- a/examples/inplace.c
+++ b/examples/inplace.c
@@ -46,7 +46,7 @@ process(double* buffer, uint blocks, double tolerance)
   ptr = buffer;
   for (i = 0; i < blocks; i++) {
     offset[i] = stream_wtell(stream);
-    bits = zfp_encode_block_double_2(zfp, ptr);
+    bits = (uint)zfp_encode_block_double_2(zfp, ptr);
     if (!bits) {
       fprintf(stderr, "compression failed\n");
       return 0;
@@ -97,7 +97,7 @@ int main(int argc, char* argv[])
     case 2:
       if (sscanf(argv[1], "%lf", &tolerance) != 1)
         goto usage;
-      /* FALLTHROUGH */
+      fallthrough_
     case 1:
       break;
     default:
diff --git a/examples/iterator.cpp b/examples/iterator.cpp
index 77bf1c5ff..94f907ded 100644
--- a/examples/iterator.cpp
+++ b/examples/iterator.cpp
@@ -1,9 +1,9 @@
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
-#include "zfparray1.h"
-#include "zfparray2.h"
-#include "zfparray3.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
 
 void print1(zfp::array1<double>::pointer p, size_t n)
 {
diff --git a/examples/iteratorC.c b/examples/iteratorC.c
index fcaf83456..93ef47259 100644
--- a/examples/iteratorC.c
+++ b/examples/iteratorC.c
@@ -1,6 +1,6 @@
 #include <stdio.h>
 #include <stdlib.h>
-#include "cfparray.h"
+#include "zfp/array.h"
 
 void print1(cfp_ptr1d p, size_t n)
 {
@@ -8,7 +8,7 @@ void print1(cfp_ptr1d p, size_t n)
   const cfp_array1d_api _ = cfp.array1d;
 
   for (i = 0; i < n; i++)
-    printf("%e\n", _.reference.get(_.pointer.ref_at(p, i)));
+    printf("%g\n", _.reference.get(_.pointer.ref_at(p, i)));
 }
 
 void print2(cfp_ptr2d p, size_t n)
@@ -16,7 +16,7 @@ void print2(cfp_ptr2d p, size_t n)
   const cfp_array2d_api _ = cfp.array2d;
 
   while (n--) {
-    printf("%lf\n", _.reference.get(_.pointer.ref(p)));
+    printf("%g\n", _.reference.get(_.pointer.ref(p)));
     p = _.pointer.inc(p);
   }
 }
@@ -27,20 +27,29 @@ void print3(cfp_iter1d begin, cfp_iter1d end)
   cfp_iter1d p;
 
   for (p = begin; !_.iterator.eq(p, end); p = _.iterator.inc(p))
-    printf("%e\n", _.reference.get(_.iterator.ref(p)));
+    printf("%g\n", _.reference.get(_.iterator.ref(p)));
 }
 
-int main()
+int main(void)
 {
   const cfp_array1d_api _1d = cfp.array1d;
   const cfp_array2d_api _2d = cfp.array2d;
   const cfp_array3d_api _3d = cfp.array3d;
+  cfp_array1d v;
+  cfp_iter1d it1;
+  cfp_array2d a;
+  cfp_iter2d it2;
+  cfp_ptr2d pb2;
+  cfp_ptr2d pe2;
+  cfp_array3d b;
+  cfp_iter3d it3;
+  cfp_ptr3d pb3;
+  cfp_ptr3d pe3;
   size_t i, j, k;
 
   /* some fun with 1D arrays */
-  cfp_array1d v = _1d.ctor(10, 64.0, 0, 0);
+  v = _1d.ctor(10, 64.0, 0, 0);
   /* initialize and print array of random values */
-  cfp_iter1d it1;
   for (it1 = _1d.begin(v); !_1d.iterator.eq(it1, _1d.end(v)); it1 = _1d.iterator.inc(it1))
     _1d.reference.set(_1d.iterator.ref(it1), rand());
   printf("random array\n");
@@ -48,14 +57,13 @@ int main()
   printf("\n");
 
   /* some fun with 2D arrays */
-  cfp_array2d a = _2d.ctor(5, 7, 64.0, 0, 0);
+  a = _2d.ctor(5, 7, 64.0, 0, 0);
   /* print array indices visited in block-order traversal*/
   printf("block order (x, y) indices\n");
-  cfp_iter2d it2;
   for (it2 = _2d.begin(a); !_2d.iterator.eq(it2, _2d.end(a)); it2 = _2d.iterator.inc(it2)) {
     i = _2d.iterator.i(it2);
     j = _2d.iterator.j(it2);
-    printf("(%lu, %lu)\n", i, j);
+    printf("(%lu, %lu)\n", (unsigned long)i, (unsigned long)j);
     _2d.reference.set(_2d.iterator.ref(it2), i + 10 * j);
   }
   printf("\n");
@@ -65,26 +73,25 @@ int main()
   print2(_2d.ptr_flat(a, 0), _2d.size(a));
   printf("\n");
   /* pointer arithmetic */
-  cfp_ptr2d pb2 = _2d.reference.ptr(_2d.iterator.ref(_2d.begin(a)));
-  cfp_ptr2d pe2 = _2d.reference.ptr(_2d.iterator.ref(_2d.end(a)));
-  printf("%lu * %lu = %lld\n", _2d.size_x(a), _2d.size_y(a), (long long int)_2d.pointer.distance(pe2, pb2));
+  pb2 = _2d.reference.ptr(_2d.iterator.ref(_2d.begin(a)));
+  pe2 = _2d.reference.ptr(_2d.iterator.ref(_2d.end(a)));
+  printf("%lu * %lu = %ld\n", (unsigned long)_2d.size_x(a), (unsigned long)_2d.size_y(a), (long)_2d.pointer.distance(pb2, pe2));
 
   /* some fun with 3D arrays */
-  cfp_array3d b = _3d.ctor(7, 2, 5, 64.0, 0, 0);
+  b = _3d.ctor(7, 2, 5, 64.0, 0, 0);
   /* print array indices visited in block-order traversal */
   printf("block order (x, y, z) indices\n");
-  cfp_iter3d it3;
   for (it3 = _3d.begin(b); !_3d.iterator.eq(it3, _3d.end(b)); it3 = _3d.iterator.inc(it3)) {
     i = _3d.iterator.i(it3);
     j = _3d.iterator.j(it3);
     k = _3d.iterator.k(it3);
-    printf("(%lu, %lu, %lu)\n", i, j, k);
+    printf("(%lu, %lu, %lu)\n", (unsigned long)i, (unsigned long)j, (unsigned long)k);
   }
   printf("\n");
   /* pointer arithmetic */
-  cfp_ptr3d pb3 = _3d.reference.ptr(_3d.iterator.ref(_3d.begin(b)));
-  cfp_ptr3d pe3 = _3d.reference.ptr(_3d.iterator.ref(_3d.end(b)));
-  printf("%lu * %lu * %lu = %lld\n", _3d.size_x(b), _3d.size_y(b), _3d.size_z(b), (long long int)_3d.pointer.distance(pe3, pb3));
+  pb3 = _3d.reference.ptr(_3d.iterator.ref(_3d.begin(b)));
+  pe3 = _3d.reference.ptr(_3d.iterator.ref(_3d.end(b)));
+  printf("%lu * %lu * %lu = %ld\n", (unsigned long)_3d.size_x(b), (unsigned long)_3d.size_y(b), (unsigned long)_3d.size_z(b), (long)_3d.pointer.distance(pb3, pe3));
 
   return 0;
 }
diff --git a/examples/ppm.c b/examples/ppm.c
index 1324ae0be..4b989a305 100644
--- a/examples/ppm.c
+++ b/examples/ppm.c
@@ -56,7 +56,7 @@ clamp(int32* block, uint n)
 
 /* convert 2D block from RGB to YCoCg color space */
 static void
-rgb2ycocg(int32 ycocg[3][16], const int32 rgb[3][16])
+rgb2ycocg(int32 ycocg[3][16], /*const*/ int32 rgb[3][16])
 {
   uint i;
   for (i = 0; i < 16; i++) {
@@ -80,7 +80,7 @@ rgb2ycocg(int32 ycocg[3][16], const int32 rgb[3][16])
 
 /* convert 2D block from YCoCg to RGB color space */
 static void
-ycocg2rgb(int32 rgb[3][16], const int32 ycocg[3][16])
+ycocg2rgb(int32 rgb[3][16], /*const*/ int32 ycocg[3][16])
 {
   uint i;
   for (i = 0; i < 16; i++) {
diff --git a/examples/simple.c b/examples/simple.c
index bcddf4a74..d22613018 100644
--- a/examples/simple.c
+++ b/examples/simple.c
@@ -86,7 +86,7 @@ int main(int argc, char* argv[])
 
   if (!decompress) {
     /* initialize array to be compressed */
-    int i, j, k;
+    size_t i, j, k;
     for (k = 0; k < nz; k++)
       for (j = 0; j < ny; j++)
         for (i = 0; i < nx; i++) {
diff --git a/examples/speed.c b/examples/speed.c
index e75f42854..ea4de02bf 100644
--- a/examples/speed.c
+++ b/examples/speed.c
@@ -91,7 +91,7 @@ int main(int argc, char* argv[])
   switch (argc) {
     case 3:
       sscanf(argv[2], "%u", &blocks);
-      /* FALLTHROUGH */
+      fallthrough_
     case 2:
       sscanf(argv[1], "%lf", &rate);
       break;
diff --git a/fortran/CMakeLists.txt b/fortran/CMakeLists.txt
index a3e6b9c2c..9c3763489 100644
--- a/fortran/CMakeLists.txt
+++ b/fortran/CMakeLists.txt
@@ -21,7 +21,7 @@ set_property(TARGET zFORp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
 set_property(TARGET zFORp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}zFORp)
 
 # install location for module file
-install(FILES ${CMAKE_Fortran_MODULE_DIRECTORY}/zforp.mod
+install(FILES ${CMAKE_Fortran_MODULE_DIRECTORY}/zfp.mod
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 # install location for library
diff --git a/fortran/Makefile b/fortran/Makefile
index 229bf42cb..9e5148681 100644
--- a/fortran/Makefile
+++ b/fortran/Makefile
@@ -1,14 +1,16 @@
 include ../Config
 
+.SUFFIXES: .f90
+
 LIBDIR = ../lib
 MODDIR = ../modules
-TARGETS = $(LIBDIR)/libzFORp.a $(LIBDIR)/libzFORp.so $(MODDIR)/zforp_module.mod
+TARGETS = $(LIBDIR)/libzFORp.a $(LIBDIR)/libzFORp.so $(MODDIR)/zfp.mod
 OBJECTS = zfp.o
-MODULES = zforp_module.mod
+MODULES = zfp.mod
 
-static: $(LIBDIR)/libzFORp.a $(MODDIR)/zforp_module.mod
+static: $(LIBDIR)/libzFORp.a $(MODDIR)/zforp.mod
 
-shared: $(LIBDIR)/libzFORp.so $(MODDIR)/zforp_module.mod
+shared: $(LIBDIR)/libzFORp.so $(MODDIR)/zforp.mod
 
 clean:
 	rm -f $(TARGETS) $(OBJECTS)
@@ -22,9 +24,9 @@ $(LIBDIR)/libzFORp.so: $(OBJECTS)
 	mkdir -p $(LIBDIR)
 	$(FC) $(FFLAGS) -shared $^ -o $@
 
-$(MODDIR)/zforp_module.mod: $(OBJECTS)
+$(MODDIR)/zforp.mod: $(OBJECTS)
 	mkdir -p $(MODDIR)
-	mv zforp_module.mod $(MODDIR)
+	mv $(MODULES) $(MODDIR)
 
-.f.o:
+.f90.o:
 	$(FC) $(FFLAGS) -c $<
diff --git a/fortran/zfp.f90 b/fortran/zfp.f90
index 8042f2159..ae5714685 100644
--- a/fortran/zfp.f90
+++ b/fortran/zfp.f90
@@ -1,4 +1,4 @@
-module zFORp
+module zfp
 
   use, intrinsic :: iso_c_binding, only: c_int, c_int64_t, c_size_t, c_ptrdiff_t, c_double, c_ptr, c_null_ptr, c_loc
   implicit none
@@ -51,25 +51,28 @@ module zFORp
   ! constants are hardcoded
   ! const_xyz holds value, but xyz is the public constant
 
-  integer, parameter :: const_zFORp_version_major = 0
-  integer, parameter :: const_zFORp_version_minor = 5
-  integer, parameter :: const_zFORp_version_patch = 5
+  integer, parameter :: const_zFORp_version_major = 1
+  integer, parameter :: const_zFORp_version_minor = 0
+  integer, parameter :: const_zFORp_version_patch = 1
+  integer, parameter :: const_zFORp_version_tweak = 0
   integer, protected, bind(c, name="zFORp_version_major") :: zFORp_version_major
   integer, protected, bind(c, name="zFORp_version_minor") :: zFORp_version_minor
   integer, protected, bind(c, name="zFORp_version_patch") :: zFORp_version_patch
+  integer, protected, bind(c, name="zFORp_version_tweak") :: zFORp_version_tweak
   data zFORp_version_major/const_zFORp_version_major/, &
        zFORp_version_minor/const_zFORp_version_minor/, &
-       zFORp_version_patch/const_zFORp_version_patch/
+       zFORp_version_patch/const_zFORp_version_patch/, &
+       zFORp_version_tweak/const_zFORp_version_tweak/
 
   integer, parameter :: const_zFORp_codec_version = 5
   integer, protected, bind(c, name="zFORp_codec_version") :: zFORp_codec_version
   data zFORp_codec_version/const_zFORp_codec_version/
 
-  integer, parameter :: const_zFORp_library_version = 85 ! 0x55
+  integer, parameter :: const_zFORp_library_version = 4112 ! 0x1010
   integer, protected, bind(c, name="zFORp_library_version") :: zFORp_library_version
   data zFORp_library_version/const_zFORp_library_version/
 
-  character(len = 36), parameter :: zFORp_version_string = 'zfp version 0.5.5 (May 5, 2019)'
+  character(len = 36), parameter :: zFORp_version_string = 'zfp version 1.0.1 (December 15, 2023)'
 
   integer, parameter :: const_zFORp_min_bits = 1
   integer, parameter :: const_zFORp_max_bits = 16658
@@ -399,6 +402,12 @@ function zfp_field_size_bytes(field) result(byte_size) bind(c, name="zfp_field_s
       integer(c_size_t) :: byte_size
     end function
 
+    function zfp_field_blocks(field) result(blocks) bind(c, name="zfp_field_blocks")
+      import
+      type(c_ptr), value :: field
+      integer(c_size_t) :: blocks
+    end function
+
     function zfp_field_stride(field, stride_arr) result(is_strided) bind(c, name="zfp_field_stride")
       import
       type(c_ptr), value :: field, stride_arr
@@ -455,25 +464,25 @@ subroutine zfp_field_set_size_4d(field, nx, ny, nz, nw) bind(c, name="zfp_field_
     subroutine zfp_field_set_stride_1d(field, sx) bind(c, name="zfp_field_set_stride_1d")
       import
       type(c_ptr), value :: field
-      integer(c_ptrdiff_t) :: sx
+      integer(c_ptrdiff_t), value :: sx
     end subroutine
 
     subroutine zfp_field_set_stride_2d(field, sx, sy) bind(c, name="zfp_field_set_stride_2d")
       import
       type(c_ptr), value :: field
-      integer(c_ptrdiff_t) :: sx, sy
+      integer(c_ptrdiff_t), value :: sx, sy
     end subroutine
 
     subroutine zfp_field_set_stride_3d(field, sx, sy, sz) bind(c, name="zfp_field_set_stride_3d")
       import
       type(c_ptr), value :: field
-      integer(c_ptrdiff_t) :: sx, sy, sz
+      integer(c_ptrdiff_t), value :: sx, sy, sz
     end subroutine
 
     subroutine zfp_field_set_stride_4d(field, sx, sy, sz, sw) bind(c, name="zfp_field_set_stride_4d")
       import
       type(c_ptr), value :: field
-      integer(c_ptrdiff_t) :: sx, sy, sz, sw
+      integer(c_ptrdiff_t), value :: sx, sy, sz, sw
     end subroutine
 
     function zfp_field_set_metadata(field, encoded_metadata) result(is_success) bind(c, name="zfp_field_set_metadata")
@@ -542,7 +551,8 @@ function zfp_read_header(stream, field, mask) result(num_bits_read) bind(c, name
   ! C macros -> constants
   public :: zFORp_version_major, &
             zFORp_version_minor, &
-            zFORp_version_patch
+            zFORp_version_patch, &
+            zFORp_version_tweak
 
   public :: zFORp_codec_version, &
             zFORp_library_version, &
@@ -629,6 +639,7 @@ function zfp_read_header(stream, field, mask) result(num_bits_read) bind(c, name
             zFORp_field_dimensionality, &
             zFORp_field_size, &
             zFORp_field_size_bytes, &
+            zFORp_field_blocks, &
             zFORp_field_stride, &
             zFORp_field_is_contiguous, &
             zFORp_field_metadata, &
@@ -984,6 +995,13 @@ function zFORp_field_size_bytes(field) result(byte_size) bind(c, name="zforp_fie
     byte_size = zfp_field_size_bytes(field%object)
   end function zFORp_field_size_bytes
 
+  function zFORp_field_blocks(field) result(blocks) bind(c, name="zforp_field_blocks")
+    implicit none
+    type(zFORp_field), intent(in) :: field
+    integer (kind=8) :: blocks
+    blocks = zfp_field_blocks(field%object)
+  end function zFORp_field_blocks
+
   function zFORp_field_stride(field, stride_arr) result(is_strided) bind(c, name="zforp_field_stride")
     implicit none
     type(zFORp_field), intent(in) :: field
@@ -1113,4 +1131,4 @@ function zFORp_read_header(stream, field, mask) result(num_bits_read) bind(c, na
     num_bits_read = zfp_read_header(stream%object, field%object, int(mask, c_int))
   end function zFORp_read_header
 
-end module zFORp
+end module zfp
diff --git a/include/zfp.h b/include/zfp.h
index 9cd5f5bbd..a56326f13 100644
--- a/include/zfp.h
+++ b/include/zfp.h
@@ -1,5 +1,5 @@
 /*
-** Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC and
+** Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC and
 ** other zfp project contributors. See the top-level LICENSE file for details.
 ** SPDX-License-Identifier: BSD-3-Clause
 */
@@ -7,10 +7,10 @@
 #ifndef ZFP_H
 #define ZFP_H
 
-#include "zfp/types.h"
-#include "zfp/system.h"
+#include "zfp/bitstream.h"
 #include "zfp/version.h"
-#include "bitstream.h"
+#include "zfp/internal/zfp/system.h"
+#include "zfp/internal/zfp/types.h"
 
 /* macros ------------------------------------------------------------------ */
 
@@ -77,14 +77,9 @@ typedef struct {
   uint chunk_size; /* number of blocks per chunk (1D only) */
 } zfp_exec_params_omp;
 
-/* execution parameters */
-typedef union {
-  zfp_exec_params_omp omp; /* OpenMP parameters */
-} zfp_exec_params;
-
 typedef struct {
   zfp_exec_policy policy; /* execution policy (serial, omp, ...) */
-  zfp_exec_params params; /* execution parameters */
+  void* params;           /* execution parameters */
 } zfp_execution;
 
 /* compressed stream; use accessors to get/set members */
@@ -340,7 +335,7 @@ zfp_stream_set_omp_chunk_size(
 
 /* unspecified configuration */
 zfp_config /* compression mode and parameter settings */
-zfp_config_none();
+zfp_config_none(void);
 
 /* fixed-rate configuration */
 zfp_config       /* compression mode and parameter settings */
@@ -363,7 +358,7 @@ zfp_config_accuracy(
 
 /* reversible (lossless) configuration */
 zfp_config /* compression mode and parameter settings */
-zfp_config_reversible();
+zfp_config_reversible(void);
 
 /* expert configuration */
 zfp_config      /* compression mode and parameter settings */
@@ -378,7 +373,7 @@ zfp_config_expert(
 
 /* allocate field struct */
 zfp_field* /* pointer to default initialized field */
-zfp_field_alloc();
+zfp_field_alloc(void);
 
 /* allocate metadata for 1D field f[nx] */
 zfp_field*       /* allocated field metadata */
@@ -469,6 +464,12 @@ zfp_field_size_bytes(
   const zfp_field* field /* field metadata */
 );
 
+/* field size in number of blocks */
+size_t                   /* total number of blocks */
+zfp_field_blocks(
+  const zfp_field* field /* field metadata */
+);
+
 /* field strides per dimension */
 zfp_bool                  /* true if array is not contiguous */
 zfp_field_stride(
@@ -630,11 +631,11 @@ zfp_stream_align(
 /*
 The functions below all compress either a complete contiguous d-dimensional
 block of 4^d scalars or a complete or partial block assembled from a strided
-array.  In the latter case, p points to the first scalar; (nx, ny, nz) specify
-the size of the block, with 1 <= nx, ny, nz <= 4; and (sx, sy, sz) specify the
-strides, i.e. the number of scalars to advance to get to the next scalar along
-each dimension.  The functions return the number of bits of compressed storage
-needed for the compressed block.
+array.  In the latter case, p points to the first scalar; (nx, ny, nz, nw)
+specify the size of the block, with 1 <= nx, ny, nz, nw <= 4; and
+(sx, sy, sz, sw) specify the strides, i.e., the number of scalars to advance
+to get to the next scalar along each dimension.  The functions return the
+number of bits of compressed storage needed for the compressed block.
 */
 
 /* encode 1D contiguous block of 4 values */
@@ -787,6 +788,9 @@ void zfp_demote_int32_to_uint8(uint8* oblock, const int32* iblock, uint dims);
 void zfp_demote_int32_to_int16(int16* oblock, const int32* iblock, uint dims);
 void zfp_demote_int32_to_uint16(uint16* oblock, const int32* iblock, uint dims);
 
+/* maximum number of bits/block of compressed storage */
+size_t zfp_block_maximum_size(zfp_type type, uint dims, zfp_bool reversible);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/array/zfpcpp.h b/include/zfp.hpp
similarity index 98%
rename from array/zfpcpp.h
rename to include/zfp.hpp
index 9fa4fb895..5ec93fd48 100644
--- a/array/zfpcpp.h
+++ b/include/zfp.hpp
@@ -1,5 +1,9 @@
-#ifndef ZFP_CPP_H
-#define ZFP_CPP_H
+#ifndef ZFP_HPP
+#define ZFP_HPP
+
+// Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC and
+// other zfp project contributors. See the top-level LICENSE file for details.
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "zfp.h"
 
diff --git a/include/zfp/array.h b/include/zfp/array.h
new file mode 100644
index 000000000..b503abc47
--- /dev/null
+++ b/include/zfp/array.h
@@ -0,0 +1,32 @@
+#ifndef CFP_ARRAY_H
+#define CFP_ARRAY_H
+
+#include <stddef.h>
+#include "zfp/internal/cfp/header.h"
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/internal/cfp/array4d.h"
+
+typedef struct {
+  cfp_array1f_api array1f;
+  cfp_array1d_api array1d;
+  cfp_array2f_api array2f;
+  cfp_array2d_api array2d;
+  cfp_array3f_api array3f;
+  cfp_array3d_api array3d;
+  cfp_array4f_api array4f;
+  cfp_array4d_api array4d;
+} cfp_api;
+
+#ifndef CFP_NAMESPACE
+  #define CFP_NAMESPACE cfp
+#endif
+
+extern_ const cfp_api CFP_NAMESPACE;
+
+#endif
diff --git a/array/zfparray.h b/include/zfp/array.hpp
similarity index 92%
rename from array/zfparray.h
rename to include/zfp/array.hpp
index 157207b41..4e2c3df52 100644
--- a/array/zfparray.h
+++ b/include/zfp/array.hpp
@@ -1,20 +1,20 @@
-#ifndef ZFP_ARRAY_H
-#define ZFP_ARRAY_H
+#ifndef ZFP_ARRAY_HPP
+#define ZFP_ARRAY_HPP
 
 #include <algorithm>
 #include <climits>
 #include <string>
 #include "zfp.h"
-#include "zfp/exception.h"
+#include "zfp/internal/array/exception.hpp"
 
 namespace zfp {
 
 // abstract base class for compressed array of scalars
 class array {
 public:
-  #include "zfp/header.h"
+  #include "zfp/internal/array/header.hpp"
 
-  // factory function (see zfpfactory.h)
+  // factory function (see factory.hpp)
   static zfp::array* construct(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0);
 
   // public virtual destructor (can delete array through base class pointer)
diff --git a/array/zfparray1.h b/include/zfp/array1.hpp
similarity index 88%
rename from array/zfparray1.h
rename to include/zfp/array1.hpp
index a17269e98..6b89fefa2 100644
--- a/array/zfparray1.h
+++ b/include/zfp/array1.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_ARRAY1_H
-#define ZFP_ARRAY1_H
+#ifndef ZFP_ARRAY1_HPP
+#define ZFP_ARRAY1_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache1.h"
-#include "zfp/store1.h"
-#include "zfp/handle1.h"
-#include "zfp/reference1.h"
-#include "zfp/pointer1.h"
-#include "zfp/iterator1.h"
-#include "zfp/view1.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache1.hpp"
+#include "zfp/internal/array/handle1.hpp"
+#include "zfp/internal/array/iterator1.hpp"
+#include "zfp/internal/array/pointer1.hpp"
+#include "zfp/internal/array/reference1.hpp"
+#include "zfp/internal/array/store1.hpp"
+#include "zfp/internal/array/view1.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class array1 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore1<value_type, codec_type, index_type> store_type;
-  typedef BlockCache1<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore1<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache1<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -79,6 +79,7 @@ class array1 : public array {
 
   // copy constructor--performs a deep copy
   array1(const array1& a) :
+    array(),
     cache(store)
   {
     deep_copy(a);
@@ -183,10 +184,19 @@ class array1 : public array {
   void set(const value_type* p)
   {
     const size_t bx = store.block_size_x();
-    const ptrdiff_t sx = 1;
     size_t block_index = 0;
-    for (size_t i = 0; i < bx; i++, p += 4)
-      cache.put_block(block_index++, p, sx);
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      for (size_t i = 0; i < bx; i++, p += 4)
+        cache.put_block(block_index++, p, sx);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4] = {};
+      while (block_index < bx)
+        cache.put_block(block_index++, block, 1);
+    }
   }
 
   // accessors
diff --git a/array/zfparray2.h b/include/zfp/array2.hpp
similarity index 88%
rename from array/zfparray2.h
rename to include/zfp/array2.hpp
index 11c3a2dfa..d669f7c02 100644
--- a/array/zfparray2.h
+++ b/include/zfp/array2.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_ARRAY2_H
-#define ZFP_ARRAY2_H
+#ifndef ZFP_ARRAY2_HPP
+#define ZFP_ARRAY2_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache2.h"
-#include "zfp/store2.h"
-#include "zfp/handle2.h"
-#include "zfp/reference2.h"
-#include "zfp/pointer2.h"
-#include "zfp/iterator2.h"
-#include "zfp/view2.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache2.hpp"
+#include "zfp/internal/array/handle2.hpp"
+#include "zfp/internal/array/iterator2.hpp"
+#include "zfp/internal/array/pointer2.hpp"
+#include "zfp/internal/array/reference2.hpp"
+#include "zfp/internal/array/store2.hpp"
+#include "zfp/internal/array/view2.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class array2 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore2<value_type, codec_type, index_type> store_type;
-  typedef BlockCache2<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore2<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache2<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -84,6 +84,7 @@ class array2 : public array {
 
   // copy constructor--performs a deep copy
   array2(const array2& a) :
+    array(),
     cache(store)
   {
     deep_copy(a);
@@ -195,12 +196,21 @@ class array2 : public array {
   {
     const size_t bx = store.block_size_x();
     const size_t by = store.block_size_y();
-    const ptrdiff_t sx = 1;
-    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
     size_t block_index = 0;
-    for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
-      for (size_t i = 0; i < bx; i++, p += 4)
-        cache.put_block(block_index++, p, sx, sy);
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+        for (size_t i = 0; i < bx; i++, p += 4)
+          cache.put_block(block_index++, p, sx, sy);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4] = {};
+      while (block_index < bx * by)
+        cache.put_block(block_index++, block, 1, 4);
+    }
   }
 
   // (i, j) accessors
diff --git a/array/zfparray3.h b/include/zfp/array3.hpp
similarity index 88%
rename from array/zfparray3.h
rename to include/zfp/array3.hpp
index fd7657980..7e60fade5 100644
--- a/array/zfparray3.h
+++ b/include/zfp/array3.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_ARRAY3_H
-#define ZFP_ARRAY3_H
+#ifndef ZFP_ARRAY3_HPP
+#define ZFP_ARRAY3_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache3.h"
-#include "zfp/store3.h"
-#include "zfp/handle3.h"
-#include "zfp/reference3.h"
-#include "zfp/pointer3.h"
-#include "zfp/iterator3.h"
-#include "zfp/view3.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache3.hpp"
+#include "zfp/internal/array/handle3.hpp"
+#include "zfp/internal/array/iterator3.hpp"
+#include "zfp/internal/array/pointer3.hpp"
+#include "zfp/internal/array/reference3.hpp"
+#include "zfp/internal/array/store3.hpp"
+#include "zfp/internal/array/view3.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class array3 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore3<value_type, codec_type, index_type> store_type;
-  typedef BlockCache3<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore3<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache3<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -86,6 +86,7 @@ class array3 : public array {
 
   // copy constructor--performs a deep copy
   array3(const array3& a) :
+    array(),
     cache(store)
   {
     deep_copy(a);
@@ -204,14 +205,23 @@ class array3 : public array {
     const size_t bx = store.block_size_x();
     const size_t by = store.block_size_y();
     const size_t bz = store.block_size_z();
-    const ptrdiff_t sx = 1;
-    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
-    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
     size_t block_index = 0;
-    for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
-      for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
-        for (size_t i = 0; i < bx; i++, p += 4)
-          cache.put_block(block_index++, p, sx, sy, sz);
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+        for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+          for (size_t i = 0; i < bx; i++, p += 4)
+            cache.put_block(block_index++, p, sx, sy, sz);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4] = {};
+      while (block_index < bx * by * bz)
+        cache.put_block(block_index++, block, 1, 4, 16);
+    }
   }
 
   // (i, j, k) accessors
diff --git a/array/zfparray4.h b/include/zfp/array4.hpp
similarity index 87%
rename from array/zfparray4.h
rename to include/zfp/array4.hpp
index e4aaeb9b5..19c1d8119 100644
--- a/array/zfparray4.h
+++ b/include/zfp/array4.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_ARRAY4_H
-#define ZFP_ARRAY4_H
+#ifndef ZFP_ARRAY4_HPP
+#define ZFP_ARRAY4_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache4.h"
-#include "zfp/store4.h"
-#include "zfp/handle4.h"
-#include "zfp/reference4.h"
-#include "zfp/pointer4.h"
-#include "zfp/iterator4.h"
-#include "zfp/view4.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache4.hpp"
+#include "zfp/internal/array/handle4.hpp"
+#include "zfp/internal/array/iterator4.hpp"
+#include "zfp/internal/array/pointer4.hpp"
+#include "zfp/internal/array/reference4.hpp"
+#include "zfp/internal/array/store4.hpp"
+#include "zfp/internal/array/view4.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class array4 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore4<value_type, codec_type, index_type> store_type;
-  typedef BlockCache4<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore4<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache4<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -88,6 +88,7 @@ class array4 : public array {
 
   // copy constructor--performs a deep copy
   array4(const array4& a) :
+    array(),
     cache(store)
   {
     deep_copy(a);
@@ -213,16 +214,25 @@ class array4 : public array {
     const size_t by = store.block_size_y();
     const size_t bz = store.block_size_z();
     const size_t bw = store.block_size_w();
-    const ptrdiff_t sx = 1;
-    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
-    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
-    const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
     size_t block_index = 0;
-    for (size_t l = 0; l < bw; l++, p += 4 * sz * ptrdiff_t(nz - bz))
-      for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
-        for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
-          for (size_t i = 0; i < bx; i++, p += 4)
-            cache.put_block(block_index++, p, sx, sy, sz, sw);
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
+      for (size_t l = 0; l < bw; l++, p += 4 * sz * ptrdiff_t(nz - bz))
+        for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+          for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+            for (size_t i = 0; i < bx; i++, p += 4)
+              cache.put_block(block_index++, p, sx, sy, sz, sw);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4 * 4] = {};
+      while (block_index < bx * by * bz * bw)
+        cache.put_block(block_index++, block, 1, 4, 16, 64);
+    }
   }
 
   // (i, j, k) accessors
diff --git a/include/bitstream.h b/include/zfp/bitstream.h
similarity index 66%
rename from include/bitstream.h
rename to include/zfp/bitstream.h
index 8ba1685d4..44598227e 100644
--- a/include/bitstream.h
+++ b/include/zfp/bitstream.h
@@ -2,12 +2,21 @@
 #define ZFP_BITSTREAM_H
 
 #include <stddef.h>
-#include "zfp/types.h"
-#include "zfp/system.h"
+#include "zfp/internal/zfp/types.h"
+#include "zfp/internal/zfp/system.h"
 
 /* forward declaration of opaque type */
 typedef struct bitstream bitstream;
 
+/* bit offset into stream where bits are read/written */
+typedef uint64 bitstream_offset;
+
+/* type for counting number of bits in a stream */
+typedef bitstream_offset bitstream_size;
+
+/* type for counting a small number of bits in a stream */
+typedef size_t bitstream_count;
+
 extern_ const size_t stream_word_bits; /* bit stream granularity */
 
 #ifndef inline_
@@ -25,7 +34,7 @@ void stream_close(bitstream* stream);
 bitstream* stream_clone(const bitstream* stream);
 
 /* word size in bits (equal to stream_word_bits) */
-size_t stream_alignment();
+bitstream_count stream_alignment(void);
 
 /* pointer to beginning of stream */
 void* stream_data(const bitstream* stream);
@@ -49,40 +58,40 @@ uint stream_read_bit(bitstream* stream);
 uint stream_write_bit(bitstream* stream, uint bit);
 
 /* read 0 <= n <= 64 bits */
-uint64 stream_read_bits(bitstream* stream, uint n);
+uint64 stream_read_bits(bitstream* stream, bitstream_count n);
 
 /* write 0 <= n <= 64 low bits of value and return remaining bits */
-uint64 stream_write_bits(bitstream* stream, uint64 value, uint n);
+uint64 stream_write_bits(bitstream* stream, uint64 value, bitstream_count n);
 
 /* return bit offset to next bit to be read */
-size_t stream_rtell(const bitstream* stream);
+bitstream_offset stream_rtell(const bitstream* stream);
 
 /* return bit offset to next bit to be written */
-size_t stream_wtell(const bitstream* stream);
+bitstream_offset stream_wtell(const bitstream* stream);
 
 /* rewind stream to beginning */
 void stream_rewind(bitstream* stream);
 
 /* position stream for reading at given bit offset */
-void stream_rseek(bitstream* stream, size_t offset);
+void stream_rseek(bitstream* stream, bitstream_offset offset);
 
 /* position stream for writing at given bit offset */
-void stream_wseek(bitstream* stream, size_t offset);
+void stream_wseek(bitstream* stream, bitstream_offset offset);
 
 /* skip over the next n bits */
-void stream_skip(bitstream* stream, uint n);
+void stream_skip(bitstream* stream, bitstream_size n);
 
 /* append n zero-bits to stream */
-void stream_pad(bitstream* stream, uint n);
+void stream_pad(bitstream* stream, bitstream_size n);
 
 /* align stream on next word boundary */
-size_t stream_align(bitstream* stream);
+bitstream_count stream_align(bitstream* stream);
 
 /* flush out any remaining buffered bits */
-size_t stream_flush(bitstream* stream);
+bitstream_count stream_flush(bitstream* stream);
 
 /* copy n bits from one bit stream to another */
-void stream_copy(bitstream* dst, bitstream* src, size_t n);
+void stream_copy(bitstream* dst, bitstream* src, bitstream_size n);
 
 #ifdef BIT_STREAM_STRIDED
 /* set block size in number of words and spacing in number of blocks */
diff --git a/src/inline/bitstream.c b/include/zfp/bitstream.inl
similarity index 73%
rename from src/inline/bitstream.c
rename to include/zfp/bitstream.inl
index 33fceb817..80294ee9e 100644
--- a/src/inline/bitstream.c
+++ b/include/zfp/bitstream.inl
@@ -22,35 +22,36 @@ The following assumptions and restrictions apply:
    stream for writing.  In read mode, the following functions may be called:
 
      size_t stream_size(stream);
-     size_t stream_rtell(stream);
+     bitstream_offset stream_rtell(stream);
      void stream_rewind(stream);
      void stream_rseek(stream, offset);
-     void stream_skip(stream, uint n);
-     size_t stream_align(stream);
+     void stream_skip(stream, n);
+     bitstream_count stream_align(stream);
      uint stream_read_bit(stream);
      uint64 stream_read_bits(stream, n);
 
    Each of the above read calls has a corresponding write call:
 
      size_t stream_size(stream);
-     size_t stream_wtell(stream);
+     bitstream_offset stream_wtell(stream);
      void stream_rewind(stream);
      void stream_wseek(stream, offset);
      void stream_pad(stream, n);
-     size_t stream_flush(stream);
+     bitstream_count stream_flush(stream);
      uint stream_write_bit(stream, bit);
      uint64 stream_write_bits(stream, value, n);
 
 3. The stream buffer is an unsigned integer of a user-specified type given
    by the BIT_STREAM_WORD_TYPE macro.  Bits are read and written in units of
    this integer word type.  Supported types are 8, 16, 32, or 64 bits wide.
-   The bit width of the buffer is denoted by 'wsize' and can be accessed via
-   the global constant stream_word_bits.  A small wsize allows for fine
-   granularity reads and writes, and may be preferable when working with many
-   small blocks of data that require non-sequential access.  The default
-   maximum size of 64 bits ensures maximum speed.  Note that even when
-   wsize < 64, it is still possible to read and write up to 64 bits at a time
-   using stream_read_bits() and stream_write_bits().
+   The bit width of the buffer is denoted by 'wsize' and can be accessed
+   either via the global constant stream_word_bits or stream_alignment().
+   A small wsize allows for fine granularity reads and writes, and may be
+   preferable when working with many small blocks of data that require
+   non-sequential access.  The default maximum size of 64 bits ensures maximum
+   speed.  Note that even when wsize < 64, it is still possible to read and
+   write up to 64 bits at a time using stream_read_bits() and
+   stream_write_bits().
 
 4. If BIT_STREAM_STRIDED is defined, words read from or written to the stream
    may be accessed noncontiguously by setting a power-of-two block size (which
@@ -58,7 +59,7 @@ The following assumptions and restrictions apply:
    word pointer is always incremented by one word each time a word is accessed.
    Once advanced past a block boundary, the word pointer is also advanced by
    the stride to the next block.  This feature may be used to store blocks of
-   data interleaved, e.g. for progressive coding or for noncontiguous parallel
+   data interleaved, e.g., for progressive coding or for noncontiguous parallel
    access to the bit stream  Note that the block size is measured in words,
    while the stride is measured in multiples of the block size.  Strided access
    can have a significant performance penalty.
@@ -71,7 +72,7 @@ The following assumptions and restrictions apply:
    is essentially equivalent to (but faster than)
 
        for (i = 0; i < n; i++, value >>= 1)
-         stream_write_bit(value & 1);
+         stream_write_bit(stream, value & 1);
 
    when 0 <= n <= 64.  The same holds for read calls, and thus
 
@@ -80,11 +81,15 @@ The following assumptions and restrictions apply:
    is essentially equivalent to
 
        for (i = 0, value = 0; i < n; i++)
-         value += (uint64)stream_read_bit() << i;
+         value += (uint64)stream_read_bit(stream) << i;
 
    Note that it is possible to write fewer bits than the argument 'value'
    holds (possibly even no bits), in which case any unwritten bits are
-   returned.
+   shifted right to the least significant position and returned.  That is,
+   value = stream_write_bits(stream, value, n); is equivalent to
+
+       for (i = 0; i < n; i++)
+         value = stream_write_bits(stream, value, 1);
 
 6. Although the stream_wseek(stream, offset) call allows positioning the
    stream for writing at any bit offset without any data loss (i.e. all
@@ -107,41 +112,43 @@ The following assumptions and restrictions apply:
   #define inline_
 #endif
 
+#include "zfp/bitstream.h"
+
 /* satisfy compiler when args unused */
 #define unused_(x) ((void)(x))
 
 /* bit stream word/buffer type; granularity of stream I/O operations */
 #ifdef BIT_STREAM_WORD_TYPE
   /* may be 8-, 16-, 32-, or 64-bit unsigned integer type */
-  typedef BIT_STREAM_WORD_TYPE word;
+  typedef BIT_STREAM_WORD_TYPE bitstream_word;
 #else
   /* use maximum word size by default for highest speed */
-  typedef uint64 word;
+  typedef uint64 bitstream_word;
 #endif
 
 /* number of bits in a buffered word */
-#define wsize ((uint)(CHAR_BIT * sizeof(word)))
+#define wsize ((bitstream_count)(sizeof(bitstream_word) * CHAR_BIT))
 
 /* bit stream structure (opaque to caller) */
 struct bitstream {
-  uint bits;   /* number of buffered bits (0 <= bits < wsize) */
-  word buffer; /* buffer for incoming/outgoing bits (buffer < 2^bits) */
-  word* ptr;   /* pointer to next word to be read/written */
-  word* begin; /* beginning of stream */
-  word* end;   /* end of stream (currently unused) */
+  bitstream_count bits;  /* number of buffered bits (0 <= bits < wsize) */
+  bitstream_word buffer; /* incoming/outgoing bits (buffer < 2^bits) */
+  bitstream_word* ptr;   /* pointer to next word to be read/written */
+  bitstream_word* begin; /* beginning of stream */
+  bitstream_word* end;   /* end of stream (not enforced) */
 #ifdef BIT_STREAM_STRIDED
-  size_t mask;     /* one less the block size in number of words */
-  ptrdiff_t delta; /* number of words between consecutive blocks */
+  size_t mask;           /* one less the block size in number of words */
+  ptrdiff_t delta;       /* number of words between consecutive blocks */
 #endif
 };
 
 /* private functions ------------------------------------------------------- */
 
 /* read a single word from memory */
-static word
+static bitstream_word
 stream_read_word(bitstream* s)
 {
-  word w = *s->ptr++;
+  bitstream_word w = *s->ptr++;
 #ifdef BIT_STREAM_STRIDED
   if (!((s->ptr - s->begin) & s->mask))
     s->ptr += s->delta;
@@ -151,7 +158,7 @@ stream_read_word(bitstream* s)
 
 /* write a single word to memory */
 static void
-stream_write_word(bitstream* s, word value)
+stream_write_word(bitstream* s, bitstream_word value)
 {
   *s->ptr++ = value;
 #ifdef BIT_STREAM_STRIDED
@@ -162,9 +169,9 @@ stream_write_word(bitstream* s, word value)
 
 /* public functions -------------------------------------------------------- */
 
-/* word size in bits (equals stream_word_bits) */
-inline_ size_t
-stream_alignment()
+/* word size in bits (equals bitstream_word_bits) */
+inline_ bitstream_count
+stream_alignment(void)
 {
   return wsize;
 }
@@ -180,14 +187,14 @@ stream_data(const bitstream* s)
 inline_ size_t
 stream_size(const bitstream* s)
 {
-  return sizeof(word) * (size_t)(s->ptr - s->begin);
+  return (size_t)(s->ptr - s->begin) * sizeof(bitstream_word);
 }
 
 /* byte capacity of stream */
 inline_ size_t
 stream_capacity(const bitstream* s)
 {
-  return sizeof(word) * (size_t)(s->end - s->begin);
+  return (size_t)(s->end - s->begin) * sizeof(bitstream_word);
 }
 
 /* number of words per block */
@@ -233,7 +240,7 @@ stream_read_bit(bitstream* s)
 inline_ uint
 stream_write_bit(bitstream* s, uint bit)
 {
-  s->buffer += (word)bit << s->bits;
+  s->buffer += (bitstream_word)bit << s->bits;
   if (++s->bits == wsize) {
     stream_write_word(s, s->buffer);
     s->buffer = 0;
@@ -244,7 +251,7 @@ stream_write_bit(bitstream* s, uint bit)
 
 /* read 0 <= n <= 64 bits */
 inline_ uint64
-stream_read_bits(bitstream* s, uint n)
+stream_read_bits(bitstream* s, bitstream_count n)
 {
   uint64 value = s->buffer;
   if (s->bits < n) {
@@ -279,10 +286,10 @@ stream_read_bits(bitstream* s, uint n)
 
 /* write 0 <= n <= 64 low bits of value and return remaining bits */
 inline_ uint64
-stream_write_bits(bitstream* s, uint64 value, uint n)
+stream_write_bits(bitstream* s, uint64 value, bitstream_count n)
 {
   /* append bit string to buffer */
-  s->buffer += (word)(value << s->bits);
+  s->buffer += (bitstream_word)(value << s->bits);
   s->bits += n;
   /* is buffer full? */
   if (s->bits >= wsize) {
@@ -296,27 +303,27 @@ stream_write_bits(bitstream* s, uint64 value, uint n)
       /* assert: 0 <= s->bits <= n */
       stream_write_word(s, s->buffer);
       /* assert: 0 <= n - s->bits < 64 */
-      s->buffer = (word)(value >> (n - s->bits));
+      s->buffer = (bitstream_word)(value >> (n - s->bits));
     } while (sizeof(s->buffer) < sizeof(value) && s->bits >= wsize);
   }
   /* assert: 0 <= s->bits < wsize */
-  s->buffer &= ((word)1 << s->bits) - 1;
+  s->buffer &= ((bitstream_word)1 << s->bits) - 1;
   /* assert: 0 <= n < 64 */
   return value >> n;
 }
 
 /* return bit offset to next bit to be read */
-inline_ size_t
+inline_ bitstream_offset
 stream_rtell(const bitstream* s)
 {
-  return wsize * (size_t)(s->ptr - s->begin) - s->bits;
+  return (bitstream_offset)(s->ptr - s->begin) * wsize - s->bits;
 }
 
 /* return bit offset to next bit to be written */
-inline_ size_t
+inline_ bitstream_offset
 stream_wtell(const bitstream* s)
 {
-  return wsize * (size_t)(s->ptr - s->begin) + s->bits;
+  return (bitstream_offset)(s->ptr - s->begin) * wsize + s->bits;
 }
 
 /* position stream for reading or writing at beginning */
@@ -330,10 +337,10 @@ stream_rewind(bitstream* s)
 
 /* position stream for reading at given bit offset */
 inline_ void
-stream_rseek(bitstream* s, size_t offset)
+stream_rseek(bitstream* s, bitstream_offset offset)
 {
-  uint n = offset % wsize;
-  s->ptr = s->begin + offset / wsize;
+  bitstream_count n = (bitstream_count)(offset % wsize);
+  s->ptr = s->begin + (size_t)(offset / wsize);
   if (n) {
     s->buffer = stream_read_word(s) >> n;
     s->bits = wsize - n;
@@ -346,13 +353,13 @@ stream_rseek(bitstream* s, size_t offset)
 
 /* position stream for writing at given bit offset */
 inline_ void
-stream_wseek(bitstream* s, size_t offset)
+stream_wseek(bitstream* s, bitstream_offset offset)
 {
-  uint n = offset % wsize;
-  s->ptr = s->begin + offset / wsize;
+  bitstream_count n = (bitstream_count)(offset % wsize);
+  s->ptr = s->begin + (size_t)(offset / wsize);
   if (n) {
-    word buffer = *s->ptr;
-    buffer &= ((word)1 << n) - 1;
+    bitstream_word buffer = *s->ptr;
+    buffer &= ((bitstream_word)1 << n) - 1;
     s->buffer = buffer;
     s->bits = n;
   }
@@ -364,36 +371,38 @@ stream_wseek(bitstream* s, size_t offset)
 
 /* skip over the next n bits (n >= 0) */
 inline_ void
-stream_skip(bitstream* s, uint n)
+stream_skip(bitstream* s, bitstream_size n)
 {
   stream_rseek(s, stream_rtell(s) + n);
 }
 
 /* append n zero-bits to stream (n >= 0) */
 inline_ void
-stream_pad(bitstream* s, uint n)
+stream_pad(bitstream* s, bitstream_size n)
 {
-  for (s->bits += n; s->bits >= wsize; s->bits -= wsize) {
+  bitstream_offset bits = s->bits;
+  for (bits += n; bits >= wsize; bits -= wsize) {
     stream_write_word(s, s->buffer);
     s->buffer = 0;
   }
+  s->bits = (bitstream_count)bits;
 }
 
 /* align stream on next word boundary */
-inline_ size_t
+inline_ bitstream_count
 stream_align(bitstream* s)
 {
-  uint bits = s->bits;
+  bitstream_count bits = s->bits;
   if (bits)
     stream_skip(s, bits);
   return bits;
 }
 
 /* write any remaining buffered bits and align stream on next word boundary */
-inline_ size_t
+inline_ bitstream_count
 stream_flush(bitstream* s)
 {
-  uint bits = (wsize - s->bits) % wsize;
+  bitstream_count bits = (wsize - s->bits) % wsize;
   if (bits)
     stream_pad(s, bits);
   return bits;
@@ -401,16 +410,16 @@ stream_flush(bitstream* s)
 
 /* copy n bits from one bit stream to another */
 inline_ void
-stream_copy(bitstream* dst, bitstream* src, size_t n)
+stream_copy(bitstream* dst, bitstream* src, bitstream_size n)
 {
   while (n > wsize) {
-    word w = (word)stream_read_bits(src, wsize);
+    bitstream_word w = (bitstream_word)stream_read_bits(src, wsize);
     stream_write_bits(dst, w, wsize);
     n -= wsize;
   }
   if (n) {
-    word w = (word)stream_read_bits(src, (uint)n);
-    stream_write_bits(dst, w, (uint)n);
+    bitstream_word w = (bitstream_word)stream_read_bits(src, (bitstream_count)n);
+    stream_write_bits(dst, w, (bitstream_count)n);
   }
 }
 
@@ -434,8 +443,8 @@ stream_open(void* buffer, size_t bytes)
 {
   bitstream* s = (bitstream*)malloc(sizeof(bitstream));
   if (s) {
-    s->begin = (word*)buffer;
-    s->end = s->begin + bytes / sizeof(word);
+    s->begin = (bitstream_word*)buffer;
+    s->end = s->begin + bytes / sizeof(bitstream_word);
 #ifdef BIT_STREAM_STRIDED
     stream_set_stride(s, 0, 0);
 #endif
diff --git a/include/zfp/codec/gencodec.hpp b/include/zfp/codec/gencodec.hpp
new file mode 100644
index 000000000..b0eb3230f
--- /dev/null
+++ b/include/zfp/codec/gencodec.hpp
@@ -0,0 +1,421 @@
+#ifndef ZFP_GENERIC_CODEC_HPP
+#define ZFP_GENERIC_CODEC_HPP
+
+// This CODEC allows interfacing with the zfp::array classes via a user-facing
+// scalar type, ExternalType (e.g., double), while storing data in memory using
+// a possibly less precise scalar type, InternalType (e.g., float).  Using
+// zfp's caching mechanism, blocks of data may reside for some time in cache
+// as ExternalType.  This potentially allows a sequence of more precise
+// operations to be performed on the data before it is down-converted to
+// InternalType and stored to memory.  When ExternalType = InternalType, this
+// CODEC allows defining arrays that support the full zfp array API but use
+// uncompressed storage.  To use this CODEC, pass it as the Codec template
+// parameter to a zfp::array class of matching dimensionality.
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include "zfp.h"
+#include "zfp/internal/array/memory.hpp"
+#include "zfp/internal/array/traits.hpp"
+
+namespace zfp {
+namespace codec {
+
+// abstract base class for storing 1D-4D uncompressed blocks of scalars
+template <
+  uint dims,                           // data dimensionality (1-4)
+  typename ExternalType,               // scalar type exposed through array API
+  typename InternalType = ExternalType // scalar type used for storage
+>
+class generic_base {
+protected:
+  // default constructor
+  generic_base() :
+    bytes(0),
+    buffer(0)
+  {}
+
+public:
+  // conservative buffer size for current codec settings
+  size_t buffer_size(const zfp_field* field) const
+  {
+    return zfp_field_blocks(field) * block_size * sizeof(InternalType);
+  }
+
+  // open 
+  void open(void* data, size_t size)
+  {
+    bytes = size;
+    buffer = static_cast<InternalType*>(data);
+  }
+
+  // close bit stream
+  void close()
+  {
+    bytes = 0;
+    buffer = 0;
+  }
+
+  // pointer to beginning of bit stream
+  void* data() const { return static_cast<void*>(buffer); }
+
+  // compression mode
+  zfp_mode mode() const { return zfp_mode_fixed_rate; }
+
+  // rate in compressed bits/value (equals precision)
+  double rate() const { return static_cast<double>(precision()); }
+
+  // precision in uncompressed bits/value
+  uint precision() const { return internal_size_bits; }
+
+  // accuracy as absolute error tolerance (unsupported)
+  double accuracy() const { return -1; }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const
+  {
+    if (minbits)
+      *minbits = block_size_bits;
+    if (maxbits)
+      *maxbits = block_size_bits;
+    if (maxprec)
+      *maxprec = precision();
+    if (minexp)
+      *minexp = ZFP_MIN_EXP;
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    throw zfp::exception("zfp generic codec does not support reversible mode");
+  }
+
+  // set rate in compressed bits/value (equals precision)
+  double set_rate(double rate, bool)
+  {
+    return static_cast<double>(set_precision(static_cast<uint>(rate)));
+  }
+
+  // set precision in uncompressed bits/value (must equal InternalType width)
+  uint set_precision(uint precision)
+  {
+    if (precision != internal_size_bits)
+      throw zfp::exception("zfp generic codec precision mismatch");
+    return precision;
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double)
+  {
+    throw zfp::exception("zfp generic codec does not support fixed-accuracy mode");
+    return -1;
+  }
+
+  // set expert mode parameters
+  bool set_params(uint, uint, uint, int)
+  {
+    throw zfp::exception("zfp generic codec does not support expert mode");
+    return false;
+  }
+
+  // set thread safety mode (not required by this codec)
+  void set_thread_safety(bool) {}
+
+  // byte size of codec data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // unit of allocated data in bytes
+  static size_t alignment() { return sizeof(InternalType); }
+
+  static const zfp_type type = zfp::internal::trait<ExternalType>::type; // scalar type
+
+  // zfp::codec::generic_base::header class for array (de)serialization
+  #include "zfp/internal/codec/genheader.hpp"
+
+protected:
+  // pointer to beginning of block
+  InternalType* begin(bitstream_offset offset) const
+  {
+    if (offset % internal_size_bits)
+      throw zfp::exception("zfp generic codec bit offset alignment error");
+    return buffer + offset / internal_size_bits;
+  }
+
+  // store full contiguous block to memory
+  size_t encode_block(bitstream_offset offset, const ExternalType* block) const
+  {
+    InternalType* ptr = begin(offset);
+    for (size_t n = block_size; n--;)
+      *ptr++ = static_cast<InternalType>(*block++);
+    return block_size_bits;
+  }
+
+  // load full contiguous block from memory
+  size_t decode_block(bitstream_offset offset, ExternalType* block) const
+  {
+    const InternalType* ptr = begin(offset);
+    for (size_t n = block_size; n--;)
+      *block++ = static_cast<ExternalType>(*ptr++);
+    return block_size_bits;
+  }
+
+  // constants associated with template arguments
+  static const size_t internal_size_bits = sizeof(InternalType) * CHAR_BIT;
+  static const size_t block_size = 1u << (2 * dims);
+  static const size_t block_size_bits = block_size * internal_size_bits;
+
+  size_t bytes;         // number of bytes of storage
+  InternalType* buffer; // pointer to storage managed by block store
+};
+
+// 1D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic1 : public generic_base<1, ExternalType, InternalType> {
+public:
+  // encode contiguous 1D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 1D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1)
+                 : decode_block(offset, block);
+  }
+
+  // encode 1D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+    }
+    for (size_t x = 0; x < nx; x++, p += sx, q++)
+      *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 1D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+    }
+    for (size_t x = 0; x < nx; x++, p += sx, q++)
+      *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<1, ExternalType, InternalType>::begin;
+  using generic_base<1, ExternalType, InternalType>::encode_block;
+  using generic_base<1, ExternalType, InternalType>::decode_block;
+  using generic_base<1, ExternalType, InternalType>::block_size_bits;
+};
+
+// 2D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic2 : public generic_base<2, ExternalType, InternalType> {
+public:
+  // encode contiguous 2D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 2D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4)
+                 : decode_block(offset, block);
+  }
+
+  // encode 2D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+    }
+    for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+      for (size_t x = 0; x < nx; x++, p += sx, q++)
+        *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 2D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+    }
+    for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+      for (size_t x = 0; x < nx; x++, p += sx, q++)
+        *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<2, ExternalType, InternalType>::begin;
+  using generic_base<2, ExternalType, InternalType>::encode_block;
+  using generic_base<2, ExternalType, InternalType>::decode_block;
+  using generic_base<2, ExternalType, InternalType>::block_size_bits;
+};
+
+// 3D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic3 : public generic_base<3, ExternalType, InternalType> {
+public:
+  // encode contiguous 3D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 3D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
+                 : decode_block(offset, block);
+  }
+
+  // encode 3D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+    }
+    for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+      for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+        for (size_t x = 0; x < nx; x++, p += sx, q++)
+          *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 3D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+    }
+    for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+      for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+        for (size_t x = 0; x < nx; x++, p += sx, q++)
+          *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<3, ExternalType, InternalType>::begin;
+  using generic_base<3, ExternalType, InternalType>::encode_block;
+  using generic_base<3, ExternalType, InternalType>::decode_block;
+  using generic_base<3, ExternalType, InternalType>::block_size_bits;
+};
+
+// 4D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic4 : public generic_base<4, ExternalType, InternalType> {
+public:
+  // encode contiguous 4D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 4D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : decode_block(offset, block);
+  }
+
+  // encode 4D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    size_t nw = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+      nw -= shape & 3u; shape >>= 2;
+    }
+    for (size_t w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+      for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+        for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (size_t x = 0; x < nx; x++, p += sx, q++)
+            *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 4D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    size_t nw = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+      nw -= shape & 3u; shape >>= 2;
+    }
+    for (size_t w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+      for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+        for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (size_t x = 0; x < nx; x++, p += sx, q++)
+            *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<4, ExternalType, InternalType>::begin;
+  using generic_base<4, ExternalType, InternalType>::encode_block;
+  using generic_base<4, ExternalType, InternalType>::decode_block;
+  using generic_base<4, ExternalType, InternalType>::block_size_bits;
+};
+
+} // codec
+} // zfp
+
+#endif
diff --git a/include/zfp/codec/zfpcodec.hpp b/include/zfp/codec/zfpcodec.hpp
new file mode 100644
index 000000000..5a880cdd8
--- /dev/null
+++ b/include/zfp/codec/zfpcodec.hpp
@@ -0,0 +1,551 @@
+#ifndef ZFP_ZFP_CODEC_HPP
+#define ZFP_ZFP_CODEC_HPP
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include "zfp.h"
+#include "zfp.hpp"
+#include "zfp/internal/array/memory.hpp"
+#include "zfp/internal/array/traits.hpp"
+
+namespace zfp {
+namespace codec {
+
+// abstract base class for zfp coding of {float, double} x {1D, 2D, 3D, 4D} data
+template <uint dims, typename Scalar>
+class zfp_base {
+protected:
+  // default constructor
+  zfp_base() :
+    stream(zfp_stream_open(0))
+#ifdef _OPENMP
+    , thread_safety(false)
+#endif
+  {}
+
+  // destructor
+  ~zfp_base()
+  {
+    close();
+    zfp_stream_close(stream);
+  }
+
+public:
+  // assignment operator--performs deep copy
+  zfp_base& operator=(const zfp_base& codec)
+  {
+    if (this != &codec)
+      deep_copy(codec);
+    return *this;
+  }
+
+  // conservative buffer size for current codec settings
+  size_t buffer_size(const zfp_field* field) const
+  {
+    // empty field case
+    if (!field->nx && !field->ny && !field->nz && !field->nw)
+      return 0;
+    // variable-rate case
+    if (zfp_stream_compression_mode(stream) != zfp_mode_fixed_rate)
+      return zfp_stream_maximum_size(stream, field);
+    // fixed-rate case: exclude header
+    size_t blocks = zfp_field_blocks(field);
+    return zfp::internal::round_up(blocks * stream->maxbits, stream_alignment()) / CHAR_BIT;
+  }
+
+  // open bit stream
+  void open(void* data, size_t size)
+  {
+    zfp_stream_set_bit_stream(stream, stream_open(data, size));
+  }
+
+  // close bit stream
+  void close()
+  {
+    stream_close(zfp_stream_bit_stream(stream));
+    zfp_stream_set_bit_stream(stream, 0);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return zfp_stream_compression_mode(stream); }
+
+  // rate in compressed bits/value (fixed-rate mode only)
+  double rate() const { return zfp_stream_rate(stream, dims); }
+
+  // precision in uncompressed bits/value (fixed-precision mode only)
+  uint precision() const { return zfp_stream_precision(stream); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return zfp_stream_accuracy(stream); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { zfp_stream_params(stream, minbits, maxbits, maxprec, minexp); }
+
+  // enable reversible (lossless) mode
+  void set_reversible() { zfp_stream_set_reversible(stream); }
+
+  // set rate in compressed bits/value
+  double set_rate(double rate, bool align) { return zfp_stream_set_rate(stream, rate, type, dims, align); }
+
+  // set precision in uncompressed bits/value
+  uint set_precision(uint precision) { return zfp_stream_set_precision(stream, precision); }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance) { return zfp_stream_set_accuracy(stream, tolerance); }
+
+  // set expert mode parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int maxexp) { return zfp_stream_set_params(stream, minbits, maxbits, maxprec, maxexp) == zfp_true; }
+
+  // set thread safety mode
+#ifdef _OPENMP
+  void set_thread_safety(bool safety) { thread_safety = safety; }
+#else
+  void set_thread_safety(bool) {}
+#endif
+
+  // byte size of codec data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META) {
+      size += sizeof(*stream);
+      size += sizeof(*this);
+    }
+    return size;
+  }
+
+  // unit of allocated data in bytes
+  static size_t alignment() { return stream_alignment() / CHAR_BIT; }
+
+  static const zfp_type type = zfp::internal::trait<Scalar>::type; // scalar type
+
+  // zfp::codec::zfp_base::header class for array (de)serialization
+  #include "zfp/internal/codec/zfpheader.hpp"
+
+protected:
+  // deep copy
+  void deep_copy(const zfp_base& codec)
+  {
+    stream = zfp_stream_open(0);
+    *stream = *codec.stream;
+    stream->stream = 0;
+#ifdef _OPENMP
+    thread_safety = codec.thread_safety;
+#endif
+  }
+
+  // make a thread-local copy of zfp stream and bit stream
+  zfp_stream clone_stream() const
+  {
+    zfp_stream zfp = *stream;
+    zfp.stream = stream_clone(zfp.stream);
+    return zfp;
+  }
+
+  // encode full contiguous block
+  size_t encode_block(bitstream_offset offset, const Scalar* block) const
+  {
+    if (thread_safety) {
+      // make a thread-local copy of zfp stream and bit stream
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block(&zfp, offset, block);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block(stream, offset, block);
+  }
+
+  // decode full contiguous block
+  size_t decode_block(bitstream_offset offset, Scalar* block) const
+  {
+    if (thread_safety) {
+      // make a thread-local copy of zfp stream and bit stream
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block(&zfp, offset, block);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block(stream, offset, block);
+  }
+
+  // encode full contiguous block
+  static size_t encode_block(zfp_stream* zfp, bitstream_offset offset, const Scalar* block)
+  {
+    stream_wseek(zfp->stream, offset);
+    size_t size = zfp::encode_block<Scalar, dims>(zfp, block);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode full contiguous block
+  static size_t decode_block(zfp_stream* zfp, bitstream_offset offset, Scalar* block)
+  {
+    stream_rseek(zfp->stream, offset);
+    size_t size = zfp::decode_block<Scalar, dims>(zfp, block);
+    stream_align(zfp->stream);
+    return size;
+  }
+
+  zfp_stream* stream; // compressed zfp stream
+#ifdef _OPENMP
+  bool thread_safety; // thread safety state
+#else
+  static const bool thread_safety = false; // not needed without OpenMP
+#endif
+};
+
+// 1D codec
+template <typename Scalar>
+class zfp1 : public zfp_base<1, Scalar> {
+public:
+  // encode contiguous 1D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 1D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1)
+                 : decode_block(offset, block);
+  }
+
+  // encode 1D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx);
+  }
+
+  // decode 1D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx);
+  }
+
+protected:
+  using zfp_base<1, Scalar>::clone_stream;
+  using zfp_base<1, Scalar>::encode_block;
+  using zfp_base<1, Scalar>::decode_block;
+  using zfp_base<1, Scalar>::stream;
+  using zfp_base<1, Scalar>::thread_safety;
+
+  // encode 1D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, sx);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 1D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, sx);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 2D codec
+template <typename Scalar>
+class zfp2 : public zfp_base<2, Scalar> {
+public:
+  // encode contiguous 2D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 2D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4)
+                 : decode_block(offset, block);
+  }
+
+  // encode 2D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy);
+  }
+
+  // decode 2D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy);
+  }
+
+protected:
+  using zfp_base<2, Scalar>::clone_stream;
+  using zfp_base<2, Scalar>::encode_block;
+  using zfp_base<2, Scalar>::decode_block;
+  using zfp_base<2, Scalar>::stream;
+  using zfp_base<2, Scalar>::thread_safety;
+
+  // encode 2D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 2D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 3D codec
+template <typename Scalar>
+class zfp3 : public zfp_base<3, Scalar> {
+public:
+  // encode contiguous 3D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 3D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
+                 : decode_block(offset, block);
+  }
+
+  // encode 3D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy, sz);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy, sz);
+  }
+
+  // decode 3D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy, sz);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy, sz);
+  }
+
+protected:
+  using zfp_base<3, Scalar>::clone_stream;
+  using zfp_base<3, Scalar>::encode_block;
+  using zfp_base<3, Scalar>::decode_block;
+  using zfp_base<3, Scalar>::stream;
+  using zfp_base<3, Scalar>::thread_safety;
+
+  // encode 3D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy, sz);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 3D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy, sz);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 4D codec
+template <typename Scalar>
+class zfp4 : public zfp_base<4, Scalar> {
+public:
+  // encode contiguous 4D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 4D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : decode_block(offset, block);
+  }
+
+  // encode 4D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy, sz, sw);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy, sz, sw);
+  }
+
+  // decode 4D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy, sz, sw);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy, sz, sw);
+  }
+
+protected:
+  using zfp_base<4, Scalar>::clone_stream;
+  using zfp_base<4, Scalar>::encode_block;
+  using zfp_base<4, Scalar>::decode_block;
+  using zfp_base<4, Scalar>::stream;
+  using zfp_base<4, Scalar>::thread_safety;
+
+  // encode 4D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      uint nw = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy, sz, sw);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 4D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      uint nw = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy, sz, sw);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+} // codec
+} // zfp
+
+#endif
diff --git a/array/zfpcarray1.h b/include/zfp/constarray1.hpp
similarity index 86%
rename from array/zfpcarray1.h
rename to include/zfp/constarray1.hpp
index e5a138de9..f2f501de6 100644
--- a/array/zfpcarray1.h
+++ b/include/zfp/constarray1.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_CARRAY1_H
-#define ZFP_CARRAY1_H
+#ifndef ZFP_CONSTARRAY1_HPP
+#define ZFP_CONSTARRAY1_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache1.h"
-#include "zfp/store1.h"
-#include "zfp/handle1.h"
-#include "zfp/reference1.h"
-#include "zfp/pointer1.h"
-#include "zfp/iterator1.h"
-#include "zfp/view1.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache1.hpp"
+#include "zfp/internal/array/handle1.hpp"
+#include "zfp/internal/array/iterator1.hpp"
+#include "zfp/internal/array/pointer1.hpp"
+#include "zfp/internal/array/reference1.hpp"
+#include "zfp/internal/array/store1.hpp"
+#include "zfp/internal/array/view1.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class const_array1 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore1<value_type, codec_type, index_type> store_type;
-  typedef BlockCache1<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore1<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache1<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -102,6 +102,9 @@ class const_array1 : public array {
   // accuracy as absolute error tolerance (fixed-accuracy mode only)
   double accuracy() const { return store.accuracy(); }
 
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
   // set rate in compressed bits per value
   double set_rate(double rate)
   {
@@ -130,6 +133,13 @@ class const_array1 : public array {
     store.set_reversible();
   }
 
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  {
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
   // set compression mode and parameters
   void set_config(const zfp_config& config)
   {
diff --git a/array/zfpcarray2.h b/include/zfp/constarray2.hpp
similarity index 88%
rename from array/zfpcarray2.h
rename to include/zfp/constarray2.hpp
index 422f0e82f..e89286291 100644
--- a/array/zfpcarray2.h
+++ b/include/zfp/constarray2.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_CARRAY2_H
-#define ZFP_CARRAY2_H
+#ifndef ZFP_CONSTARRAY2_HPP
+#define ZFP_CONSTARRAY2_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache2.h"
-#include "zfp/store2.h"
-#include "zfp/handle2.h"
-#include "zfp/reference2.h"
-#include "zfp/pointer2.h"
-#include "zfp/iterator2.h"
-#include "zfp/view2.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache2.hpp"
+#include "zfp/internal/array/handle2.hpp"
+#include "zfp/internal/array/iterator2.hpp"
+#include "zfp/internal/array/pointer2.hpp"
+#include "zfp/internal/array/reference2.hpp"
+#include "zfp/internal/array/store2.hpp"
+#include "zfp/internal/array/view2.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class const_array2 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore2<value_type, codec_type, index_type> store_type;
-  typedef BlockCache2<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore2<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache2<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -105,6 +105,9 @@ class const_array2 : public array {
   // accuracy as absolute error tolerance (fixed-accuracy mode only)
   double accuracy() const { return store.accuracy(); }
 
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
   // set rate in compressed bits per value
   double set_rate(double rate)
   {
@@ -133,6 +136,13 @@ class const_array2 : public array {
     store.set_reversible();
   }
 
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
   // set compression mode and parameters
   void set_config(const zfp_config& config)
   {
diff --git a/array/zfpcarray3.h b/include/zfp/constarray3.hpp
similarity index 88%
rename from array/zfpcarray3.h
rename to include/zfp/constarray3.hpp
index e233e9a58..61d65d466 100644
--- a/array/zfpcarray3.h
+++ b/include/zfp/constarray3.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_CARRAY3_H
-#define ZFP_CARRAY3_H
+#ifndef ZFP_CONSTARRAY3_HPP
+#define ZFP_CONSTARRAY3_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache3.h"
-#include "zfp/store3.h"
-#include "zfp/handle3.h"
-#include "zfp/reference3.h"
-#include "zfp/pointer3.h"
-#include "zfp/iterator3.h"
-#include "zfp/view3.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache3.hpp"
+#include "zfp/internal/array/handle3.hpp"
+#include "zfp/internal/array/iterator3.hpp"
+#include "zfp/internal/array/pointer3.hpp"
+#include "zfp/internal/array/reference3.hpp"
+#include "zfp/internal/array/store3.hpp"
+#include "zfp/internal/array/view3.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class const_array3 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore3<value_type, codec_type, index_type> store_type;
-  typedef BlockCache3<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore3<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache3<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -108,6 +108,9 @@ class const_array3 : public array {
   // accuracy as absolute error tolerance (fixed-accuracy mode only)
   double accuracy() const { return store.accuracy(); }
 
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
   // set rate in compressed bits per value
   double set_rate(double rate)
   {
@@ -136,6 +139,13 @@ class const_array3 : public array {
     store.set_reversible();
   }
 
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
   // set compression mode and parameters
   void set_config(const zfp_config& config)
   {
diff --git a/array/zfpcarray4.h b/include/zfp/constarray4.hpp
similarity index 89%
rename from array/zfpcarray4.h
rename to include/zfp/constarray4.hpp
index f305cae62..63680f168 100644
--- a/array/zfpcarray4.h
+++ b/include/zfp/constarray4.hpp
@@ -1,19 +1,19 @@
-#ifndef ZFP_CARRAY4_H
-#define ZFP_CARRAY4_H
+#ifndef ZFP_CONSTARRAY4_HPP
+#define ZFP_CONSTARRAY4_HPP
 
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfpindex.h"
-#include "zfp/cache4.h"
-#include "zfp/store4.h"
-#include "zfp/handle4.h"
-#include "zfp/reference4.h"
-#include "zfp/pointer4.h"
-#include "zfp/iterator4.h"
-#include "zfp/view4.h"
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache4.hpp"
+#include "zfp/internal/array/handle4.hpp"
+#include "zfp/internal/array/iterator4.hpp"
+#include "zfp/internal/array/pointer4.hpp"
+#include "zfp/internal/array/reference4.hpp"
+#include "zfp/internal/array/store4.hpp"
+#include "zfp/internal/array/view4.hpp"
 
 namespace zfp {
 
@@ -30,8 +30,8 @@ class const_array4 : public array {
   typedef Scalar value_type;
   typedef Codec codec_type;
   typedef Index index_type;
-  typedef BlockStore4<value_type, codec_type, index_type> store_type;
-  typedef BlockCache4<value_type, store_type> cache_type;
+  typedef zfp::internal::BlockStore4<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache4<value_type, store_type> cache_type;
   typedef typename Codec::header header;
 
   // accessor classes
@@ -111,6 +111,9 @@ class const_array4 : public array {
   // accuracy as absolute error tolerance (fixed-accuracy mode only)
   double accuracy() const { return store.accuracy(); }
 
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
   // set rate in compressed bits per value
   double set_rate(double rate)
   {
@@ -139,6 +142,13 @@ class const_array4 : public array {
     store.set_reversible();
   }
 
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
   // set compression mode and parameters
   void set_config(const zfp_config& config)
   {
diff --git a/array/zfpfactory.h b/include/zfp/factory.hpp
similarity index 80%
rename from array/zfpfactory.h
rename to include/zfp/factory.hpp
index 972babc88..73091514d 100644
--- a/array/zfpfactory.h
+++ b/include/zfp/factory.hpp
@@ -1,9 +1,9 @@
-#ifndef ZFP_FACTORY_H
-#define ZFP_FACTORY_H
+#ifndef ZFP_FACTORY_HPP
+#define ZFP_FACTORY_HPP
 
-// ensure zfparray.h has already been included
-#ifndef ZFP_ARRAY_H
-  #error "zfparray.h must be included before zfpfactory.h"
+// ensure zfp/array.hpp has already been included
+#ifndef ZFP_ARRAY_HPP
+  #error "zfp/array.hpp must be included before zfp/factory.hpp"
 #endif
 
 zfp::array* zfp::array::construct(const zfp::array::header& header, const void* buffer, size_t buffer_size_bytes)
@@ -22,7 +22,7 @@ zfp::array* zfp::array::construct(const zfp::array::header& header, const void*
   std::string error;
   switch (dims) {
     case 4:
-#ifdef ZFP_ARRAY4_H
+#ifdef ZFP_ARRAY4_HPP
       switch (type) {
         case zfp_type_float:
           arr = new zfp::array4f(nx, ny, nz, nw, rate);
@@ -36,12 +36,12 @@ zfp::array* zfp::array::construct(const zfp::array::header& header, const void*
           break;
       }
 #else
-      error = "zfparray4 not supported; include zfparray4.h before zfpfactory.h";
+      error = "array4 not supported; include zfp/array4.hpp before zfp/factory.hpp";
 #endif
       break;
 
     case 3:
-#ifdef ZFP_ARRAY3_H
+#ifdef ZFP_ARRAY3_HPP
       switch (type) {
         case zfp_type_float:
           arr = new zfp::array3f(nx, ny, nz, rate);
@@ -55,12 +55,12 @@ zfp::array* zfp::array::construct(const zfp::array::header& header, const void*
           break;
       }
 #else
-      error = "zfparray3 not supported; include zfparray3.h before zfpfactory.h";
+      error = "array3 not supported; include zfp/array3.hpp before zfp/factory.hpp";
 #endif
       break;
 
     case 2:
-#ifdef ZFP_ARRAY2_H
+#ifdef ZFP_ARRAY2_HPP
       switch (type) {
         case zfp_type_float:
           arr = new zfp::array2f(nx, ny, rate);
@@ -74,12 +74,12 @@ zfp::array* zfp::array::construct(const zfp::array::header& header, const void*
           break;
       }
 #else
-      error = "zfparray2 not supported; include zfparray2.h before zfpfactory.h";
+      error = "array2 not supported; include zfp/array2.hpp before zfp/factory.hpp";
 #endif
       break;
 
     case 1:
-#ifdef ZFP_ARRAY1_H
+#ifdef ZFP_ARRAY1_HPP
       switch (type) {
         case zfp_type_float:
           arr = new zfp::array1f(nx, rate);
@@ -93,7 +93,7 @@ zfp::array* zfp::array::construct(const zfp::array::header& header, const void*
           break;
       }
 #else
-      error = "zfparray1 not supported; include zfparray1.h before zfpfactory.h";
+      error = "array1 not supported; include zfp/array1.hpp before zfp/factory.hpp";
 #endif
       break;
 
diff --git a/array/zfpindex.h b/include/zfp/index.hpp
similarity index 80%
rename from array/zfpindex.h
rename to include/zfp/index.hpp
index af59be6fe..b84e9b75c 100644
--- a/array/zfpindex.h
+++ b/include/zfp/index.hpp
@@ -1,7 +1,8 @@
-#ifndef ZFP_INDEX_H
-#define ZFP_INDEX_H
+#ifndef ZFP_INDEX_HPP
+#define ZFP_INDEX_HPP
 
 #include <algorithm>
+#include "zfp/internal/array/memory.hpp"
 
 namespace zfp {
 namespace index {
@@ -29,13 +30,13 @@ class implicit {
   }
 
   // range of offsets spanned by indexed data in bits
-  size_t range() const { return block_offset(blocks); }
+  bitstream_size range() const { return block_offset(blocks); }
 
   // bit size of given block
   size_t block_size(size_t /*block_index*/) const { return bits_per_block; }
 
   // bit offset of given block
-  size_t block_offset(size_t block_index) const { return bits_per_block * block_index; }
+  bitstream_offset block_offset(size_t block_index) const { return block_index * bits_per_block; }
 
   // reset index
   void clear() {}
@@ -71,7 +72,7 @@ class verbatim {
   }
 
   // destructor
-  ~verbatim() { delete[] data; }
+  ~verbatim() { zfp::internal::deallocate(data); }
 
   // assignment operator--performs a deep copy
   verbatim& operator=(const verbatim& index)
@@ -93,13 +94,13 @@ class verbatim {
   }
 
   // range of offsets spanned by indexed data in bits
-  size_t range() const { return block_offset(blocks); }
+  bitstream_size range() const { return block_offset(blocks); }
 
   // bit size of given block
-  size_t block_size(size_t block_index) const { return block_offset(block_index + 1) - block_offset(block_index); }
+  size_t block_size(size_t block_index) const { return static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index)); }
 
   // bit offset of given block
-  size_t block_offset(size_t block_index) const { return static_cast<size_t>(data[block_index]); }
+  bitstream_offset block_offset(size_t block_index) const { return static_cast<bitstream_offset>(data[block_index]); }
 
   // reset index
   void clear() { block = 0; }
@@ -108,7 +109,7 @@ class verbatim {
   void resize(size_t blocks)
   {
     this->blocks = blocks;
-    zfp::reallocate(data, capacity() * sizeof(*data));
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
     *data = 0;
     clear();
   }
@@ -146,7 +147,7 @@ class verbatim {
   // make a deep copy of index
   void deep_copy(const verbatim& index)
   {
-    zfp::clone(data, index.data, index.capacity());
+    zfp::internal::clone(data, index.data, index.capacity());
     blocks = index.blocks;
     block = index.block;
   }
@@ -167,7 +168,7 @@ class hybrid4 {
   }
 
   // destructor
-  ~hybrid4() { delete[] data; }
+  ~hybrid4() { zfp::internal::deallocate(data); }
 
   // assignment operator--performs a deep copy
   hybrid4& operator=(const hybrid4& index)
@@ -189,7 +190,7 @@ class hybrid4 {
   }
 
   // range of offsets spanned by indexed data in bits
-  size_t range() const { return end; }
+  bitstream_size range() const { return end; }
 
   // bit size of given block
   size_t block_size(size_t block_index) const
@@ -197,12 +198,12 @@ class hybrid4 {
     size_t chunk = block_index / 4;
     size_t which = block_index % 4;
     return which == 3u
-             ? block_offset(block_index + 1) - block_offset(block_index)
-             : data[chunk].lo[which + 1] - data[chunk].lo[which];
+             ? static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index))
+             : static_cast<size_t>(data[chunk].lo[which + 1] - data[chunk].lo[which]);
   }
 
   // bit offset of given block
-  size_t block_offset(size_t block_index) const
+  bitstream_offset block_offset(size_t block_index) const
   {
     // if index is being built, point offset to end
     if (block_index == block)
@@ -210,7 +211,7 @@ class hybrid4 {
     // index has already been built; decode offset
     size_t chunk = block_index / 4;
     size_t which = block_index % 4;
-    return (size_t(data[chunk].hi) << shift) + data[chunk].lo[which];
+    return (bitstream_offset(data[chunk].hi) << shift) + data[chunk].lo[which];
   }
 
   // reset index
@@ -224,7 +225,7 @@ class hybrid4 {
   void resize(size_t blocks)
   {
     this->blocks = blocks;
-    zfp::reallocate(data, capacity() * sizeof(*data));
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
     clear();
   }
 
@@ -264,12 +265,12 @@ class hybrid4 {
     size_t which = block % 4;
     buffer[which] = size;
     if (which == 3u) {
-      // chunk is complete; encode it (double shift in case ptr is 32 bits)
-      if (((ptr >> 16) >> 16) >> shift)
+      // chunk is complete; encode it
+      if (ptr >> (32 + shift))
         throw zfp::exception("zfp block offset is too large for hybrid4 index");
       // store high bits
       data[chunk].hi = static_cast<uint32>(ptr >> shift);
-      size_t base = size_t(data[chunk].hi) << shift;
+      bitstream_offset base = bitstream_offset(data[chunk].hi) << shift;
       // store low bits
       for (uint k = 0; k < 4; k++) {
         data[chunk].lo[k] = static_cast<uint16>(ptr - base);
@@ -295,7 +296,7 @@ class hybrid4 {
   // make a deep copy of index
   void deep_copy(const hybrid4& index)
   {
-    zfp::clone(data, index.data, index.capacity());
+    zfp::internal::clone(data, index.data, index.capacity());
     blocks = index.blocks;
     block = index.block;
     ptr = index.ptr;
@@ -305,12 +306,12 @@ class hybrid4 {
 
   static const uint shift = 12; // number of bits to shift hi bits
 
-  record* data;     // block offset array
-  size_t blocks;    // number of blocks
-  size_t block;     // current block index
-  size_t end;       // offset to last block
-  size_t ptr;       // offset to current chunk of blocks
-  size_t buffer[4]; // buffer of 4 blocks to be stored together
+  record* data;         // block offset array
+  size_t blocks;        // number of blocks
+  size_t block;         // current block index
+  bitstream_offset end; // offset to last block
+  bitstream_offset ptr; // offset to current chunk of blocks
+  size_t buffer[4];     // bit sizes 4 blocks to be stored together
 };
 
 // hybrid block index (8 blocks/chunk; 16 bits/block; 86-14dims bit offsets) --
@@ -325,7 +326,7 @@ class hybrid8 {
   }
 
   // destructor
-  ~hybrid8() { delete[] data; }
+  ~hybrid8() { zfp::internal::deallocate(data); }
 
   // assignment operator--performs a deep copy
   hybrid8& operator=(const hybrid8& index)
@@ -347,7 +348,7 @@ class hybrid8 {
   }
 
   // range of offsets spanned by indexed data in bits
-  size_t range() const { return end; }
+  bitstream_size range() const { return end; }
 
   // bit size of given block
   size_t block_size(size_t block_index) const
@@ -355,12 +356,12 @@ class hybrid8 {
     size_t chunk = block_index / 8;
     size_t which = block_index % 8;
     return which == 7u
-             ? block_offset(block_index + 1) - block_offset(block_index)
-             : static_cast<size_t>(size(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which)));
+             ? static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index))
+             : size(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which));
   }
 
   // bit offset of given block
-  size_t block_offset(size_t block_index) const
+  bitstream_offset block_offset(size_t block_index) const
   {
     // if index is being built, point offset to end
     if (block_index == block)
@@ -368,7 +369,7 @@ class hybrid8 {
     // index has already been built; decode offset
     size_t chunk = block_index / 8;
     size_t which = block_index % 8;
-    return static_cast<size_t>(offset(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which)));
+    return offset(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which));
   }
 
   // reset index
@@ -382,7 +383,7 @@ class hybrid8 {
   void resize(size_t blocks)
   {
     this->blocks = blocks;
-    zfp::reallocate(data, capacity() * sizeof(*data));
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
     clear();
   }
 
@@ -457,7 +458,7 @@ class hybrid8 {
   // make a deep copy of index
   void deep_copy(const hybrid8& index)
   {
-    zfp::clone(data, index.data, index.capacity());
+    zfp::internal::clone(data, index.data, index.capacity());
     blocks = index.blocks;
     block = index.block;
     ptr = index.ptr;
@@ -466,17 +467,17 @@ class hybrid8 {
   }
 
   // kth size in chunk, 0 <= k <= 6
-  static uint64 size(uint64 h, uint64 l, uint k)
+  static size_t size(uint64 h, uint64 l, uint k)
   {
     // extract high and low bits
     h >>= (6 - k) * hbits; h &= (UINT64C(1) << hbits) - 1;
     l >>= (6 - k) * lbits; l &= (UINT64C(1) << lbits) - 1;
     // combine base offset with high and low bits
-    return (h << lbits) + l;
+    return static_cast<size_t>((h << lbits) + l);
   }
 
   // kth offset in chunk, 0 <= k <= 7
-  static uint64 offset(uint64 h, uint64 l, uint k)
+  static bitstream_offset offset(uint64 h, uint64 l, uint k)
   {
     // extract all but lowest (8 * hbits) bits
     uint64 base = h >> (8 * hbits);
@@ -485,7 +486,7 @@ class hybrid8 {
     h = hsum(h >> ((7 - k) * hbits));
     l = lsum(l >> ((7 - k) * lbits));
     // combine base offset with high and low bits
-    return (((base << hbits) + h) << lbits) + l;
+    return static_cast<bitstream_offset>((((base << hbits) + h) << lbits) + l);
   }
 
   // sum of (up to) eight packed 8-bit numbers (efficient version of sum8)
@@ -522,12 +523,12 @@ class hybrid8 {
   static const uint lbits = 8;              // 64 bits partitioned into 8
   static const uint hbits = 2 * (dims - 1); // log2(4^d * maxprec / 2^lbits)
 
-  uint64* data;     // block offset array
-  size_t blocks;    // number of blocks
-  size_t block;     // current block index
-  size_t end;       // offset to last block
-  size_t ptr;       // offset to current set of blocks
-  size_t buffer[8]; // buffer of 8 blocks to be stored together
+  uint64* data;         // block offset array
+  size_t blocks;        // number of blocks
+  size_t block;         // current block index
+  bitstream_offset end; // offset to last block
+  bitstream_offset ptr; // offset to current set of blocks
+  size_t buffer[8];     // sizes of 8 blocks to be stored together
 };
 
 } // index
diff --git a/array/zfp/cache.h b/include/zfp/internal/array/cache.hpp
similarity index 92%
rename from array/zfp/cache.h
rename to include/zfp/internal/array/cache.hpp
index 91746e915..533c37dbb 100644
--- a/array/zfp/cache.h
+++ b/include/zfp/internal/array/cache.hpp
@@ -1,7 +1,7 @@
-#ifndef ZFP_CACHE_H
-#define ZFP_CACHE_H
+#ifndef ZFP_CACHE_HPP
+#define ZFP_CACHE_HPP
 
-#include "memory.h"
+#include "zfp/internal/array/memory.hpp"
 
 #ifdef ZFP_WITH_CACHE_PROFILE
   // maintain stats on hit and miss rates
@@ -9,6 +9,7 @@
 #endif
 
 namespace zfp {
+namespace internal {
 
 // direct-mapped or two-way skew-associative write-back cache
 template <class Line>
@@ -106,8 +107,8 @@ class Cache {
   // destructor
   ~Cache()
   {
-    zfp::deallocate_aligned(tag);
-    zfp::deallocate_aligned(line);
+    zfp::internal::deallocate_aligned(tag);
+    zfp::internal::deallocate_aligned(line);
 #ifdef ZFP_WITH_CACHE_PROFILE
     std::cerr << "cache R1=" << hit[0][0] << " R2=" << hit[1][0] << " RM=" << miss[0] << " RB=" << back[0]
               <<      " W1=" << hit[0][1] << " W2=" << hit[1][1] << " WM=" << miss[1] << " WB=" << back[1] << std::endl;
@@ -141,8 +142,8 @@ class Cache {
   {
     // compute smallest value of mask such that mask + 1 = 2^k >= minsize
     for (mask = minsize ? minsize - 1 : 1; mask & (mask + 1); mask |= mask + 1);
-    zfp::reallocate_aligned(tag, size() * sizeof(Tag), ZFP_MEMORY_ALIGNMENT);
-    zfp::reallocate_aligned(line, size() * sizeof(Line), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::reallocate_aligned(tag, size() * sizeof(Tag), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::reallocate_aligned(line, size() * sizeof(Line), ZFP_MEMORY_ALIGNMENT);
     clear();
   }
 
@@ -229,8 +230,8 @@ class Cache {
   void deep_copy(const Cache& c)
   {
     mask = c.mask;
-    zfp::clone_aligned(tag, c.tag, size(), ZFP_MEMORY_ALIGNMENT);
-    zfp::clone_aligned(line, c.line, size(), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::clone_aligned(tag, c.tag, size(), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::clone_aligned(line, c.line, size(), ZFP_MEMORY_ALIGNMENT);
 #ifdef ZFP_WITH_CACHE_PROFILE
     hit[0][0] = c.hit[0][0];
     hit[0][1] = c.hit[0][1];
@@ -274,6 +275,7 @@ class Cache {
 #endif
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/cache1.h b/include/zfp/internal/array/cache1.hpp
similarity index 93%
rename from array/zfp/cache1.h
rename to include/zfp/internal/array/cache1.hpp
index d1c680211..24f192e5a 100644
--- a/array/zfp/cache1.h
+++ b/include/zfp/internal/array/cache1.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_CACHE1_H
-#define ZFP_CACHE1_H
+#ifndef ZFP_CACHE1_HPP
+#define ZFP_CACHE1_HPP
 
-#include "cache.h"
+#include "zfp/internal/array/cache.hpp"
 
 namespace zfp {
+namespace internal {
 
 template <typename Scalar, class Store>
 class BlockCache1 {
@@ -40,7 +41,7 @@ class BlockCache1 {
   // flush cache by compressing all modified cached blocks
   void flush() const
   {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
       if (p->tag.dirty()) {
         size_t block_index = p->tag.index() - 1;
         store.encode(block_index, p->line->data());
@@ -159,7 +160,7 @@ class BlockCache1 {
   {
     CacheLine* p = 0;
     size_t block_index = store.block_index(i);
-    typename zfp::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
     size_t stored_block_index = tag.index() - 1;
     if (stored_block_index != block_index) {
       // write back occupied cache line if it is dirty
@@ -194,6 +195,7 @@ class BlockCache1 {
   Store& store;                   // store backed by cache
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/cache2.h b/include/zfp/internal/array/cache2.hpp
similarity index 94%
rename from array/zfp/cache2.h
rename to include/zfp/internal/array/cache2.hpp
index 5b63bd9f3..e7aa07d90 100644
--- a/array/zfp/cache2.h
+++ b/include/zfp/internal/array/cache2.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_CACHE2_H
-#define ZFP_CACHE2_H
+#ifndef ZFP_CACHE2_HPP
+#define ZFP_CACHE2_HPP
 
-#include "cache.h"
+#include "zfp/internal/array/cache.hpp"
 
 namespace zfp {
+namespace internal {
 
 template <typename Scalar, class Store>
 class BlockCache2 {
@@ -40,7 +41,7 @@ class BlockCache2 {
   // flush cache by compressing all modified cached blocks
   void flush() const
   {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
       if (p->tag.dirty()) {
         size_t block_index = p->tag.index() - 1;
         store.encode(block_index, p->line->data());
@@ -165,7 +166,7 @@ class BlockCache2 {
   {
     CacheLine* p = 0;
     size_t block_index = store.block_index(i, j);
-    typename zfp::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
     size_t stored_block_index = tag.index() - 1;
     if (stored_block_index != block_index) {
       // write back occupied cache line if it is dirty
@@ -200,6 +201,7 @@ class BlockCache2 {
   Store& store;                   // store backed by cache
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/cache3.h b/include/zfp/internal/array/cache3.hpp
similarity index 95%
rename from array/zfp/cache3.h
rename to include/zfp/internal/array/cache3.hpp
index 6e517bf7f..1c4c95544 100644
--- a/array/zfp/cache3.h
+++ b/include/zfp/internal/array/cache3.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_CACHE3_H
-#define ZFP_CACHE3_H
+#ifndef ZFP_CACHE3_HPP
+#define ZFP_CACHE3_HPP
 
-#include "cache.h"
+#include "zfp/internal/array/cache.hpp"
 
 namespace zfp {
+namespace internal {
 
 template <typename Scalar, class Store>
 class BlockCache3 {
@@ -40,7 +41,7 @@ class BlockCache3 {
   // flush cache by compressing all modified cached blocks
   void flush() const
   {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
       if (p->tag.dirty()) {
         size_t block_index = p->tag.index() - 1;
         store.encode(block_index, p->line->data());
@@ -171,7 +172,7 @@ class BlockCache3 {
   {
     CacheLine* p = 0;
     size_t block_index = store.block_index(i, j, k);
-    typename zfp::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
     size_t stored_block_index = tag.index() - 1;
     if (stored_block_index != block_index) {
       // write back occupied cache line if it is dirty
@@ -206,6 +207,7 @@ class BlockCache3 {
   Store& store;                   // store backed by cache
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/cache4.h b/include/zfp/internal/array/cache4.hpp
similarity index 95%
rename from array/zfp/cache4.h
rename to include/zfp/internal/array/cache4.hpp
index 4c2c175c0..69182b7ec 100644
--- a/array/zfp/cache4.h
+++ b/include/zfp/internal/array/cache4.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_CACHE4_H
-#define ZFP_CACHE4_H
+#ifndef ZFP_CACHE4_HPP
+#define ZFP_CACHE4_HPP
 
-#include "cache.h"
+#include "zfp/internal/array/cache.hpp"
 
 namespace zfp {
+namespace internal {
 
 template <typename Scalar, class Store>
 class BlockCache4 {
@@ -40,7 +41,7 @@ class BlockCache4 {
   // flush cache by compressing all modified cached blocks
   void flush() const
   {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
       if (p->tag.dirty()) {
         size_t block_index = p->tag.index() - 1;
         store.encode(block_index, p->line->data());
@@ -177,7 +178,7 @@ class BlockCache4 {
   {
     CacheLine* p = 0;
     size_t block_index = store.block_index(i, j, k, l);
-    typename zfp::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
     size_t stored_block_index = tag.index() - 1;
     if (stored_block_index != block_index) {
       // write back occupied cache line if it is dirty
@@ -212,6 +213,7 @@ class BlockCache4 {
   Store& store;                   // store backed by cache
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/exception.h b/include/zfp/internal/array/exception.hpp
similarity index 83%
rename from array/zfp/exception.h
rename to include/zfp/internal/array/exception.hpp
index c5c3114ea..747bf6bd6 100644
--- a/array/zfp/exception.h
+++ b/include/zfp/internal/array/exception.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_EXCEPTION_H
-#define ZFP_EXCEPTION_H
+#ifndef ZFP_EXCEPTION_HPP
+#define ZFP_EXCEPTION_HPP
 
 #include <stdexcept>
 #include <string>
diff --git a/array/zfp/handle1.h b/include/zfp/internal/array/handle1.hpp
similarity index 92%
rename from array/zfp/handle1.h
rename to include/zfp/internal/array/handle1.hpp
index 4174fad69..72f5e91b5 100644
--- a/array/zfp/handle1.h
+++ b/include/zfp/internal/array/handle1.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_HANDLE1_H
-#define ZFP_HANDLE1_H
+#ifndef ZFP_HANDLE1_HPP
+#define ZFP_HANDLE1_HPP
 
 namespace zfp {
 namespace internal {
@@ -24,7 +24,7 @@ class const_handle {
   // protected constructor
   explicit const_handle(const container_type* container, size_t x) : container(const_cast<container_type*>(container)), x(x) {}
 
-  // derefence handle
+  // dereference handle
   value_type get() const { return container->get(x); }
 
   container_type* container; // container
diff --git a/array/zfp/handle2.h b/include/zfp/internal/array/handle2.hpp
similarity index 92%
rename from array/zfp/handle2.h
rename to include/zfp/internal/array/handle2.hpp
index 81feead11..17b5043e7 100644
--- a/array/zfp/handle2.h
+++ b/include/zfp/internal/array/handle2.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_HANDLE2_H
-#define ZFP_HANDLE2_H
+#ifndef ZFP_HANDLE2_HPP
+#define ZFP_HANDLE2_HPP
 
 namespace zfp {
 namespace internal {
@@ -24,7 +24,7 @@ class const_handle {
   // protected constructor
   explicit const_handle(const container_type* container, size_t x, size_t y) : container(const_cast<container_type*>(container)), x(x), y(y) {}
 
-  // derefence handle
+  // dereference handle
   value_type get() const { return container->get(x, y); }
 
   container_type* container; // container
diff --git a/array/zfp/handle3.h b/include/zfp/internal/array/handle3.hpp
similarity index 92%
rename from array/zfp/handle3.h
rename to include/zfp/internal/array/handle3.hpp
index 85e09d5cf..139b1d552 100644
--- a/array/zfp/handle3.h
+++ b/include/zfp/internal/array/handle3.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_HANDLE3_H
-#define ZFP_HANDLE3_H
+#ifndef ZFP_HANDLE3_HPP
+#define ZFP_HANDLE3_HPP
 
 namespace zfp {
 namespace internal {
@@ -24,7 +24,7 @@ class const_handle {
   // protected constructor
   explicit const_handle(const container_type* container, size_t x, size_t y, size_t z) : container(const_cast<container_type*>(container)), x(x), y(y), z(z) {}
 
-  // derefence handle
+  // dereference handle
   value_type get() const { return container->get(x, y, z); }
 
   container_type* container; // container
diff --git a/array/zfp/handle4.h b/include/zfp/internal/array/handle4.hpp
similarity index 93%
rename from array/zfp/handle4.h
rename to include/zfp/internal/array/handle4.hpp
index 3616344f6..da9ca3853 100644
--- a/array/zfp/handle4.h
+++ b/include/zfp/internal/array/handle4.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_HANDLE4_H
-#define ZFP_HANDLE4_H
+#ifndef ZFP_HANDLE4_HPP
+#define ZFP_HANDLE4_HPP
 
 namespace zfp {
 namespace internal {
@@ -24,7 +24,7 @@ class const_handle {
   // protected constructor
   explicit const_handle(const container_type* container, size_t x, size_t y, size_t z, size_t w) : container(const_cast<container_type*>(container)), x(x), y(y), z(z), w(w) {}
 
-  // derefence handle
+  // dereference handle
   value_type get() const { return container->get(x, y, z, w); }
 
   container_type* container; // container
diff --git a/array/zfp/header.h b/include/zfp/internal/array/header.hpp
similarity index 100%
rename from array/zfp/header.h
rename to include/zfp/internal/array/header.hpp
diff --git a/array/zfp/iterator1.h b/include/zfp/internal/array/iterator1.hpp
similarity index 97%
rename from array/zfp/iterator1.h
rename to include/zfp/internal/array/iterator1.hpp
index f3eb206d2..73d5197d9 100644
--- a/array/zfp/iterator1.h
+++ b/include/zfp/internal/array/iterator1.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_ITERATOR1_H
-#define ZFP_ITERATOR1_H
+#ifndef ZFP_ITERATOR1_HPP
+#define ZFP_ITERATOR1_HPP
 
 namespace zfp {
 namespace internal {
@@ -58,10 +58,10 @@ class const_iterator : public const_handle<Container> {
 
 protected:
   // sequential offset associated with index x plus delta d
-  difference_type offset(difference_type d = 0) const { return static_cast<difference_type>(x - container->min_x() + d); }
+  difference_type offset(difference_type d = 0) const { return static_cast<difference_type>(x - container->min_x() + size_t(d)); }
 
   // index x associated with sequential offset p
-  void index(size_t& x, difference_type p) const { x = container->min_x() + p; }
+  void index(size_t& x, difference_type p) const { x = container->min_x() + size_t(p); }
 
   // advance iterator by d
   void advance(difference_type d) { index(x, offset(d)); }
diff --git a/array/zfp/iterator2.h b/include/zfp/internal/array/iterator2.hpp
similarity index 94%
rename from array/zfp/iterator2.h
rename to include/zfp/internal/array/iterator2.hpp
index bf0f674f7..433d18256 100644
--- a/array/zfp/iterator2.h
+++ b/include/zfp/internal/array/iterator2.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_ITERATOR2_H
-#define ZFP_ITERATOR2_H
+#ifndef ZFP_ITERATOR2_HPP
+#define ZFP_ITERATOR2_HPP
 
 namespace zfp {
 namespace internal {
@@ -95,10 +95,10 @@ class const_iterator : public const_handle<Container> {
     }
     else {
       size_t m = ~size_t(3);
-      size_t by = std::max((ymin + p / nx) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx;
-      size_t bx = std::max((xmin + p / sy) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy;
-      y = by + p / sx; p -= (y - by) * sx;
-      x = bx + p;      p -= (x - bx);
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy;
+      y = by + size_t(p / ptrdiff_t(sx)); p -= (y - by) * sx;
+      x = bx + size_t(p);                 p -= (x - bx);
     }
   }
 
diff --git a/array/zfp/iterator3.h b/include/zfp/internal/array/iterator3.hpp
similarity index 92%
rename from array/zfp/iterator3.h
rename to include/zfp/internal/array/iterator3.hpp
index 386ef26cf..aa46b5ffb 100644
--- a/array/zfp/iterator3.h
+++ b/include/zfp/internal/array/iterator3.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_ITERATOR3_H
-#define ZFP_ITERATOR3_H
+#ifndef ZFP_ITERATOR3_HPP
+#define ZFP_ITERATOR3_HPP
 
 namespace zfp {
 namespace internal {
@@ -105,12 +105,12 @@ class const_iterator : public const_handle<Container> {
     }
     else {
       size_t m = ~size_t(3);
-      size_t bz = std::max((zmin + p / (nx * ny)) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny;
-      size_t by = std::max((ymin + p / (nx * sz)) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz;
-      size_t bx = std::max((xmin + p / (sy * sz)) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz;
-      z = bz + p / (sx * sy); p -= (z - bz) * sx * sy;
-      y = by + p / sx;        p -= (y - by) * sx;
-      x = bx + p;             p -= (x - bx);
+      size_t bz = std::max((zmin + size_t(p / ptrdiff_t(nx * ny))) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny;
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx * sz))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy * sz))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz;
+      z = bz + size_t(p / ptrdiff_t(sx * sy)); p -= (z - bz) * sx * sy;
+      y = by + size_t(p / ptrdiff_t(sx));      p -= (y - by) * sx;
+      x = bx + size_t(p);                      p -= (x - bx);
     }
   }
 
diff --git a/array/zfp/iterator4.h b/include/zfp/internal/array/iterator4.hpp
similarity index 91%
rename from array/zfp/iterator4.h
rename to include/zfp/internal/array/iterator4.hpp
index d4530e42e..00b941a5b 100644
--- a/array/zfp/iterator4.h
+++ b/include/zfp/internal/array/iterator4.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_ITERATOR4_H
-#define ZFP_ITERATOR4_H
+#ifndef ZFP_ITERATOR4_HPP
+#define ZFP_ITERATOR4_HPP
 
 namespace zfp {
 namespace internal {
@@ -115,14 +115,14 @@ class const_iterator : public const_handle<Container> {
     }
     else {
       size_t m = ~size_t(3);
-      size_t bw = std::max((wmin + p / (nx * ny * nz)) & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p -= (bw - wmin) * nx * ny * nz;
-      size_t bz = std::max((zmin + p / (nx * ny * sw)) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny * sw;
-      size_t by = std::max((ymin + p / (nx * sz * sw)) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz * sw;
-      size_t bx = std::max((xmin + p / (sy * sz * sw)) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz * sw;
-      w = bw + p / (sx * sy * sz); p -= (w - bw) * sx * sy * sz;
-      z = bz + p / (sx * sy);      p -= (z - bz) * sx * sy;
-      y = by + p / sx;             p -= (y - by) * sx;
-      x = bx + p;                  p -= (x - bx);
+      size_t bw = std::max((wmin + size_t(p / ptrdiff_t(nx * ny * nz))) & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p -= (bw - wmin) * nx * ny * nz;
+      size_t bz = std::max((zmin + size_t(p / ptrdiff_t(nx * ny * sw))) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny * sw;
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx * sz * sw))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz * sw;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy * sz * sw))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz * sw;
+      w = bw + size_t(p / ptrdiff_t(sx * sy * sz)); p -= (w - bw) * sx * sy * sz;
+      z = bz + size_t(p / ptrdiff_t(sx * sy));      p -= (z - bz) * sx * sy;
+      y = by + size_t(p / ptrdiff_t(sx));           p -= (y - by) * sx;
+      x = bx + size_t(p);                           p -= (x - bx);
     }
   }
 
diff --git a/array/zfp/memory.h b/include/zfp/internal/array/memory.hpp
similarity index 86%
rename from array/zfp/memory.h
rename to include/zfp/internal/array/memory.hpp
index d3c5036af..b6e7b9f6f 100644
--- a/array/zfp/memory.h
+++ b/include/zfp/internal/array/memory.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_MEMORY_H
-#define ZFP_MEMORY_H
+#ifndef ZFP_MEMORY_HPP
+#define ZFP_MEMORY_HPP
 
 // Memory management for POD types only.  Templated functions are provided only
 // to avoid the need for casts to/from void* in pass-by-reference calls.
@@ -26,6 +26,7 @@ extern "C" {
 #define unused_(x) ((void)(x))
 
 namespace zfp {
+namespace internal {
 
 // allocate size bytes
 inline void*
@@ -108,8 +109,8 @@ reallocate(T*& ptr, size_t size, bool preserve = false)
   if (preserve)
     ptr = static_cast<T*>(std::realloc(ptr, size));
   else {
-    zfp::deallocate(ptr);
-    ptr = static_cast<T*>(zfp::allocate(size));
+    zfp::internal::deallocate(ptr);
+    ptr = static_cast<T*>(zfp::internal::allocate(size));
   }
 }
 
@@ -130,15 +131,15 @@ reallocate_aligned(void*& ptr, size_t new_size, size_t alignment, size_t old_siz
 {
   if (old_size) {
     // reallocate while preserving contents
-    void* dst = zfp::allocate_aligned(new_size, alignment);
+    void* dst = zfp::internal::allocate_aligned(new_size, alignment);
     std::memcpy(dst, ptr, std::min(old_size, new_size));
-    zfp::deallocate_aligned(ptr);
+    zfp::internal::deallocate_aligned(ptr);
     ptr = dst;
   }
   else {
     // reallocate without preserving contents
-    zfp::deallocate_aligned(ptr);
-    ptr = zfp::allocate_aligned(new_size, alignment);
+    zfp::internal::deallocate_aligned(ptr);
+    ptr = zfp::internal::allocate_aligned(new_size, alignment);
   }
 }
 
@@ -147,9 +148,9 @@ template <typename T>
 inline void
 clone(T*& dst, const T* src, size_t count)
 {
-  zfp::deallocate(dst);
+  zfp::internal::deallocate(dst);
   if (src) {
-    dst = static_cast<T*>(zfp::allocate(count * sizeof(T)));
+    dst = static_cast<T*>(zfp::internal::allocate(count * sizeof(T)));
     std::copy(src, src + count, dst);
   }
   else
@@ -173,9 +174,9 @@ template <>
 inline void
 clone_aligned(void*& dst, const void* src, size_t size, size_t alignment)
 {
-  zfp::deallocate_aligned(dst);
+  zfp::internal::deallocate_aligned(dst);
   if (src) {
-    dst = zfp::allocate_aligned(size, alignment);
+    dst = zfp::internal::allocate_aligned(size, alignment);
     std::memcpy(dst, src, size);
   }
   else
@@ -191,6 +192,7 @@ round_up(size_t size, size_t unit)
   return size;
 }
 
+}
 }
 
 #undef unused_
diff --git a/array/zfp/pointer1.h b/include/zfp/internal/array/pointer1.hpp
similarity index 98%
rename from array/zfp/pointer1.h
rename to include/zfp/internal/array/pointer1.hpp
index 45fdd90c4..37876c693 100644
--- a/array/zfp/pointer1.h
+++ b/include/zfp/internal/array/pointer1.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_POINTER1_H
-#define ZFP_POINTER1_H
+#ifndef ZFP_POINTER1_HPP
+#define ZFP_POINTER1_HPP
 
 namespace zfp {
 namespace internal {
@@ -50,7 +50,7 @@ class const_pointer : public const_handle<Container> {
 
 protected:
   ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x()) + d; }
-  void index(size_t& x, ptrdiff_t p) const { x = container->min_x() + p; }
+  void index(size_t& x, ptrdiff_t p) const { x = container->min_x() + size_t(p); }
   void advance(ptrdiff_t d) { index(x, offset(d)); }
   void increment() { ++x; }
   void decrement() { --x; }
diff --git a/array/zfp/pointer2.h b/include/zfp/internal/array/pointer2.hpp
similarity index 96%
rename from array/zfp/pointer2.h
rename to include/zfp/internal/array/pointer2.hpp
index d025b1464..a074be98d 100644
--- a/array/zfp/pointer2.h
+++ b/include/zfp/internal/array/pointer2.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_POINTER2_H
-#define ZFP_POINTER2_H
+#ifndef ZFP_POINTER2_HPP
+#define ZFP_POINTER2_HPP
 
 namespace zfp {
 namespace internal {
@@ -52,8 +52,8 @@ class const_pointer : public const_handle<Container> {
   ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y())) + d; }
   void index(size_t& x, size_t& y, ptrdiff_t p) const
   {
-    x = container->min_x() + p % container->size_x(); p /= container->size_x();
-    y = container->min_y() + p;
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p);
   }
   void advance(ptrdiff_t d) { index(x, y, offset(d)); }
   void increment()
diff --git a/array/zfp/pointer3.h b/include/zfp/internal/array/pointer3.hpp
similarity index 95%
rename from array/zfp/pointer3.h
rename to include/zfp/internal/array/pointer3.hpp
index 00e3ae8e5..8f8dee618 100644
--- a/array/zfp/pointer3.h
+++ b/include/zfp/internal/array/pointer3.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_POINTER3_H
-#define ZFP_POINTER3_H
+#ifndef ZFP_POINTER3_HPP
+#define ZFP_POINTER3_HPP
 
 namespace zfp {
 namespace internal {
@@ -52,9 +52,9 @@ class const_pointer : public const_handle<Container> {
   ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y() + container->size_y() * (z - container->min_z()))) + d; }
   void index(size_t& x, size_t& y, size_t& z, ptrdiff_t p) const
   {
-    x = container->min_x() + p % container->size_x(); p /= container->size_x();
-    y = container->min_y() + p % container->size_y(); p /= container->size_y();
-    z = container->min_z() + p;
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p % ptrdiff_t(container->size_y())); p /= container->size_y();
+    z = container->min_z() + size_t(p);
   }
   void advance(ptrdiff_t d) { index(x, y, z, offset(d)); }
   void increment()
diff --git a/array/zfp/pointer4.h b/include/zfp/internal/array/pointer4.hpp
similarity index 93%
rename from array/zfp/pointer4.h
rename to include/zfp/internal/array/pointer4.hpp
index 5bbc2ba35..8adb97f33 100644
--- a/array/zfp/pointer4.h
+++ b/include/zfp/internal/array/pointer4.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_POINTER4_H
-#define ZFP_POINTER4_H
+#ifndef ZFP_POINTER4_HPP
+#define ZFP_POINTER4_HPP
 
 namespace zfp {
 namespace internal {
@@ -52,10 +52,10 @@ class const_pointer : public const_handle<Container> {
   ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y() + container->size_y() * (z - container->min_z() + container->size_z() * (w - container->min_w())))) + d; }
   void index(size_t& x, size_t& y, size_t& z, size_t & w, ptrdiff_t p) const
   {
-    x = container->min_x() + p % container->size_x(); p /= container->size_x();
-    y = container->min_y() + p % container->size_y(); p /= container->size_y();
-    z = container->min_z() + p % container->size_z(); p /= container->size_z();
-    w = container->min_w() + p;
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p % ptrdiff_t(container->size_y())); p /= container->size_y();
+    z = container->min_z() + size_t(p % ptrdiff_t(container->size_z())); p /= container->size_z();
+    w = container->min_w() + size_t(p);
   }
   void advance(ptrdiff_t d) { index(x, y, z, w, offset(d)); }
   void increment()
diff --git a/array/zfp/reference1.h b/include/zfp/internal/array/reference1.hpp
similarity index 97%
rename from array/zfp/reference1.h
rename to include/zfp/internal/array/reference1.hpp
index e4e32e0bb..e41cc8b5a 100644
--- a/array/zfp/reference1.h
+++ b/include/zfp/internal/array/reference1.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_REFERENCE1_H
-#define ZFP_REFERENCE1_H
+#ifndef ZFP_REFERENCE1_HPP
+#define ZFP_REFERENCE1_HPP
 
 namespace zfp {
 namespace internal {
diff --git a/array/zfp/reference2.h b/include/zfp/internal/array/reference2.hpp
similarity index 97%
rename from array/zfp/reference2.h
rename to include/zfp/internal/array/reference2.hpp
index 489581b06..b16484fbe 100644
--- a/array/zfp/reference2.h
+++ b/include/zfp/internal/array/reference2.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_REFERENCE2_H
-#define ZFP_REFERENCE2_H
+#ifndef ZFP_REFERENCE2_HPP
+#define ZFP_REFERENCE2_HPP
 
 namespace zfp {
 namespace internal {
diff --git a/array/zfp/reference3.h b/include/zfp/internal/array/reference3.hpp
similarity index 97%
rename from array/zfp/reference3.h
rename to include/zfp/internal/array/reference3.hpp
index de58710ff..ecb52d30d 100644
--- a/array/zfp/reference3.h
+++ b/include/zfp/internal/array/reference3.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_REFERENCE3_H
-#define ZFP_REFERENCE3_H
+#ifndef ZFP_REFERENCE3_HPP
+#define ZFP_REFERENCE3_HPP
 
 namespace zfp {
 namespace internal {
diff --git a/array/zfp/reference4.h b/include/zfp/internal/array/reference4.hpp
similarity index 98%
rename from array/zfp/reference4.h
rename to include/zfp/internal/array/reference4.hpp
index 231fff0a2..1d0c3ca3c 100644
--- a/array/zfp/reference4.h
+++ b/include/zfp/internal/array/reference4.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_REFERENCE4_H
-#define ZFP_REFERENCE4_H
+#ifndef ZFP_REFERENCE4_HPP
+#define ZFP_REFERENCE4_HPP
 
 namespace zfp {
 namespace internal {
diff --git a/array/zfp/store.h b/include/zfp/internal/array/store.hpp
similarity index 78%
rename from array/zfp/store.h
rename to include/zfp/internal/array/store.hpp
index a06428de1..f649972f5 100644
--- a/array/zfp/store.h
+++ b/include/zfp/internal/array/store.hpp
@@ -1,11 +1,12 @@
-#ifndef ZFP_STORE_H
-#define ZFP_STORE_H
+#ifndef ZFP_STORE_HPP
+#define ZFP_STORE_HPP
 
 #include <climits>
 #include <cmath>
-#include "zfp/memory.h"
+#include "zfp/internal/array/memory.hpp"
 
 namespace zfp {
+namespace internal {
 
 // base class for block store
 template <class Codec, class Index>
@@ -113,11 +114,37 @@ class BlockStore {
   // shrink buffer to match size of compressed data
   void compact()
   {
-    size_t size = zfp::round_up(index.range(), codec.alignment() * CHAR_BIT) / CHAR_BIT;
+    size_t size = zfp::internal::round_up(index.range(), codec.alignment() * CHAR_BIT) / CHAR_BIT;
     if (bytes > size) {
-      zfp::reallocate_aligned(data, size, ZFP_MEMORY_ALIGNMENT, bytes);
+      codec.close();
+      zfp::internal::reallocate_aligned(data, size, ZFP_MEMORY_ALIGNMENT, bytes);
       bytes = size;
+      codec.open(data, bytes);
+    }
+  }
+
+  // increment private view reference count (for thread safety)
+  void reference()
+  {
+#ifdef _OPENMP
+    #pragma omp critical(references)
+    {
+      references++;
+      codec.set_thread_safety(references > 1);
     }
+#endif
+  }
+
+  // decrement private view reference count (for thread safety)
+  void unreference()
+  {
+#ifdef _OPENMP
+    #pragma omp critical(references)
+    {
+      references--;
+      codec.set_thread_safety(references > 1);
+    }
+#endif
   }
 
   // byte size of store data structure components indicated by mask
@@ -144,6 +171,7 @@ class BlockStore {
   BlockStore() :
     data(0),
     bytes(0),
+    references(0),
     index(0)
   {}
 
@@ -170,8 +198,9 @@ class BlockStore {
   void deep_copy(const BlockStore& s)
   {
     free();
-    zfp::clone_aligned(data, s.data, s.bytes, ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::clone_aligned(data, s.data, s.bytes, ZFP_MEMORY_ALIGNMENT);
     bytes = s.bytes;
+    references = s.references;
     index = s.index;
     codec = s.codec;
     codec.open(data, bytes);
@@ -182,7 +211,7 @@ class BlockStore {
   {
     free();
     bytes = buffer_size();
-    zfp::reallocate_aligned(data, bytes, ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::reallocate_aligned(data, bytes, ZFP_MEMORY_ALIGNMENT);
     if (clear)
       std::fill(static_cast<uchar*>(data), static_cast<uchar*>(data) + bytes, uchar(0));
     codec.open(data, bytes);
@@ -192,7 +221,7 @@ class BlockStore {
   void free()
   {
     if (data) {
-      zfp::deallocate_aligned(data);
+      zfp::internal::deallocate_aligned(data);
       data = 0;
       bytes = 0;
       codec.close();
@@ -200,7 +229,7 @@ class BlockStore {
   }
 
   // bit offset to block store
-  size_t offset(size_t block_index) const { return index.block_offset(block_index); }
+  bitstream_offset offset(size_t block_index) const { return index.block_offset(block_index); }
 
   // shape 0 <= m <= 3 of block containing index i, 0 <= i <= n - 1
   static uint shape_code(size_t i, size_t n)
@@ -213,12 +242,14 @@ class BlockStore {
     return static_cast<uint>(m);
   }
 
-  void* data;   // pointer to compressed blocks
-  size_t bytes; // compressed data size
-  Index index;  // block index (size and offset)
-  Codec codec;  // compression codec
+  void* data;        // pointer to compressed blocks
+  size_t bytes;      // compressed data size
+  size_t references; // private view references to array (for thread safety)
+  Index index;       // block index (size and offset)
+  Codec codec;       // compression codec
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/store1.h b/include/zfp/internal/array/store1.hpp
similarity index 96%
rename from array/zfp/store1.h
rename to include/zfp/internal/array/store1.hpp
index ed6311b20..aeb05fa8e 100644
--- a/array/zfp/store1.h
+++ b/include/zfp/internal/array/store1.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_STORE1_H
-#define ZFP_STORE1_H
+#ifndef ZFP_STORE1_HPP
+#define ZFP_STORE1_HPP
 
-#include "zfp/store.h"
+#include "zfp/internal/array/store.hpp"
 
 namespace zfp {
+namespace internal {
 
 // compressed block store for 1D array
 template <typename Scalar, class Codec, class Index>
@@ -133,6 +134,7 @@ class BlockStore1 : public BlockStore<Codec, Index> {
   size_t bx; // array dimensions in number of blocks
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/store2.h b/include/zfp/internal/array/store2.hpp
similarity index 96%
rename from array/zfp/store2.h
rename to include/zfp/internal/array/store2.hpp
index 1a4a03e51..466067ac4 100644
--- a/array/zfp/store2.h
+++ b/include/zfp/internal/array/store2.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_STORE2_H
-#define ZFP_STORE2_H
+#ifndef ZFP_STORE2_HPP
+#define ZFP_STORE2_HPP
 
-#include "zfp/store.h"
+#include "zfp/internal/array/store.hpp"
 
 namespace zfp {
+namespace internal {
 
 // compressed block store for 2D array
 template <typename Scalar, class Codec, class Index>
@@ -140,6 +141,7 @@ class BlockStore2 : public BlockStore<Codec, Index> {
   size_t bx, by; // array dimensions in number of blocks
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/store3.h b/include/zfp/internal/array/store3.hpp
similarity index 96%
rename from array/zfp/store3.h
rename to include/zfp/internal/array/store3.hpp
index ac9ac2c73..cb2afb734 100644
--- a/array/zfp/store3.h
+++ b/include/zfp/internal/array/store3.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_STORE3_H
-#define ZFP_STORE3_H
+#ifndef ZFP_STORE3_HPP
+#define ZFP_STORE3_HPP
 
-#include "zfp/store.h"
+#include "zfp/internal/array/store.hpp"
 
 namespace zfp {
+namespace internal {
 
 // compressed block store for 3D array
 template <typename Scalar, class Codec, class Index>
@@ -147,6 +148,7 @@ class BlockStore3 : public BlockStore<Codec, Index> {
   size_t bx, by, bz; // array dimensions in number of blocks
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/store4.h b/include/zfp/internal/array/store4.hpp
similarity index 97%
rename from array/zfp/store4.h
rename to include/zfp/internal/array/store4.hpp
index 77c3f5c5f..dbea0c986 100644
--- a/array/zfp/store4.h
+++ b/include/zfp/internal/array/store4.hpp
@@ -1,9 +1,10 @@
-#ifndef ZFP_STORE4_H
-#define ZFP_STORE4_H
+#ifndef ZFP_STORE4_HPP
+#define ZFP_STORE4_HPP
 
-#include "zfp/store.h"
+#include "zfp/internal/array/store.hpp"
 
 namespace zfp {
+namespace internal {
 
 // compressed block store for 4D array
 template <typename Scalar, class Codec, class Index>
@@ -154,6 +155,7 @@ class BlockStore4 : public BlockStore<Codec, Index> {
   size_t bx, by, bz, bw; // array dimensions in number of blocks
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/array/zfp/traits.h b/include/zfp/internal/array/traits.hpp
similarity index 88%
rename from array/zfp/traits.h
rename to include/zfp/internal/array/traits.hpp
index 78f967807..7ec4a02b1 100644
--- a/array/zfp/traits.h
+++ b/include/zfp/internal/array/traits.hpp
@@ -1,7 +1,8 @@
-#ifndef ZFP_TRAITS_H
-#define ZFP_TRAITS_H
+#ifndef ZFP_TRAITS_HPP
+#define ZFP_TRAITS_HPP
 
 namespace zfp {
+namespace internal {
 
 // useful type traits
 template <typename Scalar>
@@ -23,6 +24,7 @@ struct trait<double> {
   static const size_t precision = CHAR_BIT * sizeof(double);
 };
 
+}
 }
 
 #endif
diff --git a/array/zfp/view1.h b/include/zfp/internal/array/view1.hpp
similarity index 98%
rename from array/zfp/view1.h
rename to include/zfp/internal/array/view1.hpp
index 635f30077..adfe868b4 100644
--- a/array/zfp/view1.h
+++ b/include/zfp/internal/array/view1.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_VIEW1_H
-#define ZFP_VIEW1_H
+#ifndef ZFP_VIEW1_HPP
+#define ZFP_VIEW1_HPP
 
 // 1D array views
 
@@ -165,11 +165,21 @@ class private_const_view : public preview<Container> {
   private_const_view(container_type* array, size_t cache_size = 0) :
     preview<Container>(array),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
   private_const_view(container_type* array, size_t x, size_t nx, size_t cache_size = 0) :
     preview<Container>(array, x, nx),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
 
   // dimensions of (sub)array
   size_t size_x() const { return nx; }
diff --git a/array/zfp/view2.h b/include/zfp/internal/array/view2.hpp
similarity index 98%
rename from array/zfp/view2.h
rename to include/zfp/internal/array/view2.hpp
index 5ffddf62c..8e12336fa 100644
--- a/array/zfp/view2.h
+++ b/include/zfp/internal/array/view2.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_VIEW2_H
-#define ZFP_VIEW2_H
+#ifndef ZFP_VIEW2_HPP
+#define ZFP_VIEW2_HPP
 
 // 2D array views
 
@@ -348,11 +348,21 @@ class private_const_view : public preview<Container> {
   private_const_view(container_type* array, size_t cache_size = 0) :
     preview<Container>(array),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
   private_const_view(container_type* array, size_t x, size_t y, size_t nx, size_t ny, size_t cache_size = 0) :
     preview<Container>(array, x, y, nx, ny),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
 
   // dimensions of (sub)array
   size_t size_x() const { return nx; }
diff --git a/array/zfp/view3.h b/include/zfp/internal/array/view3.hpp
similarity index 99%
rename from array/zfp/view3.h
rename to include/zfp/internal/array/view3.hpp
index c41808a7e..24ceb8f41 100644
--- a/array/zfp/view3.h
+++ b/include/zfp/internal/array/view3.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_VIEW3_H
-#define ZFP_VIEW3_H
+#ifndef ZFP_VIEW3_HPP
+#define ZFP_VIEW3_HPP
 
 // 3D array views
 
@@ -423,11 +423,21 @@ class private_const_view : public preview<Container> {
   private_const_view(container_type* array, size_t cache_size = 0) :
     preview<Container>(array),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
   private_const_view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz, size_t cache_size = 0) :
     preview<Container>(array, x, y, z, nx, ny, nz),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
 
   // dimensions of (sub)array
   size_t size_x() const { return nx; }
diff --git a/array/zfp/view4.h b/include/zfp/internal/array/view4.hpp
similarity index 99%
rename from array/zfp/view4.h
rename to include/zfp/internal/array/view4.hpp
index 808e1e605..5888a305c 100644
--- a/array/zfp/view4.h
+++ b/include/zfp/internal/array/view4.hpp
@@ -1,5 +1,5 @@
-#ifndef ZFP_VIEW4_H
-#define ZFP_VIEW4_H
+#ifndef ZFP_VIEW4_HPP
+#define ZFP_VIEW4_HPP
 
 // 4D array views
 
@@ -503,11 +503,21 @@ class private_const_view : public preview<Container> {
   private_const_view(container_type* array, size_t cache_size = 0) :
     preview<Container>(array),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
   private_const_view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw, size_t cache_size = 0) :
     preview<Container>(array, x, y, z, w, nx, ny, nz, nw),
     cache(array->store, cache_size ? cache_size : array->cache.size())
-  {}
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
 
   // dimensions of (sub)array
   size_t size_x() const { return nx; }
diff --git a/cfp/include/cfparray1d.h b/include/zfp/internal/cfp/array1d.h
similarity index 97%
rename from cfp/include/cfparray1d.h
rename to include/zfp/internal/cfp/array1d.h
index 193f6d09b..fb20d3a77 100644
--- a/cfp/include/cfparray1d.h
+++ b/include/zfp/internal/cfp/array1d.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_1D
-#define CFP_ARRAY_1D
+#ifndef CFP_ARRAY_1D_H
+#define CFP_ARRAY_1D_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -96,7 +96,7 @@ typedef struct {
 } cfp_header1d_api;
 
 typedef struct {
-  cfp_array1d (*ctor_default)();
+  cfp_array1d (*ctor_default)(void);
   cfp_array1d (*ctor)(size_t n, double rate, const double* p, size_t cache_size);
   cfp_array1d (*ctor_copy)(const cfp_array1d src);
   cfp_array1d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -110,6 +110,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array1d self, size_t bytes);
   void (*clear_cache)(const cfp_array1d self);
   void (*flush_cache)(const cfp_array1d self);
+  size_t (*size_bytes)(const cfp_array1d self, uint mask);
   size_t (*compressed_size)(const cfp_array1d self);
   void* (*compressed_data)(const cfp_array1d self);
   size_t (*size)(const cfp_array1d self);
diff --git a/cfp/include/cfparray1f.h b/include/zfp/internal/cfp/array1f.h
similarity index 97%
rename from cfp/include/cfparray1f.h
rename to include/zfp/internal/cfp/array1f.h
index ac48d65e9..6ca593d09 100644
--- a/cfp/include/cfparray1f.h
+++ b/include/zfp/internal/cfp/array1f.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_1F
-#define CFP_ARRAY_1F
+#ifndef CFP_ARRAY_1F_H
+#define CFP_ARRAY_1F_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -96,7 +96,7 @@ typedef struct {
 } cfp_header1f_api;
 
 typedef struct {
-  cfp_array1f (*ctor_default)();
+  cfp_array1f (*ctor_default)(void);
   cfp_array1f (*ctor)(size_t n, double rate, const float* p, size_t cache_size);
   cfp_array1f (*ctor_copy)(const cfp_array1f src);
   cfp_array1f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -110,6 +110,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array1f self, size_t bytes);
   void (*clear_cache)(const cfp_array1f self);
   void (*flush_cache)(const cfp_array1f self);
+  size_t (*size_bytes)(const cfp_array1f self, uint mask);
   size_t (*compressed_size)(const cfp_array1f self);
   void* (*compressed_data)(const cfp_array1f self);
   size_t (*size)(const cfp_array1f self);
diff --git a/cfp/include/cfparray2d.h b/include/zfp/internal/cfp/array2d.h
similarity index 97%
rename from cfp/include/cfparray2d.h
rename to include/zfp/internal/cfp/array2d.h
index 715d6a973..b0e078af3 100644
--- a/cfp/include/cfparray2d.h
+++ b/include/zfp/internal/cfp/array2d.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_2D
-#define CFP_ARRAY_2D
+#ifndef CFP_ARRAY_2D_H
+#define CFP_ARRAY_2D_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -97,7 +97,7 @@ typedef struct {
 } cfp_header2d_api;
 
 typedef struct {
-  cfp_array2d (*ctor_default)();
+  cfp_array2d (*ctor_default)(void);
   cfp_array2d (*ctor)(size_t nx, size_t ny, double rate, const double* p, size_t cache_size);
   cfp_array2d (*ctor_copy)(const cfp_array2d src);
   cfp_array2d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -111,6 +111,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array2d self, size_t bytes);
   void (*clear_cache)(const cfp_array2d self);
   void (*flush_cache)(const cfp_array2d self);
+  size_t (*size_bytes)(const cfp_array2d self, uint mask);
   size_t (*compressed_size)(const cfp_array2d self);
   void* (*compressed_data)(const cfp_array2d self);
   size_t (*size)(const cfp_array2d self);
diff --git a/cfp/include/cfparray2f.h b/include/zfp/internal/cfp/array2f.h
similarity index 97%
rename from cfp/include/cfparray2f.h
rename to include/zfp/internal/cfp/array2f.h
index 1d7a23ce6..0137b6094 100644
--- a/cfp/include/cfparray2f.h
+++ b/include/zfp/internal/cfp/array2f.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_2F
-#define CFP_ARRAY_2F
+#ifndef CFP_ARRAY_2F_H
+#define CFP_ARRAY_2F_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -97,7 +97,7 @@ typedef struct {
 } cfp_header2f_api;
 
 typedef struct {
-  cfp_array2f (*ctor_default)();
+  cfp_array2f (*ctor_default)(void);
   cfp_array2f (*ctor)(size_t nx, size_t ny, double rate, const float* p, size_t cache_size);
   cfp_array2f (*ctor_copy)(const cfp_array2f src);
   cfp_array2f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -111,6 +111,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array2f self, size_t bytes);
   void (*clear_cache)(const cfp_array2f self);
   void (*flush_cache)(const cfp_array2f self);
+  size_t (*size_bytes)(const cfp_array2f self, uint mask);
   size_t (*compressed_size)(const cfp_array2f self);
   void* (*compressed_data)(const cfp_array2f self);
   size_t (*size)(const cfp_array2f self);
diff --git a/cfp/include/cfparray3d.h b/include/zfp/internal/cfp/array3d.h
similarity index 97%
rename from cfp/include/cfparray3d.h
rename to include/zfp/internal/cfp/array3d.h
index fa2b6f260..9c4a654a1 100644
--- a/cfp/include/cfparray3d.h
+++ b/include/zfp/internal/cfp/array3d.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_3D
-#define CFP_ARRAY_3D
+#ifndef CFP_ARRAY_3D_H
+#define CFP_ARRAY_3D_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -98,7 +98,7 @@ typedef struct {
 } cfp_header3d_api;
 
 typedef struct {
-  cfp_array3d (*ctor_default)();
+  cfp_array3d (*ctor_default)(void);
   cfp_array3d (*ctor)(size_t nx, size_t ny, size_t nz, double rate, const double* p, size_t cache_size);
   cfp_array3d (*ctor_copy)(const cfp_array3d src);
   cfp_array3d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -112,6 +112,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array3d self, size_t bytes);
   void (*clear_cache)(const cfp_array3d self);
   void (*flush_cache)(const cfp_array3d self);
+  size_t (*size_bytes)(const cfp_array3d self, uint mask);
   size_t (*compressed_size)(const cfp_array3d self);
   void* (*compressed_data)(const cfp_array3d self);
   size_t (*size)(const cfp_array3d self);
diff --git a/cfp/include/cfparray3f.h b/include/zfp/internal/cfp/array3f.h
similarity index 97%
rename from cfp/include/cfparray3f.h
rename to include/zfp/internal/cfp/array3f.h
index 779ea0c3f..e0f3aba5a 100644
--- a/cfp/include/cfparray3f.h
+++ b/include/zfp/internal/cfp/array3f.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_3F
-#define CFP_ARRAY_3F
+#ifndef CFP_ARRAY_3F_H
+#define CFP_ARRAY_3F_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -98,7 +98,7 @@ typedef struct {
 } cfp_header3f_api;
 
 typedef struct {
-  cfp_array3f (*ctor_default)();
+  cfp_array3f (*ctor_default)(void);
   cfp_array3f (*ctor)(size_t nx, size_t ny, size_t nz, double rate, const float* p, size_t cache_size);
   cfp_array3f (*ctor_copy)(const cfp_array3f src);
   cfp_array3f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -112,6 +112,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array3f self, size_t bytes);
   void (*clear_cache)(const cfp_array3f self);
   void (*flush_cache)(const cfp_array3f self);
+  size_t (*size_bytes)(const cfp_array3f self, uint mask);
   size_t (*compressed_size)(const cfp_array3f self);
   void* (*compressed_data)(const cfp_array3f self);
   size_t (*size)(const cfp_array3f self);
diff --git a/cfp/include/cfparray4d.h b/include/zfp/internal/cfp/array4d.h
similarity index 97%
rename from cfp/include/cfparray4d.h
rename to include/zfp/internal/cfp/array4d.h
index 098447a7f..44d1ecf08 100644
--- a/cfp/include/cfparray4d.h
+++ b/include/zfp/internal/cfp/array4d.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_4D
-#define CFP_ARRAY_4D
+#ifndef CFP_ARRAY_4D_H
+#define CFP_ARRAY_4D_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -99,7 +99,7 @@ typedef struct {
 } cfp_header4d_api;
 
 typedef struct {
-  cfp_array4d (*ctor_default)();
+  cfp_array4d (*ctor_default)(void);
   cfp_array4d (*ctor)(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const double* p, size_t cache_size);
   cfp_array4d (*ctor_copy)(const cfp_array4d src);
   cfp_array4d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -113,6 +113,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array4d self, size_t bytes);
   void (*clear_cache)(const cfp_array4d self);
   void (*flush_cache)(const cfp_array4d self);
+  size_t (*size_bytes)(const cfp_array4d self, uint mask);
   size_t (*compressed_size)(const cfp_array4d self);
   void* (*compressed_data)(const cfp_array4d self);
   size_t (*size)(const cfp_array4d self);
diff --git a/cfp/include/cfparray4f.h b/include/zfp/internal/cfp/array4f.h
similarity index 97%
rename from cfp/include/cfparray4f.h
rename to include/zfp/internal/cfp/array4f.h
index efc6cf621..b336dffe1 100644
--- a/cfp/include/cfparray4f.h
+++ b/include/zfp/internal/cfp/array4f.h
@@ -1,5 +1,5 @@
-#ifndef CFP_ARRAY_4F
-#define CFP_ARRAY_4F
+#ifndef CFP_ARRAY_4F_H
+#define CFP_ARRAY_4F_H
 
 #include <stddef.h>
 #include "zfp.h"
@@ -99,7 +99,7 @@ typedef struct {
 } cfp_header4f_api;
 
 typedef struct {
-  cfp_array4f (*ctor_default)();
+  cfp_array4f (*ctor_default)(void);
   cfp_array4f (*ctor)(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const float* p, size_t cache_size);
   cfp_array4f (*ctor_copy)(const cfp_array4f src);
   cfp_array4f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
@@ -113,6 +113,7 @@ typedef struct {
   void (*set_cache_size)(cfp_array4f self, size_t bytes);
   void (*clear_cache)(const cfp_array4f self);
   void (*flush_cache)(const cfp_array4f self);
+  size_t (*size_bytes)(const cfp_array4f self, uint mask);
   size_t (*compressed_size)(const cfp_array4f self);
   void* (*compressed_data)(const cfp_array4f self);
   size_t (*size)(const cfp_array4f self);
diff --git a/cfp/include/cfpheader.h b/include/zfp/internal/cfp/header.h
similarity index 57%
rename from cfp/include/cfpheader.h
rename to include/zfp/internal/cfp/header.h
index 7be745bd5..01d78ba54 100644
--- a/cfp/include/cfpheader.h
+++ b/include/zfp/internal/cfp/header.h
@@ -1,5 +1,5 @@
-#ifndef CFP_HEADER
-#define CFP_HEADER
+#ifndef CFP_HEADER_H
+#define CFP_HEADER_H
 
 typedef struct {
   void* object;
diff --git a/include/zfp/internal/codec/genheader.hpp b/include/zfp/internal/codec/genheader.hpp
new file mode 100644
index 000000000..8beec0880
--- /dev/null
+++ b/include/zfp/internal/codec/genheader.hpp
@@ -0,0 +1,76 @@
+// zfp::codec::generic_base::header
+class header : public zfp::array::header {
+public:
+  // serialization: construct header from array
+  header(const zfp::array& a) :
+    zfp::array::header(a),
+    bit_rate(static_cast<size_t>(a.rate()))
+  {
+    buffer[0] = magic;
+    buffer[1] = 0; // TODO: codec identifier (dimensionality, internal type)
+    buffer[2] = static_cast<uint64>(bit_rate);
+    buffer[3] = static_cast<uint64>(type);
+    buffer[4] = static_cast<uint64>(nx);
+    buffer[5] = static_cast<uint64>(ny);
+    buffer[6] = static_cast<uint64>(nz);
+    buffer[7] = static_cast<uint64>(nw);
+  }
+
+  // deserialization: construct header from memory buffer of optional size
+  header(const void* data, size_t bytes = 0) :
+    bit_rate(0)
+  {
+    // ensure byte size matches
+    if (bytes && bytes != byte_size)
+      throw zfp::exception("zfp generic header length does not match expectations");
+    else {
+      // copy and parse header
+      std::memcpy(buffer, data, byte_size);
+      if (buffer[0] != magic)
+        throw zfp::exception("zfp generic header is corrupt");
+      bit_rate = static_cast<size_t>(buffer[2]);
+      type = static_cast<zfp_type>(buffer[3]);
+      nx = static_cast<size_t>(buffer[4]);
+      ny = static_cast<size_t>(buffer[5]);
+      nz = static_cast<size_t>(buffer[6]);
+      nw = static_cast<size_t>(buffer[7]);
+    }
+  }
+
+  virtual ~header() {}
+
+  // rate in bits per value
+  double rate() const { return static_cast<double>(bit_rate); }
+
+  // header data
+  const void* data() const { return buffer; }
+
+  // header byte size
+  size_t size_bytes(uint mask = ZFP_DATA_HEADER) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - byte_size;
+    if (mask & ZFP_DATA_HEADER)
+      size += byte_size;
+    return size;
+  }
+
+protected:
+  // magic word
+  static const uint64 magic = UINT64C(0x000000008570667a);
+
+  // header size measured in bits, bytes, and 64-bit words
+  static const size_t word_size = 8;
+  static const size_t byte_size = word_size * sizeof(uint64);
+  static const size_t bit_size = byte_size * CHAR_BIT;
+
+  using zfp::array::header::type;
+  using zfp::array::header::nx;
+  using zfp::array::header::ny;
+  using zfp::array::header::nz;
+  using zfp::array::header::nw;
+
+  size_t bit_rate;          // array rate in bits per value
+  uint64 buffer[word_size]; // header data
+};
diff --git a/array/zfp/zfpheader.h b/include/zfp/internal/codec/zfpheader.hpp
similarity index 100%
rename from array/zfp/zfpheader.h
rename to include/zfp/internal/codec/zfpheader.hpp
diff --git a/src/inline/inline.h b/include/zfp/internal/zfp/inline.h
similarity index 77%
rename from src/inline/inline.h
rename to include/zfp/internal/zfp/inline.h
index e9ade3f11..bb10673bb 100644
--- a/src/inline/inline.h
+++ b/include/zfp/internal/zfp/inline.h
@@ -1,5 +1,5 @@
-#ifndef INLINE_H
-#define INLINE_H
+#ifndef ZFP_INLINE_H
+#define ZFP_INLINE_H
 
 #ifndef inline_
   #if __STDC_VERSION__ >= 199901L
diff --git a/include/zfp/macros.h b/include/zfp/internal/zfp/macros.h
similarity index 100%
rename from include/zfp/macros.h
rename to include/zfp/internal/zfp/macros.h
diff --git a/include/zfp/system.h b/include/zfp/internal/zfp/system.h
similarity index 72%
rename from include/zfp/system.h
rename to include/zfp/internal/zfp/system.h
index 23c493600..9aa4d3c37 100644
--- a/include/zfp/system.h
+++ b/include/zfp/internal/zfp/system.h
@@ -1,6 +1,7 @@
 #ifndef ZFP_SYSTEM_H
 #define ZFP_SYSTEM_H
 
+/* restrict keyword */
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
   /* C99: use restrict */
   #define restrict_ restrict
@@ -9,6 +10,20 @@
   #define restrict_
 #endif
 
+/* fallthrough in switch statements */
+#define fallthrough_ /* FALLTHROUGH */
+#if defined(__cplusplus) && __cplusplus >= 201703L
+  /* C++17: use [[fallthrough]] */
+  #undef fallthrough_
+  #define fallthrough_ [[fallthrough]];
+#elif defined(__has_attribute)
+  #if __has_attribute(fallthrough)
+    /* GNUC: use __attribute__((fallthrough)) */
+    #undef fallthrough_
+    #define fallthrough_ __attribute__((fallthrough));
+  #endif
+#endif
+
 /* macros for exporting and importing symbols */
 #if defined(_MSC_VER) && defined(ZFP_SHARED_LIBS)
   /* export (import) symbols when ZFP_SOURCE is (is not) defined */
diff --git a/include/zfp/types.h b/include/zfp/internal/zfp/types.h
similarity index 98%
rename from include/zfp/types.h
rename to include/zfp/internal/zfp/types.h
index b209f378e..5c8823673 100644
--- a/include/zfp/types.h
+++ b/include/zfp/internal/zfp/types.h
@@ -66,7 +66,7 @@ typedef unsigned long ulong;
   typedef unsigned int uint32;
 
   /* determine 64-bit data model */
-  #if defined(_WIN32) || defined(_WIN64)
+  #if defined(_WIN32)
     /* assume ILP32 or LLP64 (MSVC, MinGW) */
     #define ZFP_LLP64 1
   #else
diff --git a/include/zfp/version.h b/include/zfp/version.h
index 87a1d2210..272802d3f 100644
--- a/include/zfp/version.h
+++ b/include/zfp/version.h
@@ -1,29 +1,52 @@
 #ifndef ZFP_VERSION_H
 #define ZFP_VERSION_H
 
+/* library version information */
+#define ZFP_VERSION_MAJOR 1   /* library major version number */
+#define ZFP_VERSION_MINOR 0   /* library minor version number */
+#define ZFP_VERSION_PATCH 1   /* library patch version number */
+#define ZFP_VERSION_TWEAK 0   /* library tweak version number */
+
+/* defined for work in progress (indicates unofficial release) */
+#define ZFP_VERSION_DEVELOP 1
+
+/* codec version number (see also zfp_codec_version) */
+#define ZFP_CODEC 5
+
 /* stringification */
 #define _zfp_str_(x) # x
 #define _zfp_str(x) _zfp_str_(x)
 
-/* library version information */
-#define ZFP_VERSION_MAJOR 0 /* library major version number */
-#define ZFP_VERSION_MINOR 5 /* library minor version number */
-#define ZFP_VERSION_PATCH 5 /* library patch version number */
-#define ZFP_VERSION_RELEASE ZFP_VERSION_PATCH
+/* macro for generating an integer version identifier */
+#define ZFP_MAKE_VERSION(major, minor, patch, tweak) \
+  (((major) << 12) + \
+   ((minor) << 8) + \
+   ((patch) << 4) + \
+   ((tweak) << 0))
 
-/* codec version number (see also zfp_codec_version) */
-#define ZFP_CODEC 5
+/* macros for generating a version string */
+#define ZFP_MAKE_VERSION_STRING(major, minor, patch) \
+  _zfp_str(major) "." \
+  _zfp_str(minor) "." \
+  _zfp_str(patch)
+
+#define ZFP_MAKE_FULLVERSION_STRING(major, minor, patch, tweak) \
+  _zfp_str(major) "." \
+  _zfp_str(minor) "." \
+  _zfp_str(patch) "." \
+  _zfp_str(tweak)
 
 /* library version number (see also zfp_library_version) */
 #define ZFP_VERSION \
-  ((ZFP_VERSION_MAJOR << 8) + \
-   (ZFP_VERSION_MINOR << 4) + \
-   (ZFP_VERSION_PATCH << 0))
+  ZFP_MAKE_VERSION(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH, ZFP_VERSION_TWEAK)
 
 /* library version string (see also zfp_version_string) */
-#define ZFP_VERSION_STRING \
-  _zfp_str(ZFP_VERSION_MAJOR) "." \
-  _zfp_str(ZFP_VERSION_MINOR) "." \
-  _zfp_str(ZFP_VERSION_PATCH)
+#if ZFP_VERSION_TWEAK == 0
+  #define ZFP_VERSION_STRING \
+    ZFP_MAKE_VERSION_STRING(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH)
+#else
+  #define ZFP_VERSION_STRING \
+    ZFP_MAKE_FULLVERSION_STRING(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH, ZFP_VERSION_TWEAK)
+#endif
 
 #endif
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..f43c81a75
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,8 @@
+[build-system]
+requires = [
+    "setuptools",
+    "wheel",
+    "cython",
+    "oldest-supported-numpy; python_version<'3.9'",
+    'numpy; python_version>="3.9"',
+]
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 76f931730..9410ddd7f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,8 @@
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.27.0)
+    cmake_policy(SET CMP0148 OLD)
+endif ()
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/scikit-build-cmake)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/eyescale-cmake)
 include(UseCython)
 include(FindPythonExtensions)
 include(FindNumPy)
@@ -7,20 +10,21 @@ include(FindNumPy)
 find_package(PythonInterp REQUIRED)
 find_package(PythonLibs REQUIRED)
 find_package(PythonExtensions REQUIRED)
-find_package(Cython REQUIRED)
+find_package(Cython 0.28 REQUIRED) # >= v0.28 required for const memoryview support
 find_package(NumPy REQUIRED)
 
 include_directories(${ZFP_SOURCE_DIR}/include)
-include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
+include_directories(${NumPy_INCLUDE_DIR})
 
-add_cython_target(zfpy zfpy.pyx C)
+add_cython_target(zfpy zfpy.pyx C PY3)
 add_library(zfpy MODULE ${zfpy})
 target_link_libraries(zfpy zfp)
 python_extension_module(zfpy)
 
-# Build to the currrent binary dir to avoid conflicts with other libraries named zfp
+# Build to the current binary dir to avoid conflicts with other libraries named zfp
 set(PYLIB_BUILD_DIR "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Directory where zfp python library will be built")
 set_target_properties(zfpy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PYLIB_BUILD_DIR})
+
 # Install to the typical python module directory
 set(python_install_lib_dir "lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/")
 install(TARGETS zfpy LIBRARY DESTINATION ${python_install_lib_dir})
diff --git a/python/eyescale-cmake/FindNumPy.cmake b/python/eyescale-cmake/FindNumPy.cmake
deleted file mode 100644
index 8aba4e696..000000000
--- a/python/eyescale-cmake/FindNumPy.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-# Find the Python NumPy package
-# PYTHON_NUMPY_INCLUDE_DIR
-# PYTHON_NUMPY_FOUND
-# will be set by this script
-
-# cmake_minimum_required(VERSION 2.6)
-
-if(NOT PYTHON_EXECUTABLE)
-  if(NumPy_FIND_QUIETLY)
-    find_package(PythonInterp QUIET)
-  else()
-    find_package(PythonInterp)
-    set(__numpy_out 1)
-  endif()
-endif()
-
-if (PYTHON_EXECUTABLE)
-  # Find out the include path
-  execute_process(
-    COMMAND "${PYTHON_EXECUTABLE}" -c
-            "from __future__ import print_function\ntry: import numpy; print(numpy.get_include(), end='')\nexcept:pass\n"
-            OUTPUT_VARIABLE __numpy_path)
-  # And the version
-  execute_process(
-    COMMAND "${PYTHON_EXECUTABLE}" -c
-            "from __future__ import print_function\ntry: import numpy; print(numpy.__version__, end='')\nexcept:pass\n"
-    OUTPUT_VARIABLE __numpy_version)
-elseif(__numpy_out)
-  message(STATUS "Python executable not found.")
-endif(PYTHON_EXECUTABLE)
-
-find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
-  HINTS "${__numpy_path}" "${PYTHON_INCLUDE_PATH}" NO_DEFAULT_PATH)
-
-if(PYTHON_NUMPY_INCLUDE_DIR)
-  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
-endif(PYTHON_NUMPY_INCLUDE_DIR)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS PYTHON_NUMPY_INCLUDE_DIR
-                                        VERSION_VAR __numpy_version)
diff --git a/python/eyescale-cmake/LICENSE.txt b/python/eyescale-cmake/LICENSE.txt
deleted file mode 100644
index 307d54e59..000000000
--- a/python/eyescale-cmake/LICENSE.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Unless otherwise noted in the file, all files in this directory are
-licensed under the BSD license, reproduced below.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-- Neither the name of Eyescale Software GmbH nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/requirements.txt b/python/requirements.txt
index 7f3612988..849962b23 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,3 +1,2 @@
 cython>=0.22
 numpy>=1.8.0
-
diff --git a/python/scikit-build-cmake/FindCython.cmake b/python/scikit-build-cmake/FindCython.cmake
index 3d58c4f00..c8de13112 100644
--- a/python/scikit-build-cmake/FindCython.cmake
+++ b/python/scikit-build-cmake/FindCython.cmake
@@ -13,7 +13,7 @@
 #  ``CYTHON_FOUND``
 #    true if the program was found
 #
-# For more information on the Cython project, see http://cython.org/.
+# For more information on the Cython project, see https://cython.org/.
 #
 # *Cython is a language that makes writing C extensions for the Python language
 # as easy as Python itself.*
@@ -36,9 +36,15 @@
 
 # Use the Cython executable that lives next to the Python executable
 # if it is a local installation.
-find_package(PythonInterp)
-if(PYTHONINTERP_FOUND)
+if(Python_EXECUTABLE)
+  get_filename_component(_python_path ${Python_EXECUTABLE} PATH)
+elseif(Python3_EXECUTABLE)
+  get_filename_component(_python_path ${Python3_EXECUTABLE} PATH)
+elseif(DEFINED PYTHON_EXECUTABLE)
   get_filename_component(_python_path ${PYTHON_EXECUTABLE} PATH)
+endif()
+
+if(DEFINED _python_path)
   find_program(CYTHON_EXECUTABLE
                NAMES cython cython.bat cython3
                HINTS ${_python_path}
@@ -56,7 +62,8 @@ if(CYTHON_EXECUTABLE)
                   OUTPUT_VARIABLE CYTHON_version_output
                   ERROR_VARIABLE CYTHON_version_error
                   RESULT_VARIABLE CYTHON_version_result
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+                  OUTPUT_STRIP_TRAILING_WHITESPACE
+                  ERROR_STRIP_TRAILING_WHITESPACE)
 
   if(NOT ${CYTHON_version_result} EQUAL 0)
     set(_error_msg "Command \"${CYTHON_version_command}\" failed with")
@@ -65,6 +72,10 @@ if(CYTHON_EXECUTABLE)
   else()
     if("${CYTHON_version_output}" MATCHES "^[Cc]ython version ([^,]+)")
       set(CYTHON_VERSION "${CMAKE_MATCH_1}")
+    else()
+      if("${CYTHON_version_error}" MATCHES "^[Cc]ython version ([^,]+)")
+        set(CYTHON_VERSION "${CMAKE_MATCH_1}")
+      endif()
     endif()
   endif()
 endif()
diff --git a/python/scikit-build-cmake/FindNumPy.cmake b/python/scikit-build-cmake/FindNumPy.cmake
new file mode 100644
index 000000000..275ae1bee
--- /dev/null
+++ b/python/scikit-build-cmake/FindNumPy.cmake
@@ -0,0 +1,104 @@
+#.rst:
+#
+# Find the include directory for ``numpy/arrayobject.h`` as well as other NumPy tools like ``conv-template`` and
+# ``from-template``.
+#
+# This module sets the following variables:
+#
+# ``NumPy_FOUND``
+#   True if NumPy was found.
+# ``NumPy_INCLUDE_DIRS``
+#   The include directories needed to use NumpPy.
+# ``NumPy_VERSION``
+#   The version of NumPy found.
+# ``NumPy_CONV_TEMPLATE_EXECUTABLE``
+#   Path to conv-template executable.
+# ``NumPy_FROM_TEMPLATE_EXECUTABLE``
+#   Path to from-template executable.
+#
+# The module will also explicitly define one cache variable:
+#
+# ``NumPy_INCLUDE_DIR``
+#
+# .. note::
+#
+#     To support NumPy < v0.15.0 where ``from-template`` and ``conv-template`` are not declared as entry points,
+#     the module emulates the behavior of standalone executables by setting the corresponding variables with the
+#     path the the python interpreter and the path to the associated script. For example:
+#     ::
+#
+#         set(NumPy_CONV_TEMPLATE_EXECUTABLE /path/to/python /path/to/site-packages/numpy/distutils/conv_template.py CACHE STRING "Command executing conv-template program" FORCE)
+#
+#         set(NumPy_FROM_TEMPLATE_EXECUTABLE /path/to/python /path/to/site-packages/numpy/distutils/from_template.py CACHE STRING "Command executing from-template program" FORCE)
+#
+
+if(NOT NumPy_FOUND)
+  set(_find_extra_args)
+  if(NumPy_FIND_REQUIRED)
+    list(APPEND _find_extra_args REQUIRED)
+  endif()
+  if(NumPy_FIND_QUIET)
+    list(APPEND _find_extra_args QUIET)
+  endif()
+
+  find_program(NumPy_CONV_TEMPLATE_EXECUTABLE NAMES conv-template)
+  find_program(NumPy_FROM_TEMPLATE_EXECUTABLE NAMES from-template)
+
+  if(PYTHON_EXECUTABLE)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+      -c "import numpy; print(numpy.get_include())"
+      OUTPUT_VARIABLE _numpy_include_dir
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_QUIET
+      )
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+      -c "import numpy; print(numpy.__version__)"
+      OUTPUT_VARIABLE NumPy_VERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_QUIET
+      )
+
+    # XXX This is required to support NumPy < v0.15.0. See note in module documentation above.
+    if(NOT NumPy_CONV_TEMPLATE_EXECUTABLE)
+      execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+        -c "from numpy.distutils import conv_template; print(conv_template.__file__)"
+        OUTPUT_VARIABLE _numpy_conv_template_file
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        )
+      set(NumPy_CONV_TEMPLATE_EXECUTABLE "${PYTHON_EXECUTABLE}" "${_numpy_conv_template_file}" CACHE STRING "Command executing conv-template program" FORCE)
+    endif()
+
+    # XXX This is required to support NumPy < v0.15.0. See note in module documentation above.
+    if(NOT NumPy_FROM_TEMPLATE_EXECUTABLE)
+      execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+        -c "from numpy.distutils import from_template; print(from_template.__file__)"
+        OUTPUT_VARIABLE _numpy_from_template_file
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        )
+      set(NumPy_FROM_TEMPLATE_EXECUTABLE "${PYTHON_EXECUTABLE}" "${_numpy_from_template_file}" CACHE STRING "Command executing from-template program" FORCE)
+    endif()
+  endif()
+endif()
+
+find_path(NumPy_INCLUDE_DIR
+  numpy/arrayobject.h
+  PATHS "${_numpy_include_dir}" "${PYTHON_INCLUDE_DIR}"
+  PATH_SUFFIXES numpy/core/include
+  )
+
+set(NumPy_INCLUDE_DIRS ${NumPy_INCLUDE_DIR})
+
+# handle the QUIETLY and REQUIRED arguments and set NumPy_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NumPy
+                                  REQUIRED_VARS
+                                    NumPy_INCLUDE_DIR
+                                    NumPy_CONV_TEMPLATE_EXECUTABLE
+                                    NumPy_FROM_TEMPLATE_EXECUTABLE
+                                  VERSION_VAR NumPy_VERSION
+                                  )
+
+mark_as_advanced(NumPy_INCLUDE_DIR)
diff --git a/python/scikit-build-cmake/FindPythonExtensions.cmake b/python/scikit-build-cmake/FindPythonExtensions.cmake
index 9a3d76a0c..59b30c2a2 100644
--- a/python/scikit-build-cmake/FindPythonExtensions.cmake
+++ b/python/scikit-build-cmake/FindPythonExtensions.cmake
@@ -104,9 +104,10 @@
 #                         [HEADER_OUTPUT_VAR <HeaderOutputVar>]
 #                         [INCLUDE_DIR_OUTPUT_VAR <IncludeDirOutputVar>])
 #
+# without the extension is used as the logical name.  If only ``<Name>`` is
+#
 # If only ``<Name>`` is provided, and it ends in the ".h" extension, then it
 # is assumed to be the ``<HeaderFilename>``.  The filename of the header file
-# without the extension is used as the logical name.  If only ``<Name>`` is
 # provided, and it does not end in the ".h" extension, then the
 # ``<HeaderFilename>`` is assumed to ``<Name>.h``.
 #
@@ -167,8 +168,6 @@
 #
 # .. code-block:: cmake
 #
-#    find_package(PythonInterp)
-#    find_package(PythonLibs)
 #    find_package(PythonExtensions)
 #    find_package(Cython)
 #    find_package(Boost COMPONENTS python)
@@ -200,7 +199,7 @@
 #                            FORWARD_DECL_MODULES_VAR fdecl_module_list)
 #
 #    # module2 -- dynamically linked
-#    include_directories({Boost_INCLUDE_DIRS})
+#    include_directories(${Boost_INCLUDE_DIRS})
 #    add_library(module2 SHARED boost_module2.cxx)
 #    target_link_libraries(module2 ${Boost_LIBRARIES})
 #    python_extension_module(module2
@@ -209,7 +208,7 @@
 #
 #    # module3 -- loaded at runtime
 #    add_cython_target(module3a.pyx)
-#    add_library(module1 MODULE ${module3a} module3b.cxx)
+#    add_library(module3 MODULE ${module3a} module3b.cxx)
 #    target_link_libraries(module3 ${Boost_LIBRARIES})
 #    python_extension_module(module3
 #                            LINKED_MODULES_VAR linked_module_list
@@ -244,7 +243,14 @@
 #=============================================================================
 
 find_package(PythonInterp REQUIRED)
-find_package(PythonLibs)
+if(SKBUILD AND NOT PYTHON_LIBRARY)
+  set(PYTHON_LIBRARY "no-library-required")
+  find_package(PythonLibs)
+  unset(PYTHON_LIBRARY)
+  unset(PYTHON_LIBRARIES)
+else()
+  find_package(PythonLibs)
+endif()
 include(targetLinkLibrariesWithDynamicLookup)
 
 set(_command "
@@ -254,7 +260,6 @@ import os
 import os.path
 import site
 import sys
-import sysconfig
 
 result = None
 rel_result = None
@@ -288,7 +293,7 @@ sys.stdout.write(\";\".join((
     sys.prefix,
     result,
     rel_result,
-    sysconfig.get_config_var('SO')
+    distutils.sysconfig.get_config_var('EXT_SUFFIX')
 )))
 ")
 
@@ -332,16 +337,33 @@ function(_set_python_extension_symbol_visibility _target)
     set_target_properties(${_target} PROPERTIES LINK_FLAGS
         "/EXPORT:${_modinit_prefix}${_target}"
     )
-  elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+  elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    # Option to not run version script. See https://github.com/scikit-build/scikit-build/issues/668
+    if(NOT DEFINED SKBUILD_GNU_SKIP_LOCAL_SYMBOL_EXPORT_OVERRIDE)
+       set(SKBUILD_GNU_SKIP_LOCAL_SYMBOL_EXPORT_OVERRIDE FALSE)
+    endif()
     set(_script_path
       ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}-version-script.map
     )
-    file(WRITE ${_script_path}
-               "{global: ${_modinit_prefix}${_target}; local: *; };"
-    )
-    set_property(TARGET ${_target} APPEND_STRING PROPERTY LINK_FLAGS
-        " -Wl,--version-script=${_script_path}"
-    )
+    # Export all symbols. See https://github.com/scikit-build/scikit-build/issues/668
+    if(SKBUILD_GNU_SKIP_LOCAL_SYMBOL_EXPORT_OVERRIDE)
+      file(WRITE ${_script_path}
+                 "{global: ${_modinit_prefix}${_target};};"
+      )
+    else()
+      file(WRITE ${_script_path}
+                 "{global: ${_modinit_prefix}${_target}; local: *;};"
+      )
+    endif()
+    if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
+      set_property(TARGET ${_target} APPEND_STRING PROPERTY LINK_FLAGS
+        " -Wl,--version-script=\"${_script_path}\""
+      )
+    else()
+      set_property(TARGET ${_target} APPEND_STRING PROPERTY LINK_FLAGS
+        " -Wl,-M \"${_script_path}\""
+      )
+    endif()
   endif()
 endfunction()
 
@@ -423,14 +445,14 @@ function(python_extension_module _target)
     target_link_libraries_with_dynamic_lookup(${_target} ${PYTHON_LIBRARIES})
 
     if(_is_module_lib)
-      #_set_python_extension_symbol_visibility(${_altname})
+      _set_python_extension_symbol_visibility(${_target})
     endif()
   endif()
 endfunction()
 
 function(python_standalone_executable _target)
   include_directories(${PYTHON_INCLUDE_DIRS})
-  target_link_libraries(${_target} ${PYTHON_LIBRARIES})
+  target_link_libraries(${_target} ${SKBUILD_LINK_LIBRARIES_KEYWORD} ${PYTHON_LIBRARIES})
 endfunction()
 
 function(python_modules_header _name)
@@ -571,3 +593,5 @@ function(python_modules_header _name)
   endif()
   set(${_include_dirs_var} ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 endfunction()
+
+include(UsePythonExtensions)
diff --git a/python/scikit-build-cmake/LICENSE b/python/scikit-build-cmake/LICENSE
index 73a9db0f2..3a85dcffc 100644
--- a/python/scikit-build-cmake/LICENSE
+++ b/python/scikit-build-cmake/LICENSE
@@ -1,6 +1,3 @@
-Unless otherwise noted in the file, all files in this directory are
-licensed under the MIT license, reproduced below.
-
 The MIT License (MIT)
 
 Copyright (c) 2014 Mike Sarahan
diff --git a/python/scikit-build-cmake/UseCython.cmake b/python/scikit-build-cmake/UseCython.cmake
index 9a5966488..4e0fa7907 100644
--- a/python/scikit-build-cmake/UseCython.cmake
+++ b/python/scikit-build-cmake/UseCython.cmake
@@ -43,7 +43,7 @@
 # ``PY2 | PY3``
 #   Force compilation using either Python-2 or Python-3 syntax and code
 #   semantics.  By default, Python-2 syntax and semantics are used if the major
-#   version of Python found is 2.  Otherwise, Python-3 syntax and sematics are
+#   version of Python found is 2.  Otherwise, Python-3 syntax and semantics are
 #   used.
 #
 # ``OUTPUT_VAR <OutputVar>``
@@ -56,13 +56,13 @@
 # ``<OutputVar>``
 #   The path of the generated source file.
 #
-# Cache variables that effect the behavior include:
+# Cache variables that affect the behavior include:
 #
 # ``CYTHON_ANNOTATE``
-#   whether to create an annotated .html file when compiling
+#   Whether to create an annotated .html file when compiling.
 #
 # ``CYTHON_FLAGS``
-#   additional flags to pass to the Cython compiler
+#   Additional flags to pass to the Cython compiler.
 #
 # Example usage
 # ^^^^^^^^^^^^^
@@ -101,9 +101,6 @@ set(CYTHON_ANNOTATE OFF
 set(CYTHON_FLAGS "" CACHE STRING
     "Extra flags to the cython compiler.")
 mark_as_advanced(CYTHON_ANNOTATE CYTHON_FLAGS)
-string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
-
-find_package(PythonLibs REQUIRED)
 
 set(CYTHON_CXX_EXTENSION "cxx")
 set(CYTHON_C_EXTENSION "c")
@@ -138,10 +135,12 @@ function(add_cython_target _name)
 
   set(_embed_main FALSE)
 
-  if("${PYTHONLIBS_VERSION_STRING}" MATCHES "^2.")
-    set(_input_syntax "PY2")
+  if("C" IN_LIST languages)
+    set(_output_syntax "C")
+  elseif("CXX" IN_LIST languages)
+    set(_output_syntax "CXX")
   else()
-    set(_input_syntax "PY3")
+    message(FATAL_ERROR "Either C or CXX must be enabled to use Cython")
   endif()
 
   if(_args_EMBED_MAIN)
@@ -156,6 +155,10 @@ function(add_cython_target _name)
     set(_output_syntax "CXX")
   endif()
 
+  # Doesn't select an input syntax - Cython
+  # defaults to 2 for Cython 2 and 3 for Cython 3
+  set(_input_syntax "default")
+
   if(_args_PY2)
     set(_input_syntax "PY2")
   endif()
@@ -201,15 +204,15 @@ function(add_cython_target _name)
   set(c_header_dependencies "")
 
   # Get the include directories.
-  get_source_file_property(pyx_location ${_source_file} LOCATION)
-  get_filename_component(pyx_path ${pyx_location} PATH)
   get_directory_property(cmake_include_directories
-                         DIRECTORY ${pyx_path}
+                         DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
                          INCLUDE_DIRECTORIES)
   list(APPEND cython_include_directories ${cmake_include_directories})
 
   # Determine dependencies.
   # Add the pxd file with the same basename as the given pyx file.
+  get_source_file_property(pyx_location ${_source_file} LOCATION)
+  get_filename_component(pyx_path ${pyx_location} PATH)
   get_filename_component(pyx_file_basename ${_source_file} NAME_WE)
   unset(corresponding_pxd_file CACHE)
   find_file(corresponding_pxd_file ${pyx_file_basename}.pxd
@@ -323,21 +326,11 @@ function(add_cython_target _name)
     set(annotate_arg "--annotate")
   endif()
 
-  set(no_docstrings_arg "")
-  set(embed_signature_arg "")
-  if(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
-    set(no_docstrings_arg "--no-docstrings")
-  else()
-    set(embed_signature_arg "-Xembedsignature=True")
-  endif()
-
   set(cython_debug_arg "")
-  set(embed_pos_arg "")
   set(line_directives_arg "")
   if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR
      CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
     set(cython_debug_arg "--gdb")
-    set(embed_pos_arg "--embed-positions")
     set(line_directives_arg "--line-directives")
   endif()
 
@@ -352,12 +345,13 @@ function(add_cython_target _name)
   list(REMOVE_DUPLICATES pxd_dependencies)
   list(REMOVE_DUPLICATES c_header_dependencies)
 
+  string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
+
   # Add the command to run the compiler.
   add_custom_command(OUTPUT ${generated_file}
                      COMMAND ${CYTHON_EXECUTABLE}
                      ARGS ${cxx_arg} ${include_directory_arg} ${py_version_arg}
-                          ${embed_arg} ${annotate_arg} ${no_docstrings_arg}
-                          ${cython_debug_arg} ${embed_pos_arg} ${embed_signature_arg}
+                          ${embed_arg} ${annotate_arg} ${cython_debug_arg}
                           ${line_directives_arg} ${CYTHON_FLAGS_LIST} ${pyx_location}
                           --output-file ${generated_file}
                      DEPENDS ${_source_file}
diff --git a/python/scikit-build-cmake/UsePythonExtensions.cmake b/python/scikit-build-cmake/UsePythonExtensions.cmake
new file mode 100644
index 000000000..c411e20c4
--- /dev/null
+++ b/python/scikit-build-cmake/UsePythonExtensions.cmake
@@ -0,0 +1,320 @@
+#.rst:
+#
+# The following functions are defined:
+#
+# .. cmake:command:: add_python_library
+#
+# Add a library that contains a mix of C, C++, Fortran, Cython, F2PY, Template,
+# and Tempita sources. The required targets are automatically generated to
+# "lower" source files from their high-level representation to a file that the
+# compiler can accept.
+#
+#
+#   add_python_library(<Name>
+#                      SOURCES [source1 [source2 ...]]
+#                      [INCLUDE_DIRECTORIES [dir1 [dir2 ...]]
+#                      [LINK_LIBRARIES [lib1 [lib2 ...]]
+#                      [DEPENDS [source1 [source2 ...]]])
+#
+#
+# Example usage
+# ^^^^^^^^^^^^^
+#
+# .. code-block:: cmake
+#
+#   find_package(PythonExtensions)
+#
+#   file(GLOB arpack_sources ARPACK/SRC/*.f ARPACK/UTIL/*.f)
+#
+#    add_python_library(arpack_scipy
+#      SOURCES ${arpack_sources}
+#              ${g77_wrapper_sources}
+#      INCLUDE_DIRECTORIES ARPACK/SRC
+#    )
+#
+# .. cmake:command:: add_python_extension
+#
+# Add a extension that contains a mix of C, C++, Fortran, Cython, F2PY, Template,
+# and Tempita sources. The required targets are automatically generated to
+# "lower" source files from their high-level representation to a file that the
+# compiler can accept.
+#
+#
+#   add_python_extension(<Name>
+#                        SOURCES [source1 [source2 ...]]
+#                        [INCLUDE_DIRECTORIES [dir1 [dir2 ...]]
+#                        [LINK_LIBRARIES [lib1 [lib2 ...]]
+#                        [DEPENDS [source1 [source2 ...]]])
+#
+#
+# Example usage
+# ^^^^^^^^^^^^^
+#
+# .. code-block:: cmake
+#
+#   find_package(PythonExtensions)
+#
+#   file(GLOB arpack_sources ARPACK/SRC/*.f ARPACK/UTIL/*.f)
+#
+#    add_python_extension(arpack_scipy
+#      SOURCES ${arpack_sources}
+#              ${g77_wrapper_sources}
+#      INCLUDE_DIRECTORIES ARPACK/SRC
+#    )
+#
+#
+#=============================================================================
+# Copyright 2011 Kitware, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+macro(_remove_whitespace _output)
+  string(REGEX REPLACE "[ \r\n\t]+" " " ${_output} "${${_output}}")
+  string(STRIP "${${_output}}" ${_output})
+endmacro()
+
+function(add_python_library _name)
+  set(options STATIC SHARED MODULE)
+  set(multiValueArgs SOURCES INCLUDE_DIRECTORIES LINK_LIBRARIES COMPILE_DEFINITIONS DEPENDS)
+  cmake_parse_arguments(_args "${options}" "" "${multiValueArgs}" ${ARGN} )
+
+  # Validate arguments to allow simpler debugging
+  if(NOT _args_SOURCES)
+    message(
+      FATAL_ERROR
+      "You have called add_python_library for library ${_name} without "
+      "any source files. This typically indicates a problem with "
+      "your CMakeLists.txt file"
+    )
+  endif()
+
+  # Initialize the list of sources
+  set(_sources ${_args_SOURCES})
+
+  # Generate targets for all *.src files
+  set(_processed )
+  foreach(_source IN LISTS _sources)
+    if(${_source} MATCHES ".pyf.src$" OR ${_source} MATCHES "\\.f\\.src$")
+      if(NOT NumPy_FOUND)
+        message(
+          FATAL_ERROR
+          "NumPy is required to process *.src Template files"
+        )
+      endif()
+      string(REGEX REPLACE "\\.[^.]*$" "" _source_we ${_source})
+      add_custom_command(
+        OUTPUT ${_source_we}
+        COMMAND ${NumPy_FROM_TEMPLATE_EXECUTABLE}
+                ${CMAKE_CURRENT_SOURCE_DIR}/${_source}
+                ${CMAKE_CURRENT_BINARY_DIR}/${_source_we}
+        DEPENDS ${_source} ${_args_DEPENDS}
+        COMMENT "Generating ${_source_we} from template ${_source}"
+      )
+      list(APPEND _processed ${_source_we})
+    elseif(${_source} MATCHES "\\.c\\.src$")
+      if(NOT NumPy_FOUND)
+        message(
+          FATAL_ERROR
+          "NumPy is required to process *.src Template files"
+        )
+      endif()
+      string(REGEX REPLACE "\\.[^.]*$" "" _source_we ${_source})
+      add_custom_command(
+        OUTPUT ${_source_we}
+        COMMAND ${NumPy_CONV_TEMPLATE_EXECUTABLE}
+                ${CMAKE_CURRENT_SOURCE_DIR}/${_source}
+                ${CMAKE_CURRENT_BINARY_DIR}/${_source_we}
+        DEPENDS ${_source} ${_args_DEPENDS}
+        COMMENT "Generating ${_source_we} from template ${_source}"
+      )
+      list(APPEND _processed ${_source_we})
+    elseif(${_source} MATCHES "\\.pyx\\.in$")
+      if(NOT Cython_FOUND)
+        message(
+          FATAL_ERROR
+          "Cython is required to process *.in Tempita files"
+        )
+      endif()
+      string(REGEX REPLACE "\\.[^.]*$" "" _source_we ${_source})
+      configure_file(
+          ${CMAKE_CURRENT_SOURCE_DIR}/${_source}
+          ${CMAKE_CURRENT_BINARY_DIR}/${_source}
+          COPYONLY
+      )
+      set(_tempita_command
+          "
+            import os;
+            import sys;
+            from Cython.Tempita import Template;
+            cwd = os.getcwd();
+            open(os.path.join(cwd, '${_source_we}'), 'w+')
+            .write(
+                Template.from_filename(os.path.join(cwd, '${_source}'),
+                encoding=sys.getdefaultencoding()).substitute()
+            )
+          "
+      )
+      _remove_whitespace(_tempita_command)
+      add_custom_command(
+        OUTPUT ${_source_we}
+        COMMAND ${PYTHON_EXECUTABLE} -c "${_tempita_command}"
+        DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${_source}"
+                ${_args_DEPENDS}
+      )
+      list(APPEND _processed ${_source_we})
+    else()
+      list(APPEND _processed  ${_source})
+    endif()
+  endforeach()
+  set(_sources ${_processed})
+
+  # If we're building a Python extension and we're given only Fortran sources,
+  # We can conclude that we need to generate a Fortran interface file
+  list(FILTER _processed EXCLUDE REGEX "(\\.f|\\.f90)$")
+  if(NOT _processed AND _args_MODULE)
+    if(NOT NumPy_FOUND)
+        message(
+          FATAL_ERROR
+          "NumPy is required to process *.pyf F2PY files"
+        )
+    endif()
+    set(_sources_abs )
+    foreach(_source IN LISTS _sources)
+      if(NOT IS_ABSOLUTE ${_source})
+        set(_source ${CMAKE_CURRENT_SOURCE_DIR}/${_source})
+      endif()
+      list(APPEND _sources_abs ${_source})
+    endforeach()
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_name}.pyf
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        COMMAND ${F2PY_EXECUTABLE}
+        ARGS -h ${_name}.pyf -m ${_name} --overwrite-signature
+             ${_sources_abs}
+        DEPENDS ${_sources} ${_args_DEPENDS}
+        COMMENT "Generating ${_name} Fortran interface file"
+    )
+    list(APPEND _sources ${_name}.pyf)
+  endif()
+
+  # Are there F2PY targets?
+  set(_has_f2py_targets OFF)
+  set(_has_cython_targets OFF)
+
+  # Generate targets for all *.pyx and *.pyf files
+  set(_processed )
+  foreach(_source IN LISTS _sources)
+    if(${_source} MATCHES \\.pyx$)
+      if(NOT Cython_FOUND)
+        message(
+          FATAL_ERROR
+          "Cython is required to process *.pyx Cython files"
+        )
+      endif()
+      string(REGEX REPLACE "\\.[^.]*$" "" _pyx_target_name ${_source})
+      set(_has_cython_targets ON)
+      add_cython_target(${_pyx_target_name}
+          ${_source}
+          OUTPUT_VAR _pyx_target_output
+          DEPENDS ${_args_DEPENDS}
+      )
+      list(APPEND _processed ${_pyx_target_output})
+    elseif(${_source} MATCHES \\.pyf$)
+      if(NOT NumPy_FOUND)
+          message(
+            FATAL_ERROR
+            "NumPy is required to process *.pyf F2PY files"
+          )
+      endif()
+      string(REGEX REPLACE "\\.[^.]*$" "" _pyf_target_name ${_source})
+      set(_has_f2py_targets ON)
+      add_f2py_target(${_pyf_target_name}
+          ${_source}
+          OUTPUT_VAR _pyf_target_output
+          DEPENDS ${_args_DEPENDS}
+      )
+      list(APPEND _processed  ${_pyf_target_output})
+    else()
+      list(APPEND _processed ${_source})
+    endif()
+  endforeach()
+  set(_sources ${_processed})
+
+  if(_args_SHARED)
+    add_library(${_name} SHARED ${_sources})
+  elseif(_args_MODULE)
+    add_library(${_name} MODULE ${_sources})
+  else()
+    # Assume static
+    add_library(${_name} STATIC ${_sources})
+  endif()
+
+  target_include_directories(${_name} PRIVATE ${_args_INCLUDE_DIRECTORIES})
+  target_link_libraries(${_name} ${SKBUILD_LINK_LIBRARIES_KEYWORD} ${_args_LINK_LIBRARIES})
+
+  if(_has_f2py_targets)
+    target_include_directories(${_name} PRIVATE ${F2PY_INCLUDE_DIRS})
+    target_link_libraries(${_name} ${SKBUILD_LINK_LIBRARIES_KEYWORD} ${F2PY_LIBRARIES})
+  endif()
+
+  if(_args_COMPILE_DEFINITIONS)
+    target_compile_definitions(${_name} PRIVATE ${_args_COMPILE_DEFINITIONS})
+  endif()
+
+  if(_args_DEPENDS)
+    add_custom_target(
+      "${_name}_depends"
+      DEPENDS ${_args_DEPENDS}
+    )
+    add_dependencies(${_name} "${_name}_depends")
+  endif()
+endfunction()
+
+function(add_python_extension _name)
+  # FIXME: make sure that extensions with the same name can happen
+  # in multiple directories
+
+  set(multiValueArgs SOURCES INCLUDE_DIRECTORIES LINK_LIBRARIES COMPILE_DEFINITIONS DEPENDS)
+  cmake_parse_arguments(_args "" "" "${multiValueArgs}" ${ARGN} )
+
+  # Validate arguments to allow simpler debugging
+  if(NOT _args_SOURCES)
+    message(
+      FATAL_ERROR
+      "You have called add_python_extension for library ${_name} without "
+      "any source files. This typically indicates a problem with "
+      "your CMakeLists.txt file"
+    )
+  endif()
+
+  add_python_library(${_name} MODULE
+    SOURCES ${_args_SOURCES}
+    INCLUDE_DIRECTORIES ${_args_INCLUDE_DIRECTORIES}
+    LINK_LIBRARIES ${_args_LINK_LIBRARIES}
+    COMPILE_DEFINITIONS ${_args_COMPILE_DEFINITIONS}
+    DEPENDS ${_args_DEPENDS}
+  )
+  python_extension_module(${_name})
+
+  file(RELATIVE_PATH _relative "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}")
+  if(_relative STREQUAL "")
+    set(_relative ".")
+  endif()
+
+  install(
+    TARGETS ${_name}
+    LIBRARY DESTINATION "${_relative}"
+    RUNTIME DESTINATION "${_relative}"
+  )
+endfunction()
diff --git a/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake b/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
index 020fc404a..a583f42cd 100644
--- a/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
+++ b/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
@@ -198,6 +198,28 @@ function(_test_weak_link_project
   set(osx_dynamic_lookup           "-undefined dynamic_lookup")
   set(no_flag                                               "")
 
+  if(CMAKE_CROSSCOMPILING)
+    set(link_flag_spec "no_flag")
+    set(link_flag "${${link_flag_spec}}")
+    set(test_skipping_reason "")
+    set(test_pass FALSE)
+
+    if(APPLE AND NOT CMAKE_CROSSCOMPILING_EMULATOR)
+      set(link_flag_spec "osx_dynamic_lookup")
+      set(link_flag "${${link_flag_spec}}")
+      set(test_skipping_reason " (Cross compiling without emulator on macOS)")
+      set(test_pass TRUE)
+    endif()
+
+    if(test_pass)
+      set(test_description "Weak Link ${target_type} -> ${lib_type} (${link_flag_spec})")
+      message(STATUS "Performing Test ${test_description} - Assuming Success${test_skipping_reason}")
+      set(${can_weak_link_var} ${test_pass} PARENT_SCOPE)
+      set(${project_name} ${link_flag} PARENT_SCOPE)
+      return()
+    endif()
+  endif()
+
   foreach(link_flag_spec gnu_ld_ignore osx_dynamic_lookup no_flag)
     set(link_flag "${${link_flag_spec}}")
 
@@ -248,7 +270,7 @@ function(_test_weak_link_project
 
     if(link_mod_lib)
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
-        target_link_libraries(counter number)
+        target_link_libraries(counter ${SKBUILD_LINK_LIBRARIES_KEYWORD} number)
       ")
     elseif(NOT link_flag STREQUAL "")
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
@@ -262,21 +284,21 @@ function(_test_weak_link_project
 
     if(link_exe_lib)
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
-        target_link_libraries(main number)
+        target_link_libraries(main ${SKBUILD_LINK_LIBRARIES_KEYWORD} number)
       ")
     elseif(NOT link_flag STREQUAL "")
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
-        target_link_libraries(main \"${link_flag}\")
+        target_link_libraries(main ${SKBUILD_LINK_LIBRARIES_KEYWORD} \"${link_flag}\")
       ")
     endif()
 
     if(link_exe_mod)
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
-        target_link_libraries(main counter)
+        target_link_libraries(main ${SKBUILD_LINK_LIBRARIES_KEYWORD} counter)
       ")
     else()
       file(APPEND "${test_project_src_dir}/CMakeLists.txt" "
-        target_link_libraries(main \"${CMAKE_DL_LIBS}\")
+        target_link_libraries(main ${SKBUILD_LINK_LIBRARIES_KEYWORD} \"${CMAKE_DL_LIBS}\")
       ")
     endif()
 
@@ -362,7 +384,7 @@ function(_test_weak_link_project
       file(APPEND "${test_project_src_dir}/main.c" "
         goto done;
         error:
-          fprintf(stderr, \"Error occured:\\n    %s\\n\", dlerror());
+          fprintf(stderr, \"Error occurred:\\n    %s\\n\", dlerror());
           result = 1;
 
         done:
@@ -492,21 +514,15 @@ function(_check_dynamic_lookup
   endif()
 
   if(NOT DEFINED ${cache_var})
-    set(skip_test FALSE)
 
-   if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR)
+    if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR)
       set(skip_test TRUE)
     endif()
 
-    if(skip_test)
-      set(has_dynamic_lookup FALSE)
-      set(link_flags)
-    else()
-      _test_weak_link_project(${target_type}
-                              ${lib_type}
-                              has_dynamic_lookup
-                              link_flags)
-    endif()
+    _test_weak_link_project(${target_type}
+                            ${lib_type}
+                            has_dynamic_lookup
+                            link_flags)
 
     set(caveat " (when linking ${target_type} against ${lib_type})")
 
@@ -576,6 +592,6 @@ function(target_link_libraries_with_dynamic_lookup target)
 
   set(links "${link_items}" "${link_libs}")
   if(links)
-    target_link_libraries(${target} "${links}")
+    target_link_libraries(${target} ${SKBUILD_LINK_LIBRARIES_KEYWORD} "${links}")
   endif()
 endfunction()
diff --git a/python/zfpy.pxd b/python/zfpy.pxd
index 60922c6b8..87aea2cf3 100644
--- a/python/zfpy.pxd
+++ b/python/zfpy.pxd
@@ -1,13 +1,17 @@
+# cython: language_level=3
+
 import cython
 cimport libc.stdint as stdint
+from libc.stddef cimport ptrdiff_t
 
-cdef extern from "bitstream.h":
+cdef extern from "zfp/bitstream.h":
     cdef struct bitstream:
         pass
     bitstream* stream_open(void* data, size_t)
     void stream_close(bitstream* stream)
 
 cdef extern from "zfp.h":
+    cython.char * ZFP_VERSION_STRING
     # enums
     ctypedef enum zfp_type:
         zfp_type_none   = 0,
@@ -53,6 +57,10 @@ cdef extern from "zfp.h":
     cython.uint zfp_stream_set_precision(zfp_stream* stream, cython.uint precision)
     double zfp_stream_set_accuracy(zfp_stream* stream, double tolerance)
     zfp_mode zfp_stream_set_mode(zfp_stream* stream, stdint.uint64_t mode)
+    zfp_mode zfp_stream_compression_mode(zfp_stream* stream)
+    double zfp_stream_accuracy(zfp_stream* stream)
+    double zfp_stream_rate(zfp_stream* stream, cython.uint dims)
+    cython.uint zfp_stream_precision(const zfp_stream* stream)
     zfp_field* zfp_field_alloc()
     zfp_field* zfp_field_1d(void* pointer, zfp_type, size_t nx)
     zfp_field* zfp_field_2d(void* pointer, zfp_type, size_t nx, size_t ny)
@@ -70,5 +78,5 @@ cdef extern from "zfp.h":
     size_t zfp_decompress(zfp_stream* stream, zfp_field* field) nogil
     size_t zfp_write_header(zfp_stream* stream, const zfp_field* field, cython.uint mask)
     size_t zfp_read_header(zfp_stream* stream, zfp_field* field, cython.uint mask)
-
+    void zfp_stream_params(zfp_stream* stream, cython.uint* minbits, cython.uint* maxbits, cython.uint* maxprec, int* minexp);
 cdef gen_padded_int_list(orig_array, pad=*, length=*)
diff --git a/python/zfpy.pyx b/python/zfpy.pyx
index 12b25cb72..89b455cfc 100644
--- a/python/zfpy.pyx
+++ b/python/zfpy.pyx
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
 import sys
 import operator
 import functools
@@ -35,6 +37,8 @@ mode_fixed_rate = zfp_mode_fixed_rate
 mode_fixed_precision = zfp_mode_fixed_precision
 mode_fixed_accuracy = zfp_mode_fixed_accuracy
 
+__version__ = str(ZFP_VERSION_STRING, encoding='utf-8')
+
 
 cpdef dtype_to_ztype(dtype):
     if dtype == np.int32:
@@ -74,7 +78,21 @@ cpdef ztype_to_dtype(zfp_type ztype):
     except KeyError:
         raise ValueError("Unsupported zfp_type {}".format(ztype))
 
-cdef zfp_field* _init_field(np.ndarray arr):
+zfp_mode_map = {
+    zfp_mode_null: "null",
+    zfp_mode_expert: "expert",
+    zfp_mode_reversible: "reversible",
+    zfp_mode_fixed_accuracy: "tolerance",
+    zfp_mode_fixed_precision: "precision",
+    zfp_mode_fixed_rate: "rate",
+}
+cpdef zmode_to_str(zfp_mode zmode):
+    try:
+        return zfp_mode_map[zmode]
+    except KeyError:
+        raise ValueError("Unsupported zfp_mode {}".format(zmode))
+
+cdef zfp_field* _init_field(np.ndarray arr) except NULL:
     shape = arr.shape
     cdef int ndim = arr.ndim
     cdef zfp_type ztype = dtype_to_ztype(arr.dtype)
@@ -351,3 +369,59 @@ cpdef np.ndarray decompress_numpy(
         stream_close(bstream)
 
     return output
+
+cpdef dict header(const uint8_t[::1] compressed_data):
+    """Return stream header information in a python dict."""
+    if compressed_data is None:
+        raise TypeError("compressed_data cannot be None")
+
+    cdef const void* comp_data_pointer = <const void *>&compressed_data[0]
+    cdef zfp_field* field = zfp_field_alloc()
+    cdef bitstream* bstream = stream_open(
+        <void *>comp_data_pointer,
+        len(compressed_data)
+    )
+    cdef zfp_stream* stream = zfp_stream_open(bstream)
+    cdef zfp_mode mode
+
+    cdef unsigned int minbits = 0
+    cdef unsigned int maxbits = 0
+    cdef unsigned int maxprec = 0
+    cdef int minexp = 0
+
+    try:
+        if zfp_read_header(stream, field, HEADER_FULL) == 0:
+            raise ValueError("Failed to read required zfp header")
+
+        mode = zfp_stream_compression_mode(stream)
+
+        ndim = 0
+        for dim in [field.nx, field.ny, field.nz, field.nw]:
+            ndim += int(dim > 0)
+
+        zfp_stream_params(stream, &minbits, &maxbits, &maxprec, &minexp)
+
+        return {
+            "nx": int(field.nx),
+            "ny": int(field.ny),
+            "nz": int(field.nz),
+            "nw": int(field.nw),
+            "type": ztype_to_dtype(field._type),
+            "mode": zmode_to_str(mode),
+            "config": {
+                "mode": int(mode),
+                "tolerance": float(zfp_stream_accuracy(stream)),
+                "rate": float(zfp_stream_rate(stream, ndim)),
+                "precision": int(zfp_stream_precision(stream)),
+                "expert": {
+                    "minbits": int(minbits),
+                    "maxbits": int(minbits),
+                    "maxprec": int(maxprec),
+                    "minexp": int(minexp),
+                },
+            },
+        }
+    finally:
+        zfp_field_free(field)
+        zfp_stream_close(stream)
+        stream_close(bstream)
diff --git a/setup.py b/setup.py
index b03ff4d66..0661368dc 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,49 @@
 from setuptools import setup, Extension
-import numpy as np
+
+class NumpyImport:
+  def __repr__(self):
+    import numpy as np
+
+    return np.get_include()
+
+  __fspath__ = __repr__
 
 setup(
     name="zfpy",
-    version="0.5.5",
-    author="Peter Lindstrom",
+    setup_requires=["numpy", "cython"],
+    version="1.0.1",
+    author="Peter Lindstrom, Danielle Asher",
     author_email="zfp@llnl.gov",
-    url="https://computing.llnl.gov/projects/floating-point-compression",
+    url="https://zfp.llnl.gov",
+    license="License :: OSI Approved :: BSD License",
     description="zfp compression in Python",
     long_description="zfp is a compressed format for representing multidimensional floating-point and integer arrays. zfp provides compressed-array classes that support high throughput read and write random access to individual array elements. zfp also supports serial and parallel compression of whole arrays using both lossless and lossy compression with error tolerances. zfp is primarily written in C and C++ but also includes Python and Fortran bindings.",
-    ext_modules=[Extension("zfpy", ["build/python/zfpy.c"],
-                           include_dirs=["include", np.get_include()],
-                           libraries=["zfp"], library_dirs=["build/lib64", "build/lib/Release"])]
+    ext_modules=[
+        Extension(
+            "zfpy",
+            sources=["python/zfpy.pyx"],
+            define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
+            include_dirs=["include", str(NumpyImport())],
+            libraries=["zfp"],
+            library_dirs=["build/lib64", "build/lib/Release"],
+            language_level=3,
+            language="c",
+        ),
+    ],
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Development Status :: 4 - Beta",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Image Processing",
+        "Topic :: System :: Archiving :: Compression",
+        "Operating System :: POSIX",
+        "Operating System :: MacOS",
+        "Operating System :: Microsoft :: Windows :: Windows 10",
+    ],
 )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c0f63965c..fd5702e58 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,8 +54,7 @@ target_include_directories(zfp
   PUBLIC
     $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  INTERFACE
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/array>)
+)
 
 set_property(TARGET zfp PROPERTY VERSION ${ZFP_VERSION})
 set_property(TARGET zfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
diff --git a/src/bitstream.c b/src/bitstream.c
index 3a7476673..29a4543a9 100644
--- a/src/bitstream.c
+++ b/src/bitstream.c
@@ -1,4 +1,4 @@
-#include "bitstream.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.h"
+#include "zfp/bitstream.inl"
 
 const size_t stream_word_bits = wsize;
diff --git a/src/cuda_zfp/cuZFP.cu b/src/cuda_zfp/cuZFP.cu
index 657976e3d..e1de467ff 100644
--- a/src/cuda_zfp/cuZFP.cu
+++ b/src/cuda_zfp/cuZFP.cu
@@ -23,7 +23,7 @@
   #define inline_ inline
 #endif
 
-#include "../inline/bitstream.c"
+#include "zfp/bitstream.inl"
 namespace internal 
 { 
   
@@ -198,7 +198,7 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
 Word *setup_device_stream_compress(zfp_stream *stream,const zfp_field *field)
 {
   bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin);
-  assert(sizeof(word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
+  assert(sizeof(bitstream_word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
 
   if(stream_device)
   {
@@ -214,7 +214,7 @@ Word *setup_device_stream_compress(zfp_stream *stream,const zfp_field *field)
 Word *setup_device_stream_decompress(zfp_stream *stream,const zfp_field *field)
 {
   bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin);
-  assert(sizeof(word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
+  assert(sizeof(bitstream_word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
 
   if(stream_device)
   {
@@ -483,7 +483,7 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
   internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type);
   internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type);
   
-  // this is how zfp determins if this was a success
+  // this is how zfp determines if this was a success
   size_t words_read = decoded_bytes / sizeof(Word);
   stream->stream->bits = wsize;
   // set stream pointer to end of stream
diff --git a/src/cuda_zfp/decode.cuh b/src/cuda_zfp/decode.cuh
index 07e9866ea..636de7d4e 100644
--- a/src/cuda_zfp/decode.cuh
+++ b/src/cuda_zfp/decode.cuh
@@ -112,7 +112,7 @@ public:
       next_read = n_bits - first_read; 
     }
    
-    // this is basically a no-op when first read constained 
+    // this is basically a no-op when first read contained 
     // all the bits. TODO: if we have aligned reads, this could 
     // be a conditional without divergence
     mask = ((Word)1<<((next_read)))-1;
diff --git a/src/cuda_zfp/decode1.cuh b/src/cuda_zfp/decode1.cuh
index b1f474d19..6d357f631 100644
--- a/src/cuda_zfp/decode1.cuh
+++ b/src/cuda_zfp/decode1.cuh
@@ -127,9 +127,9 @@ size_t decode1launch(uint dim,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dim) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/decode2.cuh b/src/cuda_zfp/decode2.cuh
index 3378f5a56..fa60a82f7 100644
--- a/src/cuda_zfp/decode2.cuh
+++ b/src/cuda_zfp/decode2.cuh
@@ -150,9 +150,9 @@ size_t decode2launch(uint2 dims,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/decode3.cuh b/src/cuda_zfp/decode3.cuh
index 7092f9a3f..9f2a98a89 100644
--- a/src/cuda_zfp/decode3.cuh
+++ b/src/cuda_zfp/decode3.cuh
@@ -163,9 +163,9 @@ size_t decode3launch(uint3 dims,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/encode.cuh b/src/cuda_zfp/encode.cuh
index 995c9c321..cf8c8c9e5 100644
--- a/src/cuda_zfp/encode.cuh
+++ b/src/cuda_zfp/encode.cuh
@@ -22,16 +22,16 @@ void pad_block(Scalar *p, uint n, uint s)
   {
     case 0:
       p[0 * s] = 0;
-      /* FALLTHROUGH */
+      fallthrough_
     case 1:
       p[1 * s] = p[0 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     case 2:
       p[2 * s] = p[1 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     case 3:
       p[3 * s] = p[0 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     default:
       break;
   }
diff --git a/src/cuda_zfp/encode1.cuh b/src/cuda_zfp/encode1.cuh
index 8ef37b14f..98ce5a753 100644
--- a/src/cuda_zfp/encode1.cuh
+++ b/src/cuda_zfp/encode1.cuh
@@ -145,9 +145,9 @@ size_t encode1launch(uint dim,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0.f;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0.f;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float gb = (float(dim) * float(sizeof(Scalar))) / (1024.f * 1024.f * 1024.f);
   float rate = gb / seconds;
   printf("Encode elapsed time: %.5f (s)\n", seconds);
diff --git a/src/cuda_zfp/encode2.cuh b/src/cuda_zfp/encode2.cuh
index 105750e43..0d577d51e 100644
--- a/src/cuda_zfp/encode2.cuh
+++ b/src/cuda_zfp/encode2.cuh
@@ -163,9 +163,9 @@ size_t encode2launch(uint2 dims,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0.f;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0.f;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float mb = (float(dims.x * dims.y) * sizeof(Scalar)) / (1024.f * 1024.f *1024.f);
   float rate = mb / seconds;
   printf("Encode elapsed time: %.5f (s)\n", seconds);
diff --git a/src/cuda_zfp/encode3.cuh b/src/cuda_zfp/encode3.cuh
index 78d05bd77..1edee9e99 100644
--- a/src/cuda_zfp/encode3.cuh
+++ b/src/cuda_zfp/encode3.cuh
@@ -171,9 +171,9 @@ size_t encode3launch(uint3 dims,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/shared.h b/src/cuda_zfp/shared.h
index 27df25beb..547e85f5e 100644
--- a/src/cuda_zfp/shared.h
+++ b/src/cuda_zfp/shared.h
@@ -223,29 +223,40 @@ __device__
 static void
 inv_lift(Int* p)
 {
-	Int x, y, z, w;
-	x = *p; p += s;
-	y = *p; p += s;
-	z = *p; p += s;
-	w = *p; p += s;
-
-	/*
-	** non-orthogonal transform
-	**       ( 4  6 -4 -1) (x)
-	** 1/4 * ( 4  2  4  5) (y)
-	**       ( 4 -2  4 -5) (z)
-	**       ( 4 -6 -4  1) (w)
-	*/
-	y += w >> 1; w -= y >> 1;
-	y += w; w <<= 1; w -= y;
-	z += x; x <<= 1; x -= z;
-	y += z; z <<= 1; z -= y;
-	w += x; x <<= 1; x -= w;
-
-	p -= s; *p = w;
-	p -= s; *p = z;
-	p -= s; *p = y;
-	p -= s; *p = x;
+  Int x, y, z, w;
+  x = *p; p += s;
+  y = *p; p += s;
+  z = *p; p += s;
+  w = *p; p += s;
+
+  /*
+  ** non-orthogonal transform
+  **
+  **       ( 4  6 -4 -1) (x)
+  ** 1/4 * ( 4  2  4  5) (y)
+  **       ( 4 -2  4 -5) (z)
+  **       ( 4 -6 -4  1) (w)
+  **
+  ** original lifted version, which invokes UB due to signed left shift and
+  ** integer overflow:
+  **
+  ** y += w >> 1; w -= y >> 1;
+  ** y += w; w <<= 1; w -= y;
+  ** z += x; x <<= 1; x -= z;
+  ** y += z; z <<= 1; z -= y;
+  ** w += x; x <<= 1; x -= w;
+  */
+
+  y += w >> 1; w -= y >> 1;
+  y += w; w -= y - w;
+  z += x; x -= z - x;
+  y += z; z -= y - z;
+  w += x; x -= w - x;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
 }
 
 
diff --git a/src/decode1d.c b/src/decode1d.c
index b5bbd6693..b95995fa7 100644
--- a/src/decode1d.c
+++ b/src/decode1d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
diff --git a/src/decode1f.c b/src/decode1f.c
index 83085a861..f08119f7f 100644
--- a/src/decode1f.c
+++ b/src/decode1f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
diff --git a/src/decode1i.c b/src/decode1i.c
index b0e61ab79..b148641eb 100644
--- a/src/decode1i.c
+++ b/src/decode1i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec1.c"
 #include "template/decode.c"
diff --git a/src/decode1l.c b/src/decode1l.c
index a063a1fbd..d79e8e46b 100644
--- a/src/decode1l.c
+++ b/src/decode1l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec1.c"
 #include "template/decode.c"
diff --git a/src/decode2d.c b/src/decode2d.c
index ced3d5d76..d7f3a77c6 100644
--- a/src/decode2d.c
+++ b/src/decode2d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
diff --git a/src/decode2f.c b/src/decode2f.c
index f6d823a46..5d44e0725 100644
--- a/src/decode2f.c
+++ b/src/decode2f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
diff --git a/src/decode2i.c b/src/decode2i.c
index e4b5235f2..579eaa82f 100644
--- a/src/decode2i.c
+++ b/src/decode2i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec2.c"
 #include "template/decode.c"
diff --git a/src/decode2l.c b/src/decode2l.c
index 80031eaff..b4d871f5c 100644
--- a/src/decode2l.c
+++ b/src/decode2l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec2.c"
 #include "template/decode.c"
diff --git a/src/decode3d.c b/src/decode3d.c
index 823c64fdf..e9291aa48 100644
--- a/src/decode3d.c
+++ b/src/decode3d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
diff --git a/src/decode3f.c b/src/decode3f.c
index 2e2724ff9..cc517b138 100644
--- a/src/decode3f.c
+++ b/src/decode3f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
diff --git a/src/decode3i.c b/src/decode3i.c
index 6888f245b..0eb05deaf 100644
--- a/src/decode3i.c
+++ b/src/decode3i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec3.c"
 #include "template/decode.c"
diff --git a/src/decode3l.c b/src/decode3l.c
index 225b6bc75..d895d0e7a 100644
--- a/src/decode3l.c
+++ b/src/decode3l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec3.c"
 #include "template/decode.c"
diff --git a/src/decode4d.c b/src/decode4d.c
index 8a047b629..38861b5d2 100644
--- a/src/decode4d.c
+++ b/src/decode4d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
diff --git a/src/decode4f.c b/src/decode4f.c
index 6ec6bf06c..7ef87f10d 100644
--- a/src/decode4f.c
+++ b/src/decode4f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
diff --git a/src/decode4i.c b/src/decode4i.c
index 309eee238..ade99493f 100644
--- a/src/decode4i.c
+++ b/src/decode4i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec4.c"
 #include "template/decode.c"
diff --git a/src/decode4l.c b/src/decode4l.c
index c07c85cce..bbbdefbbd 100644
--- a/src/decode4l.c
+++ b/src/decode4l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec4.c"
 #include "template/decode.c"
diff --git a/src/encode1d.c b/src/encode1d.c
index 7f8f3ca1a..43f5101c1 100644
--- a/src/encode1d.c
+++ b/src/encode1d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
diff --git a/src/encode1f.c b/src/encode1f.c
index 5fe4812cc..ae509d534 100644
--- a/src/encode1f.c
+++ b/src/encode1f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
diff --git a/src/encode1i.c b/src/encode1i.c
index f3069de93..ea3593cdd 100644
--- a/src/encode1i.c
+++ b/src/encode1i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec1.c"
 #include "template/encode.c"
diff --git a/src/encode1l.c b/src/encode1l.c
index 62a1814af..e9415e209 100644
--- a/src/encode1l.c
+++ b/src/encode1l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec1.c"
 #include "template/encode.c"
diff --git a/src/encode2d.c b/src/encode2d.c
index 4dba8923c..8f4458923 100644
--- a/src/encode2d.c
+++ b/src/encode2d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
diff --git a/src/encode2f.c b/src/encode2f.c
index d667ff7ae..814a18a2d 100644
--- a/src/encode2f.c
+++ b/src/encode2f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
diff --git a/src/encode2i.c b/src/encode2i.c
index 264c44907..8417031f0 100644
--- a/src/encode2i.c
+++ b/src/encode2i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec2.c"
 #include "template/encode.c"
diff --git a/src/encode2l.c b/src/encode2l.c
index fdea8a018..87f5a2f34 100644
--- a/src/encode2l.c
+++ b/src/encode2l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec2.c"
 #include "template/encode.c"
diff --git a/src/encode3d.c b/src/encode3d.c
index 548542138..55f55d624 100644
--- a/src/encode3d.c
+++ b/src/encode3d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
diff --git a/src/encode3f.c b/src/encode3f.c
index 2e9db3f27..de3bbaf40 100644
--- a/src/encode3f.c
+++ b/src/encode3f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
diff --git a/src/encode3i.c b/src/encode3i.c
index 6dcfbe53e..257a1ecd2 100644
--- a/src/encode3i.c
+++ b/src/encode3i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec3.c"
 #include "template/encode.c"
diff --git a/src/encode3l.c b/src/encode3l.c
index 26c9077e9..c62696997 100644
--- a/src/encode3l.c
+++ b/src/encode3l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec3.c"
 #include "template/encode.c"
diff --git a/src/encode4d.c b/src/encode4d.c
index c96aae061..346f17471 100644
--- a/src/encode4d.c
+++ b/src/encode4d.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
diff --git a/src/encode4f.c b/src/encode4f.c
index 36cc28318..b855262b0 100644
--- a/src/encode4f.c
+++ b/src/encode4f.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
diff --git a/src/encode4i.c b/src/encode4i.c
index 86fc5ace2..5bed6cdf9 100644
--- a/src/encode4i.c
+++ b/src/encode4i.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec4.c"
 #include "template/encode.c"
diff --git a/src/encode4l.c b/src/encode4l.c
index e8a382817..fd84e5a1c 100644
--- a/src/encode4l.c
+++ b/src/encode4l.c
@@ -1,11 +1,11 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
 #include "template/codec.c"
 #include "template/codec4.c"
 #include "template/encode.c"
diff --git a/src/share/omp.c b/src/share/omp.c
index e0300cadc..02507e564 100644
--- a/src/share/omp.c
+++ b/src/share/omp.c
@@ -1,12 +1,13 @@
 #ifdef _OPENMP
 #include <limits.h>
 #include <omp.h>
+#include "zfp.h"
 
 /* number of omp threads to use */
 static uint
 thread_count_omp(const zfp_stream* stream)
 {
-  uint count = stream->exec.params.omp.threads;
+  uint count = zfp_stream_omp_threads(stream);
   /* if no thread count is specified, use default number of threads */
   if (!count)
     count = omp_get_max_threads();
@@ -17,7 +18,7 @@ thread_count_omp(const zfp_stream* stream)
 static size_t
 chunk_count_omp(const zfp_stream* stream, size_t blocks, uint threads)
 {
-  size_t chunk_size = stream->exec.params.omp.chunk_size;
+  size_t chunk_size = (size_t)zfp_stream_omp_chunk_size(stream);
   /* if no chunk size is specified, assign one chunk per thread */
   size_t chunks = chunk_size ? (blocks + chunk_size - 1) / chunk_size : threads;
   /* each chunk must contain at least one block */
diff --git a/src/share/parallel.c b/src/share/parallel.c
index 0cbdc02c3..2f407be50 100644
--- a/src/share/parallel.c
+++ b/src/share/parallel.c
@@ -49,7 +49,7 @@ compress_init_par(zfp_stream* stream, const zfp_field* field, size_t chunks, siz
          (stream_wtell(stream->stream) % stream_word_bits != 0);
 
   /* set up buffer for each thread to compress to */
-  bs = (bitstream**)malloc(chunks * sizeof(bitstream*));
+  bs = malloc(chunks * sizeof(bitstream*));
   if (!bs)
     return NULL;
   for (chunk = 0; chunk < chunks; chunk++) {
@@ -79,12 +79,12 @@ compress_finish_par(zfp_stream* stream, bitstream** src, size_t chunks)
 {
   bitstream* dst = zfp_stream_bit_stream(stream);
   zfp_bool copy = (stream_data(dst) != stream_data(*src));
-  size_t offset = stream_wtell(dst);
+  bitstream_offset offset = stream_wtell(dst);
   size_t chunk;
 
   /* flush each stream and concatenate if necessary */
   for (chunk = 0; chunk < chunks; chunk++) {
-    size_t bits = stream_wtell(src[chunk]);
+    bitstream_size bits = stream_wtell(src[chunk]);
     offset += bits;
     stream_flush(src[chunk]);
     /* concatenate streams if they are not already contiguous */
diff --git a/src/template/compress.c b/src/template/compress.c
index 74983c56b..dca6a34a1 100644
--- a/src/template/compress.c
+++ b/src/template/compress.c
@@ -2,7 +2,7 @@
 static void
 _t2(compress, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   size_t mx = nx & ~3u;
   size_t x;
@@ -37,7 +37,7 @@ _t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 static void
 _t2(compress_strided, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 {
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   ptrdiff_t sx = field->sx ? field->sx : 1;
@@ -59,7 +59,7 @@ _t2(compress_strided, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 static void
 _t2(compress_strided, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
 {
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   size_t nz = field->nz;
diff --git a/src/template/decode.c b/src/template/decode.c
index 990f83860..aff87c4d2 100644
--- a/src/template/decode.c
+++ b/src/template/decode.c
@@ -16,16 +16,27 @@ _t1(inv_lift, Int)(Int* p, ptrdiff_t s)
 
   /*
   ** non-orthogonal transform
+  **
   **       ( 4  6 -4 -1) (x)
   ** 1/4 * ( 4  2  4  5) (y)
   **       ( 4 -2  4 -5) (z)
   **       ( 4 -6 -4  1) (w)
+  **
+  ** original lifted version, which invokes UB due to signed left shift and
+  ** integer overflow:
+  **
+  ** y += w >> 1; w -= y >> 1;
+  ** y += w; w <<= 1; w -= y;
+  ** z += x; x <<= 1; x -= z;
+  ** y += z; z <<= 1; z -= y;
+  ** w += x; x <<= 1; x -= w;
   */
+
   y += w >> 1; w -= y >> 1;
-  y += w; w <<= 1; w -= y;
-  z += x; x <<= 1; x -= z;
-  y += z; z <<= 1; z -= y;
-  w += x; x <<= 1; x -= w;
+  y += w; w -= y - w;
+  z += x; x -= z - x;
+  y += z; z -= y - z;
+  w += x; x -= w - x;
 
   p -= s; *p = w;
   p -= s; *p = z;
@@ -178,7 +189,7 @@ _t1(decode_few_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, UInt*
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  size_t offset = stream_rtell(&s);
+  bitstream_offset offset = stream_rtell(&s);
   uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint i, k, n;
@@ -215,7 +226,7 @@ _t1(decode_many_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, UInt
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  size_t offset = stream_rtell(&s);
+  bitstream_offset offset = stream_rtell(&s);
   uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint i, k, n;
@@ -268,9 +279,9 @@ _t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec,
 
 /* decode block of integers */
 static uint
-_t2(decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits;
+  uint bits;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* decode integer coefficients */
   bits = _t1(decode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
diff --git a/src/template/decodef.c b/src/template/decodef.c
index 3191c9469..e8fa40c81 100644
--- a/src/template/decodef.c
+++ b/src/template/decodef.c
@@ -10,13 +10,14 @@ _t2(decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
   /* test if block has nonzero values */
   if (stream_read_bit(zfp->stream)) {
     cache_align_(Int iblock[BLOCK_SIZE]);
-    int emax, maxprec;
+    uint maxprec;
+    int emax;
     /* decode common exponent */
     bits += EBITS;
     emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
     maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
     /* decode integer block */
-    bits += _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, maxprec, iblock);
+    bits += _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, maxprec, iblock);
     /* perform inverse block-floating-point transform */
     _t1(inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
   }
diff --git a/src/template/decodei.c b/src/template/decodei.c
index 8a35a8d8d..3cea9651f 100644
--- a/src/template/decodei.c
+++ b/src/template/decodei.c
@@ -1,4 +1,4 @@
-static uint _t2(rev_decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, Int* iblock);
+static uint _t2(rev_decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, Int* iblock);
 
 /* public functions -------------------------------------------------------- */
 
diff --git a/src/template/decompress.c b/src/template/decompress.c
index 726107737..19f02abdc 100644
--- a/src/template/decompress.c
+++ b/src/template/decompress.c
@@ -2,7 +2,7 @@
 static void
 _t2(decompress, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 {
-  Scalar* data = (Scalar*)field->data;
+  Scalar* data = field->data;
   size_t nx = field->nx;
   size_t mx = nx & ~3u;
   size_t x;
@@ -37,7 +37,7 @@ _t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 static void
 _t2(decompress_strided, Scalar, 2)(zfp_stream* stream, zfp_field* field)
 {
-  Scalar* data = (Scalar*)field->data;
+  Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   ptrdiff_t sx = field->sx ? field->sx : 1;
@@ -59,7 +59,7 @@ _t2(decompress_strided, Scalar, 2)(zfp_stream* stream, zfp_field* field)
 static void
 _t2(decompress_strided, Scalar, 3)(zfp_stream* stream, zfp_field* field)
 {
-  Scalar* data = (Scalar*)field->data;
+  Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   size_t nz = field->nz;
diff --git a/src/template/encode.c b/src/template/encode.c
index e2f519b7e..027bd44f4 100644
--- a/src/template/encode.c
+++ b/src/template/encode.c
@@ -11,16 +11,16 @@ _t1(pad_block, Scalar)(Scalar* p, size_t n, ptrdiff_t s)
   switch (n) {
     case 0:
       p[0 * s] = 0;
-      /* FALLTHROUGH */
+      fallthrough_
     case 1:
       p[1 * s] = p[0 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     case 2:
       p[2 * s] = p[1 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     case 3:
       p[3 * s] = p[0 * s];
-      /* FALLTHROUGH */
+      fallthrough_
     default:
       break;
   }
@@ -181,7 +181,7 @@ _t1(encode_few_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, const
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  size_t offset = stream_wtell(&s);
+  bitstream_offset offset = stream_wtell(&s);
   uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint i, k, n;
@@ -210,7 +210,7 @@ _t1(encode_many_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, cons
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  size_t offset = stream_wtell(&s);
+  bitstream_offset offset = stream_wtell(&s);
   uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint i, k, n, c;
@@ -240,7 +240,7 @@ _t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec,
 {
   /* use fastest available encoder implementation */
   if (with_maxbits(maxbits, maxprec, size)) {
-    /* rate contrained path: encode partial bit planes */
+    /* rate constrained path: encode partial bit planes */
     if (size <= 64)
       return _t1(encode_few_ints, UInt)(stream, maxbits, maxprec, data, size); /* 1D, 2D, 3D blocks */
     else
@@ -257,9 +257,9 @@ _t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec,
 
 /* encode block of integers */
 static uint
-_t2(encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits;
+  uint bits;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* perform decorrelating transform */
   _t2(fwd_xform, Int, DIMS)(iblock);
diff --git a/src/template/encodef.c b/src/template/encodef.c
index 86da10c75..10e504385 100644
--- a/src/template/encodef.c
+++ b/src/template/encodef.c
@@ -65,8 +65,8 @@ _t2(encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
   uint bits = 1;
   /* compute maximum exponent */
   int emax = _t1(exponent_block, Scalar)(fblock, BLOCK_SIZE);
-  int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
-  uint e = maxprec ? emax + EBIAS : 0;
+  uint maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
+  uint e = maxprec ? (uint)(emax + EBIAS) : 0;
   /* encode block only if biased exponent is nonzero */
   if (e) {
     cache_align_(Int iblock[BLOCK_SIZE]);
@@ -76,7 +76,7 @@ _t2(encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
     /* perform forward block-floating-point transform */
     _t1(fwd_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
     /* encode integer block */
-    bits += _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, maxprec, iblock);
+    bits += _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, maxprec, iblock);
   }
   else {
     /* write single zero-bit to indicate that all values are zero */
diff --git a/src/template/encodei.c b/src/template/encodei.c
index 46b6e459b..2aa4e7e3d 100644
--- a/src/template/encodei.c
+++ b/src/template/encodei.c
@@ -1,4 +1,4 @@
-static uint _t2(rev_encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock);
+static uint _t2(rev_encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock);
 
 /* public functions -------------------------------------------------------- */
 
diff --git a/src/template/ompcompress.c b/src/template/ompcompress.c
index 4e4365c72..ca446dbda 100644
--- a/src/template/ompcompress.c
+++ b/src/template/ompcompress.c
@@ -5,7 +5,7 @@ static void
 _t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
 
   /* number of omp threads, blocks, and chunks */
@@ -52,7 +52,7 @@ static void
 _t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   ptrdiff_t sx = field->sx ? field->sx : 1;
 
@@ -100,7 +100,7 @@ static void
 _t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   ptrdiff_t sx = field->sx ? field->sx : 1;
@@ -155,7 +155,7 @@ static void
 _t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
-  const Scalar* data = (const Scalar*)field->data;
+  const Scalar* data = field->data;
   size_t nx = field->nx;
   size_t ny = field->ny;
   size_t nz = field->nz;
diff --git a/src/template/revdecode.c b/src/template/revdecode.c
index 07205f58f..655e1ae5c 100644
--- a/src/template/revdecode.c
+++ b/src/template/revdecode.c
@@ -4,9 +4,9 @@ static void _t2(rev_inv_xform, Int, DIMS)(Int* p);
 
 /* reversible inverse lifting transform of 4-vector */
 static void
-_t1(rev_inv_lift, Int)(Int* p, uint s)
+_t1(rev_inv_lift, Int)(Int* p, ptrdiff_t s)
 {
-  Int x, y, z, w;
+  UInt x, y, z, w;
   x = *p; p += s;
   y = *p; p += s;
   z = *p; p += s;
@@ -14,28 +14,31 @@ _t1(rev_inv_lift, Int)(Int* p, uint s)
 
   /*
   ** high-order Lorenzo transform (P4 Pascal matrix)
+  **
   ** ( 1  0  0  0) (x)
   ** ( 1  1  0  0) (y)
   ** ( 1  2  1  0) (z)
   ** ( 1  3  3  1) (w)
+  **
+  ** unsigned arithmetic is used to avoid integer overflow
   */
   w += z;
   z += y; w += z;
   y += x; z += y; w += z;
 
-  p -= s; *p = w;
-  p -= s; *p = z;
-  p -= s; *p = y;
-  p -= s; *p = x;
+  p -= s; *p = (Int)w;
+  p -= s; *p = (Int)z;
+  p -= s; *p = (Int)y;
+  p -= s; *p = (Int)x;
 }
 
 /* decode block of integers using reversible algorithm */
 static uint
-_t2(rev_decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, Int* iblock)
+_t2(rev_decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, Int* iblock)
 {
   /* decode number of significant bits */
-  int bits = PBITS;
-  int prec = (int)stream_read_bits(stream, PBITS) + 1;
+  uint bits = PBITS;
+  uint prec = (uint)stream_read_bits(stream, PBITS) + 1;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* decode integer coefficients */
   bits += _t1(decode_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
diff --git a/src/template/revdecodef.c b/src/template/revdecodef.c
index 6777ce018..5fafcefcf 100644
--- a/src/template/revdecodef.c
+++ b/src/template/revdecodef.c
@@ -30,7 +30,7 @@ _t2(rev_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
     bits++;
     if (stream_read_bit(zfp->stream)) {
       /* decode integer block */
-      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, iblock);
+      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, iblock);
       /* reinterpret integers as floating values */
       _t1(rev_inv_reinterpret, Scalar)(iblock, fblock, BLOCK_SIZE);
     }
@@ -40,7 +40,7 @@ _t2(rev_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
       bits += EBITS;
       emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
       /* decode integer block */
-      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, iblock);
+      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, iblock);
       /* perform inverse block-floating-point transform */
       _t1(rev_inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
     }
diff --git a/src/template/revencode.c b/src/template/revencode.c
index 3473cdd54..9c4938d74 100644
--- a/src/template/revencode.c
+++ b/src/template/revencode.c
@@ -4,9 +4,9 @@ static void _t2(rev_fwd_xform, Int, DIMS)(Int* p);
 
 /* reversible forward lifting transform of 4-vector */
 static void
-_t1(rev_fwd_lift, Int)(Int* p, uint s)
+_t1(rev_fwd_lift, Int)(Int* p, ptrdiff_t s)
 {
-  Int x, y, z, w;
+  UInt x, y, z, w;
   x = *p; p += s;
   y = *p; p += s;
   z = *p; p += s;
@@ -14,19 +14,22 @@ _t1(rev_fwd_lift, Int)(Int* p, uint s)
 
   /*
   ** high-order Lorenzo transform
+  **
   ** ( 1  0  0  0) (x)
   ** (-1  1  0  0) (y)
   ** ( 1 -2  1  0) (z)
   ** (-1  3 -3  1) (w)
+  **
+  ** unsigned arithmetic is used to avoid integer overflow
   */
   w -= z; z -= y; y -= x;
   w -= z; z -= y;
   w -= z;
 
-  p -= s; *p = w;
-  p -= s; *p = z;
-  p -= s; *p = y;
-  p -= s; *p = x;
+  p -= s; *p = (Int)w;
+  p -= s; *p = (Int)z;
+  p -= s; *p = (Int)y;
+  p -= s; *p = (Int)x;
 }
 
 /* return precision required to encode block reversibly */
@@ -51,10 +54,10 @@ _t1(rev_precision, UInt)(const UInt* block, uint n)
 
 /* encode block of integers using reversible algorithm */
 static uint
-_t2(rev_encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(rev_encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits = PBITS;
-  int prec;
+  uint bits = PBITS;
+  uint prec;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* perform decorrelating transform */
   _t2(rev_fwd_xform, Int, DIMS)(iblock);
diff --git a/src/template/revencodef.c b/src/template/revencodef.c
index 789f9b938..ee270aa77 100644
--- a/src/template/revencodef.c
+++ b/src/template/revencodef.c
@@ -53,7 +53,7 @@ _t2(rev_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
   /* test if block-floating-point transform is reversible */
   if (_t1(rev_fwd_reversible, Scalar)(iblock, fblock, BLOCK_SIZE, emax)) {
     /* transform is reversible; test if block has any non-zeros */
-    uint e = emax + EBIAS;
+    uint e = (uint)(emax + EBIAS);
     if (e) {
       /* encode common exponent */
       bits += 2;
@@ -75,6 +75,6 @@ _t2(rev_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
     stream_write_bits(zfp->stream, 3, 2);
   }
   /* losslessly encode integers */
-  bits += _t2(rev_encode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, zfp->maxprec, iblock);
+  bits += _t2(rev_encode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, zfp->maxprec, iblock);
   return bits;
 }
diff --git a/src/template/template.h b/src/template/template.h
index fd5becf7e..e26ddbe36 100644
--- a/src/template/template.h
+++ b/src/template/template.h
@@ -2,6 +2,7 @@
 #define TEMPLATE_H
 
 /* concatenation */
+#define _body(x)       x ## _
 #define _cat2(x, y)    x ## _ ## y
 #define _cat3(x, y, z) x ## _ ## y ## _ ## z
 
@@ -9,4 +10,7 @@
 #define _t1(function, arg)        _cat2(function, arg)
 #define _t2(function, type, dims) _cat3(function, type, dims)
 
+/* 1-argument template instantiation; body must be defined in macro */
+#define _tdef1(function, type, args) _cat2(function, type)args _body(function)(type)
+
 #endif
diff --git a/src/template/utils.c b/src/template/utils.c
new file mode 100644
index 000000000..67e468e65
--- /dev/null
+++ b/src/template/utils.c
@@ -0,0 +1,29 @@
+#ifndef ZFP_UTILS_H
+#define ZFP_UTILS_H
+
+/* size / unit rounded up to the next integer */
+#define count_up_(type) /* (type size, type unit) */\
+{\
+  return (size + unit - 1) / unit;\
+}
+
+/* smallest multiple of unit greater than or equal to size */
+#define round_up_(type) /* (type size, type unit) */\
+{\
+  size += unit - 1;\
+  size -= size % unit;\
+  return size;\
+}
+
+/* template instantiations */
+/* static uint _tdef1(count_up, uint, (uint size, uint unit)) */
+/* static size_t _tdef1(count_up, size_t, (size_t size, size_t unit)) */
+/* static uint64 _tdef1(count_up, uint64, (uint64 size, uint64 unit)) */
+/* static bitstream_size _tdef1(count_up, bitstream_size, (bitstream_size size, bitstream_size unit)) */
+
+static uint _tdef1(round_up, uint, (uint size, uint unit))
+/* static size_t _tdef1(round_up, size_t, (size_t size, size_t unit)) */
+/* static uint64 _tdef1(round_up, uint64, (uint64 size, uint64 unit)) */
+static bitstream_size _tdef1(round_up, bitstream_size, (bitstream_size size, bitstream_size unit))
+
+#endif
diff --git a/src/zfp.c b/src/zfp.c
index 179193dd8..f24ea7dbe 100644
--- a/src/zfp.c
+++ b/src/zfp.c
@@ -3,7 +3,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "zfp/version.h"
 #include "template/template.h"
 
@@ -11,7 +11,7 @@
 
 const uint zfp_codec_version = ZFP_CODEC;
 const uint zfp_library_version = ZFP_VERSION;
-const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (May 5, 2019)";
+const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (December 15, 2023)";
 
 /* private functions ------------------------------------------------------- */
 
@@ -35,7 +35,7 @@ field_index_span(const zfp_field* field, ptrdiff_t* min, ptrdiff_t* max)
     *min = imin;
   if (max)
     *max = imax;
-  return imax - imin + 1;
+  return (size_t)(imax - imin + 1);
 }
 
 static zfp_bool
@@ -49,6 +49,10 @@ is_reversible(const zfp_stream* zfp)
 #include "share/parallel.c"
 #include "share/omp.c"
 
+/* template instantiation of utility functions ------------------------------*/
+
+#include "template/utils.c"
+
 /* template instantiation of integer and float compressor -------------------*/
 
 #define Scalar int32
@@ -102,12 +106,40 @@ zfp_type_size(zfp_type type)
   }
 }
 
+size_t
+zfp_block_maximum_size(zfp_type type, uint dims, zfp_bool reversible)
+{
+  static const size_t size_table[2][4][4] = {
+    /* non-reversible mode */
+    {
+      { 131,  527, 2111,  8447 }, /* int32 */
+      { 259, 1039, 4159, 16639 }, /* int64 */
+      { 140,  536, 2120,  8456 }, /* float */
+      { 271, 1051, 4171, 16651 }, /* double */
+    },
+    /* reversible mode */
+    {
+      { 136,  532, 2116,  8452 }, /* int32 */
+      { 265, 1045, 4165, 16645 }, /* int64 */
+      { 146,  542, 2126,  8462 }, /* float */
+      { 278, 1058, 4178, 16658 }, /* double */
+    },
+  };
+
+  /* check arguments */
+  if (!(zfp_type_int32 <= type && type <= zfp_type_double) ||
+      !(1 <= dims && dims <= 4))
+    return 0;
+
+  return size_table[reversible ? 1 : 0][type - 1][dims - 1];
+}
+
 /* public functions: fields ------------------------------------------------ */
 
 zfp_field*
-zfp_field_alloc()
+zfp_field_alloc(void)
 {
-  zfp_field* field = (zfp_field*)malloc(sizeof(zfp_field));
+  zfp_field* field = malloc(sizeof(zfp_field));
   if (field) {
     field->type = zfp_type_none;
     field->nx = field->ny = field->nz = field->nw = 0;
@@ -189,7 +221,7 @@ zfp_field_begin(const zfp_field* field)
   if (field->data) {
     ptrdiff_t min;
     field_index_span(field, &min, NULL);
-    return (void*)((uchar*)field->data + min * (ptrdiff_t)zfp_type_size(field->type));
+    return (uchar*)field->data + min * (ptrdiff_t)zfp_type_size(field->type);
   }
   else
     return NULL;
@@ -220,13 +252,13 @@ zfp_field_size(const zfp_field* field, size_t* size)
     switch (zfp_field_dimensionality(field)) {
       case 4:
         size[3] = field->nw;
-        /* FALLTHROUGH */
+        fallthrough_
       case 3:
         size[2] = field->nz;
-        /* FALLTHROUGH */
+        fallthrough_
       case 2:
         size[1] = field->ny;
-        /* FALLTHROUGH */
+        fallthrough_
       case 1:
         size[0] = field->nx;
         break;
@@ -240,6 +272,22 @@ zfp_field_size_bytes(const zfp_field* field)
   return field_index_span(field, NULL, NULL) * zfp_type_size(field->type);
 }
 
+size_t
+zfp_field_blocks(const zfp_field* field)
+{
+  size_t bx = (field->nx + 3) / 4;
+  size_t by = (field->ny + 3) / 4;
+  size_t bz = (field->nz + 3) / 4;
+  size_t bw = (field->nw + 3) / 4;
+  switch (zfp_field_dimensionality(field)) {
+    case 1: return bx;
+    case 2: return bx * by;
+    case 3: return bx * by * bz;
+    case 4: return bx * by * bz * bw;
+    default: return 0;
+  }
+}
+
 zfp_bool
 zfp_field_stride(const zfp_field* field, ptrdiff_t* stride)
 {
@@ -247,13 +295,13 @@ zfp_field_stride(const zfp_field* field, ptrdiff_t* stride)
     switch (zfp_field_dimensionality(field)) {
       case 4:
         stride[3] = field->sw ? field->sw : (ptrdiff_t)(field->nx * field->ny * field->nz);
-        /* FALLTHROUGH */
+        fallthrough_
       case 3:
         stride[2] = field->sz ? field->sz : (ptrdiff_t)(field->nx * field->ny);
-        /* FALLTHROUGH */
+        fallthrough_
       case 2:
         stride[1] = field->sy ? field->sy : (ptrdiff_t)field->nx;
-        /* FALLTHROUGH */
+        fallthrough_
       case 1:
         stride[0] = field->sx ? field->sx : 1;
         break;
@@ -449,7 +497,7 @@ zfp_field_set_metadata(zfp_field* field, uint64 meta)
 /* public functions: compression mode and parameter settings --------------- */
 
 zfp_config
-zfp_config_none()
+zfp_config_none(void)
 {
   zfp_config config;
   config.mode = zfp_mode_null;
@@ -491,7 +539,7 @@ zfp_config_accuracy(
 }
 
 zfp_config
-zfp_config_reversible()
+zfp_config_reversible(void)
 {
   zfp_config config;
   config.mode = zfp_mode_reversible;
@@ -520,7 +568,7 @@ zfp_config_expert(
 zfp_stream*
 zfp_stream_open(bitstream* stream)
 {
-  zfp_stream* zfp = (zfp_stream*)malloc(sizeof(zfp_stream));
+  zfp_stream* zfp = malloc(sizeof(zfp_stream));
   if (zfp) {
     zfp->stream = stream;
     zfp->minbits = ZFP_MIN_BITS;
@@ -528,6 +576,7 @@ zfp_stream_open(bitstream* stream)
     zfp->maxprec = ZFP_MAX_PREC;
     zfp->minexp = ZFP_MIN_EXP;
     zfp->exec.policy = zfp_exec_serial;
+    zfp->exec.params = NULL;
   }
   return zfp;
 }
@@ -535,6 +584,8 @@ zfp_stream_open(bitstream* stream)
 void
 zfp_stream_close(zfp_stream* zfp)
 {
+  if (zfp->exec.params != NULL)
+    free(zfp->exec.params);
   free(zfp);
 }
 
@@ -644,7 +695,7 @@ zfp_stream_mode(const zfp_stream* zfp)
         /* minexp is [ZFP_MIN_EXP=-1074, 843] */
         /* returns [2177, ZFP_MODE_SHORT_MAX=4094] */
         /* +1 because skipped 2176 */
-        return (zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1);
+        return (uint64)(zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1);
       else
         break;
 
@@ -660,7 +711,7 @@ zfp_stream_mode(const zfp_stream* zfp)
   minbits = MAX(1, MIN(zfp->minbits, 0x8000u)) - 1;
   maxbits = MAX(1, MIN(zfp->maxbits, 0x8000u)) - 1;
   maxprec = MAX(1, MIN(zfp->maxprec, 0x0080u)) - 1;
-  minexp = MAX(0, MIN(zfp->minexp + 16495, 0x7fff));
+  minexp = (uint)MAX(0, MIN(zfp->minexp + 16495, 0x7fff));
   mode <<= 15; mode += minexp;
   mode <<=  7; mode += maxprec;
   mode <<= 15; mode += maxbits;
@@ -694,13 +745,10 @@ zfp_stream_maximum_size(const zfp_stream* zfp, const zfp_field* field)
 {
   zfp_bool reversible = is_reversible(zfp);
   uint dims = zfp_field_dimensionality(field);
-  size_t mx = (MAX(field->nx, 1u) + 3) / 4;
-  size_t my = (MAX(field->ny, 1u) + 3) / 4;
-  size_t mz = (MAX(field->nz, 1u) + 3) / 4;
-  size_t mw = (MAX(field->nw, 1u) + 3) / 4;
-  size_t blocks = mx * my * mz * mw;
+  size_t blocks = zfp_field_blocks(field);
   uint values = 1u << (2 * dims);
   uint maxbits = 0;
+  bitstream_size maxsize;
 
   if (!dims)
     return 0;
@@ -723,7 +771,17 @@ zfp_stream_maximum_size(const zfp_stream* zfp, const zfp_field* field)
   maxbits += values - 1 + values * MIN(zfp->maxprec, zfp_field_precision(field));
   maxbits = MIN(maxbits, zfp->maxbits);
   maxbits = MAX(maxbits, zfp->minbits);
-  return ((ZFP_HEADER_MAX_BITS + blocks * maxbits + stream_word_bits - 1) & ~(stream_word_bits - 1)) / CHAR_BIT;
+  
+  /* compute number of bytes in multiples of words */
+  maxsize = ZFP_HEADER_MAX_BITS + (bitstream_size)blocks * maxbits;
+  maxsize = _t1(round_up, bitstream_size)(maxsize, stream_word_bits);
+  maxsize /= CHAR_BIT;
+
+  /* ensure maxsize fits in size_t to avoid silent truncation */
+  if ((size_t)maxsize != maxsize)
+    return 0;
+
+  return (size_t)maxsize;
 }
 
 void
@@ -744,8 +802,9 @@ zfp_stream_set_reversible(zfp_stream* zfp)
 double
 zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, zfp_bool align)
 {
-  uint n = 1u << (2 * dims);
+  const uint n = 1u << (2 * dims);
   uint bits = (uint)floor(n * rate + 0.5);
+
   switch (type) {
     case zfp_type_float:
       bits = MAX(bits, 1 + 8u);
@@ -756,15 +815,16 @@ zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, zfp_
     default:
       break;
   }
-  if (align) {
-    /* for write random access, round up to next multiple of stream word size */
-    bits += (uint)stream_word_bits - 1;
-    bits &= ~(stream_word_bits - 1);
-  }
+
+  /* for write random access, round up to next multiple of stream word size */
+  if (align)
+    bits = _t1(round_up, uint)(bits, (uint)stream_word_bits);
+
   zfp->minbits = bits;
   zfp->maxbits = bits;
   zfp->maxprec = ZFP_MAX_PREC;
   zfp->minexp = ZFP_MIN_EXP;
+
   return (double)bits / n;
 }
 
@@ -886,13 +946,17 @@ zfp_stream_execution(const zfp_stream* zfp)
 uint
 zfp_stream_omp_threads(const zfp_stream* zfp)
 {
-  return zfp->exec.params.omp.threads;
+  if (zfp->exec.policy == zfp_exec_omp) 
+    return ((zfp_exec_params_omp*)zfp->exec.params)->threads;
+  return 0u;
 }
 
 uint
 zfp_stream_omp_chunk_size(const zfp_stream* zfp)
 {
-  return zfp->exec.params.omp.chunk_size;
+  if (zfp->exec.policy == zfp_exec_omp) 
+    return ((zfp_exec_params_omp*)zfp->exec.params)->chunk_size;
+  return 0u;
 }
 
 zfp_bool
@@ -900,16 +964,29 @@ zfp_stream_set_execution(zfp_stream* zfp, zfp_exec_policy policy)
 {
   switch (policy) {
     case zfp_exec_serial:
+      if (zfp->exec.policy != policy && zfp->exec.params != NULL) {
+        free(zfp->exec.params);
+        zfp->exec.params = NULL;
+      }
       break;
 #ifdef ZFP_WITH_CUDA
     case zfp_exec_cuda:
+      if (zfp->exec.policy != policy && zfp->exec.params != NULL) {
+        free(zfp->exec.params);
+        zfp->exec.params = NULL;
+      }
       break;
 #endif
     case zfp_exec_omp:
 #ifdef _OPENMP
       if (zfp->exec.policy != policy) {
-        zfp->exec.params.omp.threads = 0;
-        zfp->exec.params.omp.chunk_size = 0;
+        if (zfp->exec.params != NULL) {
+          free(zfp->exec.params);
+        }
+        zfp_exec_params_omp* params = malloc(sizeof(zfp_exec_params_omp));
+        params->threads = 0;
+        params->chunk_size = 0;
+        zfp->exec.params = params;
       }
       break;
 #else
@@ -927,7 +1004,7 @@ zfp_stream_set_omp_threads(zfp_stream* zfp, uint threads)
 {
   if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
     return zfp_false;
-  zfp->exec.params.omp.threads = threads;
+  ((zfp_exec_params_omp*)zfp->exec.params)->threads = threads;
   return zfp_true;
 }
 
@@ -936,7 +1013,7 @@ zfp_stream_set_omp_chunk_size(zfp_stream* zfp, uint chunk_size)
 {
   if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
     return zfp_false;
-  zfp->exec.params.omp.chunk_size = chunk_size;
+  ((zfp_exec_params_omp*)zfp->exec.params)->chunk_size = chunk_size;
   return zfp_true;
 }
 
@@ -1060,7 +1137,7 @@ zfp_compress(zfp_stream* zfp, const zfp_field* field)
 #endif
   };
   uint exec = zfp->exec.policy;
-  uint strided = zfp_field_stride(field, NULL);
+  uint strided = (uint)zfp_field_stride(field, NULL);
   uint dims = zfp_field_dimensionality(field);
   uint type = field->type;
   void (*compress)(zfp_stream*, const zfp_field*);
@@ -1120,7 +1197,7 @@ zfp_decompress(zfp_stream* zfp, zfp_field* field)
 #endif
   };
   uint exec = zfp->exec.policy;
-  uint strided = zfp_field_stride(field, NULL);
+  uint strided = (uint)zfp_field_stride(field, NULL);
   uint dims = zfp_field_dimensionality(field);
   uint type = field->type;
   void (*decompress)(zfp_stream*, zfp_field*);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fc1f72e0e..7fb874701 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,151 +1,158 @@
-set(CMAKE_CXX_STANDARD 11)
-
-# CMAKE_SH-NOTFOUND needed for mingw builds
-if(MINGW)
-  list(APPEND CMOCKA_ARGS "-DCMAKE_SH=CMAKE_SH-NOTFOUND")
-  list(APPEND GTEST_ARGS "-DCMAKE_SH=CMAKE_SH-NOTFOUND")
-endif()
-
-# clone cmocka 1.1.0 into /build
-list(APPEND CMOCKA_ARGS "-DWITH_STATIC_LIB=ON;-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER};-DUNIT_TESTING=OFF")
-
-include(ExternalProject)
-ExternalProject_Add(
-  cmocka_cloned
-  GIT_REPOSITORY    https://git.cryptomilk.org/projects/cmocka.git
-  GIT_TAG           cmocka-1.1.0
-  SOURCE_DIR        "${CMAKE_BINARY_DIR}/cmocka-src"
-  BINARY_DIR        "${CMAKE_BINARY_DIR}/cmocka-build"
-  CMAKE_ARGS        ${CMOCKA_ARGS}
-  BUILD_COMMAND     ${CMAKE_COMMAND} --build . --config ${CMAKE_BUILD_TYPE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} cmocka_static
-  INSTALL_COMMAND   ""
-  STEP_TARGETS build
-  EXCLUDE_FROM_ALL TRUE
-)
-ExternalProject_Get_Property(cmocka_cloned source_dir binary_dir)
-
-# name static library cmocka, wire up against cmocka_cloned
-add_library(cmocka STATIC IMPORTED GLOBAL)
-
-# choose proper library path & extension
-if(MSVC)
-  set(IMPORTED_LOCATION_PATH "${binary_dir}/src/${CMAKE_BUILD_TYPE}/cmocka.lib")
-else()
-  set(IMPORTED_LOCATION_PATH "${binary_dir}/src/libcmocka.a")
-endif()
-set_property(TARGET cmocka
-  PROPERTY
-  IMPORTED_LOCATION "${IMPORTED_LOCATION_PATH}"
-)
-
-add_dependencies(cmocka cmocka_cloned)
-include_directories(${source_dir}/include)
-
-# include home dir so #include statements are clear in test files
-include_directories(${ZFP_SOURCE_DIR} ${ZFP_SOURCE_DIR}/include)
-# access to constants/ and utils/
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-# suppress warnings for all targets
-if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
-  add_compile_options(-Wno-unused-function)
-endif()
-# -Wno-variadic-macros was not working for gcc...revisit
-if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
-  add_compile_options(-Wno-gnu-zero-variadic-macro-arguments)
-endif()
-# suppress googletest warning "conversion from 'float' to 'testing::internal::BiggestInt', possible loss of data"
-if(MSVC)
-  add_compile_options(/wd4244)
-endif()
-
-
-add_subdirectory(utils)
-add_subdirectory(src)
-
-if(BUILD_CFP)
-  add_subdirectory(cfp)
-endif()
-
-if(BUILD_ZFORP)
-  add_subdirectory(fortran)
-endif()
-
-# needed to compile gtest on MSVC
-if(MSVC)
-  list(APPEND GTEST_ARGS "/D:_SILENCE_TR1_DEPRECATION_NAMESPACE_WARNING=1")
+if(BUILD_TESTING OR BUILD_TESTING_FULL)
+  # testzfp
+  add_executable(testzfp testzfp.cpp)
+  target_link_libraries(testzfp zfp)
+  target_compile_definitions(testzfp PRIVATE ${zfp_compressed_array_defs})
+  add_test(NAME testzfp COMMAND testzfp)
+  
+  # testviews
+  add_executable(testviews testviews.cpp)
+  if(ZFP_WITH_OPENMP)
+    target_link_libraries(testviews zfp OpenMP::OpenMP_C)
+  else()
+    target_link_libraries(testviews zfp)
+  endif()
+  target_compile_definitions(testviews PRIVATE ${zfp_compressed_array_defs})
+  add_test(NAME testviews COMMAND testviews)
 endif()
 
-# TODO: spend time getting googletest to compile on MinGW
-# checksums are generated through C tests, no need to compile C++ tests
-if((NOT MINGW) AND (NOT DEFINED ZFP_OMP_TESTS_ONLY) AND (NOT PRINT_CHECKSUMS))
-  # clone googletest into build/
-  configure_file(CMakeLists.txt.in ${ZFP_BINARY_DIR}/tests/googletest-download/CMakeLists.txt)
-  execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${GTEST_ARGS} .
-    RESULT_VARIABLE result
-    WORKING_DIRECTORY ${ZFP_BINARY_DIR}/tests/googletest-download
-  )
-
-  if(result)
-    message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+if(BUILD_TESTING_FULL)
+  # C++17 needed as of GoogleTest 1.17.0
+  set(CMAKE_CXX_STANDARD 17)
+  
+  # CMAKE_SH-NOTFOUND needed for mingw builds
+  if(MINGW)
+    list(APPEND CMOCKA_ARGS "-DCMAKE_SH=CMAKE_SH-NOTFOUND")
+    list(APPEND GTEST_ARGS "-DCMAKE_SH=CMAKE_SH-NOTFOUND")
   endif()
-  # build gtest
-  execute_process(COMMAND ${CMAKE_COMMAND} --build .
-    RESULT_VARIABLE result
-    WORKING_DIRECTORY ${ZFP_BINARY_DIR}/tests/googletest-download
+  
+  # clone cmocka into /build (WITH_STATIC_LIB replaced by BUILD_SHARED_LIBS in cmocka 1.1.6)
+  list(APPEND CMOCKA_ARGS "-DBUILD_SHARED_LIBS=OFF;-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER};-DUNIT_TESTING=OFF")
+  
+  include(ExternalProject)
+  ExternalProject_Add(
+    cmocka_cloned
+    GIT_REPOSITORY    https://gitlab.com/cmocka/cmocka.git
+    GIT_TAG           cmocka-1.1.8
+    SOURCE_DIR        "${CMAKE_BINARY_DIR}/cmocka-src"
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/cmocka-build"
+    CMAKE_ARGS        "${CMOCKA_ARGS}"
+    INSTALL_COMMAND   ""
+    STEP_TARGETS      build
+    EXCLUDE_FROM_ALL  TRUE
   )
-  if(result)
-    message(FATAL_ERROR "Build step for googletest failed: ${result}")
+  ExternalProject_Get_Property(cmocka_cloned source_dir binary_dir)
+  
+  # name static library cmocka, wire up against cmocka_cloned
+  add_library(cmocka STATIC IMPORTED GLOBAL)
+  
+  # choose proper library path & extension
+  if(MSVC)
+    set(IMPORTED_LOCATION_PATH "${binary_dir}/src/${CMAKE_BUILD_TYPE}/cmocka.lib")
+  else()
+    set(IMPORTED_LOCATION_PATH "${binary_dir}/src/libcmocka.a")
   endif()
-
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-  add_subdirectory(${ZFP_BINARY_DIR}/tests/googletest-src
-    ${ZFP_BINARY_DIR}/tests/googletest-build
+  set_property(TARGET cmocka
+    PROPERTY
+    IMPORTED_LOCATION "${IMPORTED_LOCATION_PATH}"
   )
-
-  if(CMAKE_VERSION VERSION_LESS 2.8.11)
-    include_directories("${gtest_SOURCE_DIR}/include")
+  
+  add_dependencies(cmocka cmocka_cloned)
+  include_directories(${source_dir}/include)
+  
+  # include home dir so #include statements are clear in test files
+  include_directories(${ZFP_SOURCE_DIR} ${ZFP_SOURCE_DIR}/include)
+  # access to constants/ and utils/
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+  
+  # suppress warnings for all targets
+  if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    add_compile_options(-Wno-unused-function)
   endif()
-
-  # needed to compile zfp tests with gtest on MSVC
+  # -Wno-variadic-macros was not working for gcc...revisit
+  if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    add_compile_options(-Wno-gnu-zero-variadic-macro-arguments)
+  endif()
+  # suppress googletest warning "conversion from 'float' to 'testing::internal::BiggestInt', possible loss of data"
   if(MSVC)
-    target_compile_definitions(gtest_main INTERFACE GTEST_LINKED_AS_SHARED_LIBRARY=1)
+    add_compile_options(/wd4244)
   endif()
 
-  add_subdirectory(array)
-endif()
-
-
-# testzfp
-add_executable(testzfp testzfp.cpp)
-target_link_libraries(testzfp zfp)
-target_compile_definitions(testzfp PRIVATE ${zfp_compressed_array_defs})
-
-option(ZFP_BUILD_TESTING_SMALL "Enable small-sized array testing" ON)
-if(ZFP_BUILD_TESTING_SMALL)
-  foreach(D IN ITEMS 1 2 3 4)
-    foreach(P IN ITEMS 32 64)
-      add_test(NAME small-arrays-${D}d-fp${P} COMMAND testzfp small ${D}d fp${P})
+  add_subdirectory(utils)
+  add_subdirectory(src)
+  
+  if(BUILD_CFP)
+    add_subdirectory(cfp)
+  endif()
+  
+  if(BUILD_ZFORP)
+    add_subdirectory(fortran)
+  endif()
+  
+  # needed to compile gtest on MSVC
+  if(MSVC)
+    list(APPEND GTEST_ARGS "/D:_SILENCE_TR1_DEPRECATION_NAMESPACE_WARNING=1")
+  endif()
+  
+  # TODO: spend time getting googletest to compile on MinGW
+  # checksums are generated through C tests, no need to compile C++ tests
+  if((NOT MINGW) AND (NOT DEFINED ZFP_OMP_TESTS_ONLY) AND (NOT PRINT_CHECKSUMS))
+    # clone googletest into build/
+    configure_file(CMakeLists.txt.in ${ZFP_BINARY_DIR}/tests/googletest-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${GTEST_ARGS} .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${ZFP_BINARY_DIR}/tests/googletest-download
+    )
+  
+    if(result)
+      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+    endif()
+    # build gtest
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${ZFP_BINARY_DIR}/tests/googletest-download
+    )
+    if(result)
+      message(FATAL_ERROR "Build step for googletest failed: ${result}")
+    endif()
+  
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  
+    add_subdirectory(${ZFP_BINARY_DIR}/tests/googletest-src
+      ${ZFP_BINARY_DIR}/tests/googletest-build
+    )
+  
+    if(CMAKE_VERSION VERSION_LESS 2.8.11)
+      include_directories("${gtest_SOURCE_DIR}/include")
+    endif()
+  
+    # needed to compile zfp tests with gtest on MSVC
+    if(MSVC)
+      target_compile_definitions(gtest_main INTERFACE GTEST_LINKED_AS_SHARED_LIBRARY=1)
+    endif()
+  
+    add_subdirectory(array)
+  endif()
+  
+  option(ZFP_BUILD_TESTING_SMALL "Enable small-sized array testing" ON)
+  if(ZFP_BUILD_TESTING_SMALL)
+    foreach(D IN ITEMS 1 2 3 4)
+      foreach(P IN ITEMS 32 64)
+        add_test(NAME small-arrays-${D}d-fp${P} COMMAND testzfp small ${D}d fp${P})
+      endforeach()
     endforeach()
-  endforeach()
-endif()
-
-option(ZFP_BUILD_TESTING_LARGE "Enable large-sized array testing" OFF)
-if(ZFP_BUILD_TESTING_LARGE)
-  foreach(D IN ITEMS 1 2 3 4)
-    foreach(P IN ITEMS 32 64)
-      add_test(NAME large-arrays-${D}d-fp${P} COMMAND testzfp large ${D}d fp${P})
+  endif()
+  
+  option(ZFP_BUILD_TESTING_LARGE "Enable large-sized array testing" OFF)
+  if(ZFP_BUILD_TESTING_LARGE)
+    foreach(D IN ITEMS 1 2 3 4)
+      foreach(P IN ITEMS 32 64)
+        add_test(NAME large-arrays-${D}d-fp${P} COMMAND testzfp large ${D}d fp${P})
+      endforeach()
     endforeach()
-  endforeach()
-endif()
-
-# testviews
-add_executable(testviews testviews.cpp)
-target_link_libraries(testviews zfp)
-target_compile_definitions(testviews PRIVATE ${zfp_compressed_array_defs})
-add_test(NAME testviews COMMAND testviews)
-
-if(BUILD_ZFPY)
-  add_subdirectory(python)
+  endif()
+  
+  if(BUILD_ZFPY)
+    add_subdirectory(python)
+  endif()
 endif()
diff --git a/tests/CMakeLists.txt.in b/tests/CMakeLists.txt.in
index efe6c3f6d..d476982a6 100644
--- a/tests/CMakeLists.txt.in
+++ b/tests/CMakeLists.txt.in
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.7)
+cmake_minimum_required(VERSION 3.9)
 
 project(googletest-download NONE)
 
@@ -6,7 +6,7 @@ include(ExternalProject)
 ExternalProject_Add(
   googletest
   GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           703bd9caab50b139428cea1aaff9974ebee5742e
+  GIT_TAG           main
   SOURCE_DIR        "${ZFP_BINARY_DIR}/tests/googletest-src"
   BINARY_DIR        "${ZFP_BINARY_DIR}/tests/googletest-build"
   CONFIGURE_COMMAND   ""
diff --git a/tests/Makefile b/tests/Makefile
index ef5cfe363..94339dc53 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -2,8 +2,8 @@ include ../Config
 
 BINDIR = ../bin
 TARGETS = $(BINDIR)/testzfp $(BINDIR)/testviews
-INCS = -I../include -I../array
-LIBS = -L../lib -lzfp
+INCS = -I../include
+LIBS = -L../lib -lzfp $(LDFLAGS)
 
 all: $(TARGETS)
 
diff --git a/tests/array/array/testArray1d.cpp b/tests/array/array/testArray1d.cpp
index e54e0929a..fc192f037 100644
--- a/tests/array/array/testArray1d.cpp
+++ b/tests/array/array/testArray1d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray3.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray2.h"
+#include "zfp/array1.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1dIters.cpp b/tests/array/array/testArray1dIters.cpp
index 49c74c8f3..18276a153 100644
--- a/tests/array/array/testArray1dIters.cpp
+++ b/tests/array/array/testArray1dIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array1dTest
diff --git a/tests/array/array/testArray1dPtrs.cpp b/tests/array/array/testArray1dPtrs.cpp
index 8da9d65ee..be7dada8f 100644
--- a/tests/array/array/testArray1dPtrs.cpp
+++ b/tests/array/array/testArray1dPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array1dTest
diff --git a/tests/array/array/testArray1dRefs.cpp b/tests/array/array/testArray1dRefs.cpp
index 41a13e8bf..6206dbb59 100644
--- a/tests/array/array/testArray1dRefs.cpp
+++ b/tests/array/array/testArray1dRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1dViewIters.cpp b/tests/array/array/testArray1dViewIters.cpp
index 63aa718b1..b00a9bced 100644
--- a/tests/array/array/testArray1dViewIters.cpp
+++ b/tests/array/array/testArray1dViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1dViewPtrs.cpp b/tests/array/array/testArray1dViewPtrs.cpp
index e1d48c60a..df2b35ca7 100644
--- a/tests/array/array/testArray1dViewPtrs.cpp
+++ b/tests/array/array/testArray1dViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1dViews.cpp b/tests/array/array/testArray1dViews.cpp
index 61e671e05..3741a1214 100644
--- a/tests/array/array/testArray1dViews.cpp
+++ b/tests/array/array/testArray1dViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1f.cpp b/tests/array/array/testArray1f.cpp
index f08728280..9176090c1 100644
--- a/tests/array/array/testArray1f.cpp
+++ b/tests/array/array/testArray1f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray3.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray2.h"
+#include "zfp/array1.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1fIters.cpp b/tests/array/array/testArray1fIters.cpp
index fca8e5ec0..7a8653bc4 100644
--- a/tests/array/array/testArray1fIters.cpp
+++ b/tests/array/array/testArray1fIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array1fTest
diff --git a/tests/array/array/testArray1fPtrs.cpp b/tests/array/array/testArray1fPtrs.cpp
index a34b02a3e..cea338496 100644
--- a/tests/array/array/testArray1fPtrs.cpp
+++ b/tests/array/array/testArray1fPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array1fTest
diff --git a/tests/array/array/testArray1fRefs.cpp b/tests/array/array/testArray1fRefs.cpp
index 517b3718f..26f2f6ed3 100644
--- a/tests/array/array/testArray1fRefs.cpp
+++ b/tests/array/array/testArray1fRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1fViewIters.cpp b/tests/array/array/testArray1fViewIters.cpp
index 364396fda..6a7aee3a7 100644
--- a/tests/array/array/testArray1fViewIters.cpp
+++ b/tests/array/array/testArray1fViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1fViewPtrs.cpp b/tests/array/array/testArray1fViewPtrs.cpp
index 8febee34b..60638ce7f 100644
--- a/tests/array/array/testArray1fViewPtrs.cpp
+++ b/tests/array/array/testArray1fViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray1fViews.cpp b/tests/array/array/testArray1fViews.cpp
index 142c72b72..2bad06c95 100644
--- a/tests/array/array/testArray1fViews.cpp
+++ b/tests/array/array/testArray1fViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2d.cpp b/tests/array/array/testArray2d.cpp
index c1d4840fd..2c1fa1c09 100644
--- a/tests/array/array/testArray2d.cpp
+++ b/tests/array/array/testArray2d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray2.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray3.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2dIters.cpp b/tests/array/array/testArray2dIters.cpp
index 8f04384bc..66bb1c0ed 100644
--- a/tests/array/array/testArray2dIters.cpp
+++ b/tests/array/array/testArray2dIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array2dTest
diff --git a/tests/array/array/testArray2dPtrs.cpp b/tests/array/array/testArray2dPtrs.cpp
index 90cf3eb94..0f0216784 100644
--- a/tests/array/array/testArray2dPtrs.cpp
+++ b/tests/array/array/testArray2dPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array2dTest
diff --git a/tests/array/array/testArray2dRefs.cpp b/tests/array/array/testArray2dRefs.cpp
index bcfaf2d1e..ff85149c8 100644
--- a/tests/array/array/testArray2dRefs.cpp
+++ b/tests/array/array/testArray2dRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2dViewIters.cpp b/tests/array/array/testArray2dViewIters.cpp
index 27ba4d6b7..9e1ca9eec 100644
--- a/tests/array/array/testArray2dViewIters.cpp
+++ b/tests/array/array/testArray2dViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2dViewPtrs.cpp b/tests/array/array/testArray2dViewPtrs.cpp
index 3e5829f7c..008e9eb3b 100644
--- a/tests/array/array/testArray2dViewPtrs.cpp
+++ b/tests/array/array/testArray2dViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2dViews.cpp b/tests/array/array/testArray2dViews.cpp
index e1d40e082..ced06cf70 100644
--- a/tests/array/array/testArray2dViews.cpp
+++ b/tests/array/array/testArray2dViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2f.cpp b/tests/array/array/testArray2f.cpp
index d3f759526..6acb95c6c 100644
--- a/tests/array/array/testArray2f.cpp
+++ b/tests/array/array/testArray2f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray2.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray3.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2fIters.cpp b/tests/array/array/testArray2fIters.cpp
index 72d82c4e7..dc5b42f3a 100644
--- a/tests/array/array/testArray2fIters.cpp
+++ b/tests/array/array/testArray2fIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array2fTest
diff --git a/tests/array/array/testArray2fPtrs.cpp b/tests/array/array/testArray2fPtrs.cpp
index 591bcb958..bdf956cdc 100644
--- a/tests/array/array/testArray2fPtrs.cpp
+++ b/tests/array/array/testArray2fPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array2fTest
diff --git a/tests/array/array/testArray2fRefs.cpp b/tests/array/array/testArray2fRefs.cpp
index 10fb38f4b..b3844abca 100644
--- a/tests/array/array/testArray2fRefs.cpp
+++ b/tests/array/array/testArray2fRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2fViewIters.cpp b/tests/array/array/testArray2fViewIters.cpp
index d082173a1..83e64552f 100644
--- a/tests/array/array/testArray2fViewIters.cpp
+++ b/tests/array/array/testArray2fViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2fViewPtrs.cpp b/tests/array/array/testArray2fViewPtrs.cpp
index 61dfee132..b0e9f2d34 100644
--- a/tests/array/array/testArray2fViewPtrs.cpp
+++ b/tests/array/array/testArray2fViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray2fViews.cpp b/tests/array/array/testArray2fViews.cpp
index e43eb69c9..891eb1f8d 100644
--- a/tests/array/array/testArray2fViews.cpp
+++ b/tests/array/array/testArray2fViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray2.h"
+#include "zfp/array2.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3d.cpp b/tests/array/array/testArray3d.cpp
index db9ddb3aa..0287f4ad5 100644
--- a/tests/array/array/testArray3d.cpp
+++ b/tests/array/array/testArray3d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray2.h"
-#include "array/zfparray3.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray4.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3dIters.cpp b/tests/array/array/testArray3dIters.cpp
index b363483de..3c096c082 100644
--- a/tests/array/array/testArray3dIters.cpp
+++ b/tests/array/array/testArray3dIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array3dTest
diff --git a/tests/array/array/testArray3dPtrs.cpp b/tests/array/array/testArray3dPtrs.cpp
index 0d7050024..ebf9ac6ac 100644
--- a/tests/array/array/testArray3dPtrs.cpp
+++ b/tests/array/array/testArray3dPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array3dTest
diff --git a/tests/array/array/testArray3dRefs.cpp b/tests/array/array/testArray3dRefs.cpp
index d4a43582b..9e47931b4 100644
--- a/tests/array/array/testArray3dRefs.cpp
+++ b/tests/array/array/testArray3dRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3dViewIters.cpp b/tests/array/array/testArray3dViewIters.cpp
index 40f5d6c5e..fcdd65a97 100644
--- a/tests/array/array/testArray3dViewIters.cpp
+++ b/tests/array/array/testArray3dViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3dViewPtrs.cpp b/tests/array/array/testArray3dViewPtrs.cpp
index b4e37c4f0..5eafb7690 100644
--- a/tests/array/array/testArray3dViewPtrs.cpp
+++ b/tests/array/array/testArray3dViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3dViews.cpp b/tests/array/array/testArray3dViews.cpp
index cda31375a..117a49c42 100644
--- a/tests/array/array/testArray3dViews.cpp
+++ b/tests/array/array/testArray3dViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3f.cpp b/tests/array/array/testArray3f.cpp
index ae8112ad8..22bb3d45c 100644
--- a/tests/array/array/testArray3f.cpp
+++ b/tests/array/array/testArray3f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray1.h"
-#include "array/zfparray2.h"
-#include "array/zfparray3.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray4.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3fIters.cpp b/tests/array/array/testArray3fIters.cpp
index 676e1f7fe..4eaf6376e 100644
--- a/tests/array/array/testArray3fIters.cpp
+++ b/tests/array/array/testArray3fIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array3fTest
diff --git a/tests/array/array/testArray3fPtrs.cpp b/tests/array/array/testArray3fPtrs.cpp
index d6e3698b5..d1a7801c4 100644
--- a/tests/array/array/testArray3fPtrs.cpp
+++ b/tests/array/array/testArray3fPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array3fTest
diff --git a/tests/array/array/testArray3fRefs.cpp b/tests/array/array/testArray3fRefs.cpp
index f6afbfed0..1f27c1780 100644
--- a/tests/array/array/testArray3fRefs.cpp
+++ b/tests/array/array/testArray3fRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3fViewIters.cpp b/tests/array/array/testArray3fViewIters.cpp
index 03a1d881a..74a56fb3d 100644
--- a/tests/array/array/testArray3fViewIters.cpp
+++ b/tests/array/array/testArray3fViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3fViewPtrs.cpp b/tests/array/array/testArray3fViewPtrs.cpp
index e5641050d..fa14e6812 100644
--- a/tests/array/array/testArray3fViewPtrs.cpp
+++ b/tests/array/array/testArray3fViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray3fViews.cpp b/tests/array/array/testArray3fViews.cpp
index 533c24aba..7ddd08745 100644
--- a/tests/array/array/testArray3fViews.cpp
+++ b/tests/array/array/testArray3fViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4d.cpp b/tests/array/array/testArray4d.cpp
index 38a22df56..5fb688f1d 100644
--- a/tests/array/array/testArray4d.cpp
+++ b/tests/array/array/testArray4d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray2.h"
-#include "array/zfparray3.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray1.h"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4dIters.cpp b/tests/array/array/testArray4dIters.cpp
index 12ff9a458..9be8d7c8d 100644
--- a/tests/array/array/testArray4dIters.cpp
+++ b/tests/array/array/testArray4dIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array4dTest
diff --git a/tests/array/array/testArray4dPtrs.cpp b/tests/array/array/testArray4dPtrs.cpp
index 1c43f54ab..5eae996a9 100644
--- a/tests/array/array/testArray4dPtrs.cpp
+++ b/tests/array/array/testArray4dPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array4dTest
diff --git a/tests/array/array/testArray4dRefs.cpp b/tests/array/array/testArray4dRefs.cpp
index ceeca2ef0..4560c83b4 100644
--- a/tests/array/array/testArray4dRefs.cpp
+++ b/tests/array/array/testArray4dRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4dViewIters.cpp b/tests/array/array/testArray4dViewIters.cpp
index bc85b3757..d0be737cd 100644
--- a/tests/array/array/testArray4dViewIters.cpp
+++ b/tests/array/array/testArray4dViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4dViewPtrs.cpp b/tests/array/array/testArray4dViewPtrs.cpp
index 6e537b588..0eee3c464 100644
--- a/tests/array/array/testArray4dViewPtrs.cpp
+++ b/tests/array/array/testArray4dViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4dViews.cpp b/tests/array/array/testArray4dViews.cpp
index 427499c18..d53c3cd1e 100644
--- a/tests/array/array/testArray4dViews.cpp
+++ b/tests/array/array/testArray4dViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4f.cpp b/tests/array/array/testArray4f.cpp
index d908487dd..dbd8c0707 100644
--- a/tests/array/array/testArray4f.cpp
+++ b/tests/array/array/testArray4f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfparray2.h"
-#include "array/zfparray3.h"
-#include "array/zfparray4.h"
-#include "array/zfpfactory.h"
-#include "array/zfparray1.h"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/factory.hpp"
+#include "zfp/array1.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4fIters.cpp b/tests/array/array/testArray4fIters.cpp
index 2a03e9785..ddcdb42bd 100644
--- a/tests/array/array/testArray4fIters.cpp
+++ b/tests/array/array/testArray4fIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array4fTest
diff --git a/tests/array/array/testArray4fPtrs.cpp b/tests/array/array/testArray4fPtrs.cpp
index 7b6fb8352..7a726b841 100644
--- a/tests/array/array/testArray4fPtrs.cpp
+++ b/tests/array/array/testArray4fPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 #define ARRAY_DIMS_SCALAR_TEST Array4fTest
diff --git a/tests/array/array/testArray4fRefs.cpp b/tests/array/array/testArray4fRefs.cpp
index aec1c8930..c20c305bc 100644
--- a/tests/array/array/testArray4fRefs.cpp
+++ b/tests/array/array/testArray4fRefs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4fViewIters.cpp b/tests/array/array/testArray4fViewIters.cpp
index 3cca047a1..28f6fb7ea 100644
--- a/tests/array/array/testArray4fViewIters.cpp
+++ b/tests/array/array/testArray4fViewIters.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4fViewPtrs.cpp b/tests/array/array/testArray4fViewPtrs.cpp
index b454ca770..b4a475a94 100644
--- a/tests/array/array/testArray4fViewPtrs.cpp
+++ b/tests/array/array/testArray4fViewPtrs.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArray4fViews.cpp b/tests/array/array/testArray4fViews.cpp
index 6c648f303..c0e44137a 100644
--- a/tests/array/array/testArray4fViews.cpp
+++ b/tests/array/array/testArray4fViews.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray4.h"
+#include "zfp/array4.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/array/testArrayBase.cpp b/tests/array/array/testArrayBase.cpp
index 30edc07be..d0ca5afee 100644
--- a/tests/array/array/testArrayBase.cpp
+++ b/tests/array/array/testArrayBase.cpp
@@ -221,7 +221,7 @@ TEST_F(TEST_FIXTURE, given_zfpHeaderForCertainDimensionalityButHeaderMissing_whe
 
   } catch (zfp::exception const & e) {
     std::stringstream ss;
-    ss << "zfparray" << missingDim << " not supported; include zfparray" << missingDim << ".h before zfpfactory.h";
+    ss << "array" << missingDim << " not supported; include zfp/array" << missingDim << ".hpp before zfp/factory.hpp";
     EXPECT_EQ(e.what(), ss.str());
 
   } catch (std::exception const & e) {
diff --git a/tests/array/array/testConstruct.cpp b/tests/array/array/testConstruct.cpp
index 336a24336..acf153f78 100644
--- a/tests/array/array/testConstruct.cpp
+++ b/tests/array/array/testConstruct.cpp
@@ -1,6 +1,6 @@
-#include "array/zfparray2.h"
-#include "array/zfparray3.h"
-#include "array/zfpfactory.h"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 #include "gtest/gtest.h"
@@ -70,7 +70,7 @@ TEST_F(TEST_FIXTURE, given_onlyInclude2D3D_and_zfpHeaderFor1D_when_construct_exp
     zfp::array* arr = zfp::array::construct(h);
     FailWhenNoExceptionThrown();
   } catch (zfp::exception const & e) {
-    EXPECT_EQ(e.what(), std::string("zfparray1 not supported; include zfparray1.h before zfpfactory.h"));
+    EXPECT_EQ(e.what(), std::string("array1 not supported; include zfp/array1.hpp before zfp/factory.hpp"));
   } catch (std::exception const & e) {
     FailAndPrintException(e);
   }
diff --git a/tests/array/constArray/testConstArray1d.cpp b/tests/array/constArray/testConstArray1d.cpp
index 2b3bd5f42..d1e8edd22 100644
--- a/tests/array/constArray/testConstArray1d.cpp
+++ b/tests/array/constArray/testConstArray1d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray1f.cpp b/tests/array/constArray/testConstArray1f.cpp
index ec2676170..321d5d20b 100644
--- a/tests/array/constArray/testConstArray1f.cpp
+++ b/tests/array/constArray/testConstArray1f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray2d.cpp b/tests/array/constArray/testConstArray2d.cpp
index 02bee1e86..e92360f18 100644
--- a/tests/array/constArray/testConstArray2d.cpp
+++ b/tests/array/constArray/testConstArray2d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray2f.cpp b/tests/array/constArray/testConstArray2f.cpp
index f68ee0f4c..bde637098 100644
--- a/tests/array/constArray/testConstArray2f.cpp
+++ b/tests/array/constArray/testConstArray2f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray3d.cpp b/tests/array/constArray/testConstArray3d.cpp
index 2f4d062c9..59c091eef 100644
--- a/tests/array/constArray/testConstArray3d.cpp
+++ b/tests/array/constArray/testConstArray3d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray3f.cpp b/tests/array/constArray/testConstArray3f.cpp
index e4f795841..44358955c 100644
--- a/tests/array/constArray/testConstArray3f.cpp
+++ b/tests/array/constArray/testConstArray3f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray4d.cpp b/tests/array/constArray/testConstArray4d.cpp
index c238b7afc..598417e96 100644
--- a/tests/array/constArray/testConstArray4d.cpp
+++ b/tests/array/constArray/testConstArray4d.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/constArray/testConstArray4f.cpp b/tests/array/constArray/testConstArray4f.cpp
index 00561e8bd..cf5f08d59 100644
--- a/tests/array/constArray/testConstArray4f.cpp
+++ b/tests/array/constArray/testConstArray4f.cpp
@@ -1,8 +1,8 @@
-#include "array/zfpcarray1.h"
-#include "array/zfpcarray2.h"
-#include "array/zfpcarray3.h"
-#include "array/zfpcarray4.h"
-#include "array/zfpfactory.h"
+#include "zfp/constarray1.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/constarray3.hpp"
+#include "zfp/constarray4.hpp"
+#include "zfp/factory.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode1d.cpp b/tests/array/decode/testTemplatedDecode1d.cpp
index 05d034acb..ca70fd89c 100644
--- a/tests/array/decode/testTemplatedDecode1d.cpp
+++ b/tests/array/decode/testTemplatedDecode1d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode1f.cpp b/tests/array/decode/testTemplatedDecode1f.cpp
index 03f2c6c34..914b3c95c 100644
--- a/tests/array/decode/testTemplatedDecode1f.cpp
+++ b/tests/array/decode/testTemplatedDecode1f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode2d.cpp b/tests/array/decode/testTemplatedDecode2d.cpp
index 87d816504..5915f10fc 100644
--- a/tests/array/decode/testTemplatedDecode2d.cpp
+++ b/tests/array/decode/testTemplatedDecode2d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode2f.cpp b/tests/array/decode/testTemplatedDecode2f.cpp
index 61fe0b4af..526b2bd04 100644
--- a/tests/array/decode/testTemplatedDecode2f.cpp
+++ b/tests/array/decode/testTemplatedDecode2f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode3d.cpp b/tests/array/decode/testTemplatedDecode3d.cpp
index 881231907..4c567e90e 100644
--- a/tests/array/decode/testTemplatedDecode3d.cpp
+++ b/tests/array/decode/testTemplatedDecode3d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode3f.cpp b/tests/array/decode/testTemplatedDecode3f.cpp
index daa336f46..4fc28e944 100644
--- a/tests/array/decode/testTemplatedDecode3f.cpp
+++ b/tests/array/decode/testTemplatedDecode3f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode4d.cpp b/tests/array/decode/testTemplatedDecode4d.cpp
index 1fa9d34f7..c159c49c8 100644
--- a/tests/array/decode/testTemplatedDecode4d.cpp
+++ b/tests/array/decode/testTemplatedDecode4d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecode4f.cpp b/tests/array/decode/testTemplatedDecode4f.cpp
index 135ef4908..b94b32a01 100644
--- a/tests/array/decode/testTemplatedDecode4f.cpp
+++ b/tests/array/decode/testTemplatedDecode4f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/decode/testTemplatedDecodeBase.cpp b/tests/array/decode/testTemplatedDecodeBase.cpp
index 21c995d86..471c75d35 100644
--- a/tests/array/decode/testTemplatedDecodeBase.cpp
+++ b/tests/array/decode/testTemplatedDecodeBase.cpp
@@ -35,283 +35,255 @@ void populateArray(SCALAR** dataArr)
 
 void populateStridedArray(SCALAR** dataArr, SCALAR dummyVal)
 {
-    size_t i, j, k, l, countX, countY, countZ, countW;
-
-
-    switch(DIMS) {
-        case 1:
-            countX = BLOCK_SIDE_LEN * SX;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX);
-            ASSERT_TRUE(*dataArr != nullptr);
+#if DIMS == 1
+    size_t countX = BLOCK_SIDE_LEN * SX;
+    *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX);
+    ASSERT_TRUE(*dataArr != nullptr);
 
-            for (i = 0; i < countX; i++) {
-                if (i % SX) {
-                    (*dataArr)[i] = dummyVal;
-                } else {
+    for (size_t i = 0; i < countX; i++) {
+        if (i % SX) {
+            (*dataArr)[i] = dummyVal;
+        } else {
 #ifdef FL_PT_DATA
-	    (*dataArr)[i] = nextSignedRandFlPt();
+	        (*dataArr)[i] = nextSignedRandFlPt();
 #else
-	    (*dataArr)[i] = nextSignedRandInt();
+	        (*dataArr)[i] = nextSignedRandInt();
 #endif
-                }
-            }
-            break;
+        }
+    }
 
-        case 2:
-            countX = BLOCK_SIDE_LEN * SX;
-            countY = SY / SX;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY);
-            ASSERT_TRUE(*dataArr != nullptr);
+#elif DIMS == 2
+    size_t countX = BLOCK_SIDE_LEN * SX;
+    size_t countY = SY / SX;
+    *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY);
+    ASSERT_TRUE(*dataArr != nullptr);
 
-            for (j = 0; j < countY; j++) {
-                for (i = 0; i < countX; i++) {
-                    size_t index = countX*j + i;
-                    if (i % (countX/BLOCK_SIDE_LEN)
-                            || j % (countY/BLOCK_SIDE_LEN)) {
-                        (*dataArr)[index] = dummyVal;
-                    } else {
+    for (size_t j = 0; j < countY; j++) {
+        for (size_t i = 0; i < countX; i++) {
+            size_t index = countX*j + i;
+            if (i % (countX/BLOCK_SIDE_LEN)
+                    || j % (countY/BLOCK_SIDE_LEN)) {
+                (*dataArr)[index] = dummyVal;
+            } else {
 #ifdef FL_PT_DATA
-	        (*dataArr)[index] = nextSignedRandFlPt();
+	            (*dataArr)[index] = nextSignedRandFlPt();
 #else
-	        (*dataArr)[index] = nextSignedRandInt();
+	            (*dataArr)[index] = nextSignedRandInt();
 #endif
-                    }
-                }
             }
-            break;
-
-        case 3:
-            countX = BLOCK_SIDE_LEN * SX;
-            countY = SY / SX;
-            countZ = SZ / SY;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ);
-            ASSERT_TRUE(*dataArr != nullptr);
-
-            for (k = 0; k < countZ; k++) {
-                for (j = 0; j < countY; j++) {
-                    for (i = 0; i < countX; i++) {
-                        size_t index = countX*countY*k + countX*j + i;
-                        if (i % (countX/BLOCK_SIDE_LEN)
-                                || j % (countY/BLOCK_SIDE_LEN)
-                                || k % (countZ/BLOCK_SIDE_LEN)) {
-                            (*dataArr)[index] = dummyVal;
-                        } else {
+        }
+    }
+
+#elif DIMS == 3
+    size_t countX = BLOCK_SIDE_LEN * SX;
+    size_t countY = SY / SX;
+    size_t countZ = SZ / SY;
+    *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ);
+    ASSERT_TRUE(*dataArr != nullptr);
+
+    for (size_t k = 0; k < countZ; k++) {
+        for (size_t j = 0; j < countY; j++) {
+            for (size_t i = 0; i < countX; i++) {
+                size_t index = countX*countY*k + countX*j + i;
+                if (i % (countX/BLOCK_SIDE_LEN)
+                        || j % (countY/BLOCK_SIDE_LEN)
+                        || k % (countZ/BLOCK_SIDE_LEN)) {
+                    (*dataArr)[index] = dummyVal;
+                } else {
 #ifdef FL_PT_DATA
-                            (*dataArr)[index] = nextSignedRandFlPt();
+                    (*dataArr)[index] = nextSignedRandFlPt();
 #else
-                            (*dataArr)[index] = nextSignedRandInt();
+                    (*dataArr)[index] = nextSignedRandInt();
 #endif
-                        }
-                    }
                 }
             }
-            break;
-
-        case 4:
-            countX = BLOCK_SIDE_LEN * SX;
-            countY = SY / SX;
-            countZ = SZ / SY;
-            countW = SW / SZ;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ * countW);
-            ASSERT_TRUE(*dataArr != nullptr);
-
-            for (l = 0; l < countW; l++) {
-                for (k = 0; k < countZ; k++) {
-                    for (j = 0; j < countY; j++) {
-                        for (i = 0; i < countX; i++) {
-                            size_t index = countX*countY*countZ*l + countX*countY*k + countX*j + i;
-                            if (i % (countX/BLOCK_SIDE_LEN)
-                                    || j % (countY/BLOCK_SIDE_LEN)
-                                    || k % (countZ/BLOCK_SIDE_LEN)
-                                    || l % (countW/BLOCK_SIDE_LEN)) {
-                                (*dataArr)[index] = dummyVal;
-                            } else {
+        }
+    }
+
+#elif DIMS == 4
+    size_t countX = BLOCK_SIDE_LEN * SX;
+    size_t countY = SY / SX;
+    size_t countZ = SZ / SY;
+    size_t countW = SW / SZ;
+    *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ * countW);
+    ASSERT_TRUE(*dataArr != nullptr);
+
+    for (size_t l = 0; l < countW; l++) {
+        for (size_t k = 0; k < countZ; k++) {
+            for (size_t j = 0; j < countY; j++) {
+                for (size_t i = 0; i < countX; i++) {
+                    size_t index = countX*countY*countZ*l + countX*countY*k + countX*j + i;
+                    if (i % (countX/BLOCK_SIDE_LEN)
+                            || j % (countY/BLOCK_SIDE_LEN)
+                            || k % (countZ/BLOCK_SIDE_LEN)
+                            || l % (countW/BLOCK_SIDE_LEN)) {
+                        (*dataArr)[index] = dummyVal;
+                    } else {
 #ifdef FL_PT_DATA
-                                (*dataArr)[index] = nextSignedRandFlPt();
+                        (*dataArr)[index] = nextSignedRandFlPt();
 #else
-                                (*dataArr)[index] = nextSignedRandInt();
+                        (*dataArr)[index] = nextSignedRandInt();
 #endif
-                            }
-                        }
                     }
                 }
             }
-            break;
+        }
     }
+#endif
 }
 
 void assertStridedBlockEntriesEqual(SCALAR* data1, SCALAR* data2)
 {
-  size_t i, j, k, l, countX, countY, countZ, countW;
-  switch (DIMS) {
-    case 1:
-        countX = BLOCK_SIDE_LEN * SX;
-
-        for (size_t i = 0; i < countX; i++) {
-          if (!(i % (countX/BLOCK_SIDE_LEN))) {
-            ASSERT_SCALAR_EQ(data1[i], data2[i]) << 
-                        "index " << i << " mismatch: " << data1[i] << " != " << data2[i];
-          }
-        }
-
-        break;
-
-    case 2:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-
-        for (size_t j = 0; j < countY; j++) {
-          for (size_t i = 0; i < countX; i++) {
-            if (!(i % (countX/BLOCK_SIDE_LEN))
-                && !(j % (countY/BLOCK_SIDE_LEN))) {
-              ASSERT_SCALAR_EQ(data1[countX*j + i], data2[countX*j + i]) << 
-                          "index " << (countX*j + i) << " mismatch: " << data1[countX*j + i] << " != " << data2[countX*j + i];
-            }
-          }
-        }
+#if DIMS == 1
+  size_t countX = BLOCK_SIDE_LEN * SX;
 
-        break;
+  for (size_t i = 0; i < countX; i++) {
+    if (!(i % (countX/BLOCK_SIDE_LEN))) {
+      ASSERT_SCALAR_EQ(data1[i], data2[i]) << 
+                  "index " << i << " mismatch: " << data1[i] << " != " << data2[i];
+    }
+  }
 
-    case 3:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-        countZ = SZ / SY;
+#elif DIMS == 2
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+
+  for (size_t j = 0; j < countY; j++) {
+    for (size_t i = 0; i < countX; i++) {
+      if (!(i % (countX/BLOCK_SIDE_LEN))
+          && !(j % (countY/BLOCK_SIDE_LEN))) {
+        ASSERT_SCALAR_EQ(data1[countX*j + i], data2[countX*j + i]) << 
+                    "index " << (countX*j + i) << " mismatch: " << data1[countX*j + i] << " != " << data2[countX*j + i];
+      }
+    }
+  }
 
-        for (size_t k = 0; k < countZ; k++) {
-          for (size_t j = 0; j < countY; j++) {
-            for (size_t i = 0; i < countX; i++) {
-              if (!(i % (countX/BLOCK_SIDE_LEN))
-                  && !(j % (countY/BLOCK_SIDE_LEN))
-                  && !(k % (countZ/BLOCK_SIDE_LEN))) {
-                  ASSERT_SCALAR_EQ(data1[countX*countY*k + countX*j + i], data2[countX*countY*k + countX*j + i]) << 
-                              "index " << (countX*countY*k + countX*j + i) << " mismatch: " << 
-                              data1[countX*countY*k + countX*j + i] << " != " <<
-                              data2[countX*countY*k + countX*j + i];
-              }
-            }
-          }
+#elif DIMS == 3
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+  size_t countZ = SZ / SY;
+
+  for (size_t k = 0; k < countZ; k++) {
+    for (size_t j = 0; j < countY; j++) {
+      for (size_t i = 0; i < countX; i++) {
+        if (!(i % (countX/BLOCK_SIDE_LEN))
+            && !(j % (countY/BLOCK_SIDE_LEN))
+            && !(k % (countZ/BLOCK_SIDE_LEN))) {
+            ASSERT_SCALAR_EQ(data1[countX*countY*k + countX*j + i], data2[countX*countY*k + countX*j + i]) << 
+                        "index " << (countX*countY*k + countX*j + i) << " mismatch: " << 
+                        data1[countX*countY*k + countX*j + i] << " != " <<
+                        data2[countX*countY*k + countX*j + i];
         }
+      }
+    }
+  }
 
-        break;
-
-    case 4:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-        countZ = SZ / SY;
-        countW = SW / SZ;
-
-        for (size_t l = 0; l < countW; l++) {
-          for (size_t k = 0; k < countZ; k++) {
-            for (size_t j = 0; j < countY; j++) {
-              for (size_t i = 0; i < countX; i++) {
-                if (!(i % (countX/BLOCK_SIDE_LEN))
-                    && !(j % (countY/BLOCK_SIDE_LEN))
-                    && !(k % (countZ/BLOCK_SIDE_LEN))
-                    && !(l % (countW/BLOCK_SIDE_LEN))) {
-                      ASSERT_SCALAR_EQ(data1[countX*countY*countZ*l + countX*countY*k + countX*j + i], data2[countX*countY*countZ*l + countX*countY*k + countX*j + i]) << 
-                                  "index " << (countX*countY*countZ*l + countX*countY*k + countX*j + i) << " mismatch: " << 
-                                  data1[countX*countY*countZ*l + countX*countY*k + countX*j + i] << " != " <<
-                                  data2[countX*countY*countZ*l + countX*countY*k + countX*j + i];
-                }
-              }
-            }
+#elif DIMS == 4
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+  size_t countZ = SZ / SY;
+  size_t countW = SW / SZ;
+
+  for (size_t l = 0; l < countW; l++) {
+    for (size_t k = 0; k < countZ; k++) {
+      for (size_t j = 0; j < countY; j++) {
+        for (size_t i = 0; i < countX; i++) {
+          if (!(i % (countX/BLOCK_SIDE_LEN))
+              && !(j % (countY/BLOCK_SIDE_LEN))
+              && !(k % (countZ/BLOCK_SIDE_LEN))
+              && !(l % (countW/BLOCK_SIDE_LEN))) {
+                ASSERT_SCALAR_EQ(data1[countX*countY*countZ*l + countX*countY*k + countX*j + i], data2[countX*countY*countZ*l + countX*countY*k + countX*j + i]) << 
+                            "index " << (countX*countY*countZ*l + countX*countY*k + countX*j + i) << " mismatch: " << 
+                            data1[countX*countY*countZ*l + countX*countY*k + countX*j + i] << " != " <<
+                            data2[countX*countY*countZ*l + countX*countY*k + countX*j + i];
           }
         }
-
-        break;
+      }
+    }
   }
+#endif
 }
 
 void assertPartialBlockEntriesEqual(SCALAR* data1, SCALAR* data2)
 {
-  size_t i, j, k, l, countX, countY, countZ, countW;
-  switch (DIMS) {
-    case 1:
-        countX = BLOCK_SIDE_LEN * SX;
-
-        for (size_t i = 0; i < countX; i++) {
-          if (i/(countX/BLOCK_SIDE_LEN) < PX
-              && !(i % (countX/BLOCK_SIDE_LEN))) {
-            ASSERT_SCALAR_EQ(data1[i], data2[i]) << 
-                        "index " << i << " mismatch: " << data1[i] << " != " << data2[i];
-          }
-        }
-
-        break;
-
-    case 2:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-
-        for (size_t j = 0; j < countY; j++) {
-          for (size_t i = 0; i < countX; i++) {
-            if (i/(countX/BLOCK_SIDE_LEN) < PX
-                && j/(countY/BLOCK_SIDE_LEN) < PY
-                && !(i % (countX/BLOCK_SIDE_LEN))
-                && !(j % (countY/BLOCK_SIDE_LEN))) {
-              ASSERT_SCALAR_EQ(data1[countX*j + i], data2[countX*j + i]) << 
-                          "index " << (countX*j + i) << " mismatch: " << data1[countX*j + i] << " != " << data2[countX*j + i];
-            }
-          }
-        }
+#if DIMS == 1
+  size_t countX = BLOCK_SIDE_LEN * SX;
 
-        break;
+  for (size_t i = 0; i < countX; i++) {
+    if (i/(countX/BLOCK_SIDE_LEN) < PX
+        && !(i % (countX/BLOCK_SIDE_LEN))) {
+      ASSERT_SCALAR_EQ(data1[i], data2[i]) << 
+                  "index " << i << " mismatch: " << data1[i] << " != " << data2[i];
+    }
+  }
 
-    case 3:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-        countZ = SZ / SY;
+#elif DIMS == 2
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+
+  for (size_t j = 0; j < countY; j++) {
+    for (size_t i = 0; i < countX; i++) {
+      if (i/(countX/BLOCK_SIDE_LEN) < PX
+          && j/(countY/BLOCK_SIDE_LEN) < PY
+          && !(i % (countX/BLOCK_SIDE_LEN))
+          && !(j % (countY/BLOCK_SIDE_LEN))) {
+        ASSERT_SCALAR_EQ(data1[countX*j + i], data2[countX*j + i]) << 
+                    "index " << (countX*j + i) << " mismatch: " << data1[countX*j + i] << " != " << data2[countX*j + i];
+      }
+    }
+  }
 
-        for (size_t k = 0; k < countZ; k++) {
-          for (size_t j = 0; j < countY; j++) {
-            for (size_t i = 0; i < countX; i++) {
-              if (i/(countX/BLOCK_SIDE_LEN) < PX
-                  && j/(countY/BLOCK_SIDE_LEN) < PY
-                  && k/(countZ/BLOCK_SIDE_LEN) < PZ
-                  && !(i % (countX/BLOCK_SIDE_LEN))
-                  && !(j % (countY/BLOCK_SIDE_LEN))
-                  && !(k % (countZ/BLOCK_SIDE_LEN))) {
-                  ASSERT_SCALAR_EQ(data1[countX*countY*k + countX*j + i], data2[countX*countY*k + countX*j + i]) << 
-                              "index " << (countX*countY*k + countX*j + i) << " mismatch: " << 
-                              data1[countX*countY*k + countX*j + i] << " != " <<
-                              data2[countX*countY*k + countX*j + i];
-              }
-            }
-          }
+#elif DIMS == 3
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+  size_t countZ = SZ / SY;
+
+  for (size_t k = 0; k < countZ; k++) {
+    for (size_t j = 0; j < countY; j++) {
+      for (size_t i = 0; i < countX; i++) {
+        if (i/(countX/BLOCK_SIDE_LEN) < PX
+            && j/(countY/BLOCK_SIDE_LEN) < PY
+            && k/(countZ/BLOCK_SIDE_LEN) < PZ
+            && !(i % (countX/BLOCK_SIDE_LEN))
+            && !(j % (countY/BLOCK_SIDE_LEN))
+            && !(k % (countZ/BLOCK_SIDE_LEN))) {
+            ASSERT_SCALAR_EQ(data1[countX*countY*k + countX*j + i], data2[countX*countY*k + countX*j + i]) << 
+                        "index " << (countX*countY*k + countX*j + i) << " mismatch: " << 
+                        data1[countX*countY*k + countX*j + i] << " != " <<
+                        data2[countX*countY*k + countX*j + i];
         }
+      }
+    }
+  }
 
-        break;
-
-    case 4:
-        countX = BLOCK_SIDE_LEN * SX;
-        countY = SY / SX;
-        countZ = SZ / SY;
-        countW = SW / SZ;
-
-        for (size_t l = 0; l < countW; l++) {
-          for (size_t k = 0; k < countZ; k++) {
-            for (size_t j = 0; j < countY; j++) {
-              for (size_t i = 0; i < countX; i++) {
-                if (i/(countX/BLOCK_SIDE_LEN) < PX
-                    && j/(countY/BLOCK_SIDE_LEN) < PY
-                    && k/(countZ/BLOCK_SIDE_LEN) < PZ
-                    && l/(countW/BLOCK_SIDE_LEN) < PW
-                    && !(i % (countX/BLOCK_SIDE_LEN))
-                    && !(j % (countY/BLOCK_SIDE_LEN))
-                    && !(k % (countZ/BLOCK_SIDE_LEN))
-                    && !(l % (countW/BLOCK_SIDE_LEN))) {
-                      ASSERT_SCALAR_EQ(data1[countX*countY*countZ*l + countX*countY*k + countX*j + i], data2[countX*countY*countZ*l + countX*countY*k + countX*j + i]) << 
-                                  "index " << (countX*countY*countZ*l + countX*countY*k + countX*j + i) << " mismatch: " << 
-                                  data1[countX*countY*countZ*l + countX*countY*k + countX*j + i] << " != " <<
-                                  data2[countX*countY*countZ*l + countX*countY*k + countX*j + i];
-                }
-              }
-            }
+#elif DIMS == 4
+  size_t countX = BLOCK_SIDE_LEN * SX;
+  size_t countY = SY / SX;
+  size_t countZ = SZ / SY;
+  size_t countW = SW / SZ;
+
+  for (size_t l = 0; l < countW; l++) {
+    for (size_t k = 0; k < countZ; k++) {
+      for (size_t j = 0; j < countY; j++) {
+        for (size_t i = 0; i < countX; i++) {
+          if (i/(countX/BLOCK_SIDE_LEN) < PX
+              && j/(countY/BLOCK_SIDE_LEN) < PY
+              && k/(countZ/BLOCK_SIDE_LEN) < PZ
+              && l/(countW/BLOCK_SIDE_LEN) < PW
+              && !(i % (countX/BLOCK_SIDE_LEN))
+              && !(j % (countY/BLOCK_SIDE_LEN))
+              && !(k % (countZ/BLOCK_SIDE_LEN))
+              && !(l % (countW/BLOCK_SIDE_LEN))) {
+                ASSERT_SCALAR_EQ(data1[countX*countY*countZ*l + countX*countY*k + countX*j + i], data2[countX*countY*countZ*l + countX*countY*k + countX*j + i]) << 
+                            "index " << (countX*countY*countZ*l + countX*countY*k + countX*j + i) << " mismatch: " << 
+                            data1[countX*countY*countZ*l + countX*countY*k + countX*j + i] << " != " <<
+                            data2[countX*countY*countZ*l + countX*countY*k + countX*j + i];
           }
         }
-
-        break;
+      }
+    }
   }
+#endif
 }
 
 void setupStream(zfp_field** field, zfp_stream** stream, bool isStrided = false)
@@ -400,9 +372,15 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlock_resultsMatchNonTemplated)
 TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemplated)
 {
     size_t countX = 4 * SX;
+#if DIMS > 1
     size_t countY = SY / SX;
+#endif
+#if DIMS > 2
     size_t countZ = SZ / SY;
+#endif
+#if DIMS == 4
     size_t countW = SW / SZ;
+#endif
 
     SCALAR* dataArr;
     populateStridedArray(&dataArr, DUMMY_VAL);
@@ -442,7 +420,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemp
 
     size_t sz = ZFP_DECODE_BLOCK_STRIDED_FUNC(stream, data1, SX);
     size_t tsz = decode_block_strided<SCALAR>(tstream, data2, SX);
-    size_t count = countX;
 #elif DIMS == 2
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY);
     ASSERT_TRUE(data1 != nullptr);
@@ -452,7 +429,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemp
 
     size_t sz = ZFP_DECODE_BLOCK_STRIDED_FUNC(stream, data1, SX, SY);
     size_t tsz = decode_block_strided<SCALAR>(tstream, data2, SX, SY);
-    size_t count = countX * countY;
 #elif DIMS == 3
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ);
     ASSERT_TRUE(data1 != nullptr);
@@ -462,7 +438,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemp
 
     size_t sz = ZFP_DECODE_BLOCK_STRIDED_FUNC(stream, data1, SX, SY, SZ);
     size_t tsz = decode_block_strided<SCALAR>(tstream, data2, SX, SY, SZ);
-    size_t count = countX * countY * countZ;
 #elif DIMS == 4
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ * countW);
     ASSERT_TRUE(data1 != nullptr);
@@ -472,7 +447,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemp
 
     size_t sz = ZFP_DECODE_BLOCK_STRIDED_FUNC(stream, data1, SX, SY, SZ, SW);
     size_t tsz = decode_block_strided<SCALAR>(tstream, data2, SX, SY, SZ, SW);
-    size_t count = countX * countY * countZ * countW;
 #endif
 
     ASSERT_TRUE(sz == tsz);
@@ -492,9 +466,15 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodeBlockStrided_resultsMatchNonTemp
 TEST(TemplatedDecodeTests, given_TemplatedDecodePartialBlockStrided_resultsMatchNonTemplated)
 {
     size_t countX = 4 * SX;
+#if DIMS > 1
     size_t countY = SY / SX;
+#endif
+#if DIMS > 2
     size_t countZ = SZ / SY;
+#endif
+#if DIMS == 4
     size_t countW = SW / SZ;
+#endif
 
     SCALAR* dataArr;
     populateStridedArray(&dataArr, DUMMY_VAL);
@@ -534,7 +514,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodePartialBlockStrided_resultsMatch
 
     size_t d_sz = ZFP_DECODE_PARTIAL_BLOCK_STRIDED_FUNC(stream, data1, PX, SX);
     size_t d_tsz = decode_partial_block_strided<SCALAR>(tstream, data2, PX, SX);
-    size_t count = countX;
 #elif DIMS == 2
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY);
     ASSERT_TRUE(data1 != nullptr);
@@ -544,7 +523,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodePartialBlockStrided_resultsMatch
 
     size_t d_sz = ZFP_DECODE_PARTIAL_BLOCK_STRIDED_FUNC(stream, data1, PX, PY, SX, SY);
     size_t d_tsz = decode_partial_block_strided<SCALAR>(tstream, data2, PX, PY, SX, SY);
-    size_t count = countX * countY;
 #elif DIMS == 3
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ);
     ASSERT_TRUE(data1 != nullptr);
@@ -554,7 +532,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodePartialBlockStrided_resultsMatch
 
     size_t d_sz = ZFP_DECODE_PARTIAL_BLOCK_STRIDED_FUNC(stream, data1, PX, PY, PZ, SX, SY, SZ);
     size_t d_tsz = decode_partial_block_strided<SCALAR>(tstream, data2, PX, PY, PZ, SX, SY, SZ);
-    size_t count = countX * countY * countZ;
 #elif DIMS == 4
     SCALAR *data1 = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ * countW);
     ASSERT_TRUE(data1 != nullptr);
@@ -564,7 +541,6 @@ TEST(TemplatedDecodeTests, given_TemplatedDecodePartialBlockStrided_resultsMatch
 
     size_t d_sz = ZFP_DECODE_PARTIAL_BLOCK_STRIDED_FUNC(stream, data1, PX, PY, PZ, PW, SX, SY, SZ, SW);
     size_t d_tsz = decode_partial_block_strided<SCALAR>(tstream, data2, PX, PY, PZ, PW, SX, SY, SZ, SW);
-    size_t count = countX * countY * countZ * countW;
 #endif
 
     ASSERT_TRUE(d_sz == d_tsz);
diff --git a/tests/array/encode/testTemplatedEncode1d.cpp b/tests/array/encode/testTemplatedEncode1d.cpp
index f1ef6b261..e8e1a040d 100644
--- a/tests/array/encode/testTemplatedEncode1d.cpp
+++ b/tests/array/encode/testTemplatedEncode1d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode1f.cpp b/tests/array/encode/testTemplatedEncode1f.cpp
index 0127f6a66..f30dcdba4 100644
--- a/tests/array/encode/testTemplatedEncode1f.cpp
+++ b/tests/array/encode/testTemplatedEncode1f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode2d.cpp b/tests/array/encode/testTemplatedEncode2d.cpp
index 3412abb5c..c853ea3d5 100644
--- a/tests/array/encode/testTemplatedEncode2d.cpp
+++ b/tests/array/encode/testTemplatedEncode2d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode2f.cpp b/tests/array/encode/testTemplatedEncode2f.cpp
index 35a11dab5..6e4ea5ba2 100644
--- a/tests/array/encode/testTemplatedEncode2f.cpp
+++ b/tests/array/encode/testTemplatedEncode2f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode3d.cpp b/tests/array/encode/testTemplatedEncode3d.cpp
index bced5bc6f..6260e659a 100644
--- a/tests/array/encode/testTemplatedEncode3d.cpp
+++ b/tests/array/encode/testTemplatedEncode3d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode3f.cpp b/tests/array/encode/testTemplatedEncode3f.cpp
index a26fc3f9c..af2079fa3 100644
--- a/tests/array/encode/testTemplatedEncode3f.cpp
+++ b/tests/array/encode/testTemplatedEncode3f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode4d.cpp b/tests/array/encode/testTemplatedEncode4d.cpp
index b1283e3b0..c71a27e43 100644
--- a/tests/array/encode/testTemplatedEncode4d.cpp
+++ b/tests/array/encode/testTemplatedEncode4d.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncode4f.cpp b/tests/array/encode/testTemplatedEncode4f.cpp
index c63dcefd5..91202a741 100644
--- a/tests/array/encode/testTemplatedEncode4f.cpp
+++ b/tests/array/encode/testTemplatedEncode4f.cpp
@@ -1,4 +1,4 @@
-#include "array/zfpcpp.h"
+#include "zfp.hpp"
 using namespace zfp;
 
 extern "C" {
diff --git a/tests/array/encode/testTemplatedEncodeBase.cpp b/tests/array/encode/testTemplatedEncodeBase.cpp
index e4841d6c2..06538cb7e 100644
--- a/tests/array/encode/testTemplatedEncodeBase.cpp
+++ b/tests/array/encode/testTemplatedEncodeBase.cpp
@@ -39,7 +39,7 @@ void populateStridedArray(SCALAR** dataArr, SCALAR dummyVal)
     switch(DIMS) {
         case 1:
             countX = BLOCK_SIDE_LEN * SX;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX);
+            *dataArr = new SCALAR[countX];
             ASSERT_TRUE(*dataArr != nullptr);
 
             for (i = 0; i < countX; i++) {
@@ -58,7 +58,7 @@ void populateStridedArray(SCALAR** dataArr, SCALAR dummyVal)
         case 2:
             countX = BLOCK_SIDE_LEN * SX;
             countY = SY / SX;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY);
+            *dataArr = new SCALAR[countX * countY];
             ASSERT_TRUE(*dataArr != nullptr);
 
             for (j = 0; j < countY; j++) {
@@ -82,7 +82,7 @@ void populateStridedArray(SCALAR** dataArr, SCALAR dummyVal)
             countX = BLOCK_SIDE_LEN * SX;
             countY = SY / SX;
             countZ = SZ / SY;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ);
+            *dataArr = new SCALAR[countX * countY * countZ];
             ASSERT_TRUE(*dataArr != nullptr);
 
             for (k = 0; k < countZ; k++) {
@@ -110,7 +110,7 @@ void populateStridedArray(SCALAR** dataArr, SCALAR dummyVal)
             countY = SY / SX;
             countZ = SZ / SY;
             countW = SW / SZ;
-            *dataArr = (SCALAR*)malloc(sizeof(SCALAR) * countX * countY * countZ * countW);
+            *dataArr = new SCALAR[countX * countY * countZ * countW];
             ASSERT_TRUE(*dataArr != nullptr);
 
             for (l = 0; l < countW; l++) {
@@ -192,7 +192,7 @@ bool streamsEqual(zfp_stream** stream1, zfp_stream** stream2)
     char* data2 = (char*)stream_data(s2);
     zfp_stream_flush(*stream2);
 
-    for (int i = 0; i < sz1; i++)
+    for (size_t i = 0; i < sz1; i++)
         if (data1[i] != data2[i])
             return false;
     return true;
diff --git a/tests/array/utils/gtestBaseFixture.h b/tests/array/utils/gtestBaseFixture.h
index 82b6be9a3..d63e09de6 100644
--- a/tests/array/utils/gtestBaseFixture.h
+++ b/tests/array/utils/gtestBaseFixture.h
@@ -65,12 +65,14 @@ class CArrayNdTestFixture : public ::testing::TestWithParam<testConfig> {
         config = zfp_config_reversible();
         break;
       }
+#if 0
       case zfp_mode_expert:
       {
         //TODO: do we need this one?
         //config = zfp_config_expert(uint minbits, uint maxbits, uint maxprec, int minexp);
         //break;
       }
+#endif
       default:
       {
         config = zfp_config_none();
diff --git a/tests/array/utils/predicates.h b/tests/array/utils/predicates.h
index 9ba726157..347143d96 100644
--- a/tests/array/utils/predicates.h
+++ b/tests/array/utils/predicates.h
@@ -1,4 +1,4 @@
-#include "array/zfparray1.h"
+#include "zfp/array1.hpp"
 
 #include "gtest/gtest.h"
 
diff --git a/tests/array/zfp/testAlignedMemory.cpp b/tests/array/zfp/testAlignedMemory.cpp
index 54871a5d9..07d34c1e4 100644
--- a/tests/array/zfp/testAlignedMemory.cpp
+++ b/tests/array/zfp/testAlignedMemory.cpp
@@ -1,4 +1,4 @@
-#include "array/zfparray3.h"
+#include "zfp/array3.hpp"
 using namespace zfp;
 
 #include "gtest/gtest.h"
@@ -6,7 +6,7 @@ using namespace zfp;
 #include "../utils/gtestSingleFixture.h"
 #include "../utils/predicates.h"
 
-#include <stdint.h>
+#include <cstdint>
 
 TestEnv* const testEnv = new TestEnv;
 
diff --git a/tests/cfp/testCfpArray_source.c b/tests/cfp/testCfpArray_source.c
index 718812530..6859350ce 100644
--- a/tests/cfp/testCfpArray_source.c
+++ b/tests/cfp/testCfpArray_source.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
@@ -8,7 +9,7 @@
 #include <math.h>
 #include <string.h>
 
-#include "cfparray.h"
+#include "zfp/array.h"
 #include "zfp.h"
 
 #include "utils/genSmoothRandNums.h"
diff --git a/tests/cfp/testCfpNamespace.c b/tests/cfp/testCfpNamespace.c
index 478c27bca..fe638bbf1 100644
--- a/tests/cfp/testCfpNamespace.c
+++ b/tests/cfp/testCfpNamespace.c
@@ -1,9 +1,10 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
-#include "cfparray.h"
+#include "zfp/array.h"
 
 /* only run this test when compiling with CFP_NAMESPACE=cfp2 */
 
diff --git a/tests/fortran/testFortran.f b/tests/fortran/testFortran.f
index 04b8bb83e..3e1771fca 100644
--- a/tests/fortran/testFortran.f
+++ b/tests/fortran/testFortran.f
@@ -1,5 +1,5 @@
 program main
-  use zFORp
+  use zfp
   use iso_c_binding
 
   ! loop counters
diff --git a/tests/gitlab/corona-jobs.yml b/tests/gitlab/corona-jobs.yml
index 06acdc85f..fe75a6e54 100644
--- a/tests/gitlab/corona-jobs.yml
+++ b/tests/gitlab/corona-jobs.yml
@@ -4,6 +4,7 @@
 
 rocm-3.10.0_build:
     variables:
+        ci_cmake: "cmake/3.21.1"
         ci_cmp_mod: "rocm/3.10.0"
         ci_cmp_path: "/opt/rocm-3.10.0/hip"
     extends: [.hip, .corona_build_gpu]
diff --git a/tests/gitlab/dane-jobs.yml b/tests/gitlab/dane-jobs.yml
new file mode 100644
index 000000000..77b05bac8
--- /dev/null
+++ b/tests/gitlab/dane-jobs.yml
@@ -0,0 +1,64 @@
+###########
+# CXX CPU #
+###########
+
+cpp_gnu-10.3.1_build:
+    variables:
+        ci_cmake: "cmake/3.23.1"
+        ci_cxx_cmp: "g++"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/10.3.1"
+    extends: [.cpp, .dane_build_cpu]
+    needs: []
+
+cpp_gnu-10.3.1_test:
+    extends: [.dane_test_cpu]
+    needs: [cpp_gnu-10.3.1_build]
+
+
+cpp_clang-14.0.6_build:
+    variables:
+        ci_cmake: "cmake/3.23.1"
+        ci_cxx_cmp: "clang++"
+        ci_c_cmp: "clang"
+        ci_cmp_mod: "clang/14.0.6"
+    extends: [.cpp, .dane_build_cpu]
+    needs: []
+
+cpp_clang-14.0.6_test:
+    extends: [.dane_test_cpu]
+    needs: [cpp_clang-14.0.6_build]
+
+
+cpp_intel-2022.1.0_build:
+    variables:
+        ci_cmake: "cmake/3.23.1"
+        ci_cxx_cmp: "icpc"
+        ci_c_cmp: "icc"
+        ci_cmp_mod: "intel/2022.1.0"
+    extends: [.cpp, .dane_build_cpu]
+    needs: []
+
+cpp_intel-2022.1.0_test:
+    extends: [.dane_test_cpu]
+    needs: [cpp_intel-2022.1.0_build]
+
+
+
+#########
+# C CPU #
+#########
+
+c_gnu-10.3.1_build:
+    variables:
+        ci_cmake: "cmake/3.23.1"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/10.3.1"
+    extends: [.c, .dane_build_cpu]
+    needs: []
+
+c_gnu-10.3.1_test:
+    variables:
+       ci_test_regex: "Cfp"
+    extends: [.dane_test_cpu]
+    needs: [c_gnu-10.3.1_build]
diff --git a/tests/gitlab/dane-templates.yml b/tests/gitlab/dane-templates.yml
new file mode 100644
index 000000000..147beb33b
--- /dev/null
+++ b/tests/gitlab/dane-templates.yml
@@ -0,0 +1,12 @@
+.dane_job:
+    tags:
+        - batch
+        - dane
+
+.dane_build_cpu:
+    extends: [.build_cpu, .dane_job]
+
+.dane_test_cpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_cpu, .dane_job]
diff --git a/tests/gitlab/gitlab-ci.yml b/tests/gitlab/gitlab-ci.yml
index 7c48b8ead..3a7b92ac4 100644
--- a/tests/gitlab/gitlab-ci.yml
+++ b/tests/gitlab/gitlab-ci.yml
@@ -4,7 +4,8 @@
 
 variables:
     GIT_SUBMODULE_STRATEGY: recursive
-    LLNL_SLURM_SCHEDULER_PARAMETERS: "--nodes=1 -A asccasc"
+    LLNL_SLURM_SCHEDULER_PARAMETERS: "--nodes=1 -t 00:20:00"
+    LLNL_SERVICE_USER: zfp
 
 stages:
     - build
@@ -26,7 +27,17 @@ stages:
 
 .build_cpu:
     before_script:
+        - |- 
+            if [ "$ci_c_cmp" != "gcc" ]; then
+                module --latest load gcc
+                if (( $(gcc -dumpversion | sed 's/\..*//') < 5 )); then
+                    echo "unable to find new enough gcc to support ${ci_c_cmp} build"
+                    exit 1
+                fi
+                export GXX_PATH=$(dirname $(which gcc))/../
+            fi
         - module reset
+        - module load $ci_cmake
         - module load $ci_cmp_mod
         - |-
             if [ "$ci_lang" == "cpp" ]; then
@@ -49,19 +60,27 @@ stages:
     script:
         - mkdir build
         - cd build
-        - cmake -DBUILD_TESTING=ON -DBUILD_UTILITIES=OFF -DZFP_WITH_CUDA=OFF ${ci_cmake_flags} ..
-        - make -j
+        - |-
+            export ci_cmake_cmp_flags=""
+            if [ "$ci_c_cmp" == "icc" ]; then
+                export ci_cmake_cmp_flags="-DCMAKE_CXX_FLAGS=-gcc-name=${GXX_PATH}/bin/gcc -DCMAKE_C_FLAGS=-gcc-name=${GXX_PATH}/bin/gcc"
+            elif [ "$ci_c_cmp" == "clang" ]; then
+                export ci_cmake_cmp_flags="-DCMAKE_CXX_FLAGS=--gcc-toolchain=${GXX_PATH} -DCMAKE_C_FLAGS=--gcc-toolchain=${GXX_PATH}"
+            fi
+        - cmake -DBUILD_TESTING_FULL=ON -DBUILD_UTILITIES=OFF -DZFP_WITH_CUDA=OFF ${ci_cmake_flags} ${ci_cmake_cmp_flags} ..
+        - cmake --build .
     extends: [.build]
 
 .build_gpu:
     before_script:
         - module reset
-        - module load opt
+        - module load $ci_cmake
         - module load $ci_cmp_mod
+        - module load $ci_gcc_mod
     script:
         - mkdir build
         - cd build
-        - cmake -DBUILD_TESTING=ON -DZFP_WITH_OPENMP=OFF -DBUILD_UTILITIES=OFF ${ci_cmake_flags} ..
+        - cmake -DBUILD_TESTING_FULL=ON -DZFP_WITH_OPENMP=OFF -DBUILD_UTILITIES=OFF ${ci_cmake_flags} ..
         - make -j
     extends: [.build]
 
@@ -116,9 +135,17 @@ stages:
 ############
 
 include:
-    - local: tests/gitlab/surface-templates.yml
-    - local: tests/gitlab/surface-jobs.yml
-    - local: tests/gitlab/pascal-templates.yml
-    - local: tests/gitlab/pascal-jobs.yml
+    - project: 'lc-templates/id_tokens'
+      file: 'id_tokens.yml'
+#    - local: tests/gitlab/pascal-templates.yml
+#    - local: tests/gitlab/pascal-jobs.yml
+#    - local: tests/gitlab/lassen-templates.yml
+#    - local: tests/gitlab/lassen-jobs.yml
+    - local: tests/gitlab/matrix-templates.yml
+    - local: tests/gitlab/matrix-jobs.yml
+    - local: tests/gitlab/dane-templates.yml
+    - local: tests/gitlab/dane-jobs.yml
+#    - local: tests/gitlab/quartz-templates.yml
+#    - local: tests/gitlab/quartz-jobs.yml
 #    - local: tests/gitlab/corona-templates.yml
 #    - local: tests/gitlab/corona-jobs.yml
diff --git a/tests/gitlab/lassen-jobs.yml b/tests/gitlab/lassen-jobs.yml
new file mode 100644
index 000000000..71951ea02
--- /dev/null
+++ b/tests/gitlab/lassen-jobs.yml
@@ -0,0 +1,17 @@
+############
+# CUDA GPU #
+############
+
+cuda-11.6.1_build:
+    variables:
+        ci_cmake: "cmake/3.14.5"
+        ci_cmp_mod: "cuda/11.6.1"
+        ci_gcc_mod: "gcc/8.3.1"
+    extends: [.cuda, .lassen_build_gpu]
+    needs: []
+
+cuda-11.6.1_test:
+    variables:
+       ci_test_regex: "Cuda"
+    extends: [.lassen_test_gpu]
+    needs: [cuda-11.6.1_build]
diff --git a/tests/gitlab/lassen-templates.yml b/tests/gitlab/lassen-templates.yml
new file mode 100644
index 000000000..c636ba41e
--- /dev/null
+++ b/tests/gitlab/lassen-templates.yml
@@ -0,0 +1,12 @@
+.lassen_job:
+    tags:
+        - batch
+        - lassen
+
+.lassen_build_gpu:
+    extends: [.build_gpu, .lassen_job]
+
+.lassen_test_gpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_gpu, .lassen_job]
diff --git a/tests/gitlab/matrix-jobs.yml b/tests/gitlab/matrix-jobs.yml
new file mode 100644
index 000000000..e5edf17f4
--- /dev/null
+++ b/tests/gitlab/matrix-jobs.yml
@@ -0,0 +1,17 @@
+############
+# CUDA GPU #
+############
+
+cuda-11.8.0_build:
+    variables:
+        ci_cmake: "cmake/3.23.1"
+        ci_cmp_mod: "cuda/11.8.0"
+        ci_gcc_mod: "gcc/10.3.1"
+    extends: [.cuda, .matrix_build_gpu]
+    needs: []
+
+cuda-11.8.0_test:
+    variables:
+       ci_test_regex: "Cuda"
+    extends: [.matrix_test_gpu]
+    needs: [cuda-11.8.0_build]
diff --git a/tests/gitlab/matrix-templates.yml b/tests/gitlab/matrix-templates.yml
new file mode 100644
index 000000000..95180b879
--- /dev/null
+++ b/tests/gitlab/matrix-templates.yml
@@ -0,0 +1,12 @@
+.matrix_job:
+    tags:
+        - batch
+        - matrix
+
+.matrix_build_gpu:
+    extends: [.build_gpu, .matrix_job]
+
+.matrix_test_gpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_gpu, .matrix_job]
diff --git a/tests/gitlab/pascal-jobs.yml b/tests/gitlab/pascal-jobs.yml
index 899dba02b..7d73f17a7 100644
--- a/tests/gitlab/pascal-jobs.yml
+++ b/tests/gitlab/pascal-jobs.yml
@@ -2,14 +2,16 @@
 # CUDA GPU #
 ############
 
-cuda-10.1.168_build:
+cuda-11.8.0_build:
     variables:
-        ci_cmp_mod: "cuda/10.1.168"
+        ci_cmake: "cmake/3.14.5"
+        ci_cmp_mod: "cuda/11.8.0"
+        ci_gcc_mod: "gcc/10.3.1"
     extends: [.cuda, .pascal_build_gpu]
     needs: []
 
-cuda-10.1.168_test:
+cuda-11.8.0_test:
     variables:
        ci_test_regex: "Cuda"
     extends: [.pascal_test_gpu]
-    needs: [cuda-10.1.168_build]
+    needs: [cuda-11.8.0_build]
diff --git a/tests/gitlab/quartz-jobs.yml b/tests/gitlab/quartz-jobs.yml
new file mode 100644
index 000000000..672c68a59
--- /dev/null
+++ b/tests/gitlab/quartz-jobs.yml
@@ -0,0 +1,64 @@
+###########
+# CXX CPU #
+###########
+
+cpp_gnu-10.3.1_build:
+    variables:
+        ci_cmake: "cmake/3.14.5"
+        ci_cxx_cmp: "g++"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/10.3.1"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_gnu-10.3.1_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_gnu-10.3.1_build]
+
+
+cpp_clang-14.0.6_build:
+    variables:
+        ci_cmake: "cmake/3.14.5"
+        ci_cxx_cmp: "clang++"
+        ci_c_cmp: "clang"
+        ci_cmp_mod: "clang/14.0.6"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_clang-14.0.6_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_clang-14.0.6_build]
+
+
+cpp_intel-2022.1.0_build:
+    variables:
+        ci_cmake: "cmake/3.14.5"
+        ci_cxx_cmp: "icpc"
+        ci_c_cmp: "icc"
+        ci_cmp_mod: "intel/2022.1.0"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_intel-2022.1.0_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_intel-2022.1.0_build]
+
+
+
+#########
+# C CPU #
+#########
+
+c_gnu-10.3.1_build:
+    variables:
+        ci_cmake: "cmake/3.14.5"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/10.3.1"
+    extends: [.c, .quartz_build_cpu]
+    needs: []
+
+c_gnu-10.3.1_test:
+    variables:
+       ci_test_regex: "Cfp"
+    extends: [.quartz_test_cpu]
+    needs: [c_gnu-10.3.1_build]
diff --git a/tests/gitlab/quartz-templates.yml b/tests/gitlab/quartz-templates.yml
new file mode 100644
index 000000000..d4d18533b
--- /dev/null
+++ b/tests/gitlab/quartz-templates.yml
@@ -0,0 +1,12 @@
+.quartz_job:
+    tags:
+        - batch
+        - quartz
+
+.quartz_build_cpu:
+    extends: [.build_cpu, .quartz_job]
+
+.quartz_test_cpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_cpu, .quartz_job]
diff --git a/tests/gitlab/surface-jobs.yml b/tests/gitlab/surface-jobs.yml
deleted file mode 100644
index 5325c135a..000000000
--- a/tests/gitlab/surface-jobs.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-###########
-# CXX CPU #
-###########
-
-cpp_gnu-7.3.0_build:
-    variables:
-        ci_cxx_cmp: "g++"
-        ci_c_cmp: "gcc"
-        ci_cmp_mod: "gcc/7.3.0"
-    extends: [.cpp, .surface_build_cpu]
-    needs: []
-
-cpp_gnu-7.3.0_test:
-    extends: [.surface_test_cpu]
-    needs: [cpp_gnu-7.3.0_build]
-
-
-cpp_clang-10.0.0_build:
-    variables:
-        ci_cxx_cmp: "clang++"
-        ci_c_cmp: "clang"
-        ci_cmp_mod: "clang/10.0.0"
-    extends: [.cpp, .surface_build_cpu]
-    needs: []
-
-cpp_clang-10.0.0_test:
-    extends: [.surface_test_cpu]
-    needs: [cpp_clang-10.0.0_build]
-
-
-cpp_intel-19.0.4_build:
-    variables:
-        ci_cxx_cmp: "icpc"
-        ci_c_cmp: "icc"
-        ci_cmp_mod: "intel/19.0.4"
-    extends: [.cpp, .surface_build_cpu]
-    needs: []
-
-cpp_intel-19.0.4_test:
-    extends: [.surface_test_cpu]
-    needs: [cpp_intel-19.0.4_build]
-
-
-cpp_pgi-21.1_build:
-    variables:
-        ci_cxx_cmp: "pgc++"
-        ci_c_cmp: "pgcc"
-        ci_cmp_mod: "pgi/21.1"
-    extends: [.cpp, .surface_build_cpu]
-    needs: []
-
-cpp_pgi-21.1_test:
-    extends: [.surface_test_cpu]
-    needs: [cpp_pgi-21.1_build]
-
-
-#########
-# C CPU #
-#########
-
-c_gnu-7.3.0_build:
-    variables:
-        ci_c_cmp: "gcc"
-        ci_cmp_mod: "gcc/7.3.0"
-    extends: [.c, .surface_build_cpu]
-    needs: []
-
-c_gnu-7.3.0_test:
-    variables:
-       ci_test_regex: "Cfp"
-    extends: [.surface_test_cpu]
-    needs: [c_gnu-7.3.0_build]
diff --git a/tests/gitlab/surface-templates.yml b/tests/gitlab/surface-templates.yml
deleted file mode 100644
index ee838f3fc..000000000
--- a/tests/gitlab/surface-templates.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-.surface_job:
-    tags:
-        - batch
-        - surface
-
-.surface_build_cpu:
-    extends: [.build_cpu, .surface_job]
-
-.surface_test_cpu:
-    variables:
-       ci_test_regex: "."
-    extends: [.test_cpu, .surface_job]
-
-.surface_build_gpu:
-    extends: [.build_gpu, .surface_job]
-
-.surface_test_gpu:
-    variables:
-       ci_test_regex: "."
-    extends: [.test_gpu, .surface_job]
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index 89fce2d1b..589ac8afa 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -1,5 +1,8 @@
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.27.0)
+    cmake_policy(SET CMP0148 OLD)
+endif ()
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/python/scikit-build-cmake)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/python/eyescale-cmake)
 
 find_package(PythonInterp REQUIRED)
 find_package(PythonLibs REQUIRED)
@@ -9,12 +12,12 @@ find_package(NumPy REQUIRED)
 
 include_directories(${ZFP_SOURCE_DIR}/include)
 include_directories(${ZFP_SOURCE_DIR}/python)
-include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
+include_directories(${NumPy_INCLUDE_DIR})
 
 include_directories(${ZFP_SOURCE_DIR}/tests/python)
 include_directories(${ZFP_SOURCE_DIR}/tests/utils)
 include_directories(${ZFP_SOURCE_DIR})
-add_cython_target(test_utils test_utils.pyx C)
+add_cython_target(test_utils test_utils.pyx C PY3)
 add_library(test_utils MODULE ${test_utils})
 target_link_libraries(test_utils zfp genSmoothRandNumsLib stridedOperationsLib zfpCompressionParamsLib zfpChecksumsLib zfpHashLib)
 python_extension_module(test_utils)
diff --git a/tests/python/test_numpy.py b/tests/python/test_numpy.py
index 1e52abcd5..189aa430b 100644
--- a/tests/python/test_numpy.py
+++ b/tests/python/test_numpy.py
@@ -10,6 +10,11 @@
 except ImportError:
     version_parse = None
 
+def test_zfpy_version():
+    # Just ensure that the version contains 3 numbers separated by dots
+    assert len(zfpy.__version__.split('.')) == 3
+    # Ensure it is a string, not bytes
+    assert isinstance(zfpy.__version__, str)
 
 class TestNumpy(unittest.TestCase):
     def lossless_round_trip(self, orig_array):
@@ -34,7 +39,12 @@ def test_different_dtypes(self):
         for dtype in [np.float32, np.float64]:
             elements = np.random.random_sample(num_elements)
             elements = elements.astype(dtype, casting="same_kind")
-            array = np.reshape(elements, newshape=shape)
+            if (version_parse is not None and
+                (version_parse(np.__version__) >= version_parse("2.1.0"))
+            ):
+                array = np.reshape(elements, shape=shape)
+            else:
+                array = np.reshape(elements, newshape=shape)
             self.lossless_round_trip(array)
 
         if (version_parse is not None and
@@ -48,6 +58,46 @@ def test_different_dtypes(self):
             self.lossless_round_trip(array)
 
     def test_advanced_decompression_checksum(self):
+        ndims = 2
+        ztype = zfpy.type_float
+        random_array = test_utils.getRandNumpyArray(ndims, ztype)
+        mode = zfpy.mode_fixed_accuracy
+        compress_param_num = 1
+        compression_kwargs = {
+            "tolerance": test_utils.computeParameterValue(
+                mode,
+                compress_param_num
+            ),
+        }
+        compressed_array = zfpy.compress_numpy(
+            random_array,
+            write_header=False,
+            **compression_kwargs
+        )
+
+        # Decompression using the "advanced" interface which enforces no header,
+        # and the user must provide all the metadata
+        decompressed_array = np.empty_like(random_array)
+        zfpy._decompress(
+            compressed_array,
+            ztype,
+            random_array.shape,
+            out=decompressed_array,
+            **compression_kwargs
+        )
+        decompressed_array_dims = decompressed_array.shape + tuple(0 for i in range(4 - decompressed_array.ndim))
+        decompressed_checksum = test_utils.getChecksumDecompArray(
+            decompressed_array_dims,
+            ztype,
+            mode,
+            compress_param_num
+        )
+        actual_checksum = test_utils.hashNumpyArray(
+            decompressed_array
+        )
+        self.assertEqual(decompressed_checksum, actual_checksum)
+
+    def test_memview_advanced_decompression_checksum(self):
         ndims = 2
         ztype = zfpy.type_float
         random_array = test_utils.getRandNumpyArray(ndims, ztype)
diff --git a/tests/python/test_utils.pyx b/tests/python/test_utils.pyx
index 2915b4a9b..e792e61e2 100644
--- a/tests/python/test_utils.pyx
+++ b/tests/python/test_utils.pyx
@@ -2,6 +2,7 @@
 import cython
 from libc.stdlib cimport malloc, free
 cimport libc.stdint as stdint
+from libc.stddef cimport ptrdiff_t
 from cython cimport view
 from itertools import islice, repeat, chain
 
@@ -110,29 +111,6 @@ cdef extern from "zfpChecksums.h":
                               zfp_type type,
                               uint64_t key1,
                               uint64_t key2)
-    uint64_t getChecksumOriginalDataBlock(int dims,
-                                          zfpy.zfp_type type)
-    uint64_t getChecksumEncodedBlock(int dims,
-                                     zfpy.zfp_type type)
-    uint64_t getChecksumEncodedPartialBlock(int dims,
-                                            zfpy.zfp_type type)
-    uint64_t getChecksumDecodedBlock(int dims,
-                                     zfpy.zfp_type type)
-    uint64_t getChecksumDecodedPartialBlock(int dims,
-                                            zfpy.zfp_type type)
-    uint64_t getChecksumOriginalDataArray(int ndims,
-                                          size_t[4] dims,
-                                          zfpy.zfp_type type)
-    uint64_t getChecksumCompressedBitstream(int ndims,
-                                            size_t[4] dims,
-                                            zfpy.zfp_type type,
-                                            zfpy.zfp_mode mode,
-                                            int compressParamNum)
-    uint64_t getChecksumDecompressedArray(int ndims,
-                                          size_t[4] dims,
-                                          zfpy.zfp_type ztype,
-                                          zfpy.zfp_mode mode,
-                                          int compressParamNum)
 
 cdef extern from "zfpHash.h":
     uint64_t hashBitstream(uint64_t* ptrStart,
diff --git a/tests/src/decode/zfpDecodeBlockBase.c b/tests/src/decode/zfpDecodeBlockBase.c
index caeb90a36..7691d7aaf 100644
--- a/tests/src/decode/zfpDecodeBlockBase.c
+++ b/tests/src/decode/zfpDecodeBlockBase.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/decode/zfpDecodeBlockStridedBase.c b/tests/src/decode/zfpDecodeBlockStridedBase.c
index cf921225b..0a1151c8a 100644
--- a/tests/src/decode/zfpDecodeBlockStridedBase.c
+++ b/tests/src/decode/zfpDecodeBlockStridedBase.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/encode/zfpEncodeBlockBase.c b/tests/src/encode/zfpEncodeBlockBase.c
index 23917c622..5d72a6fc3 100644
--- a/tests/src/encode/zfpEncodeBlockBase.c
+++ b/tests/src/encode/zfpEncodeBlockBase.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/encode/zfpEncodeBlockStridedBase.c b/tests/src/encode/zfpEncodeBlockStridedBase.c
index 10294ac21..2e332fc1d 100644
--- a/tests/src/encode/zfpEncodeBlockStridedBase.c
+++ b/tests/src/encode/zfpEncodeBlockStridedBase.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/endtoend/CMakeLists.txt b/tests/src/endtoend/CMakeLists.txt
index 3d386c249..5c9aca20f 100644
--- a/tests/src/endtoend/CMakeLists.txt
+++ b/tests/src/endtoend/CMakeLists.txt
@@ -20,12 +20,11 @@ function(zfp_add_test dims type bits)
   if(ZFP_WITH_OPENMP)
     set(omp_test_name testZfpOmp${dims}d${type})
     add_executable(${omp_test_name} ${omp_test_name}.c)
-    target_compile_options(${omp_test_name} PRIVATE ${OpenMP_C_FLAGS})
     target_compile_definitions(${omp_test_name} PRIVATE ${zfp_private_defs})
     target_link_libraries(${omp_test_name}
       cmocka zfp zfpHashLib genSmoothRandNumsLib stridedOperationsLib
       zfpChecksumsLib zfpTimerLib zfpCompressionParamsLib
-      ${OpenMP_C_LIBRARIES})
+      OpenMP::OpenMP_C)
     if(HAVE_LIBM_MATH)
       target_link_libraries(${omp_test_name} m)
     endif()
@@ -34,7 +33,7 @@ function(zfp_add_test dims type bits)
   endif()
 
   if(NOT DEFINED ZFP_OMP_TESTS_ONLY)
-    if(ZFP_WITH_CUDA AND (${dims} LESS 4))
+    if(ZFP_WITH_CUDA)
       add_definitions(-DZFP_WITH_CUDA)
 
       set(cuda_test_name testZfpCuda${dims}d${type})
diff --git a/tests/src/endtoend/cudaExecBase.c b/tests/src/endtoend/cudaExecBase.c
index 510ac199a..6e6750324 100644
--- a/tests/src/endtoend/cudaExecBase.c
+++ b/tests/src/endtoend/cudaExecBase.c
@@ -42,9 +42,9 @@ runZfpCompressDecompressIsNoop(void **state)
   bitstream* s = zfp_stream_bit_stream(stream);
 
   // grab bitstream member vars
-  uint bits = s->bits;
-  word buffer = s->buffer;
-  word* ptr = s->ptr;
+  bitstream_count bits = s->bits;
+  bitstream_word buffer = s->buffer;
+  bitstream_word* ptr = s->ptr;
   size_t streamSize = stream_size(s);
 
   // perform compression, expect bitstream not to advance
@@ -154,6 +154,14 @@ _catFunc3(given_, DESCRIPTOR, InterleavedArray_when_ZfpCompressFixedRate_expect_
   runCompressDecompressNoopTest(state, zfp_mode_fixed_rate);
 }
 
+#if DIMS == 4
+static void
+_catFunc3(given_Cuda_, DIM_INT_STR, Array_when_ZfpCompressDecompress_expect_BitstreamUntouchedAndReturnsZero)(void **state)
+{
+  runCompressDecompressNoopTest(state, zfp_mode_fixed_rate);
+}
+#endif
+
 /* setup functions */
 
 static int
diff --git a/tests/src/endtoend/ompExecBase.c b/tests/src/endtoend/ompExecBase.c
index 986286630..926f61a27 100644
--- a/tests/src/endtoend/ompExecBase.c
+++ b/tests/src/endtoend/ompExecBase.c
@@ -20,10 +20,13 @@ computeTotalBlocks(zfp_field* field)
   switch (zfp_field_dimensionality(field)) {
     case 4:
       bw = (field->nw + 3) / 4;
+      fallthrough_
     case 3:
       bz = (field->nz + 3) / 4;
+      fallthrough_
     case 2:
       by = (field->ny + 3) / 4;
+      fallthrough_
     case 1:
       bx = (field->nx + 3) / 4;
       return bx * by * bz * bw;
diff --git a/tests/src/endtoend/serialExecBase.c b/tests/src/endtoend/serialExecBase.c
index 95dfab394..28d7f861b 100644
--- a/tests/src/endtoend/serialExecBase.c
+++ b/tests/src/endtoend/serialExecBase.c
@@ -94,7 +94,7 @@ isCompressedBitrateComparableToChosenRate(struct setupVars* bundle)
   zfp_field* field = bundle->field;
   zfp_stream* stream = bundle->stream;
 
-  // integer arithemetic allows exact comparison
+  // integer arithmetic allows exact comparison
   size_t compressedBytes = zfp_compress(stream, field);
   if (compressedBytes == 0) {
     printf("Compression failed\n");
@@ -112,12 +112,16 @@ isCompressedBitrateComparableToChosenRate(struct setupVars* bundle)
   switch (DIMS) {
     case 4:
       paddedArrayLen *= paddedNw;
+      fallthrough_
     case 3:
       paddedArrayLen *= paddedNz;
+      fallthrough_
     case 2:
       paddedArrayLen *= paddedNy;
+      fallthrough_
     case 1:
       paddedArrayLen *= paddedNx;
+      break;
   }
 
   // expect bitrate to scale wrt padded array length
diff --git a/tests/src/endtoend/testZfpCuda4dDouble.c b/tests/src/endtoend/testZfpCuda4dDouble.c
new file mode 100644
index 000000000..afd05944f
--- /dev/null
+++ b/tests/src/endtoend/testZfpCuda4dDouble.c
@@ -0,0 +1,13 @@
+#include "src/encode4d.c"
+
+#include "constants/4dDouble.h"
+#include "cudaExecBase.c"
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    #include "testcases/cuda.c"
+  };
+
+  return cmocka_run_group_tests(tests, setupRandomData, teardownRandomData);
+}
diff --git a/tests/src/endtoend/testZfpCuda4dFloat.c b/tests/src/endtoend/testZfpCuda4dFloat.c
new file mode 100644
index 000000000..d0ce3ba89
--- /dev/null
+++ b/tests/src/endtoend/testZfpCuda4dFloat.c
@@ -0,0 +1,13 @@
+#include "src/encode4f.c"
+
+#include "constants/4dFloat.h"
+#include "cudaExecBase.c"
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    #include "testcases/cuda.c"
+  };
+
+  return cmocka_run_group_tests(tests, setupRandomData, teardownRandomData);
+}
diff --git a/tests/src/endtoend/testZfpCuda4dInt32.c b/tests/src/endtoend/testZfpCuda4dInt32.c
new file mode 100644
index 000000000..bad538fb6
--- /dev/null
+++ b/tests/src/endtoend/testZfpCuda4dInt32.c
@@ -0,0 +1,13 @@
+#include "src/encode4i.c"
+
+#include "constants/4dInt32.h"
+#include "cudaExecBase.c"
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    #include "testcases/cuda.c"
+  };
+
+  return cmocka_run_group_tests(tests, setupRandomData, teardownRandomData);
+}
diff --git a/tests/src/endtoend/testZfpCuda4dInt64.c b/tests/src/endtoend/testZfpCuda4dInt64.c
new file mode 100644
index 000000000..eb7c24d21
--- /dev/null
+++ b/tests/src/endtoend/testZfpCuda4dInt64.c
@@ -0,0 +1,13 @@
+#include "src/encode4l.c"
+
+#include "constants/4dInt64.h"
+#include "cudaExecBase.c"
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    #include "testcases/cuda.c"
+  };
+
+  return cmocka_run_group_tests(tests, setupRandomData, teardownRandomData);
+}
diff --git a/tests/src/endtoend/testcases/cuda.c b/tests/src/endtoend/testcases/cuda.c
index a2e784eb5..0af0341f0 100644
--- a/tests/src/endtoend/testcases/cuda.c
+++ b/tests/src/endtoend/testcases/cuda.c
@@ -1,5 +1,6 @@
 // requires #include "utils/testMacros.h", do outside of main()
 
+#if DIMS < 4
 _cmocka_unit_test(when_seededRandomSmoothDataGenerated_expect_ChecksumMatches),
 
 /* strided */
@@ -17,3 +18,7 @@ _cmocka_unit_test_setup_teardown(_catFunc3(given_Cuda_, DIM_INT_STR, Array_when_
 
 /* non fixed-rate modes unsupported */
 _cmocka_unit_test_setup_teardown(_catFunc3(given_Cuda_, DIM_INT_STR, Array_when_ZfpCompressDecompressNonFixedRate_expect_BitstreamUntouchedAndReturnsZero), setupDefaultStride, teardown),
+#else
+/* 4d compression unsupported */
+_cmocka_unit_test_setup_teardown(_catFunc3(given_Cuda_, DIM_INT_STR, Array_when_ZfpCompressDecompress_expect_BitstreamUntouchedAndReturnsZero), setupDefaultStride, teardown),
+#endif
diff --git a/tests/src/endtoend/zfpEndtoendBase.c b/tests/src/endtoend/zfpEndtoendBase.c
index 5069824f9..76ddf3c0c 100644
--- a/tests/src/endtoend/zfpEndtoendBase.c
+++ b/tests/src/endtoend/zfpEndtoendBase.c
@@ -1,5 +1,6 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/execPolicy/CMakeLists.txt b/tests/src/execPolicy/CMakeLists.txt
index f37e16acc..5e49a46c3 100644
--- a/tests/src/execPolicy/CMakeLists.txt
+++ b/tests/src/execPolicy/CMakeLists.txt
@@ -2,16 +2,13 @@ add_executable(testOmp testOmp.c)
 target_link_libraries(testOmp cmocka zfp)
 add_test(NAME testOmp COMMAND testOmp)
 if(ZFP_WITH_OPENMP)
-  target_compile_options(testOmp PRIVATE ${OpenMP_C_FLAGS})
-  target_link_libraries(testOmp ${OpenMP_C_LIBRARIES})
+  target_link_libraries(testOmp OpenMP::OpenMP_C)
   set_property(TEST testOmp PROPERTY RUN_SERIAL TRUE)
 endif()
 
 if(ZFP_WITH_OPENMP)
   add_executable(testOmpInternal testOmpInternal.c)
-  target_compile_options(testOmpInternal PRIVATE ${OpenMP_C_FLAGS})
-  target_link_libraries(testOmpInternal
-    cmocka zfp ${OpenMP_C_FLAGS} ${OpenMP_C_LIBRARIES})
+  target_link_libraries(testOmpInternal cmocka zfp OpenMP::OpenMP_C)
   add_test(NAME testOmpInternal COMMAND testOmpInternal)
 endif()
 
diff --git a/tests/src/execPolicy/testCuda.c b/tests/src/execPolicy/testCuda.c
index 83d2d1fc4..99640109c 100644
--- a/tests/src/execPolicy/testCuda.c
+++ b/tests/src/execPolicy/testCuda.c
@@ -2,6 +2,7 @@
 
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/execPolicy/testOmp.c b/tests/src/execPolicy/testOmp.c
index 29ab5d57b..4fffffbd1 100644
--- a/tests/src/execPolicy/testOmp.c
+++ b/tests/src/execPolicy/testOmp.c
@@ -2,6 +2,7 @@
 
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/execPolicy/testOmpInternal.c b/tests/src/execPolicy/testOmpInternal.c
index b8eac13ef..1436a6eca 100644
--- a/tests/src/execPolicy/testOmpInternal.c
+++ b/tests/src/execPolicy/testOmpInternal.c
@@ -1,9 +1,10 @@
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "src/share/omp.c"
 
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
diff --git a/tests/src/inline/testBitstream.c b/tests/src/inline/testBitstream.c
index 96ac9c551..d6e559159 100644
--- a/tests/src/inline/testBitstream.c
+++ b/tests/src/inline/testBitstream.c
@@ -1,14 +1,16 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
-#include "include/bitstream.h"
-#include "src/inline/bitstream.c"
+#include "zfp/internal/zfp/inline.h"
+#include "zfp/bitstream.h"
+#include "zfp/bitstream.inl"
 
 #define STREAM_WORD_CAPACITY 3
 
-#define WORD_MASK ((word)(-1))
+#define WORD_MASK ((bitstream_word)(-1))
 #define WORD1 WORD_MASK
 #define WORD2 (0x5555555555555555 & WORD_MASK)
 
@@ -23,10 +25,10 @@ setup(void **state)
   struct setupVars *s = malloc(sizeof(struct setupVars));
   assert_non_null(s);
 
-  s->buffer = calloc(STREAM_WORD_CAPACITY, sizeof(word));
+  s->buffer = calloc(STREAM_WORD_CAPACITY, sizeof(bitstream_word));
   assert_non_null(s->buffer);
 
-  s->b = stream_open(s->buffer, STREAM_WORD_CAPACITY * sizeof(word));
+  s->b = stream_open(s->buffer, STREAM_WORD_CAPACITY * sizeof(bitstream_word));
   assert_non_null(s->b);
 
   *state = s;
@@ -53,11 +55,11 @@ when_StreamCopy_expect_BitsCopiedToDestBitstream(void **state)
   const uint COPY_BITS = wsize + 4;
 
   const uint NUM_WORD2_BITS_WRITTEN_TO_WORD = DST_OFFSET + (wsize - SRC_OFFSET);
-  const word EXPECTED_WRITTEN_WORD = ((WORD1 >> SRC_OFFSET) << DST_OFFSET)
-                                     + (WORD2 << NUM_WORD2_BITS_WRITTEN_TO_WORD);
+  const bitstream_word EXPECTED_WRITTEN_WORD = ((WORD1 >> SRC_OFFSET) << DST_OFFSET)
+                                               + (WORD2 << NUM_WORD2_BITS_WRITTEN_TO_WORD);
   const uint EXPECTED_BITS = (DST_OFFSET + COPY_BITS) % wsize;
-  const word EXPECTED_BUFFER = (WORD2 >> (NUM_WORD2_BITS_WRITTEN_TO_WORD))
-                               & ((1u << EXPECTED_BITS) - 1);
+  const bitstream_word EXPECTED_BUFFER = (WORD2 >> (NUM_WORD2_BITS_WRITTEN_TO_WORD))
+                                         & ((1u << EXPECTED_BITS) - 1);
 
   bitstream* src = ((struct setupVars *)*state)->b;
   stream_write_word(src, WORD1);
@@ -65,8 +67,8 @@ when_StreamCopy_expect_BitsCopiedToDestBitstream(void **state)
   stream_flush(src);
   stream_rseek(src, SRC_OFFSET);
 
-  void* buffer = calloc(STREAM_WORD_CAPACITY, sizeof(word));
-  bitstream* dst = stream_open(buffer, STREAM_WORD_CAPACITY * sizeof(word));
+  void* buffer = calloc(STREAM_WORD_CAPACITY, sizeof(bitstream_word));
+  bitstream* dst = stream_open(buffer, STREAM_WORD_CAPACITY * sizeof(bitstream_word));
   stream_wseek(dst, DST_OFFSET);
 
   stream_copy(dst, src, COPY_BITS);
@@ -91,7 +93,7 @@ when_Flush_expect_PaddedWordWrittenToStream(void **state)
 
   stream_rewind(s);
   stream_write_bits(s, WORD2, PREV_BUFFER_BIT_COUNT);
-  word *prevPtr = s->ptr;
+  bitstream_word *prevPtr = s->ptr;
 
   size_t padCount = stream_flush(s);
 
@@ -105,9 +107,9 @@ static void
 given_EmptyBuffer_when_Flush_expect_NOP(void **state)
 {
   bitstream* s = ((struct setupVars *)*state)->b;
-  word *prevPtr = s->ptr;
-  uint prevBits = s->bits;
-  word prevBuffer = s->buffer;
+  bitstream_word *prevPtr = s->ptr;
+  bitstream_count prevBits = s->bits;
+  bitstream_word prevBuffer = s->buffer;
 
   size_t padCount = stream_flush(s);
 
@@ -128,7 +130,7 @@ when_Align_expect_BufferEmptyBitsZero(void **state)
 
   stream_rewind(s);
   stream_read_bits(s, READ_BIT_COUNT);
-  word *prevPtr = s->ptr;
+  bitstream_word *prevPtr = s->ptr;
 
   stream_align(s);
 
@@ -144,7 +146,7 @@ when_SkipPastBufferEnd_expect_NewMaskedWordInBuffer(void **state)
   const uint SKIP_COUNT = wsize + 5;
   const uint TOTAL_OFFSET = READ_BIT_COUNT + SKIP_COUNT;
   const uint EXPECTED_BITS = wsize - (TOTAL_OFFSET % wsize);
-  const word EXPECTED_BUFFER = WORD2 >> (TOTAL_OFFSET % wsize);
+  const bitstream_word EXPECTED_BUFFER = WORD2 >> (TOTAL_OFFSET % wsize);
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, wsize);
@@ -167,14 +169,14 @@ when_SkipWithinBuffer_expect_MaskedBuffer(void **state)
   const uint SKIP_COUNT = 5;
   const uint TOTAL_OFFSET = READ_BIT_COUNT + SKIP_COUNT;
   const uint EXPECTED_BITS = wsize - (TOTAL_OFFSET % wsize);
-  const word EXPECTED_BUFFER = WORD1 >> (TOTAL_OFFSET % wsize);
+  const bitstream_word EXPECTED_BUFFER = WORD1 >> (TOTAL_OFFSET % wsize);
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, wsize);
 
   stream_rewind(s);
   stream_read_bits(s, READ_BIT_COUNT);
-  word *prevPtr = s->ptr;
+  bitstream_word *prevPtr = s->ptr;
 
   stream_skip(s, SKIP_COUNT);
 
@@ -193,9 +195,9 @@ when_SkipZeroBits_expect_NOP(void **state)
   stream_rewind(s);
   stream_read_bits(s, 2);
 
-  word* prevPtr = s->ptr;
-  word prevBits = s->bits;
-  word prevBuffer = s->buffer;
+  bitstream_word* prevPtr = s->ptr;
+  bitstream_word prevBits = s->bits;
+  bitstream_word prevBuffer = s->buffer;
 
   stream_skip(s, 0);
 
@@ -209,7 +211,7 @@ when_RseekToNonMultipleOfWsize_expect_MaskedWordLoadedToBuffer(void **state)
 {
   const uint BIT_OFFSET = wsize + 5;
   const uint EXPECTED_BITS = wsize - (BIT_OFFSET % wsize);
-  const word EXPECTED_BUFFER = WORD2 >> (BIT_OFFSET % wsize);
+  const bitstream_word EXPECTED_BUFFER = WORD2 >> (BIT_OFFSET % wsize);
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, wsize);
@@ -240,7 +242,7 @@ static void
 when_WseekToNonMultipleOfWsize_expect_MaskedWordLoadedToBuffer(void **state)
 {
   const uint BIT_OFFSET = wsize + 5;
-  const word MASK = 0x1f;
+  const bitstream_word MASK = 0x1f;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, wsize);
@@ -305,8 +307,8 @@ when_ReadBitsSpreadsAcrossTwoWords_expect_BitsCombinedFromBothWords(void **state
   const uint NUM_OVERFLOWED_BITS = READ_BIT_COUNT - PARTIAL_WORD_BIT_COUNT;
   const uint EXPECTED_BUFFER_BIT_COUNT = wsize - NUM_OVERFLOWED_BITS;
 
-  const word PARTIAL_WORD1 = WORD1 & 0xffff;
-  const word PARTIAL_WORD2 = WORD2 & 0x1fffffffffff << PARTIAL_WORD_BIT_COUNT;
+  const bitstream_word PARTIAL_WORD1 = WORD1 & 0xffff;
+  const bitstream_word PARTIAL_WORD2 = WORD2 & 0x1fffffffffff << PARTIAL_WORD_BIT_COUNT;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, PARTIAL_WORD1, wsize);
@@ -347,7 +349,7 @@ static void
 when_ReadBits_expect_BitsReadInOrderLSB(void **state)
 {
   const uint BITS_TO_READ = 2;
-  const word MASK = 0x3;
+  const bitstream_word MASK = 0x3;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   s->buffer = WORD2;
@@ -383,9 +385,9 @@ when_WriteBitsOverflowsBuffer_expect_OverflowWrittenToNewBuffer(void **state)
   const uint NUM_BITS_TO_WRITE = wsize - 1;
   const uint OVERFLOW_BIT_COUNT = NUM_BITS_TO_WRITE - (wsize - EXISTING_BIT_COUNT);
   // 0x1101 0101 0101 ... 0101 allows stream_write_bit() to return non-zero
-  const word WORD_TO_WRITE = WORD2 + 0x8000000000000000;
-  const word OVERFLOWED_BITS = WORD_TO_WRITE >> (wsize - EXISTING_BIT_COUNT);
-  const word EXPECTED_BUFFER_RESULT = OVERFLOWED_BITS & 0xf;
+  const bitstream_word WORD_TO_WRITE = WORD2 + 0x8000000000000000;
+  const bitstream_word OVERFLOWED_BITS = WORD_TO_WRITE >> (wsize - EXISTING_BIT_COUNT);
+  const bitstream_word EXPECTED_BUFFER_RESULT = OVERFLOWED_BITS & 0xf;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, EXISTING_BIT_COUNT);
@@ -402,14 +404,14 @@ when_WriteBitsFillsBufferExactly_expect_WordWrittenToStream(void **state)
 {
   const uint EXISTING_BIT_COUNT = 5;
   const uint NUM_BITS_TO_WRITE = wsize - EXISTING_BIT_COUNT;
-  const word COMPLETING_WORD = WORD2 & 0x07ffffffffffffff;
+  const bitstream_word COMPLETING_WORD = WORD2 & 0x07ffffffffffffff;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bits(s, WORD1, EXISTING_BIT_COUNT);
   uint64 remainingBits = stream_write_bits(s, COMPLETING_WORD, NUM_BITS_TO_WRITE);
 
   stream_rewind(s);
-  word readWord = stream_read_word(s);
+  bitstream_word readWord = stream_read_word(s);
 
   assert_int_equal(readWord, 0x1f + 0xaaaaaaaaaaaaaaa0);
   assert_int_equal(remainingBits, 0);
@@ -464,8 +466,8 @@ given_BitstreamWithBitInBuffer_when_ReadBit_expect_OneBitReadFromLSB(void **stat
   bitstream* s = ((struct setupVars *)*state)->b;
   stream_write_bit(s, 1);
 
-  uint prevBits = s->bits;
-  word prevBuffer = s->buffer;
+  bitstream_count prevBits = s->bits;
+  bitstream_word prevBuffer = s->buffer;
 
   assert_int_equal(stream_read_bit(s), 1);
   assert_int_equal(s->bits, prevBits - 1);
@@ -482,8 +484,8 @@ given_BitstreamBufferOneBitFromFull_when_WriteBit_expect_BitWrittenToBufferWritt
 
   stream_write_bit(s, 1);
 
-  assert_int_equal(stream_size(s), sizeof(word));
-  assert_int_equal(*s->begin, (word)1 << PLACE);
+  assert_int_equal(stream_size(s), sizeof(bitstream_word));
+  assert_int_equal(*s->begin, (bitstream_word)1 << PLACE);
   assert_int_equal(s->buffer, 0);
 }
 
@@ -498,7 +500,7 @@ when_WriteBit_expect_BitWrittenToBufferFromLSB(void **state)
   stream_write_bit(s, 1);
 
   assert_int_equal(s->bits, PLACE + 1);
-  assert_int_equal(s->buffer, (word)1 << PLACE);
+  assert_int_equal(s->buffer, (bitstream_word)1 << PLACE);
 }
 
 static void
@@ -506,7 +508,7 @@ given_StartedBuffer_when_StreamPadOverflowsBuffer_expect_ProperWordsWritten(void
 {
   const uint NUM_WORDS = 2;
   const uint EXISTING_BIT_COUNT = 12;
-  const word EXISTING_BUFFER = 0xfff;
+  const bitstream_word EXISTING_BUFFER = 0xfff;
   const uint PAD_AMOUNT = NUM_WORDS * wsize - EXISTING_BIT_COUNT;
 
   bitstream* s = ((struct setupVars *)*state)->b;
@@ -520,7 +522,7 @@ given_StartedBuffer_when_StreamPadOverflowsBuffer_expect_ProperWordsWritten(void
 
   stream_pad(s, PAD_AMOUNT);
 
-  assert_int_equal(stream_size(s), prevStreamSize + NUM_WORDS * sizeof(word));
+  assert_int_equal(stream_size(s), prevStreamSize + NUM_WORDS * sizeof(bitstream_word));
   stream_rewind(s);
   assert_int_equal(stream_read_word(s), EXISTING_BUFFER);
   assert_int_equal(stream_read_word(s), 0);
@@ -530,7 +532,7 @@ static void
 given_StartedBuffer_when_StreamPad_expect_PaddedWordWritten(void **state)
 {
   const uint EXISTING_BIT_COUNT = 12;
-  const word EXISTING_BUFFER = 0xfff;
+  const bitstream_word EXISTING_BUFFER = 0xfff;
 
   bitstream* s = ((struct setupVars *)*state)->b;
   s->buffer = EXISTING_BUFFER;
@@ -539,7 +541,7 @@ given_StartedBuffer_when_StreamPad_expect_PaddedWordWritten(void **state)
 
   stream_pad(s, wsize - EXISTING_BIT_COUNT);
 
-  assert_int_equal(stream_size(s), prevStreamSize + sizeof(word));
+  assert_int_equal(stream_size(s), prevStreamSize + sizeof(bitstream_word));
   stream_rewind(s);
   assert_int_equal(stream_read_word(s), EXISTING_BUFFER);
 }
@@ -586,7 +588,7 @@ when_WriteTwoWords_expect_WordsWrittenToStreamConsecutively(void **state)
   stream_write_word(s, WORD1);
   stream_write_word(s, WORD2);
 
-  assert_int_equal(stream_size(s), sizeof(word) * 2);
+  assert_int_equal(stream_size(s), sizeof(bitstream_word) * 2);
   assert_int_equal(*s->begin, WORD1);
   assert_int_equal(*(s->begin + 1), WORD2);
 }
@@ -599,7 +601,7 @@ given_RewoundBitstream_when_WriteWord_expect_WordWrittenAtStreamBegin(void **sta
 
   stream_write_word(s, WORD1);
 
-  assert_int_equal(stream_size(s), prevStreamSize + sizeof(word));
+  assert_int_equal(stream_size(s), prevStreamSize + sizeof(bitstream_word));
   assert_int_equal(*s->begin, WORD1);
 }
 
@@ -608,12 +610,12 @@ when_BitstreamOpened_expect_ProperLengthAndBoundaries(void **state)
 {
   const int NUM_WORDS = 4;
 
-  size_t bufferLenBytes = sizeof(word) * NUM_WORDS;
+  size_t bufferLenBytes = sizeof(bitstream_word) * NUM_WORDS;
   void* buffer = malloc(bufferLenBytes);
   bitstream* s = stream_open(buffer, bufferLenBytes);
 
   void* streamBegin = stream_data(s);
-  void* computedStreamEnd = (word*)streamBegin + NUM_WORDS;
+  void* computedStreamEnd = (bitstream_word*)streamBegin + NUM_WORDS;
 
   assert_ptr_equal(streamBegin, buffer);
   assert_ptr_equal(s->end, computedStreamEnd);
diff --git a/tests/src/inline/testBitstreamSmallWsize.c b/tests/src/inline/testBitstreamSmallWsize.c
index 3f0c966dd..e84c72af1 100644
--- a/tests/src/inline/testBitstreamSmallWsize.c
+++ b/tests/src/inline/testBitstreamSmallWsize.c
@@ -1,12 +1,14 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
 #define BIT_STREAM_WORD_TYPE uint16
 
-#include "include/bitstream.h"
-#include "src/inline/bitstream.c"
+#include "zfp/internal/zfp/inline.h"
+#include "zfp/bitstream.h"
+#include "zfp/bitstream.inl"
 
 #define STREAM_WORD_CAPACITY 4
 
@@ -21,10 +23,10 @@ setup(void **state)
   struct setupVars *s = malloc(sizeof(struct setupVars));
   assert_non_null(s);
 
-  s->buffer = calloc(STREAM_WORD_CAPACITY, sizeof(word));
+  s->buffer = calloc(STREAM_WORD_CAPACITY, sizeof(bitstream_word));
   assert_non_null(s->buffer);
 
-  s->b = stream_open(s->buffer, STREAM_WORD_CAPACITY * sizeof(word));
+  s->b = stream_open(s->buffer, STREAM_WORD_CAPACITY * sizeof(bitstream_word));
   assert_non_null(s->b);
 
   *state = s;
@@ -72,7 +74,7 @@ when_ReadBitsSpreadsAcrossMultipleWords_expect_BitsCombinedFromMultipleWords(voi
     + (WRITE_BITS2 << PARTIAL_WORD_BIT_COUNT)
     + (WRITE_BITS3 << (wsize + PARTIAL_WORD_BIT_COUNT))
     + ((WRITE_BITS4 & 0xff) << (2*wsize + PARTIAL_WORD_BIT_COUNT)));
-  assert_int_equal(s->buffer, (word) (WRITE_BITS4 >> (NUM_OVERFLOWED_BITS % wsize)));
+  assert_int_equal(s->buffer, (bitstream_word) (WRITE_BITS4 >> (NUM_OVERFLOWED_BITS % wsize)));
 }
 
 // overflow refers to what will land in the buffer
diff --git a/tests/src/inline/testBitstreamStrided.c b/tests/src/inline/testBitstreamStrided.c
index f7349d67d..7208684d1 100644
--- a/tests/src/inline/testBitstreamStrided.c
+++ b/tests/src/inline/testBitstreamStrided.c
@@ -1,12 +1,14 @@
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
 #define BIT_STREAM_STRIDED
 
-#include "include/bitstream.h"
-#include "src/inline/bitstream.c"
+#include "zfp/internal/zfp/inline.h"
+#include "zfp/bitstream.h"
+#include "zfp/bitstream.inl"
 
 // 4 words per block
 #define BLOCK_SIZE 4
@@ -26,10 +28,10 @@ setup(void **state)
   struct setupVars *s = malloc(sizeof(struct setupVars));
   assert_non_null(s);
 
-  s->buffer = calloc(STREAM_STRIDED_LEN, sizeof(word));
+  s->buffer = calloc(STREAM_STRIDED_LEN, sizeof(bitstream_word));
   assert_non_null(s->buffer);
 
-  s->b = stream_open(s->buffer, STREAM_STRIDED_LEN * sizeof(word));
+  s->b = stream_open(s->buffer, STREAM_STRIDED_LEN * sizeof(bitstream_word));
   assert_non_null(s->b);
 
   assert_true(stream_set_stride(s->b, BLOCK_SIZE, DELTA));
@@ -54,7 +56,7 @@ static void
 given_Strided_when_ReadWordCompletesBlock_expect_PtrAdvancedByStrideLen(void **state)
 {
   bitstream* s = ((struct setupVars *)*state)->b;
-  word* prevPtr = s->ptr;
+  bitstream_word* prevPtr = s->ptr;
 
   int i;
   for (i = 0; i < BLOCK_SIZE - 1; i++) {
@@ -71,7 +73,7 @@ static void
 given_Strided_when_WriteWordCompletesBlock_expect_PtrAdvancedByStrideLen(void **state)
 {
   bitstream* s = ((struct setupVars *)*state)->b;
-  word* prevPtr = s->ptr;
+  bitstream_word* prevPtr = s->ptr;
 
   int i;
   for (i = 0; i < BLOCK_SIZE - 1; i++) {
diff --git a/tests/src/misc/CMakeLists.txt b/tests/src/misc/CMakeLists.txt
index 593a17ad8..ce76cf0d8 100644
--- a/tests/src/misc/CMakeLists.txt
+++ b/tests/src/misc/CMakeLists.txt
@@ -6,6 +6,42 @@ add_executable(testZfpStream testZfpStream.c)
 target_link_libraries(testZfpStream cmocka zfp)
 add_test(NAME testZfpStream COMMAND testZfpStream)
 
+add_executable(testZfpPromote testZfpPromote.c)
+target_link_libraries(testZfpPromote cmocka zfp)
+add_test(NAME testZfpPromote COMMAND testZfpPromote)
+
+add_executable(testZfpField1f testZfpField1f.c)
+target_link_libraries(testZfpField1f cmocka zfp)
+add_test(NAME testZfpField1f COMMAND testZfpField1f)
+
+add_executable(testZfpField2f testZfpField2f.c)
+target_link_libraries(testZfpField2f cmocka zfp)
+add_test(NAME testZfpField2f COMMAND testZfpField2f)
+
+add_executable(testZfpField3f testZfpField3f.c)
+target_link_libraries(testZfpField3f cmocka zfp)
+add_test(NAME testZfpField3f COMMAND testZfpField3f)
+
+add_executable(testZfpField4f testZfpField4f.c)
+target_link_libraries(testZfpField4f cmocka zfp)
+add_test(NAME testZfpField4f COMMAND testZfpField4f)
+
+add_executable(testZfpField1d testZfpField1d.c)
+target_link_libraries(testZfpField1d cmocka zfp)
+add_test(NAME testZfpField1d COMMAND testZfpField1d)
+
+add_executable(testZfpField2d testZfpField2d.c)
+target_link_libraries(testZfpField2d cmocka zfp)
+add_test(NAME testZfpField2d COMMAND testZfpField2d)
+
+add_executable(testZfpField3d testZfpField3d.c)
+target_link_libraries(testZfpField3d cmocka zfp)
+add_test(NAME testZfpField3d COMMAND testZfpField3d)
+
+add_executable(testZfpField4d testZfpField4d.c)
+target_link_libraries(testZfpField4d cmocka zfp)
+add_test(NAME testZfpField4d COMMAND testZfpField4d)
+
 if(HAVE_LIBM_MATH)
   target_link_libraries(testZfpHeader m)
   target_link_libraries(testZfpStream m)
diff --git a/tests/src/misc/testZfpField1d.c b/tests/src/misc/testZfpField1d.c
new file mode 100644
index 000000000..7350c5a84
--- /dev/null
+++ b/tests/src/misc/testZfpField1d.c
@@ -0,0 +1,25 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 1
+#define ZFP_TYPE zfp_type_double
+#define SCALAR double
+
+#define NX 20
+#define SX 2
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef SX
diff --git a/tests/src/misc/testZfpField1f.c b/tests/src/misc/testZfpField1f.c
new file mode 100644
index 000000000..c56430a11
--- /dev/null
+++ b/tests/src/misc/testZfpField1f.c
@@ -0,0 +1,25 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 1
+#define ZFP_TYPE zfp_type_float
+#define SCALAR float
+
+#define NX 20
+#define SX 2
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef SX
diff --git a/tests/src/misc/testZfpField2d.c b/tests/src/misc/testZfpField2d.c
new file mode 100644
index 000000000..ae6313159
--- /dev/null
+++ b/tests/src/misc/testZfpField2d.c
@@ -0,0 +1,29 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 2
+#define ZFP_TYPE zfp_type_double
+#define SCALAR double
+
+#define NX 20
+#define NY 21
+#define SX 2
+#define SY 3
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef SX
+#undef SY
diff --git a/tests/src/misc/testZfpField2f.c b/tests/src/misc/testZfpField2f.c
new file mode 100644
index 000000000..a302d34b7
--- /dev/null
+++ b/tests/src/misc/testZfpField2f.c
@@ -0,0 +1,29 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 2
+#define ZFP_TYPE zfp_type_float
+#define SCALAR float
+
+#define NX 20
+#define NY 21
+#define SX 2
+#define SY 3
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef SX
+#undef SY
diff --git a/tests/src/misc/testZfpField3d.c b/tests/src/misc/testZfpField3d.c
new file mode 100644
index 000000000..c8a6fc674
--- /dev/null
+++ b/tests/src/misc/testZfpField3d.c
@@ -0,0 +1,33 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 3
+#define ZFP_TYPE zfp_type_double
+#define SCALAR double
+
+#define NX 20
+#define NY 21
+#define NZ 12
+#define SX 2
+#define SY 3
+#define SZ 4
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef NZ
+#undef SX
+#undef SY
+#undef SZ
diff --git a/tests/src/misc/testZfpField3f.c b/tests/src/misc/testZfpField3f.c
new file mode 100644
index 000000000..caa78f1f5
--- /dev/null
+++ b/tests/src/misc/testZfpField3f.c
@@ -0,0 +1,33 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 3
+#define ZFP_TYPE zfp_type_float
+#define SCALAR float
+
+#define NX 20
+#define NY 21
+#define NZ 12
+#define SX 2
+#define SY 3
+#define SZ 4
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef NZ
+#undef SX
+#undef SY
+#undef SZ
diff --git a/tests/src/misc/testZfpField4d.c b/tests/src/misc/testZfpField4d.c
new file mode 100644
index 000000000..11c10bd99
--- /dev/null
+++ b/tests/src/misc/testZfpField4d.c
@@ -0,0 +1,37 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 4
+#define ZFP_TYPE zfp_type_double
+#define SCALAR double
+
+#define NX 20
+#define NY 21
+#define NZ 12
+#define NW 6
+#define SX 2
+#define SY 3
+#define SZ 4
+#define SW 2
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef NZ
+#undef NW
+#undef SX
+#undef SY
+#undef SZ
+#undef SW
diff --git a/tests/src/misc/testZfpField4f.c b/tests/src/misc/testZfpField4f.c
new file mode 100644
index 000000000..87f7dc30e
--- /dev/null
+++ b/tests/src/misc/testZfpField4f.c
@@ -0,0 +1,37 @@
+#include "zfp.h"
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdlib.h>
+
+#define DIMS 4
+#define ZFP_TYPE zfp_type_float
+#define SCALAR float
+
+#define NX 20
+#define NY 21
+#define NZ 12
+#define NW 6
+#define SX 2
+#define SY 3
+#define SZ 4
+#define SW 2
+
+#include "zfpFieldBase.c"
+
+#undef DIMS
+#undef ZFP_TYPE
+#undef SCALAR
+#undef NX
+#undef NY
+#undef NZ
+#undef NW
+#undef SX
+#undef SY
+#undef SZ
+#undef SW
diff --git a/tests/src/misc/testZfpHeader.c b/tests/src/misc/testZfpHeader.c
index b7e450f72..17904be39 100644
--- a/tests/src/misc/testZfpHeader.c
+++ b/tests/src/misc/testZfpHeader.c
@@ -3,6 +3,7 @@
 
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
@@ -148,6 +149,34 @@ when_zfpFieldSetMetadataCalled_expect_arrayDimensionsSet(void **state)
   assert_int_equal(field->nz, 0);
 }
 
+static void
+when_zfpFieldMetadataCalled_onInvalidSize_expect_ZFP_META_NULL(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+  uint64 metadata = zfp_field_metadata(field);
+
+  // setup uses a 2d field
+  field->nx = 1 << 25;
+  field->ny = 1 << 25;
+
+  uint64 meta = zfp_field_metadata(field); 
+
+  assert_int_equal(meta, ZFP_META_NULL);
+}
+
+static void
+when_zfpFieldSetMetadataCalled_forInvalidMeta_expect_false(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  uint64 meta = 1ULL << (ZFP_META_BITS + 1);
+  zfp_bool status = zfp_field_set_metadata(field, meta); 
+
+  assert_int_equal(status, zfp_false);
+}
+
 static void
 when_zfpWriteHeaderMagic_expect_numBitsWrittenEqualToZFP_MAGIC_BITS(void **state)
 {
@@ -436,6 +465,8 @@ int main()
 
     // write header
     cmocka_unit_test_setup_teardown(when_zfpWriteHeaderMagic_expect_numBitsWrittenEqualToZFP_MAGIC_BITS, setup, teardown),
+    cmocka_unit_test_setup_teardown(when_zfpFieldMetadataCalled_onInvalidSize_expect_ZFP_META_NULL, setup, teardown),
+    cmocka_unit_test_setup_teardown(when_zfpFieldSetMetadataCalled_forInvalidMeta_expect_false, setup, teardown),
     cmocka_unit_test_setup_teardown(when_zfpWriteHeaderMagic_expect_24BitsAreCharsZfpFollowedBy8BitsZfpCodecVersion, setup, teardown),
 
     cmocka_unit_test_setup_teardown(when_zfpWriteHeaderMetadata_expect_numBitsWrittenEqualToZFP_META_BITS, setup, teardown),
diff --git a/tests/src/misc/testZfpPromote.c b/tests/src/misc/testZfpPromote.c
new file mode 100644
index 000000000..6f40c45a6
--- /dev/null
+++ b/tests/src/misc/testZfpPromote.c
@@ -0,0 +1,121 @@
+#include "zfp.h"
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+static void
+given_int8_when_promoteToInt32_expect_demoteToInt8Matches(void **state)
+{
+  uint dims = 3;
+  uint sz = 1u << (2 * dims);
+  int8* iblock8 = (int8*)malloc(sizeof(int8)*sz);
+  int8* oblock8 = (int8*)calloc(sz, sizeof(int8));
+  int32* block32 = (int32*)malloc(sizeof(int32)*sz);
+
+  assert_non_null(iblock8);
+  assert_non_null(oblock8);
+  assert_non_null(block32);
+
+  uint i;
+  for (i = 0; i < sz; i++)
+    iblock8[i] = (int8)i;
+
+  zfp_promote_int8_to_int32(block32, iblock8, dims);
+  zfp_demote_int32_to_int8(oblock8, block32, dims);
+
+  for (i = 0; i < sz; i++)
+    assert_int_equal(iblock8[i], oblock8[i]);
+}
+
+static void
+given_uint8_when_promoteToInt32_expect_demoteToUInt8Matches(void **state)
+{
+  uint dims = 3;
+  uint sz = 1u << (2 * dims);
+  uint8* iblock8 = (uint8*)malloc(sizeof(uint8)*sz);
+  uint8* oblock8 = (uint8*)calloc(sz, sizeof(uint8));
+  int32* block32 = (int32*)malloc(sizeof(int32)*sz);
+
+  assert_non_null(iblock8);
+  assert_non_null(oblock8);
+  assert_non_null(block32);
+
+  uint i;
+  for (i = 0; i < sz; i++)
+    iblock8[i] = (uint8)i;
+
+  zfp_promote_uint8_to_int32(block32, iblock8, dims);
+  zfp_demote_int32_to_uint8(oblock8, block32, dims);
+
+  for (i = 0; i < sz; i++)
+    assert_int_equal(iblock8[i], oblock8[i]);
+}
+
+static void
+given_int16_when_promoteToInt32_expect_demoteToInt16Matches(void **state)
+{
+  uint dims = 3;
+  uint sz = 1u << (2 * dims);
+  int16* iblock16 = (int16*)malloc(sizeof(int16)*sz);
+  int16* oblock16 = (int16*)calloc(sz, sizeof(int16));
+  int32* block32 = (int32*)malloc(sizeof(int32)*sz);
+
+  assert_non_null(iblock16);
+  assert_non_null(oblock16);
+  assert_non_null(block32);
+
+  uint i;
+  for (i = 0; i < sz; i++)
+    iblock16[i] = (int16)i;
+
+  zfp_promote_int16_to_int32(block32, iblock16, dims);
+  zfp_demote_int32_to_int16(oblock16, block32, dims);
+
+  for (i = 0; i < sz; i++)
+    assert_int_equal(iblock16[i], oblock16[i]);
+}
+
+static void
+given_uint16_when_promoteToInt32_expect_demoteToUInt16Matches(void **state)
+{
+  uint dims = 3;
+  uint sz = 1u << (2 * dims);
+  uint16* iblock16 = (uint16*)malloc(sizeof(uint16)*sz);
+  uint16* oblock16 = (uint16*)calloc(sz, sizeof(uint16));
+  int32* block32 = (int32*)malloc(sizeof(int32)*sz);
+
+  assert_non_null(iblock16);
+  assert_non_null(oblock16);
+  assert_non_null(block32);
+
+  uint i;
+  for (i = 0; i < sz; i++)
+    iblock16[i] = (uint16)i;
+
+  zfp_promote_uint16_to_int32(block32, iblock16, dims);
+  zfp_demote_int32_to_uint16(oblock16, block32, dims);
+
+  for (i = 0; i < sz; i++)
+    assert_int_equal(iblock16[i], oblock16[i]);
+}
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    cmocka_unit_test(given_int8_when_promoteToInt32_expect_demoteToInt8Matches),
+    cmocka_unit_test(given_uint8_when_promoteToInt32_expect_demoteToUInt8Matches),
+    cmocka_unit_test(given_int16_when_promoteToInt32_expect_demoteToInt16Matches),
+    cmocka_unit_test(given_uint16_when_promoteToInt32_expect_demoteToUInt16Matches),
+  };
+
+  return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/tests/src/misc/testZfpStream.c b/tests/src/misc/testZfpStream.c
index e5d59211d..3c5a0744c 100644
--- a/tests/src/misc/testZfpStream.c
+++ b/tests/src/misc/testZfpStream.c
@@ -2,6 +2,7 @@
 
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <setjmp.h>
 #include <cmocka.h>
 
@@ -545,6 +546,343 @@ given_invalidCompressParamsModeVal_when_zfpStreamSetMode_expect_returnsNullMode_
   assertCompressParamsBehaviorThroughSetMode(state, zfp_mode_null);
 }
 
+static void
+testStreamAlignSizeMatches(void **state, int dim, zfp_type type)
+{
+  struct setupVars *bundle = *state;
+  zfp_stream* stream = bundle->stream;
+  zfp_field* field;
+
+  size_t arrsize = 4 << 2*(dim-1);
+  size_t dimsize = 4;
+  size_t flushsize;
+  size_t alignsize;
+
+  if (type == zfp_type_float)
+  {
+    float* array;
+    float* block = (float*)calloc(arrsize, sizeof(float));
+
+    if (dim == 1)
+    {
+      array = (float*)calloc(dimsize, sizeof(float));
+      field = zfp_field_1d(array, type, dimsize);
+    }
+    else if (dim == 2)
+    {
+      array = (float*)calloc(dimsize*dimsize, sizeof(float));
+      field = zfp_field_2d(array, type, dimsize, dimsize);
+    }
+    else if (dim == 3)
+    {
+      array = (float*)calloc(dimsize*dimsize*dimsize, sizeof(float));
+      field = zfp_field_3d(array, type, dimsize, dimsize, dimsize);
+    }
+    else if (dim == 4)
+    {
+      array = (float*)calloc(dimsize*dimsize*dimsize*dimsize, sizeof(float));
+      field = zfp_field_4d(array, type, dimsize, dimsize, dimsize, dimsize);
+    }
+
+    size_t bufsize = zfp_stream_maximum_size(stream, field);
+    void* buffer = malloc(bufsize);
+    bitstream* s = stream_open(buffer, bufsize);
+    zfp_stream_set_bit_stream(stream, s);
+    zfp_stream_rewind(stream);
+
+    if (dim == 1)
+    {
+      zfp_encode_block_float_1(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_float_1(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 2)
+    {
+      zfp_encode_block_float_2(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_float_2(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 3)
+    {
+      zfp_encode_block_float_3(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_float_3(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 4)
+    {
+      zfp_encode_block_float_4(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_float_4(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+
+    free(array);
+    free(block);
+  }
+  else if (type == zfp_type_double)
+  {
+    double* array;
+    double* block = (double*)calloc(arrsize, sizeof(double));
+
+    if (dim == 1)
+    {
+      array = (double*)calloc(dimsize, sizeof(double));
+      field = zfp_field_1d(array, type, dimsize);
+    }
+    else if (dim == 2)
+    {
+      array = (double*)calloc(dimsize*dimsize, sizeof(double));
+      field = zfp_field_2d(array, type, dimsize, dimsize);
+    }
+    else if (dim == 3)
+    {
+      array = (double*)calloc(dimsize*dimsize*dimsize, sizeof(double));
+      field = zfp_field_3d(array, type, dimsize, dimsize, dimsize);
+    }
+    else if (dim == 4)
+    {
+      array = (double*)calloc(dimsize*dimsize*dimsize*dimsize, sizeof(double));
+      field = zfp_field_4d(array, type, dimsize, dimsize, dimsize, dimsize);
+    }
+
+    size_t bufsize = zfp_stream_maximum_size(stream, field);
+    void* buffer = malloc(bufsize);
+    bitstream* s = stream_open(buffer, bufsize);
+    zfp_stream_set_bit_stream(stream, s);
+    zfp_stream_rewind(stream);
+
+    if (dim == 1)
+    {
+      zfp_encode_block_double_1(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_double_1(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 2)
+    {
+      zfp_encode_block_double_2(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_double_2(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 3)
+    {
+      zfp_encode_block_double_3(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_double_3(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+    else if (dim == 4)
+    {
+      zfp_encode_block_double_4(stream, block);
+      flushsize = zfp_stream_flush(stream);
+      zfp_stream_rewind(stream);
+      zfp_decode_block_double_4(stream, block);
+      alignsize = zfp_stream_align(stream);
+    }
+
+    free(array);
+    free(block);
+  }
+
+  assert_true(flushsize > 0);
+  assert_true(flushsize == alignsize);
+}
+
+static void
+given_block1f_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 1, zfp_type_float);
+}
+
+static void
+given_block2f_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 2, zfp_type_float);
+}
+
+static void
+given_block3f_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 3, zfp_type_float);
+}
+
+static void
+given_block4f_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 4, zfp_type_float);
+}
+
+static void
+given_block1d_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 1, zfp_type_double);
+}
+
+static void
+given_block2d_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 2, zfp_type_double);
+}
+
+static void
+given_block3d_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 3, zfp_type_double);
+}
+
+static void
+given_block4d_when_StreamFlush_expect_StreamAlignSizeMatches(void **state)
+{
+  testStreamAlignSizeMatches(state, 4, zfp_type_double);
+}
+
+static void
+testStreamCompressedSizeIncreasedCorrectly(void **state, int dim, zfp_type type)
+{
+  struct setupVars *bundle = *state;
+  zfp_stream* stream = bundle->stream;
+  zfp_field* field;
+
+  /* use fixed rate mode to simplify size calculation */
+  double rate = zfp_stream_set_rate(stream, 64, type, dim, 0);
+
+  size_t blocksize = 4 << 2*(dim-1);
+  size_t dimsize = 4;
+  size_t startsize;
+  size_t endsize;
+
+  if (type == zfp_type_float)
+  {
+    float* array = (float*)calloc(blocksize, sizeof(float));
+    float* block = (float*)calloc(blocksize, sizeof(float));
+
+    if (dim == 1)
+      field = zfp_field_1d(array, type, dimsize);
+    else if (dim == 2)
+      field = zfp_field_2d(array, type, dimsize, dimsize);
+    else if (dim == 3)
+      field = zfp_field_3d(array, type, dimsize, dimsize, dimsize);
+    else if (dim == 4)
+      field = zfp_field_4d(array, type, dimsize, dimsize, dimsize, dimsize);
+
+    size_t bufsize = zfp_stream_maximum_size(stream, field);
+    void* buffer = malloc(bufsize);
+    bitstream* s = stream_open(buffer, bufsize);
+    zfp_stream_set_bit_stream(stream, s);
+    zfp_stream_rewind(stream);
+    startsize = zfp_stream_compressed_size(stream);
+
+    if (dim == 1)
+      zfp_encode_block_float_1(stream, block);
+    else if (dim == 2)
+      zfp_encode_block_float_2(stream, block);
+    else if (dim == 3)
+      zfp_encode_block_float_3(stream, block);
+    else if (dim == 4)
+      zfp_encode_block_float_4(stream, block);
+
+    endsize = zfp_stream_compressed_size(stream);
+    free(array);
+    free(block);
+  }
+  else if (type == zfp_type_double)
+  {
+    double* array = (double*)calloc(blocksize, sizeof(double));
+    double* block = (double*)calloc(blocksize, sizeof(double));
+
+    if (dim == 1)
+      field = zfp_field_1d(array, type, dimsize);
+    else if (dim == 2)
+      field = zfp_field_2d(array, type, dimsize, dimsize);
+    else if (dim == 3)
+      field = zfp_field_3d(array, type, dimsize, dimsize, dimsize);
+    else if (dim == 4)
+      field = zfp_field_4d(array, type, dimsize, dimsize, dimsize, dimsize);
+
+    size_t bufsize = zfp_stream_maximum_size(stream, field);
+    void* buffer = malloc(bufsize);
+    bitstream* s = stream_open(buffer, bufsize);
+    zfp_stream_set_bit_stream(stream, s);
+    zfp_stream_rewind(stream);
+    startsize = zfp_stream_compressed_size(stream);
+
+    if (dim == 1)
+      zfp_encode_block_double_1(stream, block);
+    else if (dim == 2)
+      zfp_encode_block_double_2(stream, block);
+    else if (dim == 3)
+      zfp_encode_block_double_3(stream, block);
+    else if (dim == 4)
+      zfp_encode_block_double_4(stream, block);
+
+    endsize = zfp_stream_compressed_size(stream);
+    free(array);
+    free(block);
+  }
+
+  assert_true(endsize > 0);
+  assert_true(endsize == startsize + blocksize * (size_t)(rate/8));
+}
+
+static void
+given_block1f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 1, zfp_type_float);
+}
+
+static void
+given_block2f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 2, zfp_type_float);
+}
+
+static void
+given_block3f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 3, zfp_type_float);
+}
+
+static void
+given_block4f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 4, zfp_type_float);
+}
+
+static void
+given_block1d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 1, zfp_type_double);
+}
+
+static void
+given_block2d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 2, zfp_type_double);
+}
+
+static void
+given_block3d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 3, zfp_type_double);
+}
+
+static void
+given_block4d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly(void **state)
+{
+  testStreamCompressedSizeIncreasedCorrectly(state, 4, zfp_type_double);
+}
+
 int main()
 {
   const struct CMUnitTest tests[] = {
@@ -571,6 +909,26 @@ int main()
     cmocka_unit_test_setup_teardown(given_zfpStreamSetReversibleModeVal_when_zfpStreamSetMode_expect_returnsReversible_and_compressParamsConserved, setup, teardown),
     cmocka_unit_test_setup_teardown(given_customCompressParamsModeVal_when_zfpStreamSetMode_expect_returnsExpert_and_compressParamsConserved, setup, teardown),
     cmocka_unit_test_setup_teardown(given_invalidCompressParamsModeVal_when_zfpStreamSetMode_expect_returnsNullMode_and_paramsNotSet, setup, teardown),
+
+    /* test other zfp_stream_align() */
+    cmocka_unit_test_setup_teardown(given_block1f_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block2f_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block3f_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block4f_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block1d_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block2d_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block3d_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block4d_when_StreamFlush_expect_StreamAlignSizeMatches, setup, teardown),
+
+    /* test zfp_stream_compressed_size() */
+    cmocka_unit_test_setup_teardown(given_block1f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block2f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block3f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block4f_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block1d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block2d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block3d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
+    cmocka_unit_test_setup_teardown(given_block4d_when_WriteBlock_expect_StreamCompressedSizeIncreasedCorrectly, setup, teardown),
   };
 
   return cmocka_run_group_tests(tests, NULL, NULL);
diff --git a/tests/src/misc/zfpFieldBase.c b/tests/src/misc/zfpFieldBase.c
new file mode 100644
index 000000000..fa2ddc206
--- /dev/null
+++ b/tests/src/misc/zfpFieldBase.c
@@ -0,0 +1,256 @@
+struct setupVars {
+  zfp_field* field;
+  SCALAR* data;
+};
+
+static int
+setupBasic(void **state)
+{
+  struct setupVars *bundle = malloc(sizeof(struct setupVars));
+  assert_non_null(bundle);
+
+#if DIMS == 1
+  zfp_field* field = zfp_field_1d(NULL, ZFP_TYPE, NX);
+#elif DIMS == 2
+  zfp_field* field = zfp_field_2d(NULL, ZFP_TYPE, NX, NY);
+#elif DIMS == 3
+  zfp_field* field = zfp_field_3d(NULL, ZFP_TYPE, NX, NY, NZ);
+#elif DIMS == 4
+  zfp_field* field = zfp_field_4d(NULL, ZFP_TYPE, NX, NY, NZ, NW);
+#endif
+
+  bundle->field = field;
+  bundle->data = NULL;
+
+  *state = bundle;
+
+  return 0;
+}
+
+static int
+setupContiguous(void **state)
+{
+  struct setupVars *bundle = malloc(sizeof(struct setupVars));
+  assert_non_null(bundle);
+
+#if DIMS == 1
+  zfp_field* field = zfp_field_1d(NULL, ZFP_TYPE, NX);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR)*NX);
+#elif DIMS == 2
+  zfp_field* field = zfp_field_2d(NULL, ZFP_TYPE, NX, NY);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR)*NX*NY);
+#elif DIMS == 3
+  zfp_field* field = zfp_field_3d(NULL, ZFP_TYPE, NX, NY, NZ);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR)*NX*NY*NZ);
+#elif DIMS == 4
+  zfp_field* field = zfp_field_4d(NULL, ZFP_TYPE, NX, NY, NZ, NW);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR)*NX*NY*NZ*NW);
+#endif
+  assert_non_null(data);
+
+  zfp_field_set_pointer(field, data);
+  bundle->field = field;
+  bundle->data = data;
+
+  *state = bundle;
+
+  return 0;
+}
+
+static int
+setupStrided(void **state)
+{
+  struct setupVars *bundle = malloc(sizeof(struct setupVars));
+  assert_non_null(bundle);
+
+#if DIMS == 1
+  zfp_field* field = zfp_field_1d(NULL, ZFP_TYPE, NX);
+  zfp_field_set_stride_1d(field, SX);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + 1));
+#elif DIMS == 2
+  zfp_field* field = zfp_field_2d(NULL, ZFP_TYPE, NX, NY);
+  zfp_field_set_stride_2d(field, SX, SY);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + 1));
+#elif DIMS == 3
+  zfp_field* field = zfp_field_3d(NULL, ZFP_TYPE, NX, NY, NZ);
+  zfp_field_set_stride_3d(field, SX, SY, SZ);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + (SZ*(NZ-1)) + 1));
+#elif DIMS == 4
+  zfp_field* field = zfp_field_4d(NULL, ZFP_TYPE, NX, NY, NZ, NW);
+  zfp_field_set_stride_4d(field, SX, SY, SZ, SW);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + (SZ*(NZ-1)) + (SW*(NW-1)) + 1));
+#endif
+  assert_non_null(data);
+
+  zfp_field_set_pointer(field, data);
+  bundle->field = field;
+  bundle->data = data;
+
+  *state = bundle;
+
+  return 0;
+}
+
+static int
+setupNegativeStrided(void **state)
+{
+  struct setupVars *bundle = malloc(sizeof(struct setupVars));
+  assert_non_null(bundle);
+
+#if DIMS == 1
+  zfp_field* field = zfp_field_1d(NULL, ZFP_TYPE, NX);
+  zfp_field_set_stride_1d(field, -SX);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + 1));
+#elif DIMS == 2
+  zfp_field* field = zfp_field_2d(NULL, ZFP_TYPE, NX, NY);
+  zfp_field_set_stride_2d(field, -SX, -SY);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + 1));
+#elif DIMS == 3
+  zfp_field* field = zfp_field_3d(NULL, ZFP_TYPE, NX, NY, NZ);
+  zfp_field_set_stride_3d(field, -SX, -SY, -SZ);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + (SZ*(NZ-1)) + 1));
+#elif DIMS == 4
+  zfp_field* field = zfp_field_4d(NULL, ZFP_TYPE, NX, NY, NZ, NW);
+  zfp_field_set_stride_4d(field, -SX, -SY, -SZ, -SW);
+  SCALAR* data = (SCALAR*)malloc(sizeof(SCALAR) * ((SX*(NX-1)) + (SY*(NY-1)) + (SZ*(NZ-1)) + (SW*(NW-1)) + 1));
+#endif
+  assert_non_null(data);
+
+  zfp_field_set_pointer(field, data);
+  bundle->field = field;
+  bundle->data = data;
+
+  *state = bundle;
+
+  return 0;
+}
+
+static int
+teardown(void **state)
+{
+  struct setupVars *bundle = *state;
+
+  zfp_field_free(bundle->field);
+
+  if (bundle->data != NULL)
+    free(bundle->data);
+
+  free(bundle);
+
+  return 0;
+}
+
+static void
+given_contiguousData_isContiguousReturnsTrue(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  assert_true(zfp_field_is_contiguous(field));
+}
+
+static void
+given_noncontiguousData_isContiguousReturnsFalse(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  assert_false(zfp_field_is_contiguous(field));
+}
+
+static void
+when_noFieldData_fieldBeginReturnsNull(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  assert_null(zfp_field_begin(field));
+}
+
+static void
+when_contiguousData_fieldBeginsAtDataPointer(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  assert_true(zfp_field_begin(field) == zfp_field_pointer(field));
+}
+
+static void
+when_noncontiguousDataWithNegativeStride_fieldBeginsAtCorrectLocation(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+#if DIMS == 1
+  ptrdiff_t min = ((int)-SX * (ptrdiff_t)(NX - 1));
+#elif DIMS == 2
+  ptrdiff_t min = ((int)-SX * (ptrdiff_t)(NX - 1)) + ((int)-SY * (ptrdiff_t)(NY - 1));
+#elif DIMS == 3
+  ptrdiff_t min = ((int)-SX * (ptrdiff_t)(NX - 1)) + ((int)-SY * (ptrdiff_t)(NY - 1)) + ((int)-SZ * (ptrdiff_t)(NZ - 1));
+#elif DIMS == 4
+  ptrdiff_t min = ((int)-SX * (ptrdiff_t)(NX - 1)) + ((int)-SY * (ptrdiff_t)(NY - 1)) + ((int)-SZ * (ptrdiff_t)(NZ - 1)) + ((int)-SW * (ptrdiff_t)(NW - 1));
+#endif
+  void* begin = (void*)((uchar*)field->data + min * (ptrdiff_t)zfp_type_size(field->type));
+  assert_true(zfp_field_begin(field) == begin);
+}
+
+static void
+given_field_precisionCorrect(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+  assert_true(zfp_field_precision(field) == sizeof(SCALAR) * CHAR_BIT);
+}
+
+static void
+given_contiguousData_fieldSizeBytesCorrect(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+#if DIMS == 1
+  assert_true(zfp_field_size_bytes(field) == NX * sizeof(SCALAR));
+#elif DIMS == 2
+  assert_true(zfp_field_size_bytes(field) == NX * NY * sizeof(SCALAR));
+#elif DIMS == 3
+  assert_true(zfp_field_size_bytes(field) == NX * NY * NZ * sizeof(SCALAR));
+#elif DIMS == 4
+  assert_true(zfp_field_size_bytes(field) == NX * NY * NZ * NW * sizeof(SCALAR));
+#endif
+}
+
+static void
+given_noncontiguousData_fieldSizeBytesCorrect(void **state)
+{
+  struct setupVars *bundle = *state;
+  zfp_field* field = bundle->field;
+
+#if DIMS == 1
+  assert_true(zfp_field_size_bytes(field) == ((SX*(NX-1) + 1) * sizeof(SCALAR)));
+#elif DIMS == 2
+  assert_true(zfp_field_size_bytes(field) == ((SX*(NX-1) + SY*(NY-1) + 1) * sizeof(SCALAR)));
+#elif DIMS == 3
+  assert_true(zfp_field_size_bytes(field) == ((SX*(NX-1) + SY*(NY-1) + SZ*(NZ-1) + 1) * sizeof(SCALAR)));
+#elif DIMS == 4
+  assert_true(zfp_field_size_bytes(field) == ((SX*(NX-1) + SY*(NY-1) + SZ*(NZ-1) + SW*(NW-1) + 1) * sizeof(SCALAR)));
+#endif
+}
+
+
+
+int main()
+{
+  const struct CMUnitTest tests[] = {
+    cmocka_unit_test_setup_teardown(given_contiguousData_isContiguousReturnsTrue, setupContiguous, teardown),
+    cmocka_unit_test_setup_teardown(given_noncontiguousData_isContiguousReturnsFalse, setupStrided, teardown),
+    cmocka_unit_test_setup_teardown(when_noFieldData_fieldBeginReturnsNull, setupBasic, teardown),
+    cmocka_unit_test_setup_teardown(when_contiguousData_fieldBeginsAtDataPointer, setupContiguous, teardown),
+    cmocka_unit_test_setup_teardown(when_noncontiguousDataWithNegativeStride_fieldBeginsAtCorrectLocation, setupNegativeStrided, teardown),
+    cmocka_unit_test_setup_teardown(given_field_precisionCorrect, setupBasic, teardown),
+    cmocka_unit_test_setup_teardown(given_contiguousData_fieldSizeBytesCorrect, setupContiguous, teardown),
+    cmocka_unit_test_setup_teardown(given_noncontiguousData_fieldSizeBytesCorrect, setupStrided, teardown),
+  };
+  return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/tests/testviews.cpp b/tests/testviews.cpp
index f01b7cd16..f6f09f3c0 100644
--- a/tests/testviews.cpp
+++ b/tests/testviews.cpp
@@ -1,17 +1,20 @@
-#include <cassert>
 #include <cmath>
-#include <cstdio>
 #include <cstdlib>
-#include "zfparray2.h"
-#include "zfparray3.h"
+#include <iostream>
+#include <sstream>
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 #define EPSILON 1e-3
 
-// random integer in {begin, ..., end - 1}
+// random integer in {begin, ..., end}
 static size_t
 rand(size_t begin, size_t end)
 {
-  return begin + size_t(rand()) % (end - begin);
+  return begin + size_t(rand()) % (end - begin + 1);
 }
 
 // ensure f and g are sufficiently close
@@ -19,47 +22,96 @@ static void
 verify(double f, double g)
 {
   if (std::fabs(f - g) > EPSILON) {
-    fprintf(stderr, "error: %g != %g\n", f, g);
+#ifdef _OPENMP
+    #pragma omp critical
+#endif
+    std::cerr << "error: " << f << " != " << g << std::endl;
     exit(EXIT_FAILURE);
   }
 }
 
+// filter output; returns true for first head and last tail calls
+static bool
+filter_output(size_t head = 0, size_t tail = 0, size_t size = 0)
+{
+  static size_t i = 0;
+  static size_t h = 0;
+  static size_t t = 0;
+  static size_t n = 0;
+
+  if (size) {
+    i = 0;
+    h = head;
+    t = tail;
+    n = size;
+    return false;
+  }
+
+  bool display = !(h <= i && i + t < n);
+  if (!display && i == h)
+    std::cout << "..." << std::endl;
+  i++;
+
+  return display;
+}
+
+static int
+usage()
+{
+  std::cerr << "Usage: testviews [nx ny nz [x0 y0 z0 mx my mz]]" << std::endl;
+  return EXIT_FAILURE;
+}
+
 int main(int argc, char* argv[])
 {
-  size_t nx = 16;
-  size_t ny = 16;
-  size_t nz = 16;
-  size_t x0 = rand(0, nx);
-  size_t y0 = rand(0, ny);
-  size_t z0 = rand(0, nz);
-  size_t mx = rand(1, nx - x0);
-  size_t my = rand(1, ny - y0);
-  size_t mz = rand(1, nz - z0);
+  size_t nx = 8;
+  size_t ny = 48;
+  size_t nz = 32;
+  size_t x0, y0, z0;
+  size_t mx, my, mz;
   double rate = 16;
 
-  // Usage: test [nx ny nz [x0 y0 z0 mx my mz]]
+  // parse command-line arguments
   switch (argc) {
     case 10:
-      if (sscanf(argv[4], "%zu", &x0) != 1 ||
-          sscanf(argv[5], "%zu", &y0) != 1 ||
-          sscanf(argv[6], "%zu", &z0) != 1 ||
-          sscanf(argv[7], "%zu", &mx) != 1 ||
-          sscanf(argv[8], "%zu", &my) != 1 ||
-          sscanf(argv[9], "%zu", &mz) != 1)
-        return EXIT_FAILURE;
-      // FALLTHROUGH
+      if ((std::istringstream(argv[4]) >> x0).fail() ||
+          (std::istringstream(argv[5]) >> y0).fail() ||
+          (std::istringstream(argv[6]) >> z0).fail() ||
+          (std::istringstream(argv[7]) >> mx).fail() || !mx ||
+          (std::istringstream(argv[8]) >> my).fail() || !my ||
+          (std::istringstream(argv[9]) >> mz).fail() || !mz)
+        return usage();
+      fallthrough_
     case 4:
-      if (sscanf(argv[1], "%zu", &nx) != 1 ||
-          sscanf(argv[2], "%zu", &ny) != 1 ||
-          sscanf(argv[3], "%zu", &nz) != 1)
-        return EXIT_FAILURE;
-      // FALLTHROUGH
+      if ((std::istringstream(argv[1]) >> nx).fail() || !nx ||
+          (std::istringstream(argv[2]) >> ny).fail() || !ny ||
+          (std::istringstream(argv[3]) >> nz).fail() || !nz)
+        return usage();
+      fallthrough_
     case 1:
       break;
+    default:
+      return usage();
   }
 
-  printf("a(%zu, %zu, %zu)\n", nx, ny, nz);
-  printf("v(%zu, %zu, %zu) + (%zu, %zu, %zu)\n", mx, my, mz, x0, y0, z0);
+  if (argc < 10) {
+    // generate random view
+    x0 = rand(0, nx - 1);
+    y0 = rand(0, ny - 1);
+    z0 = rand(0, nz - 1);
+    mx = rand(1, nx - x0);
+    my = rand(1, ny - y0);
+    mz = rand(1, nz - z0);
+  }
+
+  // validate arguments
+  if (x0 + mx > nx || y0 + my > ny || z0 + mz > nz) {
+    std::cerr << "invalid view parameters" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::cout << "a(" << nx << ", " << ny << ", " << nz << ")" << std::endl;
+  std::cout << "v(" << mx << ", " << my << ", " << mz << ") + (" << x0 << ", " << y0 << ", " << z0 << ")" << std::endl;
 
   // initialize 3D array to linear function
   zfp::array3<double> a(nx, ny, nz, rate);
@@ -69,106 +121,168 @@ int main(int argc, char* argv[])
         a(x, y, z) = static_cast<double>(x + nx * (y + ny * z));
 
   // rectangular view into a
-  printf("\n3D view\n");
+  std::cout << std::endl << "3D view" << std::endl;
   zfp::array3<double>::view v(&a, x0, y0, z0, mx, my, mz);
+  filter_output(v.size_x() + 2, 3, v.size());
   for (size_t z = 0; z < v.size_z(); z++)
     for (size_t y = 0; y < v.size_y(); y++)
       for (size_t x = 0; x < v.size_x(); x++) {
-        printf("%zu %zu %zu: %g %g\n", x, y, z, (double)a(x0 + x, y0 + y, z0 + z), (double)v(x, y, z));
+        if (filter_output())
+          std::cout << x << " " << y << " " << z << ": " << a(x0 + x, y0 + y, z0 + z) << " " << v(x, y, z) << std::endl;
         verify(a(x0 + x, y0 + y, z0 + z), v(x, y, z));
       }
 
   // flat view of all of a
-  printf("\n3D flat view\n");
+  std::cout << std::endl << "3D flat view" << std::endl;
   zfp::array3<double>::flat_view fv(&a);
+  filter_output(fv.size_x() + 2, 3, fv.size());
   for (size_t z = 0; z < fv.size_z(); z++)
     for (size_t y = 0; y < fv.size_y(); y++)
       for (size_t x = 0; x < fv.size_x(); x++) {
-        printf("%zu %zu %zu: %g %g\n", x, y, z, (double)a(x, y, z), (double)fv[fv.index(x, y, z)]);
+        if (filter_output())
+          std::cout << x << " " << y << " " << z << ": " << a(x, y, z) << " " << fv[fv.index(x, y, z)] << std::endl;
         verify(a(x, y, z), fv[fv.index(x, y, z)]);
       }
 
   // nested view of all of a
-  printf("\n3D nested view\n");
+  std::cout << std::endl << "3D nested view" << std::endl;
   zfp::array3<double>::nested_view nv(&a);
-  for (size_t z = 0; z < v.size_z(); z++)
-    for (size_t y = 0; y < v.size_y(); y++)
-      for (size_t x = 0; x < v.size_x(); x++) {
-        printf("%zu %zu %zu: %g %g\n", x, y, z, (double)a(x, y, z), (double)nv[z][y][x]);
+  filter_output(nv.size_x() + 2, 3, nv.size());
+  for (size_t z = 0; z < nv.size_z(); z++)
+    for (size_t y = 0; y < nv.size_y(); y++)
+      for (size_t x = 0; x < nv.size_x(); x++) {
+        if (filter_output())
+          std::cout << x << " " << y << " " << z << ": " << a(x, y, z) << " " << nv[z][y][x] << std::endl;
         verify(a(x, y, z), nv[z][y][x]);
       }
 
   // pointers and iterators into a via view v
-  printf("\n3D view pointers and iterators\n");
+  std::cout << std::endl << "3D view pointers and iterators" << std::endl;
   zfp::array3<double>::view::const_reference vr = v(0, 0, 0);
   zfp::array3<double>::view::const_pointer p = &vr;
   p = &v(0, 0, 0);
+  filter_output(v.size_x() + 2, 3, v.size());
   for (zfp::array3<double>::view::const_iterator it = v.begin(); it != v.end(); it++) {
     size_t x = it.i();
     size_t y = it.j();
     size_t z = it.k();
+    if (filter_output())
+      std::cout << x << " " << y << " " << z << ": " << *it << " " << p[x + mx * (y + my * z)] << std::endl;
     verify(*it, p[x + mx * (y + my * z)]);
   }
 
   // pointers and iterators into a via flat view fv
-  printf("\n3D flat view pointers and iterators\n");
+  std::cout << std::endl << "3D flat view pointers and iterators" << std::endl;
   zfp::array3<double>::flat_view::const_reference fvr = fv[0];
   zfp::array3<double>::flat_view::const_pointer fp = &fvr;
   fp = &fv(0, 0, 0);
+  filter_output(fv.size_x() + 2, 3, fv.size());
   for (zfp::array3<double>::flat_view::const_iterator it = fv.begin(); it != fv.end(); it++) {
     size_t x = it.i();
     size_t y = it.j();
     size_t z = it.k();
+    if (filter_output())
+      std::cout << x << " " << y << " " << z << ": " << *it << " " << fp[x + nx * (y + ny * z)] << std::endl;
     verify(*it, fp[x + nx * (y + ny * z)]);
   }
 
   // 2D slice of a
-  printf("\n2D slice\n");
-  size_t z = rand(0, nv.size_z());
+  std::cout << std::endl << "2D slice" << std::endl;
+  size_t z = rand(0, nv.size_z() - 1);
   zfp::array3<double>::nested_view2 slice2(nv[z]);
+  filter_output(slice2.size_x() + 2, 3, slice2.size());
   for (size_t y = 0; y < slice2.size_y(); y++)
     for (size_t x = 0; x < slice2.size_x(); x++) {
-      printf("%zu %zu %zu: %g %g\n", x, y, z, (double)a(x, y, z), (double)slice2[y][x]);
+      if (filter_output())
+        std::cout << x << " " << y << " " << z << ": " << a(x, y, z) << " " << slice2[y][x] << std::endl;
       verify(a(x, y, z), slice2[y][x]);
     }
 
   // 2D array constructed from 2D slice (exercises deep copy via iterator)
-  printf("\n2D array from 2D slice\n");
+  std::cout << std::endl << "2D array from 2D slice" << std::endl;
   zfp::array2<double> b(slice2);
+  filter_output(b.size_x() + 2, 3, b.size());
   for (size_t y = 0; y < b.size_y(); y++)
     for (size_t x = 0; x < b.size_x(); x++) {
-      printf("%zu %zu: %g %g\n", x, y, (double)b(x, y), (double)slice2[y][x]);
+      if (filter_output())
+        std::cout << x << " " << y << ": " << b(x, y) << " " << slice2[y][x] << std::endl;
       verify(b(x, y), slice2[y][x]);
     }
 
   // 1D slice of a
-  printf("\n1D slice\n");
-  size_t y = rand(0, slice2.size_y());
+  std::cout << std::endl << "1D slice" << std::endl;
+  size_t y = rand(0, slice2.size_y() - 1);
   zfp::array3<double>::nested_view1 slice1 = slice2[y];
   for (size_t x = 0; x < slice1.size_x(); x++) {
-    printf("%zu %zu %zu: %g %g\n", x, y, z, (double)a(x, y, z), (double)slice1[x]);
+    std::cout << x << " " << y << " " << z << ": " << a(x, y, z) << " " << slice1[x] << std::endl;
     verify(a(x, y, z), slice1[x]);
   }
 
   // 2D array constructed from 2D slice of 3D array (exercises deep copy via iterator)
-  printf("\n2D array from 2D slice of 3D array\n");
+  std::cout << std::endl << "2D array from 2D slice of 3D array" << std::endl;
   zfp::array2<double> c(slice2);
+  filter_output(c.size_x() + 2, 3, c.size());
   for (size_t y = 0; y < c.size_y(); y++)
     for (size_t x = 0; x < c.size_x(); x++) {
-      printf("%zu %zu: %g %g\n", x, y, (double)c(x, y), (double)slice2[y][x]);
+      if (filter_output())
+        std::cout << x << " " << y << ": " << c(x, y) << " " << slice2[y][x] << std::endl;
       verify(c(x, y), slice2[y][x]);
     }
 
-  // 2D thread-safe view of c
-  printf("\n2D private view\n");
+  // 2D thread-safe read-only view of c
+  std::cout << std::endl << "2D private read-only view" << std::endl;
   zfp::array2<double>::private_const_view d(&c);
+  filter_output(c.size_x() + 2, 3, c.size());
   for (size_t y = 0; y < c.size_y(); y++)
     for (size_t x = 0; x < c.size_x(); x++) {
-      printf("%zu %zu: %g %g\n", x, y, (double)c(x, y), (double)d(x, y));
+      if (filter_output())
+        std::cout << x << " " << y << ": " << c(x, y) << " " << d(x, y) << std::endl;
       verify(c(x, y), d(x, y));
     }
 
-  printf("\nall tests passed\n");
+#ifdef _OPENMP
+  std::cout << std::endl << "multithreaded 2D private read-only views" << std::endl;
+  // copy c for verification; direct accesses to c are not thread-safe
+  double* data = new double[c.size()];
+  c.get(data);
+  #pragma omp parallel
+  {
+    // make a thread-local view into c
+    zfp::array2<double>::private_const_view d(&c);
+    if (omp_get_thread_num() == 0)
+      filter_output(d.size_x() + 2, 3, d.size());
+    for (size_t y = 0; y < d.size_y(); y++)
+      for (size_t x = 0; x < d.size_x(); x++) {
+        double val = data[x + nx * y];
+        if (omp_get_thread_num() == 0 && filter_output())
+          std::cout << x << " " << y << ": " << val << " " << d(x, y) << std::endl;
+        verify(val, d(x, y));
+      }
+  }
+
+  std::cout << std::endl << "multithreaded 2D private read-write views" << std::endl;
+  #pragma omp parallel
+  {
+    // partition c into disjoint views
+    zfp::array2<double>::private_view d(&c);
+    d.partition(omp_get_thread_num(), omp_get_num_threads());
+    if (omp_get_thread_num() == 0)
+      filter_output(d.size_x() + 2, 3, d.size());
+    for (size_t j = 0; j < d.size_y(); j++)
+      for (size_t i = 0; i < d.size_x(); i++) {
+        d(i, j) += 1;
+        size_t x = d.global_x(i);
+        size_t y = d.global_y(j);
+        double val = data[x + nx * y] + 1;
+        if (omp_get_thread_num() == 0 && filter_output())
+          std::cout << x << " " << y << ": " << val << " " << d(i, j) << std::endl;
+        verify(val, d(i, j));
+      }
+  }
+  delete[] data;
+#endif
+
+  std::cout << std::endl << "all tests passed" << std::endl;
 
   return 0;
 }
diff --git a/tests/testzfp.cpp b/tests/testzfp.cpp
index 9c0bdb7f4..6e0db89e1 100644
--- a/tests/testzfp.cpp
+++ b/tests/testzfp.cpp
@@ -11,10 +11,10 @@
 #include <sstream>
 #include <string>
 #include "zfp.h"
-#include "zfparray1.h"
-#include "zfparray2.h"
-#include "zfparray3.h"
-#include "zfparray4.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
 
 enum ArraySize {
   Small  = 0, // 2^12 = 4096 scalars (2^12 = (2^6)^2 = (2^4)^3 = (2^3)^4)
@@ -44,16 +44,16 @@ test_size(ArraySize size)
 inline void
 refine1d(int* g, const int* f, size_t m)
 {
-  const int weight[4] = { -1, 9, 9, -1 };
+  const int64 weight[4] = { -1, 9, 9, -1 };
   const size_t n = 2 * m;
 
   for (size_t x = 0; x < n; x++) {
-    int s = 0;
+    int64 s = 0;
     for (size_t i = 0; i < 4; i++) {
       size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
       s += weight[i] * f[xx];
     }
-    g[x] = s / 16;
+    g[x] = static_cast<int>(s / 16);
   }
 }
 
@@ -61,12 +61,12 @@ refine1d(int* g, const int* f, size_t m)
 inline void
 refine2d(int* g, const int* f, size_t m)
 {
-  const int weight[4] = { -1, 9, 9, -1 };
+  const int64 weight[4] = { -1, 9, 9, -1 };
   const size_t n = 2 * m;
 
   for (size_t y = 0; y < n; y++)
     for (size_t x = 0; x < n; x++) {
-      int s = 0;
+      int64 s = 0;
       for (size_t j = 0; j < 4; j++) {
         size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
         for (size_t i = 0; i < 4; i++) {
@@ -74,7 +74,7 @@ refine2d(int* g, const int* f, size_t m)
           s += weight[i] * weight[j] * f[xx + m * yy];
         }
       }
-      g[x + n * y] = s / (16 * 16);
+      g[x + n * y] = static_cast<int>(s / (16 * 16));
     }
 }
 
@@ -82,13 +82,13 @@ refine2d(int* g, const int* f, size_t m)
 inline void
 refine3d(int* g, const int* f, size_t m)
 {
-  const int weight[4] = { -1, 9, 9, -1 };
+  const int64 weight[4] = { -1, 9, 9, -1 };
   const size_t n = 2 * m;
 
   for (size_t z = 0; z < n; z++)
     for (size_t y = 0; y < n; y++)
       for (size_t x = 0; x < n; x++) {
-        int s = 0;
+        int64 s = 0;
         for (size_t k = 0; k < 4; k++) {
           size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2;
           for (size_t j = 0; j < 4; j++) {
@@ -99,7 +99,7 @@ refine3d(int* g, const int* f, size_t m)
             }
           }
         }
-        g[x + n * (y + n * z)] = s / (16 * 16 * 16);
+        g[x + n * (y + n * z)] = static_cast<int>(s / (16 * 16 * 16));
       }
 }
 
@@ -107,14 +107,14 @@ refine3d(int* g, const int* f, size_t m)
 inline void
 refine4d(int* g, const int* f, size_t m)
 {
-  const int weight[4] = { -1, 9, 9, -1 };
+  const int64 weight[4] = { -1, 9, 9, -1 };
   const size_t n = 2 * m;
 
   for (size_t w = 0; w < n; w++)
     for (size_t z = 0; z < n; z++)
       for (size_t y = 0; y < n; y++)
         for (size_t x = 0; x < n; x++) {
-          int s = 0;
+          int64 s = 0;
           for (size_t l = 0; l < 4; l++) {
             size_t ww = w & 1u ? (w / 2 + l - 1 + m) % m : w / 2;
             for (size_t k = 0; k < 4; k++) {
@@ -128,7 +128,7 @@ refine4d(int* g, const int* f, size_t m)
               }
             }
           }
-          g[x + n * (y + n * (z + n * w))] = s / (16 * 16 * 16 * 16);
+          g[x + n * (y + n * (z + n * w))] = static_cast<int>(s / (16 * 16 * 16 * 16));
         }
 }
 
@@ -759,7 +759,7 @@ test(uint dims, ArraySize array_size)
   // determine array size
   uint nx, ny, nz, nw;
   zfp_field* field = zfp_field_alloc();
-  zfp_field_set_type(field, zfp::trait<Scalar>::type);
+  zfp_field_set_type(field, zfp::internal::trait<Scalar>::type);
   zfp_field_set_pointer(field, f);
   switch (dims) {
     case 1:
@@ -792,11 +792,11 @@ test(uint dims, ArraySize array_size)
   // test data integrity
   uint32 checksum[2][2][4] = { // [size][type][dims]
     // small
-    {{ 0x54174c44u, 0x86609589u, 0xfc0a6a76u, 0xa3481e00u },
-     { 0x7d257bb6u, 0x294bb210u, 0x68614d26u, 0xf6bd3a21u }},
+    {{ 0x54174c44u, 0x86609589u, 0xfc0a6a76u, 0x28708a2bu },
+     { 0x7d257bb6u, 0x294bb210u, 0x68614d26u, 0xd58a5fe7u }},
     // large
-    {{ 0xd1ce1aceu, 0x644274dau, 0xc0ad63fau, 0x700de480u },
-     { 0xc3ed7116u, 0x644e2117u, 0xd7464b07u, 0x2516382eu }},
+    {{ 0xd1ce1aceu, 0x644274dau, 0xc0ad63fau, 0xdc65b02eu },
+     { 0xc3ed7116u, 0x644e2117u, 0xd7464b07u, 0xe4b60fbbu }},
   };
   uint32 h = hash(f, n * sizeof(Scalar));
   if (h != checksum[array_size][t][dims - 1])
@@ -815,13 +815,13 @@ test(uint dims, ArraySize array_size)
           {1.627e+01, 8.277e-02, 0.000e+00},
           {1.500e+00, 3.663e-03, 0.000e+00},
           {1.500e+00, 9.583e-03, 0.000e+00},
-          {1.373e+01, 6.633e-01, 0.000e+00},
+          {6.750e+00, 1.931e-01, 0.000e+00},
         },
         {
           {1.627e+01, 1.601e+01, 1.832e-04, 0.000e+00},
           {2.376e+01, 1.797e-01, 8.584e-06, 0.000e+00},
           {5.210e+00, 2.002e-01, 3.338e-05, 0.000e+00},
-          {1.016e+01, 8.985e+00, 3.312e-03, 0.000e+00},
+          {9.594e+00, 2.264e+00, 8.282e-04, 0.000e+00},
         },
       },
       // large
@@ -830,13 +830,13 @@ test(uint dims, ArraySize array_size)
           {1.627e+01, 2.100e-02, 0.000e+00},
           {1.624e-01, 7.439e-05, 0.000e+00},
           {1.001e-02, 7.248e-05, 0.000e+00},
-          {2.527e-02, 2.460e-04, 0.000e+00},
+          {1.038e-02, 1.078e-04, 0.000e+00},
         },
         {
           {1.627e+01, 1.601e+01, 2.289e-05, 0.000e+00},
           {1.607e+01, 2.076e-03, 0.000e+00, 0.000e+00},
           {1.407e-01, 7.344e-04, 0.000e+00, 0.000e+00},
-          {1.436e-01, 2.659e-03, 8.801e-08, 0.000e+00},
+          {8.130e-02, 1.515e-03, 4.401e-08, 0.000e+00},
         }
       }
     };
@@ -856,13 +856,13 @@ test(uint dims, ArraySize array_size)
           {2192, 3280, 6328},
           { 592, 1328, 4384},
           { 152, 1040, 4600},
-          {  64, 1760, 5856},
+          {  32,  352, 4168},
         },
         {
           {3664, 6712, 14104},
           {1424, 4480, 12616},
           {1064, 4624, 12808},
-          {1768, 5864, 14056},
+          { 360, 4168, 12360},
         },
       },
       // large
@@ -871,13 +871,13 @@ test(uint dims, ArraySize array_size)
           {8965672, 13160560, 21835352},
           {2235560,  3512848, 10309240},
           { 568456,  1361056,  8759696},
-          { 134344,   739632,  8896360},
+          { 135344,   706600,  8207768},
         },
         {
           {14733112, 23407904, 44997832},
           { 3905240, 10701640, 40856544},
           { 1458368,  8857008, 41270184},
-          {  763928,  8920656, 41574712},
+          {  730896,  8232056, 40581448},
         },
       }
     };
@@ -895,13 +895,13 @@ test(uint dims, ArraySize array_size)
           {6328, 11944, 13720},
           {4936, 11064, 12520},
           {6104, 11752, 12784},
-          {9440, 14048, 14048},
+          {8776, 12360, 12360},
         },
         {
           {6712, 25888, 29064},
           {5032, 26016, 28984},
           {6128, 27120, 29192},
-          {9448, 30440, 30440},
+          {8776, 28744, 28744},
         },
       },
       // large
@@ -910,13 +910,13 @@ test(uint dims, ArraySize array_size)
           {21815976, 38285256, 43425280},
           { 9187232, 32695984, 40464144},
           { 8914336, 33364208, 41172864},
-          {12109200, 35921784, 41550416},
+          {11394368, 34992872, 40557152},
         },
         {
           {23388528, 79426016,  88659304},
           { 9579632, 89770896, 103388072},
           { 9011648, 94009072, 107606336},
-          {12133496, 97126288, 107911568},
+          {11418664, 96325984, 106922328},
         },
       }
     };
@@ -933,13 +933,13 @@ test(uint dims, ArraySize array_size)
           7272,
           5104,
           6096,
-          6864,
+          7208,
         },
         {
           7784,
           5232,
           6128,
-          6872,
+          7216,
         },
       },
       // large
@@ -948,13 +948,13 @@ test(uint dims, ArraySize array_size)
           25037288,
           12792440,
           14187128,
-          17135704,
+          17222720,
         },
         {
           27134024,
           13315632,
           14316880,
-          17168096,
+          17255112,
         },
       }
     };
@@ -965,25 +965,25 @@ test(uint dims, ArraySize array_size)
   double emax[2][2][4] = { // [size][type][dims] (construct test)
     // small
     {
-      {4.578e-05, 7.630e-06, 3.148e-05, 3.598e-03},
-      {1.832e-04, 8.584e-06, 3.338e-05, 3.312e-03},
+      {4.578e-05, 7.630e-06, 3.148e-05, 8.197e-04},
+      {1.832e-04, 8.584e-06, 3.338e-05, 8.282e-04},
     },
     // large
     {
-      {0.000e+00, 0.000e+00, 0.000e+00, 1.193e-07},
-      {2.289e-05, 0.000e+00, 0.000e+00, 8.801e-08},
+      {0.000e+00, 0.000e+00, 0.000e+00, 2.981e-08},
+      {2.289e-05, 0.000e+00, 0.000e+00, 4.401e-08},
     }
   };
   double dfmax[2][2][4] = { // [size][type][dims] (update test)
     // small
     {
-      {2.155e-02, 3.755e-01, 1.846e+00, 4.843e+01},
-      {2.155e-02, 3.755e-01, 1.846e+00, 4.844e+01},
+      {2.155e-02, 3.755e-01, 1.846e+00, 1.601e+01},
+      {2.155e-02, 3.755e-01, 1.846e+00, 1.601e+01},
     },
     // large
     {
-      {2.441e-04, 7.801e-04, 3.599e-03, 2.793e-02},
-      {2.670e-04, 9.075e-04, 3.694e-03, 2.779e-02},
+      {2.441e-04, 4.883e-04, 1.222e-03, 8.794e-03},
+      {2.670e-04, 4.883e-04, 1.222e-03, 8.795e-03},
     }
   };
   double rate = 16;
@@ -1115,6 +1115,7 @@ int main(int argc, char* argv[])
       break;
   }
   std::cout << std::endl;
+  std::cout << "word size " << stream_word_bits << std::endl;
   std::cout << std::endl;
 
   uint sizes = 0;
diff --git a/tests/utils/fixedpoint96.h b/tests/utils/fixedpoint96.h
index 7a701b747..b48ebdab5 100644
--- a/tests/utils/fixedpoint96.h
+++ b/tests/utils/fixedpoint96.h
@@ -1,7 +1,7 @@
 #ifndef FIXEDPT_H
 #define FIXEDPT_H
 
-#include "include/zfp/types.h"
+#include "include/zfp/internal/zfp/types.h"
 
 typedef struct {
   // the number represented = i + (2^-32)*f
diff --git a/tests/utils/genSmoothRandNums.c b/tests/utils/genSmoothRandNums.c
index 20c2ba47a..1c12ca280 100644
--- a/tests/utils/genSmoothRandNums.c
+++ b/tests/utils/genSmoothRandNums.c
@@ -5,6 +5,7 @@
 #include "genSmoothRandNums.h"
 #include "fixedpoint96.h"
 #include "rand64.h"
+#include "zfp.h"
 
 #define FLOAT_MANTISSA_BITS 23
 #define DOUBLE_MANTISSA_BITS 52
@@ -30,15 +31,19 @@ computeOffset(size_t l, size_t k, size_t j, size_t i, size_t sideLen, int numDim
   switch (numDims) {
     case 4:
       result += l * sideLen * sideLen * sideLen;
+      fallthrough_
 
     case 3:
       result += k * sideLen * sideLen;
+      fallthrough_
 
     case 2:
       result += j * sideLen;
+      fallthrough_
 
     case 1:
       result += i;
+      break;
   }
 
   return result;
diff --git a/tests/utils/genSmoothRandNums.h b/tests/utils/genSmoothRandNums.h
index b8969d8f1..9d43c72fa 100644
--- a/tests/utils/genSmoothRandNums.h
+++ b/tests/utils/genSmoothRandNums.h
@@ -1,7 +1,7 @@
 #ifndef GEN_SMOOTH_RAND_INTS_H
 #define GEN_SMOOTH_RAND_INTS_H
 
-#include "include/zfp/types.h"
+#include "zfp/internal/zfp/types.h"
 
 // used to compute (square) array sizes
 size_t
diff --git a/tests/utils/rand32.h b/tests/utils/rand32.h
index de82cce6b..a47328db3 100644
--- a/tests/utils/rand32.h
+++ b/tests/utils/rand32.h
@@ -1,7 +1,7 @@
 #ifndef RAND_32_H
 #define RAND_32_H
 
-#include "include/zfp/types.h"
+#include "include/zfp/internal/zfp/types.h"
 
 // reset seed
 void
diff --git a/tests/utils/rand64.h b/tests/utils/rand64.h
index 78c526184..4c4ec161a 100644
--- a/tests/utils/rand64.h
+++ b/tests/utils/rand64.h
@@ -1,7 +1,7 @@
 #ifndef RAND_64_H
 #define RAND_64_H
 
-#include "include/zfp/types.h"
+#include "include/zfp/internal/zfp/types.h"
 
 // reset seed
 void
diff --git a/tests/utils/testMacros.h b/tests/utils/testMacros.h
index 0b2be136a..4791512d0 100644
--- a/tests/utils/testMacros.h
+++ b/tests/utils/testMacros.h
@@ -23,8 +23,8 @@
   // key2: identifies array dimensions
   // value: checksum
   // (macro substitutes "printf() && 0" because we want conditional to fail after executing printf)
-  #define ASSERT_EQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) printf("{UINT64C(0x%"PRIx64"), UINT64C(0x%"PRIx64"), UINT64C(0x%"PRIx64")},\n", key1, key2, computedChecksum)
-  #define COMPARE_NEQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) printf("{UINT64C(0x%"PRIx64"), UINT64C(0x%"PRIx64"), UINT64C(0x%"PRIx64")},\n", key1, key2, computedChecksum) && 0
+  #define ASSERT_EQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) printf("{UINT64C(0x%" PRIx64 "), UINT64C(0x%" PRIx64 "), UINT64C(0x%" PRIx64 ")},\n", key1, key2, computedChecksum)
+  #define COMPARE_NEQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) printf("{UINT64C(0x%" PRIx64 "), UINT64C(0x%" PRIx64 "), UINT64C(0x%" PRIx64 ")},\n", key1, key2, computedChecksum) && 0
 #else
   #define ASSERT_EQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) assert_int_equal(computedChecksum, getChecksumByKey(dims, zfpType, key1, key2))
   #define COMPARE_NEQ_CHECKSUM(dims, zfpType, computedChecksum, key1, key2) (computedChecksum != getChecksumByKey(dims, zfpType, key1, key2))
diff --git a/tests/utils/zfpChecksums.c b/tests/utils/zfpChecksums.c
index cf9b7e62e..f9b1f7376 100644
--- a/tests/utils/zfpChecksums.c
+++ b/tests/utils/zfpChecksums.c
@@ -1,4 +1,4 @@
-#include "zfp/types.h"
+#include "zfp/internal/zfp/types.h"
 #include "zfpChecksums.h"
 
 #define NUM_INT_CHECKSUMS 19
diff --git a/tests/utils/zfpHash.h b/tests/utils/zfpHash.h
index 446065725..5718050f3 100644
--- a/tests/utils/zfpHash.h
+++ b/tests/utils/zfpHash.h
@@ -2,7 +2,7 @@
 #define ZFP_HASH_H
 
 #include <stddef.h>
-#include "include/zfp/types.h"
+#include "include/zfp/internal/zfp/types.h"
 
 uint64
 hashBitstream(uint64* ptrStart, size_t bufsizeBytes);
diff --git a/travis.sh b/travis.sh
deleted file mode 100755
index e73383cde..000000000
--- a/travis.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env sh
-set -e
-
-# pass additional args in $1 (starting with whitespace character)
-run_all () {
-  run_all_cmd="ctest -V -C Debug -DC_STANDARD=${C_STANDARD:-99} -DCXX_STANDARD=${CXX_STANDARD:-98} -S \"$TRAVIS_BUILD_DIR/cmake/travis.cmake\""
-  eval "${run_all_cmd}$1"
-}
-
-mkdir build
-cd build
-
-# technically, flags are passed on to cmake/* and actually set there
-BUILD_FLAGS=""
-
-if [ -n "${COVERAGE}" ]; then
-  # build (linux)
-
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_UTILITIES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_EXAMPLES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CFP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFPY=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFORP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DZFP_WITH_ALIGNED_ALLOC=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CUDA=OFF"
-  BUILD_FLAGS="$BUILD_FLAGS -DWITH_COVERAGE=ON"
-
-  run_all "$BUILD_FLAGS"
-else
-  # build/test without OpenMP, with CFP (and custom namespace), with zfPy, with Fortran (linux only)
-  if [[ "$OSTYPE" == "darwin"* ]]; then
-    BUILD_ZFORP=OFF
-  else
-    BUILD_ZFORP=ON
-  fi
-
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_UTILITIES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_EXAMPLES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CFP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DCFP_NAMESPACE=cfp2"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFPY=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFORP=$BUILD_ZFORP"
-  BUILD_FLAGS="$BUILD_FLAGS -DZFP_WITH_ALIGNED_ALLOC=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=OFF"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CUDA=OFF"
-  run_all "$BUILD_FLAGS"
-
-  rm -rf ./* ;
-
-  # if OpenMP available, start a 2nd build with it
-  if cmake ../tests/ci-utils/ ; then
-    rm -rf ./* ;
-
-    # build/test with OpenMP
-    BUILD_FLAGS=""
-    BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=ON"
-    run_all "$BUILD_FLAGS"
-  fi
-fi
diff --git a/utils/Makefile b/utils/Makefile
index 84edc6e95..dc7ef3e95 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -2,7 +2,7 @@ include ../Config
 
 TARGET = ../bin/zfp
 INCS = -I../include
-LIBS = -L../lib -lzfp -lm
+LIBS = -L../lib -lzfp $(LDFLAGS) -lm
 
 all: $(TARGET)
 
diff --git a/utils/zfp.c b/utils/zfp.c
index 79608db77..9744f896a 100644
--- a/utils/zfp.c
+++ b/utils/zfp.c
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 
 /*
 File I/O is done using the following combinations of i, o, s, and z:
@@ -30,14 +30,14 @@ The 7 major tasks to be accomplished are:
 static void
 print_error(const void* fin, const void* fout, zfp_type type, size_t n)
 {
-  const int32* i32i = (const int32*)fin;
-  const int64* i64i = (const int64*)fin;
-  const float* f32i = (const float*)fin;
-  const double* f64i = (const double*)fin;
-  const int32* i32o = (const int32*)fout;
-  const int64* i64o = (const int64*)fout;
-  const float* f32o = (const float*)fout;
-  const double* f64o = (const double*)fout;
+  const int32* i32i = fin;
+  const int64* i64i = fin;
+  const float* f32i = fin;
+  const double* f64i = fin;
+  const int32* i32o = fout;
+  const int64* i64o = fout;
+  const float* f32o = fout;
+  const double* f64o = fout;
   double fmin = +DBL_MAX;
   double fmax = -DBL_MAX;
   double erms = 0;
@@ -80,7 +80,7 @@ print_error(const void* fin, const void* fout, zfp_type type, size_t n)
 }
 
 static void
-usage()
+usage(void)
 {
   fprintf(stderr, "%s\n", zfp_version_string);
   fprintf(stderr, "Usage: zfp <options>\n");
diff --git a/zfp-config-version.cmake.in b/zfp-config-version.cmake.in
index 4a77db0a1..449327023 100644
--- a/zfp-config-version.cmake.in
+++ b/zfp-config-version.cmake.in
@@ -1,6 +1,8 @@
 set(PACKAGE_VERSION_MAJOR @ZFP_VERSION_MAJOR@)
 set(PACKAGE_VERSION_MINOR @ZFP_VERSION_MINOR@)
 set(PACKAGE_VERSION_PATCH @ZFP_VERSION_PATCH@)
+set(PACKAGE_VERSION_TWEAK @ZFP_VERSION_TWEAK@)
+
 set(PACKAGE_VERSION @ZFP_VERSION@)
 
 # Check whether the requested PACKAGE_FIND_VERSION is compatible
diff --git a/zfp-config.cmake.in b/zfp-config.cmake.in
index 204665264..c98096546 100644
--- a/zfp-config.cmake.in
+++ b/zfp-config.cmake.in
@@ -3,11 +3,19 @@
 # It defines the following variables
 #  ZFP_INCLUDE_DIRS - include directories for zfp
 #  ZFP_LIBRARIES    - libraries to link against
-#  ZFP_WITH_OPENMP  - Indicates if the zfp library has been built with OpenMP support.
+#  ZFP_WITH_OPENMP  - indicates if the zfp library has been built with OpenMP support
+#  ZFP_WITH_CUDA    - indicates if the zfp library has been built with CUDA support
+#  ZFP_CFP_ENABLED  - indicated if the cfp library has been built
 #
 # And the following imported targets:
 #   zfp::zfp
 #
+# If cfp is enabled the following variabled are also defined
+#  CFP_INCLUDE_DIRS - include directories for cfp
+#  CFP_LIBRARIES    - libraries to link against (cfp only)
+#
+# As well as the following imported targets:
+#   zfp::cfp
 
 include("${CMAKE_CURRENT_LIST_DIR}/zfp-config-version.cmake")
 
@@ -17,10 +25,16 @@ find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE)
 
 if(NOT TARGET zfp::zfp)
   include("${CMAKE_CURRENT_LIST_DIR}/zfp-targets.cmake")
+  set(ZFP_LIBRARIES "zfp::zfp")
+  get_target_property(ZFP_INCLUDE_DIRS zfp::zfp INTERFACE_INCLUDE_DIRECTORIES)
 endif()
 
-set(ZFP_LIBRARIES zfp::zfp)
-get_target_property(ZFP_INCLUDE_DIRS zfp::zfp INTERFACE_INCLUDE_DIRECTORIES)
+set(ZFP_CFP_ENABLED @BUILD_CFP@)
+if(ZFP_CFP_ENABLED AND NOT TARGET zfp::cfp)
+  include("${CMAKE_CURRENT_LIST_DIR}/cfp-targets.cmake")
+  set(CFP_LIBRARIES "zfp::cfp")
+  get_target_property(CFP_INCLUDE_DIRS zfp::cfp INTERFACE_INCLUDE_DIRECTORIES)
+endif()
 
 set(ZFP_WITH_OPENMP @ZFP_WITH_OPENMP@)
 if(ZFP_WITH_OPENMP)