diff --git a/.github/workflows/update-mcp-registry.yml b/.github/workflows/update-mcp-registry.yml index 507c329b11e771..6968661381b0cd 100644 --- a/.github/workflows/update-mcp-registry.yml +++ b/.github/workflows/update-mcp-registry.yml @@ -2,6 +2,7 @@ name: Update MCP Registry on: release: types: [released] + workflow_dispatch: concurrency: group: update-mcp-registry-${{ github.ref }}-${{ github.event_name }} @@ -16,6 +17,14 @@ jobs: name: Publish to MCP Registry runs-on: ubuntu-latest steps: + - name: Abort if manual run is not on a v* tag + if: > + github.event_name == 'workflow_dispatch' && + !startsWith(github.ref, 'refs/tags/v') + run: | + echo "❌ This workflow can only be run manually on a tag starting with 'v'." + echo "Ref: ${{ github.ref }}" + exit 1 - name: Checkout id: checkout uses: actions/checkout@v5 @@ -39,6 +48,11 @@ jobs: sudo mv mcp-publisher /usr/local/bin/ mcp-publisher --version + - name: Authenticate mcp-publisher + id: mcp_auth + run: | + mcp-publisher login github-oidc + - name: Validate server.json (dry run) id: validate env: @@ -65,6 +79,7 @@ jobs: Checkout: ${{ steps.checkout.outcome }} Update version: ${{ steps.update_version.outcome }} Install mcp-publisher: ${{ steps.install_publisher.outcome }} + MCP Auth: ${{ steps.mcp_auth.outcome }} Validate: ${{ steps.validate.outcome }} Publish: ${{ steps.publish.outcome }} SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e921935fd455f..a050884207796c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,76 @@ ### Merged Pull Requests: +- Fix(go.d/vnodes): add additionalProperties to config_schema.json ([#21503](https://github.com/netdata/netdata/issues/21503)) +- Fix(edit-config): ignore container variable inherited from /etc/profile ([#21505](https://github.com/netdata/netdata/issues/21505)) +- Fix: add procfile parsing for non-seekable files ([#21507](https://github.com/netdata/netdata/issues/21507)) +- Fix(go.d/proxysql): correct backend status metric mapping ([#21524](https://github.com/netdata/netdata/issues/21524)) +- Fix(go.d/pkg/logs): fix ParserConfig.CSV omitempty behavior ([#21526](https://github.com/netdata/netdata/issues/21526)) +- Use the same type for all time_group_value values ([#21528](https://github.com/netdata/netdata/issues/21528)) +- Fix(diskspace.plugin): exclude ZFS datasets ([#21532](https://github.com/netdata/netdata/issues/21532)) + +## [2.8.4] - 2025-12-17 + +### Merged Pull Requests: + +- Build(deps): update go toolchain to v1.25.5 ([#21459](https://github.com/netdata/netdata/issues/21459)) +- Chore(go.d): include Go version in build info ([#21473](https://github.com/netdata/netdata/issues/21473)) + +## [2.8.3] - 2025-12-15 + +### Merged Pull Requests: + +- Update bundled components for static builds. ([#21401](https://github.com/netdata/netdata/issues/21401)) +- Disable MongoDB exporter for Ubuntu 20.04, 22.04, and 24.04 packages. ([#21403](https://github.com/netdata/netdata/issues/21403)) +- Change chart variable (Windows.plugin) ([#21412](https://github.com/netdata/netdata/issues/21412)) +- Fix shutdown timeout handling in completion wait function ([#21395](https://github.com/netdata/netdata/issues/21395)) +- Fix validation of extent index in journal file ([#21400](https://github.com/netdata/netdata/issues/21400)) +- Improve check for DBENGINE max datafile size allowed ([#21390](https://github.com/netdata/netdata/issues/21390)) +- Fix(go.d/pkg/ndexec): return the output along with the error ([#21405](https://github.com/netdata/netdata/issues/21405)) +- Fix(rabbitmq): remove "rabbitmq_version" check ([#21411](https://github.com/netdata/netdata/issues/21411)) +- Chore(go.d/snmp): add collection stats ([#21409](https://github.com/netdata/netdata/issues/21409)) +- Chore(go.d): log skipped data collection ([#21423](https://github.com/netdata/netdata/issues/21423)) +- Chore(go.d/snmp): remove non-default update_every from sd config ([#21424](https://github.com/netdata/netdata/issues/21424)) +- Chore(go.d): log data collection duration when skipping tick ([#21425](https://github.com/netdata/netdata/issues/21425)) +- Fix(go.d): avoid blocking all jobs when stopping a slow job ([#21448](https://github.com/netdata/netdata/issues/21448)) +- Use improved compression algorithms when building DEB packages. ([#21310](https://github.com/netdata/netdata/issues/21310)) +- Fix(go.d): correct Windows install paths ([#21451](https://github.com/netdata/netdata/issues/21451)) +- Adjust hardware collection (windows.plugin) ([#21433](https://github.com/netdata/netdata/issues/21433)) +- AD fixes (windows.plugin) ([#21454](https://github.com/netdata/netdata/issues/21454)) +- Fix(go.d/ap): handle unknown values in station statistics gracefully ([#21461](https://github.com/netdata/netdata/issues/21461)) +- Improve streaming connection loss detection ([#21430](https://github.com/netdata/netdata/issues/21430)) + +## [2.8.2] - 2025-12-03 + +### Merged Pull Requests: + +- Improve replication logic by checking if parent is caught up with child ([#21352](https://github.com/netdata/netdata/issues/21352)) +- Improve stale replication ([#21357](https://github.com/netdata/netdata/issues/21357)) +- Fix(docker): add netdata user to nvidia device group on non-Debian systems ([#21358](https://github.com/netdata/netdata/issues/21358)) +- Add netdata user to video group ([#21359](https://github.com/netdata/netdata/issues/21359)) +- Fix(docker): fix bugs and refactor Docker entrypoint script ([#21364](https://github.com/netdata/netdata/issues/21364)) +- Adjust windows sensors initialization ([#21374](https://github.com/netdata/netdata/issues/21374)) +- Ci: fix MCP Registry publishing workflow ([#21342](https://github.com/netdata/netdata/issues/21342)) +- Chore(go.d): disable redis lib logging ([#21344](https://github.com/netdata/netdata/issues/21344)) +- Fix(go.d): prefer env-provided dirs over build-time defaults ([#21345](https://github.com/netdata/netdata/issues/21345)) +- Remove retention check during datafile initialization ([#21387](https://github.com/netdata/netdata/issues/21387)) +- Replace dots with slashes in OTEL metric families for hierarchical grouping ([#21371](https://github.com/netdata/netdata/issues/21371)) + +## [2.8.1] - 2025-11-21 + +### Merged Pull Requests: + +- Remove log message ([#21325](https://github.com/netdata/netdata/issues/21325)) +- Fix compilation when DBENGINE is disabled ([#21323](https://github.com/netdata/netdata/issues/21323)) +- Fix(cgroups): improve podman container name resolution ([#21317](https://github.com/netdata/netdata/issues/21317)) +- Improve(health/dyncfg): Add source unit field to alert value ([#21326](https://github.com/netdata/netdata/issues/21326)) +- Disable apps plugin pss mem by default ([#21330](https://github.com/netdata/netdata/issues/21330)) +- Fix(go.d): fix dyncfg vnodes configs ([#21332](https://github.com/netdata/netdata/issues/21332)) + +## [2.8.0] - 2025-11-19 + +### Merged Pull Requests: + - Improve(go.d/rabbitmq): add support for old RabbitMQ whoami tags format ([#21049](https://github.com/netdata/netdata/issues/21049)) - Ai-docs ([#21043](https://github.com/netdata/netdata/issues/21043)) - Feat(go.d/snmp): add optional ICMP ping metrics ([#21052](https://github.com/netdata/netdata/issues/21052)) diff --git a/netdata.spec.in b/netdata.spec.in index df0b4fe9c59de5..20cedff08077ac 100644 --- a/netdata.spec.in +++ b/netdata.spec.in @@ -3249,7 +3249,7 @@ else fi %endif -for item in docker ceph I2C; do +for item in docker ceph I2C video; do if getent group $item > /dev/null 2>&1; then usermod -a -G ${item} %{name} fi diff --git a/packaging/build-package.sh b/packaging/build-package.sh index 7aad99268fec22..131deacbf0b2e4 100755 --- a/packaging/build-package.sh +++ b/packaging/build-package.sh @@ -18,6 +18,8 @@ SCRIPT_SOURCE="$( )" SOURCE_DIR="$(dirname "$(dirname "${SCRIPT_SOURCE}")")" +. /etc/os-release + CMAKE_ARGS="-S ${SOURCE_DIR} -B ${BUILD_DIR}" add_cmake_option() { @@ -46,7 +48,6 @@ add_cmake_option ENABLE_PLUGIN_SYSTEMD_JOURNAL On add_cmake_option ENABLE_PLUGIN_SYSTEMD_UNITS On add_cmake_option ENABLE_EXPORTER_PROMETHEUS_REMOTE_WRITE On -add_cmake_option ENABLE_EXPORTER_MONGODB On add_cmake_option ENABLE_BUNDLED_PROTOBUF Off add_cmake_option ENABLE_BUNDLED_JSONC Off @@ -80,6 +81,15 @@ case "${PKG_TYPE}" in add_cmake_option ENABLE_PLUGIN_IBM Off ;; esac + + if [ "${ID}" = "ubuntu" ]; then + case "${VERSION_ID}" in + 20.04|22.04|24.04) add_cmake_option ENABLE_EXPORTER_MONGODB Off ;; + *) add_cmake_option ENABLE_EXPORTER_MONGODB On ;; + esac + else + add_cmake_option ENABLE_EXPORTER_MONGODB On + fi ;; RPM) ;; *) echo "Unrecognized package type ${PKG_TYPE}." ; exit 1 ;; diff --git a/packaging/check-for-go-toolchain.sh b/packaging/check-for-go-toolchain.sh index 6ea357548de67b..1cf4c8a354e4e1 100644 --- a/packaging/check-for-go-toolchain.sh +++ b/packaging/check-for-go-toolchain.sh @@ -51,32 +51,32 @@ install_go_toolchain() { Linux) case "$(uname -m)" in i?86) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-386.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="d03cdcbc9bd8baf5cf028de390478e9e2b3e4d0afe5a6582dedc19bfe6a263b2" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-386.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="db908a86e888574ed3432355ba5372ad3ef2c0821ba9b91ceaa0f6634620c40c" ;; x86_64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-amd64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="7716a0d940a0f6ae8e1f3b3f4f36299dc53e31b16840dbd171254312c41ca12e" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-amd64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="9e9b755d63b36acf30c12a9a3fc379243714c1c6d3dd72861da637f336ebb35b" ;; aarch64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-arm64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="65a3e34fb2126f55b34e1edfc709121660e1be2dee6bdf405fc399a63a95a87d" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-arm64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="b00b694903d126c588c378e72d3545549935d3982635ba3f7a964c9fa23fe3b9" ;; armv*) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-armv6l.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="eb949be683e82a99e9861dafd7057e31ea40b161eae6c4cd18fdc0e8c4ae6225" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-armv6l.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="0b27e3dec8d04899d6941586d2aa2721c3dee67c739c1fc1b528188f3f6e8ab5" ;; ppc64le) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-ppc64le.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="8b0c8d3ee5b1b5c28b6bd63dc4438792012e01d03b4bf7a61d985c87edab7d1f" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-ppc64le.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="f0904b647b5b8561efc5d48bb59a34f2b7996afab83ccd41c93b1aeb2c0067e4" ;; riscv64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-riscv64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="22fe934a9d0c9c57275716c55b92d46ebd887cec3177c9140705efa9f84ba1e2" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-riscv64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="05de84b319bc91b9cecbc6bf8eb5fcd814cf8a9d16c248d293dbd96f6cc0151b" ;; s390x) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.linux-s390x.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="9cfe517ba423f59f3738ca5c3d907c103253cffbbcc2987142f79c5de8c1bf93" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.linux-s390x.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="a5d0a72b0dfd57f9c2c0cdd8b7e0f401e0afb9e8c304d3410f9b0982ce0953da" ;; *) GOLANG_FAILURE_REASON="Linux $(uname -m) platform is not supported out-of-box by Go, you must install a toolchain for it yourself." @@ -87,24 +87,24 @@ install_go_toolchain() { FreeBSD) case "$(uname -m)" in 386) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.freebsd-386.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="dc0198dd4ec520e13f26798def8750544edf6448d8e9c43fd2a814e4885932af" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.freebsd-386.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="f8ff9fa5309fbbbd7d52f5d3f7181feb830dfd044d23c38746a2ada091f751b5" ;; amd64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.freebsd-amd64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="c4f1a7e7b258406e6f3b677ecdbd97bbb23ff9c0d44be4eb238a07d360f69ac8" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.freebsd-amd64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="a2d2b2aeb218bd646fd8708bacc96c9d4de1b6c9ea48ceb9171e9e784f676650" ;; arm) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.freebsd-arm.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="7772fc5ff71ed39297ec0c1599fc54e399642c9b848eac989601040923b0de9c" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.freebsd-arm.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="b83a5cb1695c7185a13840661aef6aa1b46202d41a72528ecde51735765c6641" ;; arm64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.freebsd-arm64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="5bb011d5d5b6218b12189f07aa0be618ab2002662fff1ca40afba7389735c207" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.freebsd-arm64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="938fc0204f853c24ab03967105146af6590903dd14f869fe912db7a735f654f6" ;; riscv64) - GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.1.freebsd-riscv64.tar.gz" - GOLANG_ARCHIVE_CHECKSUM="ccac716240cb049bebfafcb7eebc3758512178a4c51fc26da9cc032035d850c8" + GOLANG_ARCHIVE_URL="https://go.dev/dl/go1.25.5.freebsd-riscv64.tar.gz" + GOLANG_ARCHIVE_CHECKSUM="7b0cc61246cf6fc9e576135cfcd2b95e870b0f2ee5bf057325b2d76119001e4e" ;; *) GOLANG_FAILURE_REASON="FreeBSD $(uname -m) platform is not supported out-of-box by Go, you must install a toolchain for it yourself." diff --git a/packaging/cmake/Modules/Packaging.cmake b/packaging/cmake/Modules/Packaging.cmake index 4e47ff5cc2c5bf..96774339560906 100644 --- a/packaging/cmake/Modules/Packaging.cmake +++ b/packaging/cmake/Modules/Packaging.cmake @@ -15,6 +15,45 @@ set(CPACK_STRIP_FILES NO) set(CPACK_DEBIAN_DEBUGINFO_PACKAGE NO) set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS YES) +set(CPACK_DEBIAN_COMPRESSION_TYPE "xz") + +if(OS_LINUX) + set(OS_DISTRO_ID "unknown") + set(OS_VERSION_ID "unknown") + + find_file(OS_RELEASE_PATH os-release PATHS /etc /usr/lib + NO_DEFAULT_PATH + NO_PACKAGE_ROOT_PATH + NO_CMAKE_PATH + NO_CMAKE_ENVIRONMENT_PATH + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_SYSTEM_PATH + NO_CMAKE_INSTALL_PREFIX) + + if(NOT OS_RELEASE_PATH STREQUAL OS_RELEASE_PATH-NOTFOUND) + file(STRINGS "${OS_RELEASE_PATH}" OS_RELEASE_LINES) + + foreach(_line IN LISTS OS_RELEASE_LINES) + if(_line MATCHES "^ID=.*$") + string(SUBSTRING "${_line}" 3 -1 OS_DISTRO_ID) + string(REPLACE "\"" "" OS_DISTRO_ID "${OS_DISTRO_ID}") + elseif(_line MATCHES "^VERSION_ID=.*$") + string(SUBSTRING "${_line}" 11 -1 OS_VERSION_ID) + string(REPLACE "\"" "" OS_VERSION_ID "${OS_VERSION_ID}") + endif() + endforeach() + endif() + + if(OS_DISTRO_ID STREQUAL "debian") + if(OS_VERSION_ID VERSION_GREATER_EQUAL 12) + set(CPACK_DEBIAN_COMPRESSION_TYPE "zstd") + endif() + elseif(OS_DISTRO_ID STREQUAL "ubuntu") + if(OS_VERSION_ID VERSION_GREATER_EQUAL 21.10) + set(CPACK_DEBIAN_COMPRESSION_TYPE "zstd") + endif() + endif() +endif() set(CPACK_PACKAGING_INSTALL_PREFIX "/") diff --git a/packaging/cmake/pkg-files/deb/user/postinst b/packaging/cmake/pkg-files/deb/user/postinst index 0e88334bd1d4df..adb71b743bf3f9 100755 --- a/packaging/cmake/pkg-files/deb/user/postinst +++ b/packaging/cmake/pkg-files/deb/user/postinst @@ -21,6 +21,9 @@ case "${1}" in if [ -d "/etc/pve" ]; then groups="${groups} www-data" fi + if [ -e "/dev/nvidiactl" ]; then + groups="${groups} video" + fi for item in ${groups}; do if getent group "${item}" > /dev/null 2>&1; then diff --git a/packaging/docker/run.sh b/packaging/docker/run.sh index fd38710108eda2..4b3d420a9ca4a8 100755 --- a/packaging/docker/run.sh +++ b/packaging/docker/run.sh @@ -5,41 +5,77 @@ set -e +DOCKER_USR="${DOCKER_USR:-netdata}" + if [ ! -w / ] && [ "${EUID}" -eq 0 ]; then echo >&2 "WARNING: This Docker host appears to not properly support newer stat system calls. This is known to cause issues with Netdata (most notably, nodes running on such hosts **cannot be claimed**)." echo >&2 "WARNING: For more information, see https://learn.netdata.cloud/docs/agent/claim#known-issues-on-older-hosts-with-seccomp-enabled" fi -# Needed to read Proxmox VMs and (LXC) containers configuration files (name resolution + CPU and memory limits) -function add_netdata_to_proxmox_conf_files_group() { - group_guid="$(stat -c %g /host/etc/pve 2>/dev/null || true)" - [ -z "${group_guid}" ] && return +# Check if user is a member of a group by GID +# Arguments: $1 = GID, $2 = username +is_user_in_group() { + local gid="$1" + local user="$2" + getent group "${gid}" 2>/dev/null | awk -F: '{print $4}' | tr ',' '\n' | grep -qx "${user}" +} + +# Add user to a group by GID, creating the group if necessary +# Arguments: $1 = GID, $2 = group name (for creation) +add_user_to_gid() { + local gid="$1" + local group_name="$2" - if ! getent group "${group_guid}" >/dev/null; then - echo "Creating proxmox-etc-pve group with GID ${group_guid}" - if ! addgroup --gid "${group_guid}" "proxmox-etc-pve"; then - echo >&2 "Failed to add group proxmox-etc-pve with GID ${group_guid}." - return + [ -z "${gid}" ] && return 1 + + if ! getent group "${gid}" > /dev/null; then + echo "Creating ${group_name} group with GID ${gid}" + if ! addgroup --gid "${gid}" "${group_name}"; then + echo >&2 "Failed to add group ${group_name} with GID ${gid}." + return 1 fi fi - if ! getent group "${group_guid}" | grep -q "${DOCKER_USR}"; then - echo "Assigning ${DOCKER_USR} user to group ${group_guid}" - if ! usermod --apend --groups "${group_guid}" "${DOCKER_USR}"; then - echo >&2 "Failed to add ${DOCKER_USR} user to group with GID ${group_guid}." - return + if ! is_user_in_group "${gid}" "${DOCKER_USR}"; then + echo "Assigning ${DOCKER_USR} user to group ${gid}" + if ! usermod --append --groups "${gid}" "${DOCKER_USR}"; then + echo >&2 "Failed to add ${DOCKER_USR} user to group with GID ${gid}." + return 1 fi fi } -if [ ! "${DISABLE_TELEMETRY:-0}" -eq 0 ] || - [ -n "$DISABLE_TELEMETRY" ] || - [ ! "${DO_NOT_TRACK:-0}" -eq 0 ] || - [ -n "$DO_NOT_TRACK" ]; then +# Needed to read Proxmox VMs and (LXC) containers configuration files +add_netdata_to_proxmox_conf_files_group() { + [ "${DOCKER_USR}" = "root" ] && return 0 + + local group_gid + group_gid="$(stat -c %g /host/etc/pve 2> /dev/null || true)" + [ -z "${group_gid}" ] && return 0 + + add_user_to_gid "${group_gid}" "proxmox-etc-pve" +} + +# Needed to access NVIDIA GPU monitoring +add_netdata_to_nvidia_group() { + [ "${DOCKER_USR}" = "root" ] && return 0 + + local group_gid + group_gid="$(stat -c %g /dev/nvidiactl 2> /dev/null || true)" + [ -z "${group_gid}" ] && return 0 + + # Skip if the device is owned by root group + [ "${group_gid}" -eq 0 ] && return 0 + + add_user_to_gid "${group_gid}" "nvidia-dev" +} + +if [ "${DISABLE_TELEMETRY:-0}" != "0" ] || + [ "${DO_NOT_TRACK:-0}" != "0" ]; then touch /etc/netdata/.opt-out-from-anonymous-statistics fi -chmod o+rX / 2>/dev/null || echo "Unable to change permissions without errors." +chmod o+rX / 2> /dev/null || echo "Unable to change permissions without errors." if [ "${EUID}" -eq 0 ]; then if [ -n "${NETDATA_EXTRA_APK_PACKAGES}" ]; then @@ -62,8 +98,8 @@ if [ "${EUID}" -eq 0 ]; then fi fi - BALENA_PGID=$(stat -c %g /var/run/balena.sock 2>/dev/null || true) - DOCKER_PGID=$(stat -c %g /var/run/docker.sock 2>/dev/null || true) + BALENA_PGID=$(stat -c %g /var/run/balena.sock 2> /dev/null || true) + DOCKER_PGID=$(stat -c %g /var/run/docker.sock 2> /dev/null || true) re='^[0-9]+$' if [[ $BALENA_PGID =~ $re ]]; then @@ -75,19 +111,26 @@ if [ "${EUID}" -eq 0 ]; then DOCKER_HOST="unix:///var/run/docker.sock" PGID="$DOCKER_PGID" fi - export PGID - export DOCKER_HOST if [ -n "${PGID}" ]; then - echo "Creating docker group with GID ${PGID}" - addgroup --gid "${PGID}" "docker" || echo >&2 "Failed to add group docker with GID ${PGID}, probably one already exists." - echo "Assigning ${DOCKER_USR} user to group with GID ${PGID}" - usermod --append --groups "${PGID}" "${DOCKER_USR}" || echo >&2 "Failed to add ${DOCKER_USR} user to group with GID ${PGID}." + export PGID + fi + if [ -n "${DOCKER_HOST}" ]; then + export DOCKER_HOST + fi + + if [ -n "${PGID}" ]; then + echo "Configuring docker group (GID ${PGID}) for ${DOCKER_USR}" + add_user_to_gid "${PGID}" "docker" || true fi if [ -d "/host/etc/pve" ]; then add_netdata_to_proxmox_conf_files_group || true fi + + if [ -e "/dev/nvidiactl" ]; then + add_netdata_to_nvidia_group || true + fi else echo >&2 "WARNING: Entrypoint started as non-root user. This is not officially supported and some features may not be available." fi @@ -100,10 +143,10 @@ fi if [ -w "/etc/netdata" ]; then if mountpoint -q /etc/netdata; then - hostname >/etc/netdata/.container-hostname + hostname > /etc/netdata/.container-hostname else rm -f /etc/netdata/.container-hostname fi fi -exec /usr/sbin/netdata -u "${DOCKER_USR}" -D -s /host -p "${NETDATA_LISTENER_PORT}" "$@" +exec /usr/sbin/netdata -u "${DOCKER_USR}" -D -s /host -p "${NETDATA_LISTENER_PORT:-19999}" "$@" diff --git a/packaging/installer/functions.sh b/packaging/installer/functions.sh index 2a5935f80dc6a4..e7073e5ea7ee98 100644 --- a/packaging/installer/functions.sh +++ b/packaging/installer/functions.sh @@ -1045,6 +1045,9 @@ create_netdata_accounts() { if [ -d "/etc/pve" ]; then NETDATA_WANTED_GROUPS="${NETDATA_WANTED_GROUPS} www-data" fi + if [ -e "/dev/nvidiactl" ]; then + NETDATA_WANTED_GROUPS="${NETDATA_WANTED_GROUPS} video" + fi if command -v systemd-sysusers >/dev/null 2>&1; then install -m 644 -o root -g root "${NETDATA_PREFIX}/usr/lib/netdata/system/systemd/sysusers/netdata.conf" /usr/lib/sysusers.d/netdata.conf diff --git a/packaging/makeself/bundled-packages.version b/packaging/makeself/bundled-packages.version index e15997568332c5..f6b5d6aa79a7a4 100644 --- a/packaging/makeself/bundled-packages.version +++ b/packaging/makeself/bundled-packages.version @@ -1,17 +1,17 @@ # Source of truth for all the packages we bundle in static builds PACKAGES=("OPENSSL" "CURL" "BASH" "IOPING" "LIBNETFILTER_ACT") SOURCE_TYPES=("GH_REPO_CLONE" "GH_REPO_CLONE" "DW_TARBALL" "GH_REPO_SOURCE" "DW_TARBALL") -OPENSSL_VERSION="openssl-3.5.4" +OPENSSL_VERSION="openssl-3.6.0" OPENSSL_SOURCE="https://github.com/openssl/openssl" -LIBUCONTEXT_VERSION="libucontext-1.3.2" +LIBUCONTEXT_VERSION="libucontext-1.3.3" LIBUCONTEXT_SOURCE="https://github.com/kaniini/libucontext" -LIBUNWIND_VERSION="177deb5f89c5d792c9618db54fdcebd260e271e8" # Should be updated to a stable version once https://github.com/libunwind/libunwind/issues/742 is fixed. +LIBUNWIND_VERSION="v1.8.3" LIBUNWIND_SOURCE="https://github.com/libunwind/libunwind" -CURL_VERSION="curl-8_16_0" +CURL_VERSION="curl-8_17_0" CURL_SOURCE="https://github.com/curl/curl" -BASH_VERSION="5.2.37" +BASH_VERSION="5.3" BASH_ARTIFACT_SOURCE="http://ftp.gnu.org/gnu/bash" -BASH_ARTIFACT_SHA256="9599b22ecd1d5787ad7d3b7bf0c59f312b3396d1e281175dd1f8a4014da621ff" +BASH_ARTIFACT_SHA256="0d5cd86965f869a26cf64f4b71be7b96f90a3ba8b3d74e27e8e9d9d5550f31ba" IOPING_VERSION="1.3" IOPING_SOURCE="https://github.com/koct9i/ioping" IOPING_ARTIFACT_SHA256="7aa48e70aaa766bc112dea57ebbe56700626871052380709df3a26f46766e8c8" diff --git a/packaging/makeself/jobs/30-curl.install.sh b/packaging/makeself/jobs/30-curl.install.sh index e1c027e5469325..d722156134ca9d 100755 --- a/packaging/makeself/jobs/30-curl.install.sh +++ b/packaging/makeself/jobs/30-curl.install.sh @@ -46,6 +46,7 @@ if [ "${CACHE_HIT:-0}" -eq 0 ]; then --with-openssl \ --with-ca-bundle=/opt/netdata/etc/ssl/certs/ca-certificates.crt \ --with-ca-path=/opt/netdata/etc/ssl/certs \ + --without-brotli \ --disable-dependency-tracking # Curl autoconf does not honour the curl_LDFLAGS environment variable diff --git a/packaging/version b/packaging/version index b77e3cc58c33b6..3f03d9f19a1a71 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v2.8.0 +v2.8.5 diff --git a/src/collectors/apps.plugin/README.md b/src/collectors/apps.plugin/README.md index 2e12604f5ee38d..f3ea587dfbb5f5 100644 --- a/src/collectors/apps.plugin/README.md +++ b/src/collectors/apps.plugin/README.md @@ -146,10 +146,10 @@ interpreters: process1 process2 process3 #### Unix-like systems (Linux, FreeBSD, macOS) -| Field | Description | Example | -|----------|----------------------------------------------------------------------|----------------------| -| comm | Process name (command) | `chrome` | -| cmdline | Full command line with arguments | `/usr/bin/chrome --enable-features=...` | +| Field | Description | Example | +|---------|----------------------------------|-----------------------------------------| +| comm | Process name (command) | `chrome` | +| cmdline | Full command line with arguments | `/usr/bin/chrome --enable-features=...` | > **Note:** On Linux specifically, the **comm** field is limited to 15 characters from `/proc/{PID}/comm`. > `apps.plugin` attempts to obtain the full process name by searching for it in the **cmdline**. @@ -157,11 +157,11 @@ interpreters: process1 process2 process3 #### Windows process fields -| Field | Description | Example | -|----------|----------------------------------------------------------------------|----------------------| -| comm | Performance Monitor instance name (may include instance numbers) | `chrome#12` | -| cmdline | Full path to the executable (without command line arguments) | `C:\Program Files\Google\Chrome\Application\chrome.exe` | -| name | Friendly name from file description or service display name | `Google Chrome` | +| Field | Description | Example | +|---------|------------------------------------------------------------------|---------------------------------------------------------| +| comm | Performance Monitor instance name (may include instance numbers) | `chrome#12` | +| cmdline | Full path to the executable (without command line arguments) | `C:\Program Files\Google\Chrome\Application\chrome.exe` | +| name | Friendly name from file description or service display name | `Google Chrome` | > On Windows: > - All pattern types (exact, prefix, suffix, substring) also match against the **name** field @@ -180,14 +180,15 @@ You can use asterisks (`*`) to create patterns: > - **Netdata v2.5.2 and earlier**: Windows patterns match against `comm` and `cmdline` fields > - **Netdata v2.5.3 and later**: Windows patterns match against `comm`, `cmdline`, and `name` (friendly name) fields -| Mode | Pattern | Description | Unix-like | Windows | -|-----------|-------------|---------------------------------------------|-----------|---------| -| exact | `firefox` | Matches **comm** exactly | ✓ Yes | ✓ Yes | -| prefix | `firefox*` | Matches **comm** starting with firefox | ✓ Yes | ✓ Yes | -| suffix | `*fox` | Matches **comm** ending with fox | ✓ Yes | ✓ Yes | -| substring | `*firefox*` | Searches within **cmdline** | ✓ Yes (full command line) | ✓ Yes (full path) | +| Mode | Pattern | Description | Unix-like | Windows | +|-----------|-------------|----------------------------------------|---------------------------|-------------------| +| exact | `firefox` | Matches **comm** exactly | ✓ Yes | ✓ Yes | +| prefix | `firefox*` | Matches **comm** starting with firefox | ✓ Yes | ✓ Yes | +| suffix | `*fox` | Matches **comm** ending with fox | ✓ Yes | ✓ Yes | +| substring | `*firefox*` | Searches within **cmdline** | ✓ Yes (full command line) | ✓ Yes (full path) | **Note on substring matching (`*pattern*`):** + - On Unix-like systems: Searches within the full command line including arguments - On Windows: Searches within the full executable path (e.g., `C:\Program Files\Mozilla Firefox\firefox.exe`) @@ -200,11 +201,13 @@ You can use asterisks (`*`) to create patterns: #### Windows default grouping behavior On Windows, when a process doesn't match any pattern in `apps_groups.conf`: + - The **name** field (friendly name from file description or service display name) is used as the default group/category if available - If no **name** field exists, the **comm** field is used - This provides better default grouping for Windows services and applications with descriptive names For example, a process might have: + - **comm**: `svchost` - **name**: `Windows Update` - **Default category**: `Windows Update` (uses the friendly name) @@ -251,15 +254,15 @@ You can use the Netdata `processes` function to verify that your `apps_groups.co 1. **Access the processes function** through Netdata Cloud (required for security reasons) 2. **Review the output** to see: - - Current running processes with their `comm`, `cmdline`, and (on Windows) `name` fields - - The **Category** column shows which group from `apps_groups.conf` each process has been assigned to - - Resource utilization for each process + - Current running processes with their `comm`, `cmdline`, and (on Windows) `name` fields + - The **Category** column shows which group from `apps_groups.conf` each process has been assigned to + - Resource utilization for each process 3. **Troubleshooting tips**: - - If a process shows the wrong Category, check the exact process name in the function output - - On Windows, remember that the `name` field is used for default categories but NOT for pattern matching - - Remember that the first matching pattern wins - check your pattern order - - For inherited groups, verify the parent process has the correct Category + - If a process shows the wrong Category, check the exact process name in the function output + - On Windows, remember that the `name` field is used for default categories but NOT for pattern matching + - Remember that the first matching pattern wins - check your pattern order + - For inherited groups, verify the parent process has the correct Category There are a few command line options you can pass to `apps.plugin`. The list of available options can be acquired with the `--help` flag. The options can be set in the `netdata.conf` using the [`edit-config` script](/docs/netdata-agent/configuration/README.md). @@ -275,7 +278,7 @@ For example, to disable user and user group charts you would set: On Linux systems with kernel 4.14 or later, `apps.plugin` uses Proportional Set Size (PSS) data from `/proc//smaps_rollup` to provide more accurate memory usage estimates for processes that heavily use shared memory (e.g., databases, shared memory applications). -**By default, PSS sampling is enabled with a 5-minute refresh interval.** This provides better accuracy than raw RSS (Resident Set Size), which can overstate memory usage for processes sharing memory pages. The plugin periodically samples PSS values and uses them to scale the shared memory portion of RSS, providing a more accurate estimate without the overhead of reading smaps on every iteration. +**By default, PSS sampling is disabled**. When disabled, memory charts show traditional RSS (Resident Set Size), which may overstate usage for processes sharing memory pages. Enabling PSS sampling allows the plugin to periodically sample PSS values and use them to scale the shared portion of RSS, providing a significantly more accurate estimate without the overhead of reading smaps on every iteration. #### Configuration @@ -287,28 +290,32 @@ The `--pss` option controls PSS sampling behavior: ``` **Valid values:** + - Duration (e.g., `5m`, `300s`, `10m`): Sets the refresh interval for PSS sampling. Lower values provide more accurate estimates but increase CPU overhead. - `off` or `0`: Completely disables PSS sampling. Memory charts will show traditional RSS-based measurements. -**Default:** `5m` (5 minutes) +**Default:** `off` **How it works:** + - `apps.plugin` uses adaptive sampling that alternates between two strategies each iteration: - - **Delta-based**: Prioritizes processes with largest shared memory changes (refreshes big memory consumers within seconds) - - **Age-based**: Prioritizes processes with oldest samples (ensures all processes refreshed within 2× the interval) + - **Delta-based**: Prioritizes processes with largest shared memory changes (refreshes big memory consumers within seconds) + - **Age-based**: Prioritizes processes with oldest samples (ensures all processes refreshed within 2× the interval) - The sampled PSS/RSS ratio is cached and applied to subsequent RSS readings to estimate current memory usage - This approach ensures rapid response to significant memory changes while guaranteeing bounded staleness for all processes - When disabled (`--pss 0` or `--pss off`), no PSS sampling occurs and estimated memory charts are not shown **Performance considerations:** + - Reading `/proc//smaps_rollup` is more expensive than reading `/proc//status` - Shorter refresh periods provide more accurate estimates but increase CPU usage - On systems with thousands of processes, consider increasing the refresh period (e.g., `10m` or `15m`) - For systems without significant shared memory usage, disabling PSS sampling (`--pss off`) reduces overhead **Chart behavior:** -- **Default (PSS enabled):** Shows both "Estimated memory usage (RSS with shared scaling)" and "Memory RSS usage" charts -- **When disabled (`--pss 0`):** Shows only "Memory RSS usage" charts + +- **When PSS is enabled:** Shows both "Estimated memory usage (RSS with shared scaling)" and "Memory RSS usage" charts +- **Default (PSS disabled):** Shows only "Memory RSS usage" charts - The `processes` function API exposes additional columns (PSS, PssAge, SharedRatio) when PSS is enabled ### Integration with eBPF diff --git a/src/collectors/apps.plugin/apps_plugin.c b/src/collectors/apps.plugin/apps_plugin.c index af15b9dc285ee3..741100bcced63c 100644 --- a/src/collectors/apps.plugin/apps_plugin.c +++ b/src/collectors/apps.plugin/apps_plugin.c @@ -381,7 +381,7 @@ int check_proc_1_io() { static bool profile_speed = false; static bool print_tree_and_exit = false; #if (PROCESSES_HAVE_SMAPS_ROLLUP == 1) -int pss_refresh_period = 300; // seconds +int pss_refresh_period = 0; // disabled by default #endif static void parse_args(int argc, char **argv) @@ -586,9 +586,9 @@ static void parse_args(int argc, char **argv) " (default is %d seconds)\n" "\n" #if (PROCESSES_HAVE_SMAPS_ROLLUP == 1) - " --pss TIME estimated memory interval (e.g. 5m, 300s)\n" - " use 'off' to disable smaps sampling\n" - " (default is 5 minutes)\n" + " --pss TIME enable estimated memory using PSS sampling at the given interval\n" + " (e.g. 5m, 300s). Use 'off' or '0' to disable.\n" + " (default is off)\n" "\n" #endif #endif diff --git a/src/collectors/cgroups.plugin/cgroup-name.sh.in b/src/collectors/cgroups.plugin/cgroup-name.sh.in index ce1c60c0921f58..4f451632378089 100755 --- a/src/collectors/cgroups.plugin/cgroup-name.sh.in +++ b/src/collectors/cgroups.plugin/cgroup-name.sh.in @@ -683,7 +683,7 @@ if [ -z "${NAME}" ]; then docker_validate_id "${DOCKERID}" elif [[ ${CGROUP} =~ ^.*libpod-[a-fA-F0-9]+.*$ ]]; then # Podman - PODMANID="$(echo "${CGROUP}" | sed "s|^.*libpod-\([a-fA-F0-9]\+\).*$|\1|")" + PODMANID="$(echo "${CGROUP}" | sed "s|^.*libpod-\(conmon-\)\?\([a-fA-F0-9]\+\).*$|\2|")" podman_validate_id "${PODMANID}" elif [[ ${CGROUP} =~ machine.slice[_/].*\.service ]]; then diff --git a/src/collectors/diskspace.plugin/plugin_diskspace.c b/src/collectors/diskspace.plugin/plugin_diskspace.c index 363b65be3934b4..83f8f4419e099d 100644 --- a/src/collectors/diskspace.plugin/plugin_diskspace.c +++ b/src/collectors/diskspace.plugin/plugin_diskspace.c @@ -299,7 +299,26 @@ static void calculate_values_and_show_charts( m->collected++; } +// Check if a ZFS filesystem entry is a dataset (not a pool) +static inline bool is_zfs_dataset(struct mountinfo *mi) { + if(!mi || !mi->filesystem || !mi->mount_source || !mi->mount_source[0]) + return false; + + if(strcmp(mi->filesystem, "zfs") != 0) + return false; + + // For ZFS, the mount_source contains the dataset name (e.g., "tank" or "tank/install") + // Pools have no slash, datasets have at least one slash + return strchr(mi->mount_source, '/') != NULL; +} + static inline void do_disk_space_stats(struct mountinfo *mi, int update_every) { + // Skip ZFS datasets, only monitor ZFS pools + // This prevents alert floods when a pool fills up + if (is_zfs_dataset(mi)) { + return; + } + const char *disk = mi->persistent_id; static SIMPLE_PATTERN *excluded_mountpoints = NULL; diff --git a/src/collectors/windows.plugin/GetHardwareInfo.c b/src/collectors/windows.plugin/GetHardwareInfo.c index 4dc1999b87cee6..191f0608b20b39 100644 --- a/src/collectors/windows.plugin/GetHardwareInfo.c +++ b/src/collectors/windows.plugin/GetHardwareInfo.c @@ -236,7 +236,7 @@ static void netdata_detect_cpu() temperature_fcnt = netdata_amd_cpu_temp; } -static int initialize(int update_every) +static int initialize() { netdata_detect_cpu(); if (!temperature_fcnt) { @@ -254,7 +254,7 @@ static int initialize(int update_every) ncpus = os_get_system_cpus(); cpus = callocz(ncpus, sizeof(struct cpu_data)); - hardware_info_thread = nd_thread_create("hi_threads", NETDATA_THREAD_OPTION_DEFAULT, get_hardware_info_thread, &update_every); + hardware_info_thread = nd_thread_create("hi_threads", NETDATA_THREAD_OPTION_DEFAULT, get_hardware_info_thread, NULL); return 0; } @@ -270,7 +270,7 @@ static RRDSET *netdata_publish_cpu_chart(int update_every) "temperature", "cpu.temperature", "Core temperature", - "Celcius", + "Celsius", PLUGIN_WINDOWS_NAME, "GetHardwareInfo", NETDATA_CHART_PRIO_CPU_TEMPERATURE, @@ -284,11 +284,11 @@ static RRDSET *netdata_publish_cpu_chart(int update_every) static void netdata_loop_cpu_chart(int update_every) { RRDSET *chart = netdata_publish_cpu_chart(update_every); - for (size_t i = 0; i < ncpus; i++) { + for (int i = 0; i < (int)ncpus; i++) { struct cpu_data *lcpu = &cpus[i]; if (!lcpu->rd_cpu_temp) { char id[RRD_ID_LENGTH_MAX + 1]; - snprintfz(id, RRD_ID_LENGTH_MAX, "cpu%lu.temp", i); + snprintfz(id, RRD_ID_LENGTH_MAX, "cpu%d.temp", i); lcpu->rd_cpu_temp = rrddim_add(chart, id, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } rrddim_set_by_pointer(chart, lcpu->rd_cpu_temp, lcpu->cpu_temp); @@ -301,7 +301,7 @@ int do_GetHardwareInfo(int update_every, usec_t dt __maybe_unused) static bool initialized = false; if (unlikely(!initialized)) { initialized = true; - if (initialize(update_every)) { + if (initialize()) { return -1; } } @@ -314,7 +314,7 @@ int do_GetHardwareInfo(int update_every, usec_t dt __maybe_unused) void do_GetHardwareInfo_cleanup() { if (nd_thread_join(hardware_info_thread)) - nd_log_daemon(NDLP_ERR, "Failed to join mssql queries thread"); + nd_log_daemon(NDLP_ERR, "Failed to join Get Hardware Info thread"); netdata_stop_driver(); } diff --git a/src/collectors/windows.plugin/GetPowerSupply.c b/src/collectors/windows.plugin/GetPowerSupply.c index 623b9d2c56acc7..3976f1d87bda00 100644 --- a/src/collectors/windows.plugin/GetPowerSupply.c +++ b/src/collectors/windows.plugin/GetPowerSupply.c @@ -40,7 +40,7 @@ static inline void netdata_update_power_supply_values( if (bs.Capacity != BATTERY_UNKNOWN_CAPACITY) { NETDATA_DOUBLE num = bs.Capacity; NETDATA_DOUBLE den = bi->FullChargedCapacity; - num /= den; + num = (den) ? num/ den : 0; power_supply_root->capacity->value = (unsigned long long)(num * 100.0); } @@ -84,7 +84,7 @@ int do_GetPowerSupply(int update_every, usec_t dt __maybe_unused) SP_DEVICE_INTERFACE_DATA did = {0}; did.cbSize = sizeof(did); - for (LONG i = 0; i < 32 && SetupDiEnumDeviceInterfaces(hdev, 0, &GUID_DEVCLASS_BATTERY, 0, &did); i++) { + for (LONG i = 0; i < 32 && SetupDiEnumDeviceInterfaces(hdev, 0, &GUID_DEVCLASS_BATTERY, i, &did); i++) { DWORD cbRequired = 0; PSP_DEVICE_INTERFACE_DETAIL_DATA pdidd = NULL; HANDLE hBattery = NULL; @@ -124,8 +124,8 @@ int do_GetPowerSupply(int update_every, usec_t dt __maybe_unused) &bqi.BatteryTag, sizeof(bqi.BatteryTag), &dwOut, - NULL) && - bqi.BatteryTag) + NULL) || + !bqi.BatteryTag) goto endPowerSupply; BATTERY_INFORMATION bi = {0}; @@ -141,8 +141,14 @@ int do_GetPowerSupply(int update_every, usec_t dt __maybe_unused) char name[RRD_ID_LENGTH_MAX + 1]; snprintfz(name, sizeof(name), "BAT%d", i + 1); - power_supply_root->name = name; - power_supply_root->capacity->filename = power_supply_root->name; + if (likely(power_supply_root->name)) + freez(power_supply_root->name); + if (likely(power_supply_root->capacity->filename)) + freez(power_supply_root->capacity->filename); + + power_supply_root->name = power_supply_root->capacity->filename = NULL; + power_supply_root->name = strdupz(name); + power_supply_root->capacity->filename = strdupz(power_supply_root->name); netdata_update_power_supply_values(hBattery, &voltage, &bi, &bqi); diff --git a/src/collectors/windows.plugin/GetSensors.c b/src/collectors/windows.plugin/GetSensors.c index 8e64c186b01c78..d356dd9dafaf24 100644 --- a/src/collectors/windows.plugin/GetSensors.c +++ b/src/collectors/windows.plugin/GetSensors.c @@ -310,23 +310,37 @@ struct sensor_data { DICTIONARY *sensors; // Microsoft appends additional data -#define ADDTIONAL_UUID_STR_LEN (UUID_STR_LEN + 8) +#define ADDTIONAL_UUID_STR_LEN (UUID_STR_LEN + 17) -static void netdata_clsid_to_char(char *output, const GUID *pguid) +static bool netdata_clsid_to_char(char *output, size_t output_len, const GUID *pguid) { + if (unlikely(!output)) + return false; + LPWSTR wguid = NULL; - if (SUCCEEDED(StringFromCLSID(pguid, &wguid)) && wguid) { - size_t len = wcslen(wguid); - wcstombs(output, wguid, len); - CoTaskMemFree(wguid); + HRESULT hr = StringFromCLSID(pguid, &wguid); + output[0] = '\0'; + if (FAILED(hr) || unlikely(!wguid)) + return false; + + size_t converted = wcstombs(output, wguid, output_len - 1); + CoTaskMemFree(wguid); + + if (unlikely(converted == (size_t)-1)) { + output[0] = '\0'; + return false; } + + output[converted] = '\0'; + + return true; } static inline char *netdata_convert_guid_to_string(HRESULT hr, GUID *value) { if (SUCCEEDED(hr)) { char cguid[ADDTIONAL_UUID_STR_LEN]; - netdata_clsid_to_char(cguid, value); + netdata_clsid_to_char(cguid, ADDTIONAL_UUID_STR_LEN, value); return strdupz(cguid); } return NULL; @@ -501,6 +515,7 @@ static void netdata_get_sensors() ULONG count = 0; hr = pSensorCollection->lpVtbl->GetCount(pSensorCollection, &count); if (FAILED(hr)) { + pSensorCollection->lpVtbl->Release(pSensorCollection); return; } @@ -516,9 +531,14 @@ static void netdata_get_sensors() GUID id = {0}; hr = pSensor->lpVtbl->GetID(pSensor, &id); if (FAILED(hr)) { + pSensor->lpVtbl->Release(pSensor); + continue; + } + + if (!netdata_clsid_to_char(thread_values, sizeof(thread_values), &id)) { + pSensor->lpVtbl->Release(pSensor); continue; } - netdata_clsid_to_char(thread_values, &id); struct sensor_data *sd = dictionary_set(sensors, thread_values, NULL, sizeof(*sd)); @@ -578,6 +598,30 @@ static void netdata_get_sensors() static void netdata_sensors_monitor(void *ptr __maybe_unused) { + // Initialize COM for this thread - sensor thread only needs COM for Sensor API, not WMI + HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED); + bool com_initialized = SUCCEEDED(hr); + + // RPC_E_CHANGED_MODE means COM was already initialized in a different mode (e.g., COINIT_APARTMENTTHREADED) + // In this case, COM is available but we didn't increment the reference count + if (FAILED(hr) && hr != RPC_E_CHANGED_MODE) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "Sensor thread: cannot initialize COM interface (error 0x%lX)", (unsigned long)hr); + return; + } + + // Create sensor manager instance for this thread + hr = CoCreateInstance( + &CLSID_SensorManager, NULL, CLSCTX_INPROC_SERVER, &IID_ISensorManager, (void **)&pSensorManager); + if (FAILED(hr)) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "Sensor thread: cannot create ISensorManager (error 0x%lX)", (unsigned long)hr); + // Only uninitialize if we successfully initialized COM ourselves + if (com_initialized) + CoUninitialize(); + return; + } + heartbeat_t hb; heartbeat_init(&hb, USEC_PER_SEC); @@ -589,6 +633,16 @@ static void netdata_sensors_monitor(void *ptr __maybe_unused) netdata_get_sensors(); } + + // Thread cleanup - release sensor manager and uninitialize COM + if (pSensorManager) { + pSensorManager->lpVtbl->Release(pSensorManager); + pSensorManager = NULL; + } + + // Only uninitialize if we successfully initialized COM ourselves + if (com_initialized) + CoUninitialize(); } void dict_sensor_insert(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) @@ -605,24 +659,9 @@ void dict_sensor_insert(const DICTIONARY_ITEM *item __maybe_unused, void *value, static int initialize(int update_every) { - // This is an internal plugin, if we initialize these two times, collector will fail. To avoid this - // we call InitializeWMI to verify COM interface was already initialized. - HRESULT hr = InitializeWMI(); - if (hr != S_OK) { - hr = CoInitializeEx(NULL, COINIT_MULTITHREADED); - if (FAILED(hr)) { - nd_log(NDLS_COLLECTORS, NDLP_ERR, "Collector cannot initialize COM interface."); - return -1; - } - } - - hr = CoCreateInstance( - &CLSID_SensorManager, NULL, CLSCTX_INPROC_SERVER, &IID_ISensorManager, (void **)&pSensorManager); - if (FAILED(hr)) { - nd_log(NDLS_COLLECTORS, NDLP_ERR, "Collector cannot initialize sensor API."); - CoUninitialize(); - return -1; - } + // Note: COM and Sensor API initialization is now done in the sensor thread + // (netdata_sensors_monitor) because COM must be initialized per-thread. + // The sensor thread owns the pSensorManager instance and handles its cleanup. sensors = dictionary_create_advanced( DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, NULL, sizeof(struct sensor_data)); @@ -803,12 +842,11 @@ int do_GetSensors(int update_every, usec_t dt __maybe_unused) void do_Sensors_cleanup() { + // Wait for sensor thread to finish + // The thread handles its own COM cleanup (CoUninitialize) and pSensorManager release if (nd_thread_join(sensors_thread_update)) nd_log_daemon(NDLP_ERR, "Failed to join sensors thread update"); - if (pSensorManager) { - pSensorManager->lpVtbl->Release(pSensorManager); - - CoUninitialize(); - } + // Note: pSensorManager is owned and cleaned up by the sensor thread itself + // No additional cleanup needed here since the thread has already released resources } diff --git a/src/collectors/windows.plugin/GetServicesStatus.c b/src/collectors/windows.plugin/GetServicesStatus.c index 19b21a6b1ab037..6d9937e6d5a909 100644 --- a/src/collectors/windows.plugin/GetServicesStatus.c +++ b/src/collectors/windows.plugin/GetServicesStatus.c @@ -41,13 +41,13 @@ static void initialize(void) static BOOL fill_dictionary_with_content() { - static PVOID buffer = NULL; + PVOID buffer = NULL; static DWORD bytes_needed = 0; LPENUM_SERVICE_STATUS_PROCESS service, services; DWORD total_services = 0; SC_HANDLE ndSCMH = OpenSCManager(NULL, NULL, SC_MANAGER_ENUMERATE_SERVICE | SC_MANAGER_CONNECT); - if (!ndSCMH) { + if (unlikely(!ndSCMH)) { return FALSE; } @@ -64,11 +64,24 @@ static BOOL fill_dictionary_with_content() NULL, NULL); - if (GetLastError() == ERROR_MORE_DATA) { - if (!buffer) + DWORD test = GetLastError(); + if (test == ERROR_MORE_DATA) { + if (unlikely(!buffer)) buffer = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, bytes_needed); else buffer = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, buffer, bytes_needed); + } else { + switch (test) { + case ERROR_ACCESS_DENIED: + case ERROR_INVALID_PARAMETER: + case ERROR_INVALID_HANDLE: + case ERROR_INVALID_LEVEL: + case ERROR_SHUTDOWN_IN_PROGRESS: + ret = FALSE; + goto endServiceCollection; + default: + ret = TRUE; + } } if (!buffer) { @@ -112,6 +125,8 @@ static BOOL fill_dictionary_with_content() ret = TRUE; endServiceCollection: + if (buffer) + HeapFree(GetProcessHeap(), 0, buffer); CloseServiceHandle(ndSCMH); return ret; diff --git a/src/collectors/windows.plugin/perflib-ad.c b/src/collectors/windows.plugin/perflib-ad.c index 265e8ca5ce98a1..97500201e52ac1 100644 --- a/src/collectors/windows.plugin/perflib-ad.c +++ b/src/collectors/windows.plugin/perflib-ad.c @@ -187,9 +187,12 @@ static void netdata_ad_searches(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE *p static RRDSET *st_ldap_searches_total = NULL; static RRDDIM *rd_ldap_searches_total = NULL; - if (perflibGetObjectCounter(pDataBlock, pObjectType, &ldapSearchesTotal)) { - if (unlikely(!st_ldap_searches_total)) { - st_ldap_searches_total = rrdset_create_localhost( + if (!perflibGetObjectCounter(pDataBlock, pObjectType, &ldapSearchesTotal)) { + return; + } + + if (unlikely(!st_ldap_searches_total)) { + st_ldap_searches_total = rrdset_create_localhost( "ad", "ldap_searches", NULL, @@ -203,14 +206,13 @@ static void netdata_ad_searches(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE *p update_every, RRDSET_TYPE_LINE); - rd_ldap_searches_total = + rd_ldap_searches_total = rrddim_add(st_ldap_searches_total, "searches", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - } + } - rrddim_set_by_pointer( + rrddim_set_by_pointer( st_ldap_searches_total, rd_ldap_searches_total, (collected_number)ldapSearchesTotal.current.Data); - rrdset_done(st_ldap_searches_total); - } + rrdset_done(st_ldap_searches_total); } static void netdata_ad_properties(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE *pObjectType, int update_every) @@ -490,9 +492,12 @@ netdata_ad_service_threads_in_use(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE static RRDSET *st_directory_services_threads = NULL; static RRDDIM *rd_directory_services_threads = NULL; - if (perflibGetObjectCounter(pDataBlock, pObjectType, &directoryServiceThreads)) { - if (unlikely(!st_directory_services_threads)) { - st_directory_services_threads = rrdset_create_localhost( + if (!perflibGetObjectCounter(pDataBlock, pObjectType, &directoryServiceThreads)) { + return; + } + + if (unlikely(!st_directory_services_threads)) { + st_directory_services_threads = rrdset_create_localhost( "ad", "ds_threads", NULL, @@ -506,16 +511,15 @@ netdata_ad_service_threads_in_use(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE update_every, RRDSET_TYPE_LINE); - rd_directory_services_threads = + rd_directory_services_threads = rrddim_add(st_directory_services_threads, "thread", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - } + } - rrddim_set_by_pointer( + rrddim_set_by_pointer( st_directory_services_threads, rd_directory_services_threads, (collected_number)directoryServiceThreads.current.Data); - rrdset_done(st_directory_services_threads); - } + rrdset_done(st_directory_services_threads); } static void netdata_ad_bind(PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE *pObjectType, int update_every) diff --git a/src/collectors/windows.plugin/perflib-adcs.c b/src/collectors/windows.plugin/perflib-adcs.c index 5d1dcf629fec9c..f9bc93b57a8a8c 100644 --- a/src/collectors/windows.plugin/perflib-adcs.c +++ b/src/collectors/windows.plugin/perflib-adcs.c @@ -381,9 +381,9 @@ static void netdata_adcs_retrieval_processing( return; } - if (!ac->st_adcs_challenge_response_processing_time_seconds) { + if (!ac->st_adcs_retrievals_processing_time_seconds) { snprintfz(id, RRD_ID_LENGTH_MAX, "cert_%s_retrievals_processing_time", ac->name); - ac->st_adcs_challenge_response_processing_time_seconds = rrdset_create_localhost( + ac->st_adcs_retrievals_processing_time_seconds = rrdset_create_localhost( "adcs", id, NULL, @@ -397,8 +397,8 @@ static void netdata_adcs_retrieval_processing( update_every, RRDSET_TYPE_LINE); - ac->rd_adcs_challenge_response_processing_time_seconds = rrddim_add( - ac->st_adcs_challenge_response_processing_time_seconds, + ac->rd_adcs_retrievals_processing_time_seconds = rrddim_add( + ac->st_adcs_retrievals_processing_time_seconds, "processing_time", NULL, 1, @@ -406,17 +406,17 @@ static void netdata_adcs_retrieval_processing( RRD_ALGORITHM_ABSOLUTE); rrdlabels_add( - ac->st_adcs_challenge_response_processing_time_seconds->rrdlabels, "cert", ac->name, RRDLABEL_SRC_AUTO); + ac->st_adcs_retrievals_processing_time_seconds->rrdlabels, "cert", ac->name, RRDLABEL_SRC_AUTO); } rrddim_set_by_pointer( - ac->st_adcs_challenge_response_processing_time_seconds, - ac->rd_adcs_challenge_response_processing_time_seconds, + ac->st_adcs_retrievals_processing_time_seconds, + ac->rd_adcs_retrievals_processing_time_seconds, (collected_number)ac->ADCSRetrievalsProcessingTime.current.Data); - rrdset_done(ac->st_adcs_challenge_response_processing_time_seconds); + rrdset_done(ac->st_adcs_retrievals_processing_time_seconds); } -static void netdata_adcs_crypto_singing_time( +static void netdata_adcs_crypto_signing_time( struct adcs_certificate *ac, PERF_DATA_BLOCK *pDataBlock, PERF_OBJECT_TYPE *pObjectType, @@ -458,7 +458,7 @@ static void netdata_adcs_crypto_singing_time( rrddim_set_by_pointer( ac->st_adcs_request_cryptographic_signing_time_seconds, ac->rd_adcs_request_cryptographic_signing_time_seconds, - (collected_number)ac->ADCSRetrievalsProcessingTime.current.Data); + (collected_number)ac->ADCSRequestCryptoSigningTime.current.Data); rrdset_done(ac->st_adcs_request_cryptographic_signing_time_seconds); } @@ -656,20 +656,19 @@ static bool do_ADCS(PERF_DATA_BLOCK *pDataBlock, int update_every) return false; static void (*doADCS[])(struct adcs_certificate *, PERF_DATA_BLOCK *, PERF_OBJECT_TYPE *, int) = { - netdata_adcs_requests, - netdata_adcs_requests_processing_time, - netdata_adcs_retrievals, - netdata_adcs_failed_requets, - netdata_adcs_issued_requets, - netdata_adcs_pending_requets, - netdata_adcs_challenge_response, - netdata_adcs_retrieval_processing, - netdata_adcs_crypto_singing_time, - netdata_adcs_policy_mod_processing_time, - netdata_adcs_challenge_response_processing_time, - netdata_adcs_signed_certificate_timetamp_list, - netdata_adcs_signed_certificate_timetamp_list_processing, - netdata_adcs_retrieval_processing, + netdata_adcs_requests, + netdata_adcs_requests_processing_time, + netdata_adcs_retrievals, + netdata_adcs_failed_requets, + netdata_adcs_issued_requets, + netdata_adcs_pending_requets, + netdata_adcs_challenge_response, + netdata_adcs_retrieval_processing, + netdata_adcs_crypto_signing_time, + netdata_adcs_policy_mod_processing_time, + netdata_adcs_challenge_response_processing_time, + netdata_adcs_signed_certificate_timetamp_list, + netdata_adcs_signed_certificate_timetamp_list_processing, // This must be the end NULL}; diff --git a/src/collectors/windows.plugin/perflib-adfs.c b/src/collectors/windows.plugin/perflib-adfs.c index 1eae0c253bc968..08e026ebceedc3 100644 --- a/src/collectors/windows.plugin/perflib-adfs.c +++ b/src/collectors/windows.plugin/perflib-adfs.c @@ -239,8 +239,8 @@ struct adfs_certificate { .ADFSOauthClientCredentialsFailure.key = "OAuth Client Credentials Request Failures", .ADFSOauthClientPrivkeyJwtAuthenticationSuccess.key = "OAuth Client Private Key Jwt Authentications", .ADFSOauthClientPrivkeyJwtAuthenticationFailure.key = "OAuth Client Private Key Jwt Authentication Failures", - .ADFSOauthClientSecretBasicAuthenticationsSuccess.key = "OAuth Client Secret Post Authentication", - .ADFSOauthClientSecretBasicAuthenticationsFailure.key = "OAuth Client Secret Post Authentication Failures", + .ADFSOauthClientSecretBasicAuthenticationsSuccess.key = "OAuth Client Secret Basic Authentications", + .ADFSOauthClientSecretBasicAuthenticationsFailure.key = "OAuth Client Secret Basic Authentication Failures", .ADFSOauthClientSecretPostAuthenticationsSuccess.key = "OAuth Client Secret Post Authentication", .ADFSOauthClientSecretPostAuthenticationsFailure.key = "OAuth Client Secret Post Authentication Failures", .ADFSOauthClientWindowsAuthenticationsSuccess.key = "OAuth Client Windows Integrated Authentication", @@ -261,8 +261,8 @@ struct adfs_certificate { .ADFSSSOAuthenticationsSuccess.key = "SSO Authentications", .ADFSSSOAuthenticationsFailure.key = "SSO Authentication Failures", .ADFSTokenRequests.key = "Token Requests", - .ADFSUserPasswordAuthenticationsSuccess.key = "SSO Authentications", - .ADFSUserPasswordAuthenticationsFailure.key = "SSO Authentication Failures", + .ADFSUserPasswordAuthenticationsSuccess.key = "U/P Authentications", + .ADFSUserPasswordAuthenticationsFailure.key = "U/P Authentication Failures", .ADFSWindowsIntegratedAuthentications.key = "Windows Integrated Authentications", .ADFSWSFedTokenRequestsSuccess.key = "WS-Fed Token Requests", }; @@ -577,7 +577,7 @@ void netdata_adfs_federated_authentications(PERF_DATA_BLOCK *pDataBlock, PERF_OB adfs.rd_adfs_federation_authentications, (collected_number)adfs.ADFSFederationAuthentications.current.Data); - rrdset_done(adfs.st_adfs_external_authentications); + rrdset_done(adfs.st_adfs_federation_authentications); } void netdata_adfs_federation_metadata_authentications( @@ -935,11 +935,11 @@ void netdata_adfs_oauth_logon_certificate_request( if (!adfs.st_adfs_oauth_logon_certificate_requests) { adfs.st_adfs_oauth_logon_certificate_requests = rrdset_create_localhost( "adfs", - "oauth_client_windows_authentications", + "oauth_logon_certificate_requests", NULL, "oauth", - "adfs.oauth_client_windows_authentications", - "OAuth client windows integrated authentications", + "adfs.oauth_logon_certificate_requests", + "OAuth logon certificate requests", "requests/s", PLUGIN_WINDOWS_NAME, "PerflibADFS", @@ -980,11 +980,11 @@ void netdata_adfs_oauth_password_grant_requests( if (!adfs.st_adfs_oauth_password_grant_requests) { adfs.st_adfs_oauth_password_grant_requests = rrdset_create_localhost( "adfs", - "oauth_client_windows_authentications", + "oauth_password_grant_requests", NULL, "oauth", - "adfs.oauth_client_windows_authentications", - "OAuth client windows integrated authentications", + "adfs.oauth_password_grant_requests", + "OAuth password grant requests", "authentications/s", PLUGIN_WINDOWS_NAME, "PerflibADFS", diff --git a/src/collectors/windows.plugin/perflib-mssql.c b/src/collectors/windows.plugin/perflib-mssql.c index 03cdf67c2401ac..b50821d14baf19 100644 --- a/src/collectors/windows.plugin/perflib-mssql.c +++ b/src/collectors/windows.plugin/perflib-mssql.c @@ -313,7 +313,7 @@ void do_mssql_errors(PERF_DATA_BLOCK *pDataBlock, struct mssql_instance *mi, int } rrddim_set_by_pointer( - mi->st_sql_errors, mi->rd_sql_errors, (collected_number)mi->MSSQLAccessMethodPageSplits.current.Data); + mi->st_sql_errors, mi->rd_sql_errors, (collected_number)mi->MSSQLSQLErrorsTotal.current.Data); rrdset_done(mi->st_sql_errors); } } diff --git a/src/crates/jf/otel-plugin/src/netdata_chart.rs b/src/crates/jf/otel-plugin/src/netdata_chart.rs index 84e051e945a75a..388710091c1512 100644 --- a/src/crates/jf/otel-plugin/src/netdata_chart.rs +++ b/src/crates/jf/otel-plugin/src/netdata_chart.rs @@ -149,7 +149,7 @@ impl NetdataChart { let title = &self.metric_description; let units = &self.metric_unit; let context = format!("otel.{}", &self.metric_name); - let family = context.clone(); + let family = self.metric_name.replace('.', "/"); let chart_type = if self.is_histogram() { "heatmap" } else { diff --git a/src/database/engine/datafile.c b/src/database/engine/datafile.c index 218185dbd812cc..8563eae21c8878 100644 --- a/src/database/engine/datafile.c +++ b/src/database/engine/datafile.c @@ -604,15 +604,6 @@ int init_data_files(struct rrdengine_instance *ctx) else { if (ctx->loading.create_new_datafile_pair) create_new_datafile_pair(ctx); - - while(rrdeng_ctx_tier_cap_exceeded(ctx)) { - Word_t Index = 0; - Pvoid_t *PValue = JudyLFirst(ctx->datafiles.JudyL, &Index, PJE0); - if (PValue && *PValue) { - struct rrdengine_datafile *datafile = *PValue; - datafile_delete(ctx, datafile, false, true, false); - } - } } pgc_reset_hot_max(open_cache); diff --git a/src/database/engine/pagecache.c b/src/database/engine/pagecache.c index 7ea19786727577..8ddb7e9b863e68 100644 --- a/src/database/engine/pagecache.c +++ b/src/database/engine/pagecache.c @@ -575,7 +575,7 @@ static NOT_INLINE_HOT size_t get_page_list_from_journal_v2(struct rrdengine_inst break; // Make sure index is valid for this file - if (page_entry_in_journal->extent_index > extent_entries) { + if (page_entry_in_journal->extent_index >= extent_entries) { nd_log_limit_static_thread_var(erl, 60, 0); nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, "DBENGINE: Invalid extent index in journalfile %u", diff --git a/src/database/engine/rrdengine.c b/src/database/engine/rrdengine.c index df6a7464930a67..fffff53c897feb 100644 --- a/src/database/engine/rrdengine.c +++ b/src/database/engine/rrdengine.c @@ -740,7 +740,7 @@ extent_flush_to_open(struct rrdengine_instance *ctx, struct extent_io_descriptor // Main event loop callback -static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { +static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, uint64_t extent_size) { bool ret = false; spinlock_lock(&datafile->writers.spinlock); @@ -748,14 +748,14 @@ static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_da #ifdef OS_WINDOWS time_t now = now_realtime_sec(); if (now - datafile->writers.last_sync_time > 60) { - nd_log_daemon(NDLP_INFO, "DBENGINE: datafile %d, last sync time: %ld, now: %ld", datafile->fileno, datafile->writers.last_sync_time, now); sync_uv_file_data(datafile->file); sync_uv_file_data(datafile->journalfile->file); datafile->writers.last_sync_time = now_realtime_sec(); } #endif - if(datafile->pos > rrdeng_target_data_file_size(ctx)) + // Check if adding this extent would exceed the target size + if(datafile->pos + extent_size > rrdeng_target_data_file_size(ctx)) ret = true; spinlock_unlock(&datafile->writers.spinlock); @@ -848,9 +848,14 @@ static void __attribute__((destructor)) destroy_mutex(void) { netdata_mutex_destroy(&mutex); } -static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_instance *ctx) { +static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_instance *ctx, uint64_t extent_size) { struct rrdengine_datafile *datafile; + // Acquire the mutex at the beginning to make the entire check-and-act atomic + // This prevents the race condition where multiple threads pass the "is full" check + // before any of them increments the position, causing files to grow beyond limits + netdata_mutex_lock(&mutex); + // get the latest datafile netdata_rwlock_rdlock(&ctx->datafiles.rwlock); @@ -861,23 +866,15 @@ static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_ spinlock_unlock(&datafile->writers.spinlock); netdata_rwlock_rdunlock(&ctx->datafiles.rwlock); - if(datafile_is_full(ctx, datafile)) { + if(datafile_is_full(ctx, datafile, extent_size)) { // remember the datafile we have become writers to struct rrdengine_datafile *old_datafile = datafile; - // only 1 datafile creation at a time - - netdata_mutex_lock(&mutex); - - // take the latest datafile again - without this, multiple threads may create multiple files - datafile = get_last_ctx_datafile(ctx, false); - - if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx) == 0) + // Create a new datafile - since we hold the mutex, no other thread can interfere + if(create_new_datafile_pair(ctx) == 0) __atomic_store_n(&ctx->atomic.needs_indexing, true, __ATOMIC_RELAXED); - netdata_mutex_unlock(&mutex); - - // get the new latest datafile again, like above + // get the new datafile netdata_rwlock_rdlock(&ctx->datafiles.rwlock); datafile = get_last_ctx_datafile(ctx, true); // become a writer on this datafile, to prevent it from vanishing @@ -898,6 +895,8 @@ static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_ spinlock_unlock(&old_datafile->writers.spinlock); } + netdata_mutex_unlock(&mutex); + return datafile; } @@ -1008,7 +1007,8 @@ datafile_extent_build(struct rrdengine_instance *ctx, struct page_descr_with_dat real_io_size = ALIGN_BYTES_CEILING(size_bytes); - datafile = get_datafile_to_write_extent(ctx); + // Pass the extent size so the check can determine if this extent will fit + datafile = get_datafile_to_write_extent(ctx, real_io_size); spinlock_lock(&datafile->writers.spinlock); xt_io_descr->datafile = datafile; xt_io_descr->pos = datafile->pos; diff --git a/src/database/rrdset-index-id.c b/src/database/rrdset-index-id.c index 632905b4e91c6c..a4cf694174ef0b 100644 --- a/src/database/rrdset-index-id.c +++ b/src/database/rrdset-index-id.c @@ -103,6 +103,9 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v rw_spinlock_init(&st->alerts.spinlock); + // Initialize replication stuck detection counter + st->replication_empty_response_count = 0; + // initialize the db tiers { for(size_t tier = 0; tier < nd_profile.storage_tiers; tier++) { diff --git a/src/database/rrdset.h b/src/database/rrdset.h index c52d757dd610a7..00e60a84ab55d6 100644 --- a/src/database/rrdset.h +++ b/src/database/rrdset.h @@ -228,6 +228,9 @@ struct rrdset { } replay; #endif // NETDATA_LOG_REPLICATION_REQUESTS + // Replication stuck detection - outside debug flag for production safety + uint8_t replication_empty_response_count; // track consecutive empty responses + SPINLOCK destroy_lock; }; diff --git a/src/go/cmd/godplugin/main.go b/src/go/cmd/godplugin/main.go index 6af176a2086409..c3221e5e8d6a4e 100644 --- a/src/go/cmd/godplugin/main.go +++ b/src/go/cmd/godplugin/main.go @@ -60,7 +60,7 @@ func main() { DumpSummary: opts.DumpSummary, }) - a.Debugf("plugin: name=%s, %s", a.Name, buildinfo.Info()) + a.Infof("plugin: name=%s, %s", a.Name, buildinfo.Info()) if u, err := user.Current(); err == nil { a.Debugf("current user: name=%s, uid=%s", u.Username, u.Uid) } diff --git a/src/go/pkg/buildinfo/buildinfo.go b/src/go/pkg/buildinfo/buildinfo.go index 22737fb6f287c0..e78b9e42cf7c53 100644 --- a/src/go/pkg/buildinfo/buildinfo.go +++ b/src/go/pkg/buildinfo/buildinfo.go @@ -2,7 +2,14 @@ package buildinfo -import "fmt" +import ( + "fmt" + "path/filepath" + "runtime" + "strings" + + "github.com/netdata/netdata/go/plugins/pkg/executable" +) // The following variables are set at build time using linker flags. @@ -24,11 +31,110 @@ var NetdataBinDir = "/usr/sbin" // Info returns all build information as a single line with snake_case keys. func Info() string { return fmt.Sprintf( - "version=%s user_config_dir=%s stock_config_dir=%s plugins_dir=%s netdata_bin_dir=%s", + "version=%s go_version=%s user_config_dir=%s stock_config_dir=%s plugins_dir=%s netdata_bin_dir=%s", Version, + runtime.Version(), UserConfigDir, StockConfigDir, PluginsDir, NetdataBinDir, ) } + +func init() { + if runtime.GOOS != "windows" { + return + } + + execDir := executable.Directory + if execDir == "" || PluginsDir == "" { + return + } + + // ---------------------------------------------------------------------------- + // 1. Detect install prefix on Windows + // + // We assume that on Windows the *running binary* lives inside PluginsDir. + // + // Example: + // execDir = "C:/Program Files/Netdata/usr/libexec/netdata/plugins.d" + // PluginsDir = "/usr/libexec/netdata/plugins.d" + // + // By normalizing both paths to forward-slash format, we can test: + // + // strings.HasSuffix(execDir, PluginsDir) → true + // + // From that, we strip the suffix and recover the actual installation prefix: + // + // prefix = "C:/Program Files/Netdata" + // + // If execDir does *not* end with PluginsDir, we simply do nothing — this keeps + // development/testing environments safe where binaries are run outside the + // expected layout. + // ---------------------------------------------------------------------------- + normalized := filepath.ToSlash(execDir) + suffix := filepath.ToSlash(PluginsDir) + + if !strings.HasSuffix(normalized, suffix) { + return + } + + // Extract the prefix by removing the suffix. + // + // Example: + // normalized = "C:/Program Files/Netdata/usr/libexec/netdata/plugins.d" + // suffix = "/usr/libexec/netdata/plugins.d" + // + // → prefix = "C:/Program Files/Netdata" + prefix := strings.TrimSuffix(normalized, suffix) + prefix = strings.TrimSuffix(prefix, "/") + + installPrefix := filepath.FromSlash(prefix) + + // ---------------------------------------------------------------------------- + // 2. Rewrite all buildinfo paths as: + // + + // + // Example: + // NetdataBinDir build-time: "/usr/sbin" + // After rewrite: + // "C:\Program Files\Netdata\usr\sbin" + // + // Notes: + // • We must remove the *leading slash* from the build-time path, otherwise + // filepath.Join would treat it as absolute and ignore the prefix. + // + // Example of what we avoid: + // filepath.Join("C:\\Program Files\\Netdata", "/usr/sbin") → + // "\usr\sbin" (WRONG — prefix lost!) + // + // • Paths that were empty at build time should remain empty. + // ---------------------------------------------------------------------------- + rebuild := func(p string) string { + if p == "" { + return "" + } + + // Convert to slash form and trim the leading '/' so it becomes relative. + // + // Example: + // p = "/usr/sbin" + // → s = "usr/sbin" + // + s := filepath.ToSlash(p) + s = strings.TrimPrefix(s, "/") + + // Now prefix + relative suffix works reliably on Windows. + // + // Example: + // installPrefix = "C:\\Program Files\\Netdata" + // s = "usr/sbin" + // → result = "C:\\Program Files\\Netdata\\usr\\sbin" + return filepath.Join(installPrefix, s) + } + + UserConfigDir = rebuild(UserConfigDir) + StockConfigDir = rebuild(StockConfigDir) + PluginsDir = rebuild(PluginsDir) + NetdataBinDir = rebuild(NetdataBinDir) +} diff --git a/src/go/pkg/pluginconfig/pluginconfig.go b/src/go/pkg/pluginconfig/pluginconfig.go index 5c2d85b023f154..9fba0f7445e9af 100644 --- a/src/go/pkg/pluginconfig/pluginconfig.go +++ b/src/go/pkg/pluginconfig/pluginconfig.go @@ -19,6 +19,8 @@ import ( "github.com/netdata/netdata/go/plugins/pkg/cli" "github.com/netdata/netdata/go/plugins/pkg/executable" "github.com/netdata/netdata/go/plugins/pkg/multipath" + + "github.com/mattn/go-isatty" ) var ( @@ -130,11 +132,11 @@ func (d *directories) initUserRoots(opts *cli.Option, env envData, execDir strin roots = append(roots, p) } - // 2) NETDATA_USER_CONFIG_DIR - if buildinfo.UserConfigDir != "" { - roots = append(roots, safePathClean(buildinfo.UserConfigDir)) - } else if dir := safePathClean(env.userDir); dir != "" { + // 2) NETDATA_USER_CONFIG_DIR (env has priority over buildinfo) + if dir := safePathClean(env.userDir); dir != "" { roots = append(roots, dir) + } else if buildinfo.UserConfigDir != "" { + roots = append(roots, safePathClean(buildinfo.UserConfigDir)) } if len(roots) != 0 { @@ -164,14 +166,15 @@ func (d *directories) initUserRoots(opts *cli.Option, env envData, execDir strin // Build step 2: initialize single "stock" root: env, common locations, build-relative fallback. func (d *directories) initStockRoot(env envData, execDir string) { - if buildinfo.StockConfigDir != "" { - d.stockConfigDir = safePathClean(buildinfo.StockConfigDir) - return - } + // env.stockDir has priority if stock := safePathClean(env.stockDir); stock != "" { d.stockConfigDir = stock return } + if buildinfo.StockConfigDir != "" { + d.stockConfigDir = safePathClean(buildinfo.StockConfigDir) + return + } relDir := safePathClean(filepath.Join(execDir, "..", "..", "..", "..", "usr", "lib", "netdata", "conf.d")) if isDirExists(relDir) { @@ -254,19 +257,28 @@ func (d *directories) validate() error { return nil } +var isTerm = isatty.IsTerminal(os.Stderr.Fd()) || isatty.IsTerminal(os.Stdout.Fd()) + func readEnvFromOS(execDir string) envData { e := envData{ cygwinBase: os.Getenv("NETDATA_CYGWIN_BASE_PATH"), userDir: os.Getenv("NETDATA_USER_CONFIG_DIR"), stockDir: os.Getenv("NETDATA_STOCK_CONFIG_DIR"), - varLibDir: os.Getenv("NETDATA_LIB_DIR"), watchPath: os.Getenv("NETDATA_PLUGINS_GOD_WATCH_PATH"), + varLibDir: os.Getenv("NETDATA_LIB_DIR"), logLevel: os.Getenv("NETDATA_LOG_LEVEL"), } e.userDir = handleDirOnWin(e.cygwinBase, safePathClean(e.userDir), execDir) e.stockDir = handleDirOnWin(e.cygwinBase, safePathClean(e.stockDir), execDir) e.varLibDir = handleDirOnWin(e.cygwinBase, safePathClean(e.varLibDir), execDir) e.watchPath = handleDirOnWin(e.cygwinBase, safePathClean(e.watchPath), execDir) + + if isTerm { + e.userDir = "" + e.stockDir = "" + e.watchPath = "" + } + return e } diff --git a/src/go/plugin/go.d/agent/jobmgr/dyncfg.go b/src/go/plugin/go.d/agent/jobmgr/dyncfg.go index ef8b8536406276..85b1f8db51f6ae 100644 --- a/src/go/plugin/go.d/agent/jobmgr/dyncfg.go +++ b/src/go/plugin/go.d/agent/jobmgr/dyncfg.go @@ -29,7 +29,13 @@ func (m *Manager) dyncfgConfig(fn functions.Function) { //m.Infof("QQ FN: '%s'", fn) - switch id := fn.Args[0]; true { + m.dyncfgQueuedExec(fn) +} + +func (m *Manager) dyncfgQueuedExec(fn functions.Function) { + id := fn.Args[0] + + switch { case strings.HasPrefix(id, m.dyncfgCollectorPrefixValue()): m.dyncfgCollectorExec(fn) case strings.HasPrefix(id, m.dyncfgVnodePrefixValue()): @@ -39,6 +45,19 @@ func (m *Manager) dyncfgConfig(fn functions.Function) { } } +func (m *Manager) dyncfgSeqExec(fn functions.Function) { + id := fn.Args[0] + + switch { + case strings.HasPrefix(id, m.dyncfgCollectorPrefixValue()): + m.dyncfgCollectorSeqExec(fn) + case strings.HasPrefix(id, m.dyncfgVnodePrefixValue()): + m.dyncfgVnodeSeqExec(fn) + default: + m.dyncfgApi.SendCodef(fn, 503, "unknown function '%s' (%s).", fn.Name, id) + } +} + func unmarshalPayload(dst any, fn functions.Function) error { if v := reflect.ValueOf(dst); v.Kind() != reflect.Ptr || v.IsNil() { return fmt.Errorf("invalid config: expected a pointer to a struct, got a %s", v.Type()) diff --git a/src/go/plugin/go.d/agent/jobmgr/manager.go b/src/go/plugin/go.d/agent/jobmgr/manager.go index cfcc7e2c62ec15..d78bd49f53275d 100644 --- a/src/go/plugin/go.d/agent/jobmgr/manager.go +++ b/src/go/plugin/go.d/agent/jobmgr/manager.go @@ -161,7 +161,7 @@ func (m *Manager) run() { case <-m.ctx.Done(): return case fn := <-m.dyncfgCh: - m.dyncfgCollectorSeqExec(fn) + m.dyncfgSeqExec(fn) } } else { select { @@ -172,14 +172,7 @@ func (m *Manager) run() { case cfg := <-m.rmCh: m.removeConfig(cfg) case fn := <-m.dyncfgCh: - switch id := fn.Args[0]; true { - case strings.HasPrefix(id, m.dyncfgCollectorPrefixValue()): - m.dyncfgCollectorSeqExec(fn) - case strings.HasPrefix(id, m.dyncfgVnodePrefixValue()): - m.dyncfgVnodeSeqExec(fn) - default: - m.dyncfgApi.SendCodef(fn, 503, "unknown function '%s' (%s).", fn.Name, id) - } + m.dyncfgSeqExec(fn) } } } @@ -266,24 +259,25 @@ func (m *Manager) runNotifyRunningJobs() { } func (m *Manager) startRunningJob(job *module.Job) { + m.stopRunningJob(job.FullName()) + m.runningJobs.lock() defer m.runningJobs.unlock() - if job, ok := m.runningJobs.lookup(job.FullName()); ok { - job.Stop() - } - go job.Start() m.runningJobs.add(job.FullName(), job) } func (m *Manager) stopRunningJob(name string) { m.runningJobs.lock() - defer m.runningJobs.unlock() + job, ok := m.runningJobs.lookup(name) + if ok { + m.runningJobs.remove(name) + } + m.runningJobs.unlock() - if job, ok := m.runningJobs.lookup(name); ok { + if ok { job.Stop() - m.runningJobs.remove(name) } } diff --git a/src/go/plugin/go.d/agent/module/job.go b/src/go/plugin/go.d/agent/module/job.go index 6127c6a58cbd66..e41ddd92faaa1f 100644 --- a/src/go/plugin/go.d/agent/module/job.go +++ b/src/go/plugin/go.d/agent/module/job.go @@ -175,6 +175,11 @@ type Job struct { // Dump mode support dumpMode bool dumpAnalyzer interface{} // Will be *agent.DumpAnalyzer but avoid circular dependency + + skipStateMu sync.Mutex + consecutiveSkips int + collectStartTime time.Time // when current collection started + collectStopTime time.Time // when current collection finished } // NetdataChartIDMaxLength is the chart ID max length. See RRD_ID_LENGTH_MAX in the netdata source code. @@ -287,7 +292,21 @@ func (j *Job) Tick(clock int) { select { case j.tick <- clock: default: - j.Debug("skip the tick due to previous run hasn't been finished") + if j.shouldCollect(clock) { + j.skipStateMu.Lock() + j.consecutiveSkips++ + consecutiveSkips := j.consecutiveSkips + startTime := j.collectStartTime + j.skipStateMu.Unlock() + + if startTime.IsZero() { + j.Infof("skipping data collection: waiting for first collection to start (interval %ds)", j.updateEvery) + } else if consecutiveSkips >= 2 { + j.Warningf("skipping data collection: previous run is still in progress for %s (skipped %d times in a row, interval %ds)", time.Since(startTime), consecutiveSkips, j.updateEvery) + } else { + j.Infof("skipping data collection: previous run is still in progress for %s (interval %ds)", time.Since(startTime), j.updateEvery) + } + } } } @@ -302,8 +321,24 @@ LOOP: case <-j.stop: break LOOP case t := <-j.tick: - if t%(j.updateEvery+j.penalty()) == 0 { + if j.shouldCollect(t) { + j.skipStateMu.Lock() + if j.consecutiveSkips > 0 { + if j.collectStopTime.IsZero() { + j.Infof("data collection resumed (skipped %d times)", j.consecutiveSkips) + } else { + j.Infof("data collection resumed after %s (skipped %d times)", j.collectStopTime.Sub(j.collectStartTime), j.consecutiveSkips) + } + j.consecutiveSkips = 0 + } + j.collectStartTime = time.Now() + j.skipStateMu.Unlock() + j.runOnce() + + j.skipStateMu.Lock() + j.collectStopTime = time.Now() + j.skipStateMu.Unlock() } } } @@ -319,6 +354,10 @@ func (j *Job) Stop() { <-j.stop } +func (j *Job) shouldCollect(clock int) bool { + return clock%(j.updateEvery+j.penalty()) == 0 +} + func (j *Job) disableAutoDetection() { j.AutoDetectEvery = 0 } diff --git a/src/go/plugin/go.d/agent/vnodes/config_schema.json b/src/go/plugin/go.d/agent/vnodes/config_schema.json index 1a22a541bd68ec..f5256b5bb896df 100644 --- a/src/go/plugin/go.d/agent/vnodes/config_schema.json +++ b/src/go/plugin/go.d/agent/vnodes/config_schema.json @@ -20,7 +20,10 @@ "type": [ "object", "null" - ] + ], + "additionalProperties": { + "type": "string" + } } }, "required": [ diff --git a/src/go/plugin/go.d/collector/ap/collect.go b/src/go/plugin/go.d/collector/ap/collect.go index 01614921b003a4..13e283edfb3965 100644 --- a/src/go/plugin/go.d/collector/ap/collect.go +++ b/src/go/plugin/go.d/collector/ap/collect.go @@ -22,16 +22,16 @@ type iwInterface struct { } type stationStats struct { - clients int64 - rxBytes int64 - rxPackets int64 - txBytes int64 - txPackets int64 - txRetries int64 - txFailed int64 - signalAvg int64 - txBitrate float64 - rxBitrate float64 + clients *int64 + rxBytes *int64 + rxPackets *int64 + txBytes *int64 + txPackets *int64 + txRetries *int64 + txFailed *int64 + signalAvg *int64 + txBitrate *float64 + rxBitrate *float64 } func (c *Collector) collect() (map[string]int64, error) { @@ -59,10 +59,7 @@ func (c *Collector) collect() (map[string]int64, error) { return nil, fmt.Errorf("getting station statistics for %s: %v", iface, err) } - stats, err := parseIwStationStatistics(bs) - if err != nil { - return nil, fmt.Errorf("parsing station statistics for %s: %v", iface, err) - } + stats := parseIwStationStatistics(bs) key := fmt.Sprintf("%s-%s", iface.name, iface.ssid) @@ -75,18 +72,39 @@ func (c *Collector) collect() (map[string]int64, error) { px := fmt.Sprintf("ap_%s_%s_", iface.name, iface.ssid) - mx[px+"clients"] = stats.clients - mx[px+"bw_received"] = stats.rxBytes - mx[px+"bw_sent"] = stats.txBytes - mx[px+"packets_received"] = stats.rxPackets - mx[px+"packets_sent"] = stats.txPackets - mx[px+"issues_retries"] = stats.txRetries - mx[px+"issues_failures"] = stats.txFailed - mx[px+"average_signal"], mx[px+"bitrate_receive"], mx[px+"bitrate_transmit"] = 0, 0, 0 - if clients := float64(stats.clients); clients > 0 { - mx[px+"average_signal"] = int64(float64(stats.signalAvg) / clients * precision) - mx[px+"bitrate_receive"] = int64(stats.rxBitrate / clients * precision) - mx[px+"bitrate_transmit"] = int64(stats.txBitrate / clients * precision) + if stats.clients != nil { + mx[px+"clients"] = *stats.clients + } + if stats.rxBytes != nil { + mx[px+"bw_received"] = *stats.rxBytes + } + if stats.txBytes != nil { + mx[px+"bw_sent"] = *stats.txBytes + } + if stats.rxPackets != nil { + mx[px+"packets_received"] = *stats.rxPackets + } + if stats.txPackets != nil { + mx[px+"packets_sent"] = *stats.txPackets + } + if stats.txRetries != nil { + mx[px+"issues_retries"] = *stats.txRetries + } + if stats.txFailed != nil { + mx[px+"issues_failures"] = *stats.txFailed + } + + if stats.clients != nil && *stats.clients > 0 { + clients := float64(*stats.clients) + if stats.signalAvg != nil { + mx[px+"average_signal"] = int64(float64(*stats.signalAvg) / clients * precision) + } + if stats.rxBitrate != nil { + mx[px+"bitrate_receive"] = int64(*stats.rxBitrate / clients * precision) + } + if stats.txBitrate != nil { + mx[px+"bitrate_transmit"] = int64(*stats.txBitrate / clients * precision) + } } } @@ -146,7 +164,7 @@ func parseIwDevices(resp []byte) ([]*iwInterface, error) { return apIfaces, nil } -func parseIwStationStatistics(resp []byte) (*stationStats, error) { +func parseIwStationStatistics(resp []byte) *stationStats { var stats stationStats sc := bufio.NewScanner(bytes.NewReader(resp)) @@ -154,58 +172,49 @@ func parseIwStationStatistics(resp []byte) (*stationStats, error) { for sc.Scan() { line := strings.TrimSpace(sc.Text()) - var v float64 - var err error - switch { case strings.HasPrefix(line, "Station"): - stats.clients++ + stats.addInt64(&stats.clients, 1) case strings.HasPrefix(line, "rx bytes:"): - if v, err = get3rdValue(line); err == nil { - stats.rxBytes += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.rxBytes, int64(v)) } case strings.HasPrefix(line, "rx packets:"): - if v, err = get3rdValue(line); err == nil { - stats.rxPackets += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.rxPackets, int64(v)) } case strings.HasPrefix(line, "tx bytes:"): - if v, err = get3rdValue(line); err == nil { - stats.txBytes += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.txBytes, int64(v)) } case strings.HasPrefix(line, "tx packets:"): - if v, err = get3rdValue(line); err == nil { - stats.txPackets += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.txPackets, int64(v)) } case strings.HasPrefix(line, "tx retries:"): - if v, err = get3rdValue(line); err == nil { - stats.txRetries += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.txRetries, int64(v)) } case strings.HasPrefix(line, "tx failed:"): - if v, err = get3rdValue(line); err == nil { - stats.txFailed += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.txFailed, int64(v)) } case strings.HasPrefix(line, "signal avg:"): - if v, err = get3rdValue(line); err == nil { - stats.signalAvg += int64(v) + if v, err := get3rdValue(line); err == nil { + stats.addInt64(&stats.signalAvg, int64(v)) } case strings.HasPrefix(line, "tx bitrate:"): - if v, err = get3rdValue(line); err == nil { - stats.txBitrate += v + if v, err := get3rdValue(line); err == nil { + stats.addFloat64(&stats.txBitrate, v) } case strings.HasPrefix(line, "rx bitrate:"): - if v, err = get3rdValue(line); err == nil { - stats.rxBitrate += v + if v, err := get3rdValue(line); err == nil { + stats.addFloat64(&stats.rxBitrate, v) } - default: - continue - } - - if err != nil { - return nil, fmt.Errorf("parsing line '%s': %v", line, err) } } - return &stats, nil + return &stats } func get3rdValue(line string) (float64, error) { @@ -221,3 +230,17 @@ func get3rdValue(line string) (float64, error) { } return strconv.ParseFloat(v, 64) } + +func (s *stationStats) addInt64(dst **int64, v int64) { + if *dst == nil { + *dst = new(int64) + } + **dst += v +} + +func (s *stationStats) addFloat64(dst **float64, v float64) { + if *dst == nil { + *dst = new(float64) + } + **dst += v +} diff --git a/src/go/plugin/go.d/collector/ap/collector_test.go b/src/go/plugin/go.d/collector/ap/collector_test.go index 651da39465c6a4..d81a0417833eeb 100644 --- a/src/go/plugin/go.d/collector/ap/collector_test.go +++ b/src/go/plugin/go.d/collector/ap/collector_test.go @@ -166,26 +166,26 @@ func TestCollector_Collect(t *testing.T) { prepareMock: prepareMockOk, wantCharts: len(apChartsTmpl) * 2, wantMetrics: map[string]int64{ - "ap_wlp1s0_testing_average_signal": -34000, - "ap_wlp1s0_testing_bitrate_receive": 65500, - "ap_wlp1s0_testing_bitrate_transmit": 65000, - "ap_wlp1s0_testing_bw_received": 95117, - "ap_wlp1s0_testing_bw_sent": 8270, - "ap_wlp1s0_testing_clients": 2, + "ap_wlp1s0_testing_average_signal": -50666, + "ap_wlp1s0_testing_bitrate_receive": 49400, + "ap_wlp1s0_testing_bitrate_transmit": 43333, + "ap_wlp1s0_testing_bw_received": 101822, + "ap_wlp1s0_testing_bw_sent": 9284, + "ap_wlp1s0_testing_clients": 3, "ap_wlp1s0_testing_issues_failures": 1, "ap_wlp1s0_testing_issues_retries": 1, - "ap_wlp1s0_testing_packets_received": 2531, - "ap_wlp1s0_testing_packets_sent": 38, - "ap_wlp1s1_testing_average_signal": -34000, - "ap_wlp1s1_testing_bitrate_receive": 65500, - "ap_wlp1s1_testing_bitrate_transmit": 65000, - "ap_wlp1s1_testing_bw_received": 95117, - "ap_wlp1s1_testing_bw_sent": 8270, - "ap_wlp1s1_testing_clients": 2, + "ap_wlp1s0_testing_packets_received": 2670, + "ap_wlp1s0_testing_packets_sent": 51, + "ap_wlp1s1_testing_average_signal": -50666, + "ap_wlp1s1_testing_bitrate_receive": 49400, + "ap_wlp1s1_testing_bitrate_transmit": 43333, + "ap_wlp1s1_testing_bw_received": 101822, + "ap_wlp1s1_testing_bw_sent": 9284, + "ap_wlp1s1_testing_clients": 3, "ap_wlp1s1_testing_issues_failures": 1, "ap_wlp1s1_testing_issues_retries": 1, - "ap_wlp1s1_testing_packets_received": 2531, - "ap_wlp1s1_testing_packets_sent": 38, + "ap_wlp1s1_testing_packets_received": 2670, + "ap_wlp1s1_testing_packets_sent": 51, }, }, "no ap devices": { diff --git a/src/go/plugin/go.d/collector/ap/testdata/station_dump.txt b/src/go/plugin/go.d/collector/ap/testdata/station_dump.txt index 683a6818df3408..c86a67d519bd2a 100644 --- a/src/go/plugin/go.d/collector/ap/testdata/station_dump.txt +++ b/src/go/plugin/go.d/collector/ap/testdata/station_dump.txt @@ -56,3 +56,32 @@ Station fa:50:db:c1:1c:18 (on wlp1s0) associated at [boottime]: 13440.167s associated at: 1720706069520 ms current time: 1720706075344 ms +Station a7:2f:9e:d3:f0:34 (on wlp1s0) + inactive time: 42324 ms + rx bytes: 6705 + rx packets: 139 + tx bytes: 1014 + tx packets: 13 + tx retries: 0 + tx failed: 0 + rx drop misc: 0 + signal: -87 [-87] dBm + signal avg: -84 [-84] dBm + tx bitrate: (unknown) + tx duration: 0 us + rx bitrate: 17.2 MBit/s HE-MCS 1 HE-NSS 1 HE-GI 0 HE-DCM 0 + rx duration: 0 us + authorized: yes + authenticated: yes + associated: yes + preamble: long + WMM/WME: yes + MFP: no + TDLS peer: no + DTIM period: 2 + beacon interval:100 + short slot time:yes + connected time: 2215 seconds + associated at [boottime]: 3810.003s + associated at: ms + current time: ms diff --git a/src/go/plugin/go.d/collector/proxysql/collect.go b/src/go/plugin/go.d/collector/proxysql/collect.go index 11cb150d3f84b1..f4089434dd50f3 100644 --- a/src/go/plugin/go.d/collector/proxysql/collect.go +++ b/src/go/plugin/go.d/collector/proxysql/collect.go @@ -163,10 +163,9 @@ func (c *Collector) collectStatsMySQLConnectionPool(mx map[string]int64) error { c.cache.getBackend(hg, host, port).updated = true px = "backend_" + backendID(hg, host, port) + "_" case "status": - mx[px+"status_ONLINE"] = metrix.Bool(value == "1") - mx[px+"status_SHUNNED"] = metrix.Bool(value == "2") - mx[px+"status_OFFLINE_SOFT"] = metrix.Bool(value == "3") - mx[px+"status_OFFLINE_HARD"] = metrix.Bool(value == "4") + for _, st := range []string{"ONLINE", "SHUNNED", "OFFLINE_SOFT", "OFFLINE_HARD"} { + mx[px+"status_"+st] = metrix.Bool(value == st) + } default: mx[px+column] = parseInt(value) } diff --git a/src/go/plugin/go.d/collector/proxysql/collector_test.go b/src/go/plugin/go.d/collector/proxysql/collector_test.go index edbf7a5a39335e..ee27a7121cc122 100644 --- a/src/go/plugin/go.d/collector/proxysql/collector_test.go +++ b/src/go/plugin/go.d/collector/proxysql/collector_test.go @@ -308,7 +308,7 @@ func TestCollector_Collect(t *testing.T) { "backend_10_back001-db-master_6001_Queries": 8970367, "backend_10_back001-db-master_6001_status_OFFLINE_HARD": 0, "backend_10_back001-db-master_6001_status_OFFLINE_SOFT": 0, - "backend_10_back001-db-master_6001_status_ONLINE": 0, + "backend_10_back001-db-master_6001_status_ONLINE": 1, "backend_10_back001-db-master_6001_status_SHUNNED": 0, "backend_11_back001-db-master_6002_Bytes_data_recv": 2903, "backend_11_back001-db-master_6002_Bytes_data_sent": 187675, @@ -320,7 +320,7 @@ func TestCollector_Collect(t *testing.T) { "backend_11_back001-db-master_6002_Queries": 69, "backend_11_back001-db-master_6002_status_OFFLINE_HARD": 0, "backend_11_back001-db-master_6002_status_OFFLINE_SOFT": 0, - "backend_11_back001-db-master_6002_status_ONLINE": 0, + "backend_11_back001-db-master_6002_status_ONLINE": 1, "backend_11_back001-db-master_6002_status_SHUNNED": 0, "backend_11_back001-db-reader_6003_Bytes_data_recv": 4994101, "backend_11_back001-db-reader_6003_Bytes_data_sent": 163690013, @@ -332,7 +332,7 @@ func TestCollector_Collect(t *testing.T) { "backend_11_back001-db-reader_6003_Queries": 63488, "backend_11_back001-db-reader_6003_status_OFFLINE_HARD": 0, "backend_11_back001-db-reader_6003_status_OFFLINE_SOFT": 0, - "backend_11_back001-db-reader_6003_status_ONLINE": 0, + "backend_11_back001-db-reader_6003_status_ONLINE": 1, "backend_11_back001-db-reader_6003_status_SHUNNED": 0, "backend_20_back002-db-master_6004_Bytes_data_recv": 266034339, "backend_20_back002-db-master_6004_Bytes_data_sent": 1086994186, @@ -344,7 +344,7 @@ func TestCollector_Collect(t *testing.T) { "backend_20_back002-db-master_6004_Queries": 849461, "backend_20_back002-db-master_6004_status_OFFLINE_HARD": 0, "backend_20_back002-db-master_6004_status_OFFLINE_SOFT": 0, - "backend_20_back002-db-master_6004_status_ONLINE": 0, + "backend_20_back002-db-master_6004_status_ONLINE": 1, "backend_20_back002-db-master_6004_status_SHUNNED": 0, "backend_21_back002-db-reader_6005_Bytes_data_recv": 984, "backend_21_back002-db-reader_6005_Bytes_data_sent": 6992, @@ -354,7 +354,7 @@ func TestCollector_Collect(t *testing.T) { "backend_21_back002-db-reader_6005_ConnUsed": 0, "backend_21_back002-db-reader_6005_Latency_us": 230, "backend_21_back002-db-reader_6005_Queries": 8, - "backend_21_back002-db-reader_6005_status_OFFLINE_HARD": 0, + "backend_21_back002-db-reader_6005_status_OFFLINE_HARD": 1, "backend_21_back002-db-reader_6005_status_OFFLINE_SOFT": 0, "backend_21_back002-db-reader_6005_status_ONLINE": 0, "backend_21_back002-db-reader_6005_status_SHUNNED": 0, @@ -367,7 +367,7 @@ func TestCollector_Collect(t *testing.T) { "backend_31_back003-db-master_6006_Latency_us": 231, "backend_31_back003-db-master_6006_Queries": 3276, "backend_31_back003-db-master_6006_status_OFFLINE_HARD": 0, - "backend_31_back003-db-master_6006_status_OFFLINE_SOFT": 0, + "backend_31_back003-db-master_6006_status_OFFLINE_SOFT": 1, "backend_31_back003-db-master_6006_status_ONLINE": 0, "backend_31_back003-db-master_6006_status_SHUNNED": 0, "backend_31_back003-db-reader_6007_Bytes_data_recv": 115810708275, @@ -381,7 +381,7 @@ func TestCollector_Collect(t *testing.T) { "backend_31_back003-db-reader_6007_status_OFFLINE_HARD": 0, "backend_31_back003-db-reader_6007_status_OFFLINE_SOFT": 0, "backend_31_back003-db-reader_6007_status_ONLINE": 0, - "backend_31_back003-db-reader_6007_status_SHUNNED": 0, + "backend_31_back003-db-reader_6007_status_SHUNNED": 1, "backend_lagging_during_query": 8880, "backend_offline_during_query": 8, "generated_error_packets": 231, diff --git a/src/go/plugin/go.d/collector/proxysql/testdata/v2.0.10/stats_mysql_connection_pool .txt b/src/go/plugin/go.d/collector/proxysql/testdata/v2.0.10/stats_mysql_connection_pool .txt index 80b53e1af156c1..f37bc068bf3b04 100644 --- a/src/go/plugin/go.d/collector/proxysql/testdata/v2.0.10/stats_mysql_connection_pool .txt +++ b/src/go/plugin/go.d/collector/proxysql/testdata/v2.0.10/stats_mysql_connection_pool .txt @@ -1,11 +1,11 @@ -+----+-------------------+--------+--------+----------+----------+--------+---------+---------+-----------------+-----------------+------------+ -| hostgroup | srv_host | srv_port | status | ConnUsed | ConnFree | ConnOK | ConnERR | Queries | Bytes_data_sent | Bytes_data_recv | Latency_us | -+-----------+-------------------+----------+--------+----------+----------+--------+---------+---------+-----------------+-----------------+------------+ -| 10 | back001-db-master | 6001 | ONLINE | 69 | 423 | 524 | 0 | 8970367 | 9858463664 | 145193069937 | 17684 | -| 11 | back001-db-master | 6002 | ONLINE | 0 | 1 | 1 | 0 | 69 | 187675 | 2903 | 17684 | -| 11 | back001-db-reader | 6003 | ONLINE | 0 | 11 | 11 | 0 | 63488 | 163690013 | 4994101 | 113 | -| 20 | back002-db-master | 6004 | ONLINE | 9 | 188 | 197 | 2 | 849461 | 1086994186 | 266034339 | 101981 | -| 21 | back002-db-reader | 6005 | ONLINE | 0 | 1 | 1 | 0 | 8 | 6992 | 984 | 230 | -| 31 | back003-db-master | 6006 | ONLINE | 0 | 3 | 3 | 0 | 3276 | 712803 | 81438709 | 231 | -| 31 | back003-db-reader | 6007 | ONLINE | 1 | 70 | 71 | 0 | 2356904 | 411900849 | 115810708275 | 230 | -+-----------+-------------------+--------+--------+----------+----------+--------+---------+---------+-----------------+-----------------+--------------+ \ No newline at end of file ++-----------+---------------------+----------+--------------+----------+----------+--------+---------+----------+------------------+------------------+------------+ +| hostgroup | srv_host | srv_port | status | ConnUsed | ConnFree | ConnOK | ConnERR | Queries | Bytes_data_sent | Bytes_data_recv | Latency_us | ++-----------+---------------------+----------+--------------+----------+----------+--------+---------+----------+------------------+------------------+------------+ +| 10 | back001-db-master | 6001 | ONLINE | 69 | 423 | 524 | 0 | 8970367 | 9858463664 | 145193069937 | 17684 | +| 11 | back001-db-master | 6002 | ONLINE | 0 | 1 | 1 | 0 | 69 | 187675 | 2903 | 17684 | +| 11 | back001-db-reader | 6003 | ONLINE | 0 | 11 | 11 | 0 | 63488 | 163690013 | 4994101 | 113 | +| 20 | back002-db-master | 6004 | ONLINE | 9 | 188 | 197 | 2 | 849461 | 1086994186 | 266034339 | 101981 | +| 21 | back002-db-reader | 6005 | OFFLINE_HARD | 0 | 1 | 1 | 0 | 8 | 6992 | 984 | 230 | +| 31 | back003-db-master | 6006 | OFFLINE_SOFT | 0 | 3 | 3 | 0 | 3276 | 712803 | 81438709 | 231 | +| 31 | back003-db-reader | 6007 | SHUNNED | 1 | 70 | 71 | 0 | 2356904 | 411900849 | 115810708275 | 230 | ++-----------+---------------------+----------+--------------+----------+----------+--------+---------+----------+------------------+------------------+------------+ \ No newline at end of file diff --git a/src/go/plugin/go.d/collector/rabbitmq/collect.go b/src/go/plugin/go.d/collector/rabbitmq/collect.go index 19917de45d6429..9b53b9492e2376 100644 --- a/src/go/plugin/go.d/collector/rabbitmq/collect.go +++ b/src/go/plugin/go.d/collector/rabbitmq/collect.go @@ -83,10 +83,6 @@ func (c *Collector) getClusterMeta() (id string, name string, err error) { return "", "", err } - if resp.RabbitmqVersion == "" { - return "", "", fmt.Errorf("unexpected response: rabbitmq version is empty") - } - id = "unknown" name = "unset" diff --git a/src/go/plugin/go.d/collector/rabbitmq/restapi.go b/src/go/plugin/go.d/collector/rabbitmq/restapi.go index b54419a1f116a7..5dfd135d162684 100644 --- a/src/go/plugin/go.d/collector/rabbitmq/restapi.go +++ b/src/go/plugin/go.d/collector/rabbitmq/restapi.go @@ -40,8 +40,7 @@ func (a *apiWhoamiTags) UnmarshalJSON(data []byte) error { } type apiDefinitionsResp struct { - RabbitmqVersion string `json:"rabbitmq_version"` - GlobalParams []struct { + GlobalParams []struct { Name string `json:"name"` Value any `json:"value"` } `json:"global_parameters"` diff --git a/src/go/plugin/go.d/collector/redis/collector.go b/src/go/plugin/go.d/collector/redis/collector.go index 293512b1455ad3..f1ecf40276a6ed 100644 --- a/src/go/plugin/go.d/collector/redis/collector.go +++ b/src/go/plugin/go.d/collector/redis/collector.go @@ -22,7 +22,13 @@ import ( //go:embed "config_schema.json" var configSchema string +type noopLogger struct{} + +func (noopLogger) Printf(context.Context, string, ...any) {} + func init() { + redis.SetLogger(noopLogger{}) + module.Register("redis", module.Creator{ JobConfigSchema: configSchema, Create: func() module.Module { return New() }, diff --git a/src/go/plugin/go.d/collector/snmp/charts.go b/src/go/plugin/go.d/collector/snmp/charts.go index 967433a3a680eb..63d98a0b432d2c 100644 --- a/src/go/plugin/go.d/collector/snmp/charts.go +++ b/src/go/plugin/go.d/collector/snmp/charts.go @@ -16,6 +16,12 @@ const ( prioProfileChart = module.Priority + iota prioPingRtt prioPingStdDev + + prioInternalStatsTimings + prioInternalStatsSnmpOps + prioInternalStatsMetrics + prioInternalStatsTableCache + prioInternalStatsErrors ) var ( @@ -66,6 +72,115 @@ func (c *Collector) addPingCharts() { } } +var ( + profileStatsChartsTmpl = module.Charts{ + profileStatsTimingsChartTmpl.Copy(), + profileStatsSnmpChartTmpl.Copy(), + profileStatsMetricsChartTmpl.Copy(), + profileStatsTableCacheChartTmpl.Copy(), + profileStatsErrorsChartTmpl.Copy(), + } + + profileStatsTimingsChartTmpl = module.Chart{ + ID: "snmp_device_prof_%s_stats_timings", + Title: "SNMP profile collection timings", + Units: "milliseconds", + Fam: "Internal/Stats", + Ctx: "snmp.device_prof_stats_timings", + Priority: prioInternalStatsTimings, + Dims: module.Dims{ + {ID: "snmp_device_prof_%s_stats_timings_scalar", Name: "scalar"}, + {ID: "snmp_device_prof_%s_stats_timings_table", Name: "table"}, + {ID: "snmp_device_prof_%s_stats_timings_virtual", Name: "virtual"}, + }, + } + + profileStatsSnmpChartTmpl = module.Chart{ + ID: "snmp_device_prof_%s_stats_snmp", + Title: "SNMP profile operations", + Units: "operations", + Fam: "Internal/Stats", + Ctx: "snmp.device_prof_stats_snmp", + Priority: prioInternalStatsSnmpOps, + Dims: module.Dims{ + {ID: "snmp_device_prof_%s_stats_snmp_get_requests", Name: "get_requests"}, + {ID: "snmp_device_prof_%s_stats_snmp_get_oids", Name: "get_oids"}, + {ID: "snmp_device_prof_%s_stats_snmp_walk_requests", Name: "walk_requests"}, + {ID: "snmp_device_prof_%s_stats_snmp_walk_pdus", Name: "walk_pdus"}, + {ID: "snmp_device_prof_%s_stats_snmp_tables_walked", Name: "tables_walked"}, + {ID: "snmp_device_prof_%s_stats_snmp_tables_cached", Name: "tables_cached"}, + }, + } + + profileStatsMetricsChartTmpl = module.Chart{ + ID: "snmp_device_prof_%s_stats_metrics", + Title: "SNMP profile metric counts", + Units: "metrics", + Fam: "Internal/Stats", + Ctx: "snmp.device_prof_stats_metrics", + Priority: prioInternalStatsMetrics, + Dims: module.Dims{ + {ID: "snmp_device_prof_%s_stats_metrics_scalar", Name: "scalar"}, + {ID: "snmp_device_prof_%s_stats_metrics_table", Name: "table"}, + {ID: "snmp_device_prof_%s_stats_metrics_virtual", Name: "virtual"}, + {ID: "snmp_device_prof_%s_stats_metrics_tables", Name: "tables"}, + {ID: "snmp_device_prof_%s_stats_metrics_rows", Name: "rows"}, + }, + } + + profileStatsTableCacheChartTmpl = module.Chart{ + ID: "snmp_device_prof_%s_stats_table_cache", + Title: "SNMP profile table cache", + Units: "tables", + Fam: "Internal/Stats", + Ctx: "snmp.device_prof_stats_table_cache", + Priority: prioInternalStatsTableCache, + Dims: module.Dims{ + {ID: "snmp_device_prof_%s_stats_table_cache_hits", Name: "hits"}, + {ID: "snmp_device_prof_%s_stats_table_cache_misses", Name: "misses"}, + }, + } + + profileStatsErrorsChartTmpl = module.Chart{ + ID: "snmp_device_prof_%s_stats_errors", + Title: "SNMP profile errors", + Units: "errors", + Fam: "Internal/Stats", + Ctx: "snmp.device_prof_stats_errors", + Priority: prioInternalStatsErrors, + Dims: module.Dims{ + {ID: "snmp_device_prof_%s_stats_errors_snmp", Name: "snmp"}, + {ID: "snmp_device_prof_%s_stats_errors_processing_scalar", Name: "processing_scalar"}, + {ID: "snmp_device_prof_%s_stats_errors_processing_table", Name: "processing_table"}, + }, + } +) + +func (c *Collector) addProfileStatsCharts(name string) { + if name == "" { + return + } + + charts := profileStatsChartsTmpl.Copy() + + labels := c.chartBaseLabels() + labels["profile"] = name + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, name) + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, name) + } + for k, v := range labels { + chart.Labels = append(chart.Labels, module.Label{Key: k, Value: v}) + } + } + + if err := c.Charts().Add(*charts...); err != nil { + c.Warningf("failed to add profile stats charts for %s: %v", name, err) + } +} + func (c *Collector) addProfileScalarMetricChart(m ddsnmp.Metric) { if m.Name == "" { return diff --git a/src/go/plugin/go.d/collector/snmp/collect_snmp.go b/src/go/plugin/go.d/collector/snmp/collect_snmp.go index dbb7af2f680c26..18962015ad3f4e 100644 --- a/src/go/plugin/go.d/collector/snmp/collect_snmp.go +++ b/src/go/plugin/go.d/collector/snmp/collect_snmp.go @@ -4,6 +4,7 @@ package snmp import ( "fmt" + "path/filepath" "sort" "strings" @@ -22,6 +23,7 @@ func (c *Collector) collectSNMP(mx map[string]int64) error { c.collectProfileScalarMetrics(mx, pms) c.collectProfileTableMetrics(mx, pms) + c.collectProfileStats(mx, pms) return nil } @@ -92,6 +94,38 @@ func (c *Collector) collectProfileTableMetrics(mx map[string]int64, pms []*ddsnm } } +func (c *Collector) collectProfileStats(mx map[string]int64, pms []*ddsnmp.ProfileMetrics) { + for _, pm := range pms { + name := stripFileNameExt(pm.Source) + + if !c.seenProfiles[name] { + c.seenProfiles[name] = true + c.addProfileStatsCharts(name) + } + + px := fmt.Sprintf("snmp_device_prof_%s_stats_", name) + mx[px+"timings_scalar"] = pm.Stats.Timing.Scalar.Milliseconds() + mx[px+"timings_table"] = pm.Stats.Timing.Table.Milliseconds() + mx[px+"timings_virtual"] = pm.Stats.Timing.VirtualMetrics.Milliseconds() + mx[px+"snmp_get_requests"] = pm.Stats.SNMP.GetRequests + mx[px+"snmp_get_oids"] = pm.Stats.SNMP.GetOIDs + mx[px+"snmp_walk_pdus"] = pm.Stats.SNMP.WalkPDUs + mx[px+"snmp_walk_requests"] = pm.Stats.SNMP.WalkRequests + mx[px+"snmp_tables_walked"] = pm.Stats.SNMP.TablesWalked + mx[px+"snmp_tables_cached"] = pm.Stats.SNMP.TablesCached + mx[px+"metrics_scalar"] = pm.Stats.Metrics.Scalar + mx[px+"metrics_table"] = pm.Stats.Metrics.Table + mx[px+"metrics_virtual"] = pm.Stats.Metrics.Virtual + mx[px+"metrics_tables"] = pm.Stats.Metrics.Tables + mx[px+"metrics_rows"] = pm.Stats.Metrics.Rows + mx[px+"table_cache_hits"] = pm.Stats.TableCache.Hits + mx[px+"table_cache_misses"] = pm.Stats.TableCache.Misses + mx[px+"errors_snmp"] = pm.Stats.Errors.SNMP + mx[px+"errors_processing_scalar"] = pm.Stats.Errors.Processing.Scalar + mx[px+"errors_processing_table"] = pm.Stats.Errors.Processing.Table + } +} + func tableMetricKey(m ddsnmp.Metric) string { if m.Name == "" { return "" @@ -124,3 +158,7 @@ func tableMetricKey(m ddsnmp.Metric) string { return sb.String() } + +func stripFileNameExt(path string) string { + return strings.TrimSuffix(filepath.Base(path), filepath.Ext(path)) +} diff --git a/src/go/plugin/go.d/collector/snmp/collector.go b/src/go/plugin/go.d/collector/snmp/collector.go index 30452995a41e14..6c78e04349e55a 100644 --- a/src/go/plugin/go.d/collector/snmp/collector.go +++ b/src/go/plugin/go.d/collector/snmp/collector.go @@ -67,6 +67,7 @@ func New() *Collector { charts: &module.Charts{}, seenScalarMetrics: make(map[string]bool), seenTableMetrics: make(map[string]bool), + seenProfiles: make(map[string]bool), newProber: ping.NewProber, newSnmpClient: gosnmp.NewHandler, @@ -86,6 +87,7 @@ type ( charts *module.Charts seenScalarMetrics map[string]bool seenTableMetrics map[string]bool + seenProfiles map[string]bool prober ping.Prober newProber func(ping.ProberConfig, *logger.Logger) ping.Prober diff --git a/src/go/plugin/go.d/collector/snmp/collector_test.go b/src/go/plugin/go.d/collector/snmp/collector_test.go index e5fe5e9f33f96d..b7d01a85a1d6d5 100644 --- a/src/go/plugin/go.d/collector/snmp/collector_test.go +++ b/src/go/plugin/go.d/collector/snmp/collector_test.go @@ -226,6 +226,7 @@ func TestCollector_Collect(t *testing.T) { collr.newDdSnmpColl = func(ddsnmpcollector.Config) ddCollector { return &mockDdSnmpCollector{pms: []*ddsnmp.ProfileMetrics{ { + Source: "test", Metrics: []ddsnmp.Metric{ { Name: "uptime", @@ -243,7 +244,26 @@ func TestCollector_Collect(t *testing.T) { }, want: map[string]int64{ // scalar → "snmp_device_prof_" - "snmp_device_prof_uptime": 123, + "snmp_device_prof_test_stats_errors_processing_scalar": 0, + "snmp_device_prof_test_stats_errors_processing_table": 0, + "snmp_device_prof_test_stats_errors_snmp": 0, + "snmp_device_prof_test_stats_metrics_rows": 0, + "snmp_device_prof_test_stats_metrics_scalar": 0, + "snmp_device_prof_test_stats_metrics_table": 0, + "snmp_device_prof_test_stats_metrics_tables": 0, + "snmp_device_prof_test_stats_metrics_virtual": 0, + "snmp_device_prof_test_stats_snmp_get_oids": 0, + "snmp_device_prof_test_stats_snmp_get_requests": 0, + "snmp_device_prof_test_stats_snmp_tables_cached": 0, + "snmp_device_prof_test_stats_snmp_tables_walked": 0, + "snmp_device_prof_test_stats_snmp_walk_pdus": 0, + "snmp_device_prof_test_stats_snmp_walk_requests": 0, + "snmp_device_prof_test_stats_table_cache_hits": 0, + "snmp_device_prof_test_stats_table_cache_misses": 0, + "snmp_device_prof_test_stats_timings_scalar": 0, + "snmp_device_prof_test_stats_timings_table": 0, + "snmp_device_prof_test_stats_timings_virtual": 0, + "snmp_device_prof_uptime": 123, }, }, "collects table multivalue metric": { @@ -260,6 +280,7 @@ func TestCollector_Collect(t *testing.T) { collr.newDdSnmpColl = func(ddsnmpcollector.Config) ddCollector { return &mockDdSnmpCollector{pms: []*ddsnmp.ProfileMetrics{ { + Source: "test", Metrics: []ddsnmp.Metric{ { Name: "if_octets", @@ -281,8 +302,27 @@ func TestCollector_Collect(t *testing.T) { want: map[string]int64{ // table key: "snmp_device_prof___" // here tags = {"ifName":"eth0"} → key part becomes "_eth0" - "snmp_device_prof_if_octets_eth0_in": 1, - "snmp_device_prof_if_octets_eth0_out": 2, + "snmp_device_prof_test_stats_errors_processing_scalar": 0, + "snmp_device_prof_test_stats_errors_processing_table": 0, + "snmp_device_prof_test_stats_errors_snmp": 0, + "snmp_device_prof_test_stats_metrics_rows": 0, + "snmp_device_prof_test_stats_metrics_scalar": 0, + "snmp_device_prof_test_stats_metrics_table": 0, + "snmp_device_prof_test_stats_metrics_tables": 0, + "snmp_device_prof_test_stats_metrics_virtual": 0, + "snmp_device_prof_test_stats_snmp_get_oids": 0, + "snmp_device_prof_test_stats_snmp_get_requests": 0, + "snmp_device_prof_test_stats_snmp_tables_cached": 0, + "snmp_device_prof_test_stats_snmp_tables_walked": 0, + "snmp_device_prof_test_stats_snmp_walk_pdus": 0, + "snmp_device_prof_test_stats_snmp_walk_requests": 0, + "snmp_device_prof_test_stats_table_cache_hits": 0, + "snmp_device_prof_test_stats_table_cache_misses": 0, + "snmp_device_prof_test_stats_timings_scalar": 0, + "snmp_device_prof_test_stats_timings_table": 0, + "snmp_device_prof_test_stats_timings_virtual": 0, + "snmp_device_prof_if_octets_eth0_in": 1, + "snmp_device_prof_if_octets_eth0_out": 2, }, }, } diff --git a/src/go/plugin/go.d/collector/snmp/config_schema.json b/src/go/plugin/go.d/collector/snmp/config_schema.json index df3c6bd6f893ec..ca9b53af22cdbc 100644 --- a/src/go/plugin/go.d/collector/snmp/config_schema.json +++ b/src/go/plugin/go.d/collector/snmp/config_schema.json @@ -78,6 +78,7 @@ "type": "string", "enum": [ "1", + "2", "2c", "3" ], diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector.go index 73ac2b7f2778e0..b5f687ccb1bb27 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector.go @@ -63,10 +63,12 @@ type ( vmetricsCollector *vmetricsCollector } profileState struct { - profile *ddsnmp.Profile - initialized bool - globalTags map[string]string - deviceMetadata map[string]ddsnmp.MetaTag + profile *ddsnmp.Profile + initialized bool + cache struct { + globalTags map[string]string + deviceMetadata map[string]ddsnmp.MetaTag + } } ) @@ -74,7 +76,7 @@ func (c *Collector) CollectDeviceMetadata() (map[string]ddsnmp.MetaTag, error) { meta := make(map[string]ddsnmp.MetaTag) for _, prof := range c.profiles { - profDeviceMeta, err := c.deviceMetadataCollector.Collect(prof.profile) + profDeviceMeta, err := c.deviceMetadataCollector.collect(prof.profile) if err != nil { return nil, err } @@ -91,7 +93,8 @@ func (c *Collector) Collect() ([]*ddsnmp.ProfileMetrics, error) { var metrics []*ddsnmp.ProfileMetrics var errs []error - if expired := c.tableCache.clearExpired(); len(expired) > 0 { + expired := c.tableCache.clearExpired() + if len(expired) > 0 { c.log.Debugf("Cleared %d expired table cache entries", len(expired)) } @@ -106,13 +109,16 @@ func (c *Collector) Collect() ([]*ddsnmp.ProfileMetrics, error) { metrics = append(metrics, pm) - if vmetrics := c.vmetricsCollector.Collect(prof.profile.Definition, pm.Metrics); len(vmetrics) > 0 { + now := time.Now() + if vmetrics := c.vmetricsCollector.collect(prof.profile.Definition, pm.Metrics); len(vmetrics) > 0 { for i := range vmetrics { vmetrics[i].Profile = pm } pm.Metrics = slices.DeleteFunc(pm.Metrics, func(m ddsnmp.Metric) bool { return strings.HasPrefix(m.Name, "_") }) pm.Metrics = append(pm.Metrics, vmetrics...) + pm.Stats.Metrics.Virtual += int64(len(vmetrics)) + pm.Stats.Timing.VirtualMetrics = time.Since(now) } } @@ -142,42 +148,47 @@ func (c *Collector) SetSNMPClient(snmpClient gosnmp.Handler) { } func (c *Collector) collectProfile(ps *profileState) (*ddsnmp.ProfileMetrics, error) { + pm := &ddsnmp.ProfileMetrics{ + Source: ps.profile.SourceFile, + } + if !ps.initialized { - globalTag, err := c.globalTagsCollector.Collect(ps.profile) + globalTag, err := c.globalTagsCollector.collect(ps.profile) if err != nil { return nil, fmt.Errorf("failed to collect global tags: %w", err) } + ps.cache.globalTags = globalTag - deviceMeta, err := c.deviceMetadataCollector.Collect(ps.profile) + deviceMeta, err := c.deviceMetadataCollector.collect(ps.profile) if err != nil { return nil, fmt.Errorf("failed to collect device metadata: %w", err) } + ps.cache.deviceMetadata = deviceMeta - ps.globalTags = globalTag - ps.deviceMetadata = deviceMeta ps.initialized = true } - var metrics []ddsnmp.Metric + pm.Tags = maps.Clone(ps.cache.globalTags) + pm.DeviceMetadata = maps.Clone(ps.cache.deviceMetadata) - scalarMetrics, err := c.scalarCollector.Collect(ps.profile) + now := time.Now() + scalarMetrics, err := c.scalarCollector.collect(ps.profile, &pm.Stats) if err != nil { return nil, err } - metrics = append(metrics, scalarMetrics...) + pm.Metrics = append(pm.Metrics, scalarMetrics...) + pm.Stats.Timing.Scalar = time.Since(now) + pm.Stats.Metrics.Scalar += int64(len(scalarMetrics)) - tableMetrics, err := c.tableCollector.Collect(ps.profile) + now = time.Now() + tableMetrics, err := c.tableCollector.collect(ps.profile, &pm.Stats) if err != nil { return nil, err } - metrics = append(metrics, tableMetrics...) + pm.Metrics = append(pm.Metrics, tableMetrics...) + pm.Stats.Timing.Table = time.Since(now) + pm.Stats.Metrics.Table += int64(len(tableMetrics)) - pm := &ddsnmp.ProfileMetrics{ - Source: ps.profile.SourceFile, - DeviceMetadata: maps.Clone(ps.deviceMetadata), - Tags: maps.Clone(ps.globalTags), - Metrics: metrics, - } for i := range pm.Metrics { pm.Metrics[i].Profile = pm } diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta.go index 1134b7f376337b..691daab0649c05 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta.go @@ -31,7 +31,7 @@ func newDeviceMetadataCollector(snmpClient gosnmp.Handler, missingOIDs map[strin } } -func (dc *deviceMetadataCollector) Collect(prof *ddsnmp.Profile) (map[string]ddsnmp.MetaTag, error) { +func (dc *deviceMetadataCollector) collect(prof *ddsnmp.Profile) (map[string]ddsnmp.MetaTag, error) { if len(prof.Definition.Metadata) == 0 && len(prof.Definition.SysobjectIDMetadata) == 0 { return nil, nil } diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta_test.go index b33dfd9b7f6014..96d5fedc966224 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta_test.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_device_meta_test.go @@ -915,7 +915,7 @@ func TestDeviceMetadataCollector_Collect(t *testing.T) { missingOIDs := make(map[string]bool) collector := newDeviceMetadataCollector(mockHandler, missingOIDs, logger.New(), tc.sysobjectid) - result, err := collector.Collect(tc.profile) + result, err := collector.collect(tc.profile) if tc.expectedError { assert.Error(t, err) diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags.go index 56cc95eb7cb178..f2586ae6876a5f 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags.go @@ -32,7 +32,7 @@ func newGlobalTagsCollector(snmpClient gosnmp.Handler, missingOIDs map[string]bo } // Collect gathers all global tags from the profile -func (gc *globalTagsCollector) Collect(prof *ddsnmp.Profile) (map[string]string, error) { +func (gc *globalTagsCollector) collect(prof *ddsnmp.Profile) (map[string]string, error) { if len(prof.Definition.MetricTags) == 0 && len(prof.Definition.StaticTags) == 0 { return nil, nil } diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags_test.go index 622f085f3d11be..acb6aef101fa70 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags_test.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_global_tags_test.go @@ -389,7 +389,7 @@ func TestGlobalTagsCollector_Collect(t *testing.T) { missingOIDs := make(map[string]bool) collector := newGlobalTagsCollector(mockHandler, missingOIDs, logger.New()) - result, err := collector.Collect(tc.profile) + result, err := collector.collect(tc.profile) if tc.expectedError { assert.Error(t, err) diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar.go index cabe52902a8924..10030aab961c96 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar.go @@ -32,23 +32,25 @@ func newScalarCollector(snmpClient gosnmp.Handler, missingOIDs map[string]bool, } // Collect gathers all scalar metrics from the profile -func (sc *scalarCollector) Collect(prof *ddsnmp.Profile) ([]ddsnmp.Metric, error) { +func (sc *scalarCollector) collect(prof *ddsnmp.Profile, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { oids, missingOIDs := sc.identifyScalarOIDs(prof.Definition.Metrics) if len(missingOIDs) > 0 { sc.log.Debugf("scalar metrics missing OIDs: %v", missingOIDs) + stats.Errors.MissingOIDs += int64(len(missingOIDs)) } if len(oids) == 0 { return nil, nil } - pdus, err := sc.getScalarValues(oids) + pdus, err := sc.getScalarValues(oids, stats) if err != nil { + stats.Errors.SNMP++ return nil, err } - return sc.processScalarMetrics(prof.Definition.Metrics, pdus) + return sc.processScalarMetrics(prof.Definition.Metrics, pdus, stats) } // identifyScalarOIDs returns OIDs to collect and OIDs that are known to be missing @@ -77,11 +79,14 @@ func (sc *scalarCollector) identifyScalarOIDs(configs []ddprofiledefinition.Metr return oids, missingOIDs } -func (sc *scalarCollector) getScalarValues(oids []string) (map[string]gosnmp.SnmpPDU, error) { +func (sc *scalarCollector) getScalarValues(oids []string, stats *ddsnmp.CollectionStats) (map[string]gosnmp.SnmpPDU, error) { pdus := make(map[string]gosnmp.SnmpPDU) maxOids := sc.snmpClient.MaxOids() for chunk := range slices.Chunk(oids, maxOids) { + stats.SNMP.GetOIDs += int64(len(chunk)) + stats.SNMP.GetRequests++ + result, err := sc.snmpClient.Get(chunk) if err != nil { return nil, err @@ -90,6 +95,7 @@ func (sc *scalarCollector) getScalarValues(oids []string) (map[string]gosnmp.Snm for _, pdu := range result.Variables { if !isPduWithData(pdu) { sc.missingOIDs[trimOID(pdu.Name)] = true + stats.Errors.MissingOIDs++ continue } pdus[trimOID(pdu.Name)] = pdu @@ -100,7 +106,7 @@ func (sc *scalarCollector) getScalarValues(oids []string) (map[string]gosnmp.Snm } // processScalarMetrics converts PDUs into metrics -func (sc *scalarCollector) processScalarMetrics(configs []ddprofiledefinition.MetricsConfig, pdus map[string]gosnmp.SnmpPDU) ([]ddsnmp.Metric, error) { +func (sc *scalarCollector) processScalarMetrics(configs []ddprofiledefinition.MetricsConfig, pdus map[string]gosnmp.SnmpPDU, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { var metrics []ddsnmp.Metric var errs []error @@ -113,6 +119,7 @@ func (sc *scalarCollector) processScalarMetrics(configs []ddprofiledefinition.Me if err != nil { errs = append(errs, fmt.Errorf("metric '%s': %w", cfg.Symbol.Name, err)) sc.log.Debugf("Error processing scalar metric '%s': %v", cfg.Symbol.Name, err) + stats.Errors.Processing.Scalar++ continue } diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar_test.go index 2ee1059b8c2397..cb4e0f8c0f425f 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar_test.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_scalar_test.go @@ -709,7 +709,8 @@ func TestScalarCollector_Collect(t *testing.T) { missingOIDs := make(map[string]bool) collector := newScalarCollector(mockHandler, missingOIDs, logger.New()) - result, err := collector.Collect(tc.profile) + var stats ddsnmp.CollectionStats + result, err := collector.collect(tc.profile, &stats) if tc.expectedError { assert.Error(t, err) diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table.go index 6cb3a797c7c48f..3155a053db8831 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table.go @@ -40,13 +40,13 @@ func newTableCollector(snmpClient gosnmp.Handler, missingOIDs map[string]bool, t } // Collect gathers all table metrics from the profile -func (tc *tableCollector) Collect(prof *ddsnmp.Profile) ([]ddsnmp.Metric, error) { - walkResults, err := tc.walkTablesAsNeeded(prof) +func (tc *tableCollector) collect(prof *ddsnmp.Profile, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { + walkResults, err := tc.walkTablesAsNeeded(prof, stats) if err != nil { return nil, err } - return tc.processWalkResults(walkResults) + return tc.processWalkResults(walkResults, stats) } // tableWalkResult holds the walked data for a single table @@ -159,10 +159,10 @@ type cacheProcessingContext struct { } // walkTablesAsNeeded walks only tables that aren't fully cached -func (tc *tableCollector) walkTablesAsNeeded(prof *ddsnmp.Profile) ([]tableWalkResult, error) { - toWalk := tc.identifyTablesToWalk(prof) +func (tc *tableCollector) walkTablesAsNeeded(prof *ddsnmp.Profile, stats *ddsnmp.CollectionStats) ([]tableWalkResult, error) { + toWalk := tc.identifyTablesToWalk(prof, stats) - walkedData, errs := tc.walkTables(toWalk.tablesToWalk) + walkedData, errs := tc.walkTables(toWalk.tablesToWalk, stats) results := tc.buildWalkResults(walkedData, toWalk) @@ -181,7 +181,7 @@ type tablesToWalkInfo struct { } // identifyTablesToWalk determines which tables need to be walked -func (tc *tableCollector) identifyTablesToWalk(prof *ddsnmp.Profile) *tablesToWalkInfo { +func (tc *tableCollector) identifyTablesToWalk(prof *ddsnmp.Profile, stats *ddsnmp.CollectionStats) *tablesToWalkInfo { info := &tablesToWalkInfo{ tablesToWalk: make(map[string]bool), tableConfigs: make(map[string][]ddprofiledefinition.MetricsConfig), @@ -194,6 +194,7 @@ func (tc *tableCollector) identifyTablesToWalk(prof *ddsnmp.Profile) *tablesToWa tableOID := cfg.Table.OID if tc.missingOIDs[trimOID(tableOID)] { + stats.Errors.MissingOIDs++ info.missingOIDs = append(info.missingOIDs, tableOID) continue } @@ -202,6 +203,9 @@ func (tc *tableCollector) identifyTablesToWalk(prof *ddsnmp.Profile) *tablesToWa if !tc.tableCache.isConfigCached(cfg) { info.tablesToWalk[tableOID] = true + stats.TableCache.Misses++ + } else { + stats.TableCache.Hits++ } } @@ -216,18 +220,20 @@ func (tc *tableCollector) identifyTablesToWalk(prof *ddsnmp.Profile) *tablesToWa } // walkTables performs SNMP walks for the specified tables -func (tc *tableCollector) walkTables(tablesToWalk map[string]bool) (map[string]map[string]gosnmp.SnmpPDU, []error) { +func (tc *tableCollector) walkTables(tablesToWalk map[string]bool, stats *ddsnmp.CollectionStats) (map[string]map[string]gosnmp.SnmpPDU, []error) { walkedData := make(map[string]map[string]gosnmp.SnmpPDU) var errs []error for tableOID := range tablesToWalk { - pdus, err := tc.snmpWalk(tableOID) + pdus, err := tc.snmpWalk(tableOID, stats) if err != nil { + stats.Errors.SNMP++ errs = append(errs, fmt.Errorf("failed to walk table OID '%s': %w", tableOID, err)) continue } if len(pdus) > 0 { + stats.SNMP.TablesWalked++ walkedData[tableOID] = pdus } } @@ -263,7 +269,7 @@ func (tc *tableCollector) buildWalkResults(walkedData map[string]map[string]gosn } // processWalkResults processes all table walk results -func (tc *tableCollector) processWalkResults(walkResults []tableWalkResult) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) processWalkResults(walkResults []tableWalkResult, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { // Build lookup maps walkedData := tc.buildWalkedDataMap(walkResults) tableNameToOID := tc.buildTableNameMap(walkResults) @@ -271,14 +277,19 @@ func (tc *tableCollector) processWalkResults(walkResults []tableWalkResult) ([]d var metrics []ddsnmp.Metric var errs []error + tablesSeen := make(map[string]bool) + for _, result := range walkResults { - tableMetrics, err := tc.processTableResult(result, walkedData, tableNameToOID) + tableMetrics, err := tc.processTableResult(result, walkedData, tableNameToOID, stats) if err != nil { + stats.Errors.Processing.Table++ errs = append(errs, fmt.Errorf("table '%s': %w", result.config.Table.Name, err)) continue } metrics = append(metrics, tableMetrics...) + tablesSeen[result.tableOID] = true } + stats.Metrics.Tables = int64(len(tablesSeen)) if len(metrics) == 0 && len(errs) > 0 { return nil, errors.Join(errs...) @@ -310,9 +321,10 @@ func (tc *tableCollector) buildTableNameMap(walkResults []tableWalkResult) map[s } // processTableResult processes a single table result -func (tc *tableCollector) processTableResult(result tableWalkResult, walkedData map[string]map[string]gosnmp.SnmpPDU, tableNameToOID map[string]string) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) processTableResult(result tableWalkResult, walkedData map[string]map[string]gosnmp.SnmpPDU, tableNameToOID map[string]string, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { // Try cache first - if metrics := tc.tryCollectFromCache(result.config); metrics != nil { + if metrics := tc.tryCollectFromCache(result.config, stats); metrics != nil { + stats.SNMP.TablesCached++ return metrics, nil } @@ -324,14 +336,16 @@ func (tc *tableCollector) processTableResult(result tableWalkResult, walkedData walkedData: walkedData, tableNameToOID: tableNameToOID, } - return tc.processTableData(ctx) + metrics, err := tc.processTableData(ctx, stats) + stats.Metrics.Rows += int64(len(ctx.rows)) + return metrics, err } return nil, nil } // tryCollectFromCache attempts to collect metrics using cached data -func (tc *tableCollector) tryCollectFromCache(cfg ddprofiledefinition.MetricsConfig) []ddsnmp.Metric { +func (tc *tableCollector) tryCollectFromCache(cfg ddprofiledefinition.MetricsConfig, stats *ddsnmp.CollectionStats) []ddsnmp.Metric { cachedOIDs, cachedTags, ok := tc.tableCache.getCachedData(cfg) if !ok { return nil @@ -347,18 +361,19 @@ func (tc *tableCollector) tryCollectFromCache(cfg ddprofiledefinition.MetricsCon tableName: cfg.Table.Name, } - metrics, err := tc.collectWithCache(ctx) - if err == nil { - tc.log.Debugf("Successfully collected table %s using cache", cfg.Table.Name) - return metrics + metrics, err := tc.collectWithCache(ctx, stats) + if err != nil { + tc.log.Debugf("Cached collection failed for table %s: %v", cfg.Table.Name, err) + return nil } - tc.log.Debugf("Cached collection failed for table %s: %v", cfg.Table.Name, err) - return nil + stats.Metrics.Rows += int64(len(cachedOIDs)) + tc.log.Debugf("Successfully collected table %s using cache", cfg.Table.Name) + return metrics } // processTableData processes walked table data -func (tc *tableCollector) processTableData(ctx *tableProcessingContext) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) processTableData(ctx *tableProcessingContext, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { ctx.columnOIDs = buildColumnOIDs(ctx.config) ctx.orderedTags = buildOrderedTags(ctx.config) @@ -367,7 +382,7 @@ func (tc *tableCollector) processTableData(ctx *tableProcessingContext) ([]ddsnm ctx.staticTags = parseStaticTags(ctx.config.StaticTags) - metrics, err := tc.processRows(ctx) + metrics, err := tc.processRows(ctx, stats) // Cache the processed data deps := extractTableDependencies(ctx.config, ctx.tableNameToOID) @@ -417,7 +432,7 @@ func (tc *tableCollector) organizePDUsByRow(ctx *tableProcessingContext) (rows m } // processRows processes all rows and returns metrics -func (tc *tableCollector) processRows(ctx *tableProcessingContext) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) processRows(ctx *tableProcessingContext, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { var metrics []ddsnmp.Metric var errs []error @@ -444,6 +459,7 @@ func (tc *tableCollector) processRows(ctx *tableProcessingContext) ([]ddsnmp.Met } rowMetrics, err := tc.rowProcessor.processRow(row, rowCtx) if err != nil { + stats.Errors.Processing.Table++ errs = append(errs, err) continue } @@ -464,7 +480,7 @@ func (tc *tableCollector) processRows(ctx *tableProcessingContext) ([]ddsnmp.Met } // collectWithCache collects metrics using cached structure -func (tc *tableCollector) collectWithCache(ctx *cacheProcessingContext) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) collectWithCache(ctx *cacheProcessingContext, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { // Build list of OIDs to GET var oidsToGet []string for _, columns := range ctx.cachedOIDs { @@ -480,7 +496,7 @@ func (tc *tableCollector) collectWithCache(ctx *cacheProcessingContext) ([]ddsnm } // GET current values - pdus, err := tc.snmpGet(oidsToGet) + pdus, err := tc.snmpGet(oidsToGet, stats) if err != nil { return nil, fmt.Errorf("failed to get cached OIDs: %w", err) } @@ -492,11 +508,11 @@ func (tc *tableCollector) collectWithCache(ctx *cacheProcessingContext) ([]ddsnm // Add PDUs to context and build metrics ctx.pdus = pdus - return tc.buildMetricsFromCache(ctx) + return tc.buildMetricsFromCache(ctx, stats) } // buildMetricsFromCache builds metrics from cached structure and current values -func (tc *tableCollector) buildMetricsFromCache(ctx *cacheProcessingContext) ([]ddsnmp.Metric, error) { +func (tc *tableCollector) buildMetricsFromCache(ctx *cacheProcessingContext, stats *ddsnmp.CollectionStats) ([]ddsnmp.Metric, error) { staticTags := parseStaticTags(ctx.config.StaticTags) var metrics []ddsnmp.Metric var errs []error @@ -525,12 +541,14 @@ func (tc *tableCollector) buildMetricsFromCache(ctx *cacheProcessingContext) ([] value, err := tc.valProc.processValue(sym, pdu) if err != nil { + stats.Errors.Processing.Table++ tc.log.Debugf("Error processing value for %s: %v", sym.Name, err) continue } metric, err := buildTableMetric(sym, pdu, value, rowTags, staticTags, ctx.tableName) if err != nil { + stats.Errors.Processing.Table++ errs = append(errs, err) continue } @@ -548,12 +566,14 @@ func (tc *tableCollector) buildMetricsFromCache(ctx *cacheProcessingContext) ([] // SNMP operations -func (tc *tableCollector) snmpWalk(oid string) (map[string]gosnmp.SnmpPDU, error) { +func (tc *tableCollector) snmpWalk(oid string, stats *ddsnmp.CollectionStats) (map[string]gosnmp.SnmpPDU, error) { pdus := make(map[string]gosnmp.SnmpPDU) var resp []gosnmp.SnmpPDU var err error + stats.SNMP.WalkRequests++ + if tc.snmpClient.Version() == gosnmp.Version1 || tc.disableBulkWalk { resp, err = tc.snmpClient.WalkAll(oid) } else { @@ -563,9 +583,13 @@ func (tc *tableCollector) snmpWalk(oid string) (map[string]gosnmp.SnmpPDU, error return nil, err } + stats.SNMP.WalkPDUs += int64(len(resp)) + for _, pdu := range resp { if isPduWithData(pdu) { pdus[trimOID(pdu.Name)] = pdu + } else { + stats.Errors.MissingOIDs++ } } @@ -576,17 +600,22 @@ func (tc *tableCollector) snmpWalk(oid string) (map[string]gosnmp.SnmpPDU, error return pdus, nil } -func (tc *tableCollector) snmpGet(oids []string) (map[string]gosnmp.SnmpPDU, error) { +func (tc *tableCollector) snmpGet(oids []string, stats *ddsnmp.CollectionStats) (map[string]gosnmp.SnmpPDU, error) { pdus := make(map[string]gosnmp.SnmpPDU) for chunk := range slices.Chunk(oids, tc.snmpClient.MaxOids()) { + stats.SNMP.GetRequests++ + stats.SNMP.GetOIDs += int64(len(chunk)) + result, err := tc.snmpClient.Get(chunk) if err != nil { + stats.Errors.SNMP++ return nil, err } for _, pdu := range result.Variables { if !isPduWithData(pdu) { + stats.Errors.MissingOIDs++ tc.missingOIDs[trimOID(pdu.Name)] = true continue } diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table_test.go index c82bb05455a445..bc820334233346 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table_test.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_table_test.go @@ -3820,10 +3820,11 @@ func TestTableCollector_Collect(t *testing.T) { } missingOIDs := make(map[string]bool) - tableCache := newTableCache(0, 0) // Cache disabled - collector := newTableCollector(mockHandler, missingOIDs, tableCache, logger.New(), false) + tcache := newTableCache(0, 0) // Cache disabled + collector := newTableCollector(mockHandler, missingOIDs, tcache, logger.New(), false) - result, err := collector.Collect(tc.profile) + var stats ddsnmp.CollectionStats + result, err := collector.collect(tc.profile, &stats) if tc.expectedError { assert.Error(t, err) diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_test.go new file mode 100644 index 00000000000000..9bffe46ed2e17e --- /dev/null +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_test.go @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package ddsnmpcollector + +import ( + "testing" + + "github.com/gosnmp/gosnmp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/netdata/netdata/go/plugins/logger" + "github.com/netdata/netdata/go/plugins/plugin/go.d/collector/snmp/ddsnmp" + "github.com/netdata/netdata/go/plugins/plugin/go.d/collector/snmp/ddsnmp/ddprofiledefinition" +) + +func TestCollector_Collect_StatsSnapshot(t *testing.T) { + ctrl, mockHandler := setupMockHandler(t) + defer ctrl.Finish() + + // --- SNMP expectations --------------------------------------------------- + + // Scalar: sysUpTime.0 + expectSNMPGet(mockHandler, + []string{"1.3.6.1.2.1.1.3.0"}, + []gosnmp.SnmpPDU{ + createTimeTicksPDU("1.3.6.1.2.1.1.3.0", 123456), + }, + ) + + // Table: ifTable, we only care about ifInOctets with 2 rows + expectSNMPWalk(mockHandler, + gosnmp.Version2c, + "1.3.6.1.2.1.2.2", + []gosnmp.SnmpPDU{ + // Row 1 + createCounter32PDU("1.3.6.1.2.1.2.2.1.10.1", 1000), // ifInOctets.1 + // Row 2 + createCounter32PDU("1.3.6.1.2.1.2.2.1.10.2", 2000), // ifInOctets.2 + }, + ) + + // --- Profile definition -------------------------------------------------- + + profile := &ddsnmp.Profile{ + SourceFile: "stats-toy-profile.yaml", + Definition: &ddprofiledefinition.ProfileDefinition{ + Metrics: []ddprofiledefinition.MetricsConfig{ + // Simple scalar metric + { + Symbol: ddprofiledefinition.SymbolConfig{ + OID: "1.3.6.1.2.1.1.3.0", + Name: "sysUpTime", + }, + }, + // Simple table metric: ifInOctets over ifTable + { + Table: ddprofiledefinition.SymbolConfig{ + OID: "1.3.6.1.2.1.2.2", + Name: "ifTable", + }, + Symbols: []ddprofiledefinition.SymbolConfig{ + { + OID: "1.3.6.1.2.1.2.2.1.10", + Name: "ifInOctets", + }, + }, + }, + }, + // One virtual metric that sums ifInOctets across the table. + VirtualMetrics: []ddprofiledefinition.VirtualMetricConfig{ + { + Name: "ifInOctets_total", + Sources: []ddprofiledefinition.VirtualMetricSourceConfig{ + { + Metric: "ifInOctets", + Table: "ifTable", + }, + }, + }, + }, + }, + } + + handleCrossTableTagsWithoutMetrics(profile) + require.NoError(t, ddsnmp.CompileTransforms(profile)) + + collector := New(Config{ + SnmpClient: mockHandler, + Profiles: []*ddsnmp.Profile{profile}, + Log: logger.New(), + SysObjectID: "", + }) + + // --- Run collection ------------------------------------------------------ + + results, err := collector.Collect() + require.NoError(t, err) + require.Len(t, results, 1) + + pm := results[0] + + // --- Sanity check on actual metrics ------------------------------------- + + // We expect: + // - 1 scalar metric (sysUpTime) + // - 2 table metrics (ifInOctets for 2 rows) + // - 1 virtual metric (ifInOctets_total) + require.Len(t, pm.Metrics, 4, "total number of metrics") + + // --- Assert CollectionStats as a snapshot ------------------------------- + + // Ignore timing (it's inherently variable). + stats := pm.Stats + stats.Timing = ddsnmp.TimingStats{} + pm.Stats = stats + + expected := ddsnmp.CollectionStats{ + SNMP: ddsnmp.SNMPOperationStats{ + // Scalar: 1 GET with 1 OID + GetRequests: 1, + GetOIDs: 1, + + // Table: 1 WALK with 2 PDUs, 1 table walked, no cached tables + WalkRequests: 1, + WalkPDUs: 2, + TablesWalked: 1, + // TablesCached should be 0 on first run + }, + Metrics: ddsnmp.MetricCountStats{ + Scalar: 1, // sysUpTime + Table: 2, // ifInOctets.1, ifInOctets.2 + Virtual: 1, // ifInOctets_total + Tables: 1, // ifTable + Rows: 2, // 2 interfaces + }, + TableCache: ddsnmp.TableCacheStats{ + Hits: 0, // first run → no cache hits + Misses: 1, // one table config had to be walked + // Expired intentionally ignored / omitted + }, + Errors: ddsnmp.ErrorStats{ + SNMP: 0, + MissingOIDs: 0, + }, + // Timing left as zero-value for comparison + Timing: ddsnmp.TimingStats{}, + } + + assert.Equal(t, expected, pm.Stats) +} diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics.go index 5e80e98dc18963..83639ac55cb545 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics.go @@ -22,7 +22,7 @@ func newVirtualMetricsCollector(log *logger.Logger) *vmetricsCollector { } } -func (p *vmetricsCollector) Collect(profDef *ddprofiledefinition.ProfileDefinition, collected []ddsnmp.Metric) []ddsnmp.Metric { +func (p *vmetricsCollector) collect(profDef *ddprofiledefinition.ProfileDefinition, collected []ddsnmp.Metric) []ddsnmp.Metric { if len(profDef.VirtualMetrics) == 0 { return nil } @@ -418,8 +418,7 @@ type aggregatorsBuilder struct { aggregators []*vmetricsAggregator } -func newAggregatorsBuilder( - log *logger.Logger, prof *ddprofiledefinition.ProfileDefinition, existingNames map[string]bool) *aggregatorsBuilder { +func newAggregatorsBuilder(log *logger.Logger, prof *ddprofiledefinition.ProfileDefinition, existingNames map[string]bool) *aggregatorsBuilder { return &aggregatorsBuilder{ log: log, prof: prof, diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics_test.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics_test.go index fb4040f31d0ce8..fa3d38179abc07 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics_test.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/ddsnmpcollector/collector_vmetrics_test.go @@ -1517,7 +1517,7 @@ func TestVirtualMetricsCollector_Collect(t *testing.T) { for name, tc := range tests { t.Run(name, func(t *testing.T) { vmc := newVirtualMetricsCollector(logger.New()) - result := vmc.Collect(tc.profileDef, tc.collectedMetrics) + result := vmc.collect(tc.profileDef, tc.collectedMetrics) // Sort both slices for consistent comparison assert.ElementsMatch(t, tc.expected, result) diff --git a/src/go/plugin/go.d/collector/snmp/ddsnmp/metric.go b/src/go/plugin/go.d/collector/snmp/ddsnmp/metric.go index d08d540a3504d3..cdaccf7cdc4dae 100644 --- a/src/go/plugin/go.d/collector/snmp/ddsnmp/metric.go +++ b/src/go/plugin/go.d/collector/snmp/ddsnmp/metric.go @@ -1,6 +1,8 @@ package ddsnmp import ( + "time" + "github.com/netdata/netdata/go/plugins/plugin/go.d/collector/snmp/ddsnmp/ddprofiledefinition" ) @@ -9,6 +11,7 @@ type ProfileMetrics struct { DeviceMetadata map[string]MetaTag Tags map[string]string Metrics []Metric + Stats CollectionStats } type Metric struct { @@ -32,3 +35,77 @@ type MetaTag struct { Value string IsExactMatch bool // whether this value is from an exact match context } + +// CollectionStats contains statistics for a single profile collection cycle. +type CollectionStats struct { + Timing TimingStats + SNMP SNMPOperationStats + Metrics MetricCountStats + TableCache TableCacheStats + Errors ErrorStats +} + +// TimingStats captures duration of each collection phase. +type TimingStats struct { + // Scalar is time spent collecting scalar (non-table) metrics. + Scalar time.Duration + // Table is time spent collecting table metrics. + Table time.Duration + // VirtualMetrics is time spent computing derived/aggregated metrics. + VirtualMetrics time.Duration +} + +func (s TimingStats) Total() time.Duration { + return s.Scalar + s.Table + s.VirtualMetrics +} + +// SNMPOperationStats captures SNMP protocol-level operations. +type SNMPOperationStats struct { + // GetRequests is the number of SNMP GET operations performed. + GetRequests int64 + // GetOIDs is the total number of OIDs requested across all GETs. + GetOIDs int64 + // WalkRequests is the number of SNMP Walk/BulkWalk operations. + WalkRequests int64 + // WalkPDUs is the total number of PDUs returned from all walks. + WalkPDUs int64 + // TablesWalked is the count of tables that required walking. + TablesWalked int64 + // TablesCached is the count of tables served from cache. + TablesCached int64 +} + +// MetricCountStats captures the number of metrics produced. +type MetricCountStats struct { + // Scalar is the count of scalar (non-table) metrics. + Scalar int64 + // Table is the count of table metrics. + Table int64 + // Virtual is the count of computed/derived metrics. + Virtual int64 + // Tables is the count of unique tables with metrics. + Tables int64 + // Rows is the total number of table rows across all tables. + Rows int64 +} + +// TableCacheStats captures table cache performance. +type TableCacheStats struct { + // Hits is the number of table configs served from cache. + Hits int64 + // Misses is the number of table configs that required walking. + Misses int64 +} + +// ErrorStats captures categorized error counts. +type ErrorStats struct { + // SNMP is the count of SNMP-level errors (timeouts, network issues). + SNMP int64 + // Processing is the count of value conversion/transform errors. + Processing struct { + Scalar int64 + Table int64 + } + // MissingOIDs is the count of NoSuchObject/NoSuchName responses. + MissingOIDs int64 +} diff --git a/src/go/plugin/go.d/config/go.d/sd/snmp.conf b/src/go/plugin/go.d/config/go.d/sd/snmp.conf index 031de97d41e723..928721975deeb4 100644 --- a/src/go/plugin/go.d/config/go.d/sd/snmp.conf +++ b/src/go/plugin/go.d/config/go.d/sd/snmp.conf @@ -62,7 +62,6 @@ services: - id: "snmp" match: '{{ true }}' config_template: | - update_every: 5 {{- if .SysInfo.Name }} name: {{ .SysInfo.Name }}-ip-{{ .IPAddress }} {{- else }} diff --git a/src/go/plugin/go.d/pkg/logs/csv.go b/src/go/plugin/go.d/pkg/logs/csv.go index d57aecda2c70f6..72ef9b0362d42d 100644 --- a/src/go/plugin/go.d/pkg/logs/csv.go +++ b/src/go/plugin/go.d/pkg/logs/csv.go @@ -12,13 +12,17 @@ import ( "strings" ) +type CSVCheckFieldFunc func(string) (string, int, bool) + +func (f CSVCheckFieldFunc) IsZero() bool { return true } + type ( CSVConfig struct { - FieldsPerRecord int `yaml:"fields_per_record,omitempty" json:"fields_per_record"` - Delimiter string `yaml:"delimiter,omitempty" json:"delimiter"` - TrimLeadingSpace bool `yaml:"trim_leading_space,omitempty" json:"trim_leading_space"` - Format string `yaml:"format,omitempty" json:"format"` - CheckField func(string) (string, int, bool) `yaml:"-" json:"-"` + FieldsPerRecord int `yaml:"fields_per_record,omitempty" json:"fields_per_record"` + Delimiter string `yaml:"delimiter,omitempty" json:"delimiter"` + TrimLeadingSpace bool `yaml:"trim_leading_space,omitempty" json:"trim_leading_space"` + Format string `yaml:"format,omitempty" json:"format"` + CheckField CSVCheckFieldFunc `yaml:"-" json:"-"` } CSVParser struct { diff --git a/src/go/plugin/go.d/pkg/ndexec/ndexec.go b/src/go/plugin/go.d/pkg/ndexec/ndexec.go index b80b789ee820da..bedcfd9fa4cb00 100644 --- a/src/go/plugin/go.d/pkg/ndexec/ndexec.go +++ b/src/go/plugin/go.d/pkg/ndexec/ndexec.go @@ -87,7 +87,7 @@ func (r *runner) run(log *logger.Logger, timeout time.Duration, helperPath, labe err = ctx.Err() } - return nil, cmdStr, fmt.Errorf("%s: %v: %w (stderr: %s)", label, ex, err, strings.TrimSpace(s)) + return out, cmdStr, fmt.Errorf("%s: %v: %w (stderr: %s)", label, ex, err, strings.TrimSpace(s)) } return out, cmdStr, nil diff --git a/src/health/schema.d/health%3Aalert%3Aprototype.json b/src/health/schema.d/health%3Aalert%3Aprototype.json index c29c7c340435ef..2a5160a5026375 100644 --- a/src/health/schema.d/health%3Aalert%3Aprototype.json +++ b/src/health/schema.d/health%3Aalert%3Aprototype.json @@ -23,8 +23,12 @@ "title": "The instance this rule should be applied to.", "description": "You can find the instance names on all charts at the instances drop down menu. Do not include the host name in this field." }, - "host_labels": { "$ref": "#/definitions/matchHostLabels" }, - "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" } + "host_labels": { + "$ref": "#/definitions/matchHostLabels" + }, + "instance_labels": { + "$ref": "#/definitions/matchInstanceLabels" + } }, "required": [ "on", @@ -43,8 +47,12 @@ "title": "The context of the instances this rule should be applied to.", "description": "The context is the code-name of each chart on the dashboard, that appears at the chart title bar, between the chart title and its unit of measurement, like: system.cpu, disk.io, etc." }, - "host_labels": { "$ref": "#/definitions/matchHostLabels" }, - "instance_labels": { "$ref": "#/definitions/matchInstanceLabels" } + "host_labels": { + "$ref": "#/definitions/matchHostLabels" + }, + "instance_labels": { + "$ref": "#/definitions/matchInstanceLabels" + } }, "required": [ "on", @@ -90,9 +98,21 @@ "data_source": { "type": "string", "oneOf": [ - { "const": "samples", "title": "Samples", "description": "Use the time-series values for each dimension" }, - { "const": "percentages", "title": "Percentages", "description": "Use the percentage of each dimension vs the sum of all dimensions" }, - { "const": "anomalies", "title": "Anomalies", "description": "Use the anomaly rate of each dimension" } + { + "const": "samples", + "title": "Samples", + "description": "Use the time-series values for each dimension" + }, + { + "const": "percentages", + "title": "Percentages", + "description": "Use the percentage of each dimension vs the sum of all dimensions" + }, + { + "const": "anomalies", + "title": "Anomalies", + "description": "Use the anomaly rate of each dimension" + } ], "default": "samples", "title": " ", @@ -101,20 +121,62 @@ "time_group": { "type": "string", "oneOf": [ - { "const": "average", "title": "Average" }, - { "const": "median", "title": "Median" }, - { "const": "min", "title": "Minimum" }, - { "const": "max", "title": "Maximum" }, - { "const": "sum", "title": "Sum" }, - { "const": "incremental_sum", "title": "Incremental Sum" }, - { "const": "stddev", "title": "Standard Deviation" }, - { "const": "cv", "title": "Coefficient of Variation" }, - { "const": "trimmed-mean", "title": "Trimmed Mean" }, - { "const": "trimmed-median", "title": "Trimmed Median" }, - { "const": "percentile", "title": "Percentile" }, - { "const": "ses", "title": "Simple Exponential Smoothing" }, - { "const": "des", "title": "Double Exponential Smoothing" }, - { "const": "countif", "title": "Count If" } + { + "const": "average", + "title": "Average" + }, + { + "const": "median", + "title": "Median" + }, + { + "const": "min", + "title": "Minimum" + }, + { + "const": "max", + "title": "Maximum" + }, + { + "const": "sum", + "title": "Sum" + }, + { + "const": "incremental_sum", + "title": "Incremental Sum" + }, + { + "const": "stddev", + "title": "Standard Deviation" + }, + { + "const": "cv", + "title": "Coefficient of Variation" + }, + { + "const": "trimmed-mean", + "title": "Trimmed Mean" + }, + { + "const": "trimmed-median", + "title": "Trimmed Median" + }, + { + "const": "percentile", + "title": "Percentile" + }, + { + "const": "ses", + "title": "Simple Exponential Smoothing" + }, + { + "const": "des", + "title": "Double Exponential Smoothing" + }, + { + "const": "countif", + "title": "Count If" + } ], "default": "average", "title": "Time Aggregation", @@ -135,11 +197,31 @@ "dims_group": { "type": "string", "oneOf": [ - { "const": "average", "title": "Average", "description": "The average of all dimensions" }, - { "const": "min", "title": "Minimum", "description": "The minimum of all dimensions" }, - { "const": "max", "title": "Maximum", "description": "The maximum of all dimensions" }, - { "const": "sum", "title": "Sum", "description": "The sum of all dimensions" }, - { "const": "min2max", "title": "Min-to-Max", "description": "The delta between the minimum of the maximum of the dimensions" } + { + "const": "average", + "title": "Average", + "description": "The average of all dimensions" + }, + { + "const": "min", + "title": "Minimum", + "description": "The minimum of all dimensions" + }, + { + "const": "max", + "title": "Maximum", + "description": "The maximum of all dimensions" + }, + { + "const": "sum", + "title": "Sum", + "description": "The sum of all dimensions" + }, + { + "const": "min2max", + "title": "Min-to-Max", + "description": "The delta between the minimum of the maximum of the dimensions" + } ], "default": "sum", "title": "Dims Aggregation", @@ -158,14 +240,31 @@ "uniqueItems": true, "items": { "oneOf": [ - { "const": "unaligned", "title": "Do not shift the time-frame for visual presentation" }, - { "const": "absolute", "title": "Make all values positive before using them" }, - { "const": "null2zero", "title": "Treat gaps in the time-series as a zero value" }, - { "const": "match_ids", "title": "Match only dimension IDs, not Names" }, - { "const": "match_names", "title": "Match only dimension Names, not IDs" } + { + "const": "unaligned", + "title": "Do not shift the time-frame for visual presentation" + }, + { + "const": "absolute", + "title": "Make all values positive before using them" + }, + { + "const": "null2zero", + "title": "Treat gaps in the time-series as a zero value" + }, + { + "const": "match_ids", + "title": "Match only dimension IDs, not Names" + }, + { + "const": "match_names", + "title": "Match only dimension Names, not IDs" + } ] }, - "default": [ "unaligned" ] + "default": [ + "unaligned" + ] } }, "allOf": [ @@ -173,60 +272,72 @@ "if": { "properties": { "time_group": { - "enum": ["trimmed-mean"] + "enum": [ + "trimmed-mean" + ] } } }, "then": { "properties": { "time_group_value": { - "type": "integer", + "type": "number", "default": 1, "title": "Trim %", "description": "" } }, - "required": ["time_group_value"] + "required": [ + "time_group_value" + ] } }, { "if": { "properties": { "time_group": { - "enum": ["trimmed-median"] + "enum": [ + "trimmed-median" + ] } } }, "then": { "properties": { "time_group_value": { - "type": "integer", + "type": "number", "default": 1, "title": "Trim %", "description": "" } }, - "required": ["time_group_value"] + "required": [ + "time_group_value" + ] } }, { "if": { "properties": { "time_group": { - "enum": ["percentile"] + "enum": [ + "percentile" + ] } } }, "then": { "properties": { "time_group_value": { - "type": "integer", + "type": "number", "default": 95, "title": "Percentage", "description": "" } }, - "required": ["time_group_value"] + "required": [ + "time_group_value" + ] } }, { @@ -242,12 +353,30 @@ "time_group_condition": { "type": "string", "oneOf": [ - { "const": "!=", "title": "!=" }, - { "const": "=", "title": "==" }, - { "const": ">=", "title": ">=" }, - { "const": ">", "title": ">" }, - { "const": "<=", "title": "<=" }, - { "const": "<", "title": "<" } + { + "const": "!=", + "title": "!=" + }, + { + "const": "=", + "title": "==" + }, + { + "const": ">=", + "title": ">=" + }, + { + "const": ">", + "title": ">" + }, + { + "const": "<=", + "title": "<=" + }, + { + "const": "<", + "title": "<" + } ], "default": "equal", "title": "Condition", @@ -260,7 +389,10 @@ "description": "" } }, - "required": ["time_group_condition", "time_group_value"] + "required": [ + "time_group_condition", + "time_group_value" + ] } } ] @@ -270,6 +402,12 @@ "title": "Calculation", "description": "An expression to transform the value" }, + "units_placeholder": { + "type": "string", + "default": "-", + "title": "Source unit", + "description": "before scaling in UI" + }, "units": { "type": "string", "title": "Unit", @@ -319,7 +457,10 @@ "uniqueItems": true, "items": { "oneOf": [ - { "const": "no-clear-notification", "title": "Do not perform any action when the alert is cleared"} + { + "const": "no-clear-notification", + "title": "Do not perform any action when the alert is cleared" + } ] }, "default": [] @@ -372,15 +513,33 @@ "title": "Alert Configuration", "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.", "properties": { - "match": { "$ref": "#/definitions/matchInstance" }, - "summary": { "$ref": "#/definitions/configSummary" }, - "info": { "$ref": "#/definitions/configInfo" }, - "type": { "$ref": "#/definitions/configType" }, - "component": { "$ref": "#/definitions/configComponent" }, - "classification": { "$ref": "#/definitions/configClassification" }, - "value": { "$ref": "#/definitions/configValue" }, - "conditions": { "$ref": "#/definitions/configConditions" }, - "action": { "$ref": "#/definitions/configAction" } + "match": { + "$ref": "#/definitions/matchInstance" + }, + "summary": { + "$ref": "#/definitions/configSummary" + }, + "info": { + "$ref": "#/definitions/configInfo" + }, + "type": { + "$ref": "#/definitions/configType" + }, + "component": { + "$ref": "#/definitions/configComponent" + }, + "classification": { + "$ref": "#/definitions/configClassification" + }, + "value": { + "$ref": "#/definitions/configValue" + }, + "conditions": { + "$ref": "#/definitions/configConditions" + }, + "action": { + "$ref": "#/definitions/configAction" + } }, "required": [] }, @@ -389,15 +548,33 @@ "title": "Alert Configuration", "description": "The properties that control the value the alert will get, the conditions it will trigger, the back-off for notifications, the auto-repeating of notifications, etc.", "properties": { - "match": { "$ref": "#/definitions/matchTemplate" }, - "summary": { "$ref": "#/definitions/configSummary" }, - "info": { "$ref": "#/definitions/configInfo" }, - "type": { "$ref": "#/definitions/configType" }, - "component": { "$ref": "#/definitions/configComponent" }, - "classification": { "$ref": "#/definitions/configClassification" }, - "value": { "$ref": "#/definitions/configValue" }, - "conditions": { "$ref": "#/definitions/configConditions" }, - "action": { "$ref": "#/definitions/configAction" } + "match": { + "$ref": "#/definitions/matchTemplate" + }, + "summary": { + "$ref": "#/definitions/configSummary" + }, + "info": { + "$ref": "#/definitions/configInfo" + }, + "type": { + "$ref": "#/definitions/configType" + }, + "component": { + "$ref": "#/definitions/configComponent" + }, + "classification": { + "$ref": "#/definitions/configClassification" + }, + "value": { + "$ref": "#/definitions/configValue" + }, + "conditions": { + "$ref": "#/definitions/configConditions" + }, + "action": { + "$ref": "#/definitions/configAction" + } }, "required": [] } @@ -425,28 +602,43 @@ "type": { "type": "string", "oneOf": [ - { "const": "instance" , "title": "A specific Instance" }, - { "const": "template" , "title": "Each of the Instances" } + { + "const": "instance", + "title": "A specific Instance" + }, + { + "const": "template", + "title": "Each of the Instances" + } ], "default": "template", "title": "Apply this rule to:", "description": "" } }, - "required": [ "type", "enabled" ], + "required": [ + "type", + "enabled" + ], "if": { "properties": { - "type": { "const": "instance" } + "type": { + "const": "instance" + } } }, "then": { "properties": { - "config": { "$ref": "#/definitions/configInstance" } + "config": { + "$ref": "#/definitions/configInstance" + } } }, "else": { "properties": { - "config": { "$ref": "#/definitions/configTemplate" } + "config": { + "$ref": "#/definitions/configTemplate" + } } } } @@ -467,7 +659,7 @@ "ui:widget": "hidden" }, "rules": { - "ui:openEmptyItem": true, + "ui:openEmptyItem": true, "items": { "ui:classNames": "dyncfg-grid dyncfg-grid-col-6", "enabled": { @@ -555,8 +747,25 @@ }, "value": { "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", + "ui:order": [ + "database_lookup", + "calculation", + "units", + "update_every", + "units_placeholder" + ], "database_lookup": { - "ui:order": ["data_source", "time_group", "time_group_condition", "time_group_value", "after", "before", "dims_group", "dimensions", "options"], + "ui:order": [ + "data_source", + "time_group", + "time_group_condition", + "time_group_value", + "after", + "before", + "dims_group", + "dimensions", + "options" + ], "ui:classNames": "dyncfg-grid dyncfg-grid-col-6 dyncfg-grid-col-span-1-6", "ui:collapsible": true, "ui:initiallyExpanded": true, @@ -608,7 +817,12 @@ }, "update_every": { "ui:help": "The frequency this alarm is to be evaluated, in seconds.", - "ui:classNames": "dyncfg-grid-col-span-6-1" + "ui:classNames": "dyncfg-grid-col-span-1-2" + }, + "units_placeholder": { + "ui:widget": "unitsPlaceholder", + "ui:classNames": "dyncfg-grid-col-span-5-1", + "ui:help": "Netdata may display scaled units (e.g. bytes → GiB). Use the metric’s original unit to avoid confusion when writing alert expressions." } }, "conditions": { diff --git a/src/libnetdata/completion/completion.c b/src/libnetdata/completion/completion.c index 323dca01fcbc0a..8c6410d524e231 100644 --- a/src/libnetdata/completion/completion.c +++ b/src/libnetdata/completion/completion.c @@ -36,33 +36,33 @@ ALWAYS_INLINE void completion_wait_for(struct completion *p) ALWAYS_INLINE bool completion_timedwait_for(struct completion *p, uint64_t timeout_s) { - timeout_s *= NSEC_PER_SEC; + uint64_t timeout_ns = timeout_s * NSEC_PER_SEC; + if (timeout_ns == 0) timeout_ns = 1; - uint64_t start_time = uv_hrtime(); + uint64_t deadline_ns = uv_hrtime() + timeout_ns; bool result = true; netdata_mutex_lock(&p->mutex); - while (!p->completed) { - int rc = netdata_cond_timedwait(&p->cond, &p->mutex, timeout_s); + while (!p->completed && result) { + uint64_t current_time_ns = uv_hrtime(); - if (rc == 0) { - result = true; - break; - } else if (rc == UV_ETIMEDOUT) { + // Check if we've already exceeded the deadline + if (current_time_ns >= deadline_ns) { result = false; break; } - /* - * handle spurious wakeups - */ + uint64_t remaining_timeout_ns = deadline_ns - current_time_ns; - uint64_t elapsed = uv_hrtime() - start_time; - if (elapsed >= timeout_s) { + int rc = netdata_cond_timedwait(&p->cond, &p->mutex, remaining_timeout_ns); + + if (rc == UV_ETIMEDOUT) result = false; - break; - } - timeout_s -= elapsed; + + // Condition was signaled (or spurious wakeup). + // The loop condition `!p->completed` will be re-evaluated. + // If p->completed is true, the loop exits. + // If p->completed is false (spurious wakeup), the loop continues with a new remaining_timeout_ns. } netdata_mutex_unlock(&p->mutex); diff --git a/src/libnetdata/procfile/procfile.c b/src/libnetdata/procfile/procfile.c index fb6b0f8c3dce13..640ee018457264 100644 --- a/src/libnetdata/procfile/procfile.c +++ b/src/libnetdata/procfile/procfile.c @@ -330,14 +330,29 @@ procfile *procfile_readall(procfile *ff) { ff->len += r; } - - // netdata_log_debug(D_PROCFILE, "Rewinding file '%s'", ff->filename); - if(unlikely(lseek(ff->fd, 0, SEEK_SET) == -1)) { - if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) collector_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); - else if(unlikely(ff->flags & PROCFILE_FLAG_ERROR_ON_ERROR_LOG)) - netdata_log_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); - procfile_close(ff); - return NULL; + + if (unlikely(ff->flags & PROCFILE_FLAG_NONSEEKABLE)) { + char *fn = procfile_filename(ff); + ff = procfile_reopen(ff, fn, NULL, ff->flags); + if (unlikely(!ff)) + return NULL; + } else if (unlikely(lseek(ff->fd, 0, SEEK_SET) == -1)) { + // Some procfs files (Ubuntu HWE 24.04 / kernel 6.14) may be non-seekable. + // In that case, "rewind" by reopening. + if (errno == ESPIPE || errno == EINVAL) { + ff->flags |= PROCFILE_FLAG_NONSEEKABLE; + char *fn = procfile_filename(ff); + ff = procfile_reopen(ff, fn, NULL, ff->flags); + if (unlikely(!ff)) + return NULL; + } else { + if (unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) + collector_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); + else if (unlikely(ff->flags & PROCFILE_FLAG_ERROR_ON_ERROR_LOG)) + netdata_log_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); + procfile_close(ff); + return NULL; + } } procfile_lines_reset(ff->lines); @@ -498,9 +513,8 @@ procfile *procfile_reopen(procfile *ff, const char *filename, const char *separa } ff->stats.opens++; - // netdata_log_info("PROCFILE: opened '%s' on fd %d", filename, ff->fd); - - //strncpyz(ff->filename, filename, FILENAME_MAX); + // IMPORTANT: 'filename' parameter must not be used after this point + // as it may point to ff->filename which we're about to free freez(ff->filename); ff->filename = NULL; ff->flags = flags; diff --git a/src/libnetdata/procfile/procfile.h b/src/libnetdata/procfile/procfile.h index 25b976988222a1..356d9e42a6fe4f 100644 --- a/src/libnetdata/procfile/procfile.h +++ b/src/libnetdata/procfile/procfile.h @@ -36,6 +36,7 @@ typedef struct { #define PROCFILE_FLAG_DEFAULT 0x00000000 // To store inside `collector.log` #define PROCFILE_FLAG_NO_ERROR_ON_FILE_IO 0x00000001 // Do not log anything #define PROCFILE_FLAG_ERROR_ON_ERROR_LOG 0x00000002 // Store inside `error.log` +#define PROCFILE_FLAG_NONSEEKABLE 0x00000004 // File doesn't support lseek(), reopen instead typedef enum __attribute__ ((__packed__)) procfile_separator { PF_CHAR_IS_SEPARATOR, diff --git a/src/libnetdata/socket/nd-sock.h b/src/libnetdata/socket/nd-sock.h index 5ea9c852bc95cd..fd20fd59fd06b3 100644 --- a/src/libnetdata/socket/nd-sock.h +++ b/src/libnetdata/socket/nd-sock.h @@ -124,6 +124,14 @@ static ssize_t nd_sock_revc_nowait(ND_SOCK *s, void *buf, size_t num) { return recv(s->fd, buf, num, MSG_DONTWAIT); } +ALWAYS_INLINE +static ssize_t nd_sock_peek_nowait(ND_SOCK *s, void *buf, size_t num) { + if (nd_sock_is_ssl(s)) + return netdata_ssl_peek(&s->ssl, buf, num); + else + return recv(s->fd, buf, num, MSG_PEEK | MSG_DONTWAIT); +} + ALWAYS_INLINE static ssize_t nd_sock_send_nowait(ND_SOCK *s, void *buf, size_t num) { if (nd_sock_is_ssl(s)) diff --git a/src/libnetdata/socket/security.c b/src/libnetdata/socket/security.c index 060ab7ec04a471..47fb2496c059d1 100644 --- a/src/libnetdata/socket/security.c +++ b/src/libnetdata/socket/security.c @@ -286,8 +286,10 @@ ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num) { errno = 0; ssl->ssl_errno = 0; - if(unlikely(!is_handshake_complete(ssl, "read"))) + if(unlikely(!is_handshake_complete(ssl, "read"))) { + errno = ENOTCONN; return -1; + } int bytes = SSL_read(ssl->conn, buf, (int)num); @@ -302,8 +304,14 @@ ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num) { ssl->ssl_errno = err; errno = EWOULDBLOCK; } - else + else { + // For SSL_ERROR_SYSCALL, errno contains the underlying socket error + // (e.g., ECONNRESET). Save it before calling netdata_ssl_log_error_queue() + // which may corrupt errno through subsequent function calls. + int saved_errno = errno; netdata_ssl_log_error_queue("SSL_read", ssl, err); + errno = saved_errno; + } bytes = -1; // according to read() or recv() } @@ -311,6 +319,56 @@ ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num) { return bytes; } +/* + * netdata_ssl_peek() - peek at incoming SSL data without consuming it + * + * This function is identical to netdata_ssl_read() but uses SSL_peek() + * instead of SSL_read(), leaving the data in the SSL buffer for a + * subsequent read operation. Useful for probing connection status. + * + * Returns: + * > 0: Number of bytes available to peek + * 0: Connection closed (SSL_ERROR_ZERO_RETURN) + * -1: Error (check errno: EWOULDBLOCK means no data available) + */ +ALWAYS_INLINE +ssize_t netdata_ssl_peek(NETDATA_SSL *ssl, void *buf, size_t num) { + errno = 0; + ssl->ssl_errno = 0; + + if(unlikely(!is_handshake_complete(ssl, "peek"))) { + errno = ENOTCONN; + return -1; + } + + int bytes = SSL_peek(ssl->conn, buf, (int)num); + + if(unlikely(bytes <= 0)) { + int err = SSL_get_error(ssl->conn, bytes); + if (err == SSL_ERROR_ZERO_RETURN) { + ssl->ssl_errno = err; + return 0; // Connection closed + } + + if (err == SSL_ERROR_WANT_READ || err == SSL_ERROR_WANT_WRITE) { + ssl->ssl_errno = err; + errno = EWOULDBLOCK; + } + else { + // For SSL_ERROR_SYSCALL, errno contains the underlying socket error + // (e.g., ECONNRESET). Save it before calling netdata_ssl_log_error_queue() + // which may corrupt errno through subsequent function calls. + int saved_errno = errno; + netdata_ssl_log_error_queue("SSL_peek", ssl, err); + errno = saved_errno; + } + + bytes = -1; + } + + return bytes; +} + /* * netdata_ssl_write() should return the same as write(): * @@ -329,8 +387,10 @@ ssize_t netdata_ssl_write(NETDATA_SSL *ssl, const void *buf, size_t num) { errno = 0; ssl->ssl_errno = 0; - if(unlikely(!is_handshake_complete(ssl, "write"))) + if(unlikely(!is_handshake_complete(ssl, "write"))) { + errno = ENOTCONN; return -1; + } int bytes = SSL_write(ssl->conn, (uint8_t *)buf, (int)num); @@ -340,8 +400,14 @@ ssize_t netdata_ssl_write(NETDATA_SSL *ssl, const void *buf, size_t num) { ssl->ssl_errno = err; errno = EWOULDBLOCK; } - else + else { + // For SSL_ERROR_SYSCALL, errno contains the underlying socket error + // (e.g., ECONNRESET). Save it before calling netdata_ssl_log_error_queue() + // which may corrupt errno through subsequent function calls. + int saved_errno = errno; netdata_ssl_log_error_queue("SSL_write", ssl, err); + errno = saved_errno; + } bytes = -1; // according to write() or send() } diff --git a/src/libnetdata/socket/security.h b/src/libnetdata/socket/security.h index 7deb1d79763ffa..6a2e0cec4e7013 100644 --- a/src/libnetdata/socket/security.h +++ b/src/libnetdata/socket/security.h @@ -48,6 +48,7 @@ void netdata_ssl_close(NETDATA_SSL *ssl); ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num); ssize_t netdata_ssl_write(NETDATA_SSL *ssl, const void *buf, size_t num); +ssize_t netdata_ssl_peek(NETDATA_SSL *ssl, void *buf, size_t num); ssize_t netdata_ssl_pending(NETDATA_SSL *ssl); bool netdata_ssl_has_pending(NETDATA_SSL *ssl); diff --git a/src/plugins.d/pluginsd_replication.c b/src/plugins.d/pluginsd_replication.c index 0d036244d886dc..b8135350d10f5d 100644 --- a/src/plugins.d/pluginsd_replication.c +++ b/src/plugins.d/pluginsd_replication.c @@ -359,6 +359,9 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_rrdset_collection_state(char **words, si ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARSER *parser) { if (num_words < 7) { // accepts 7, but the 7th is optional nd_log(NDLS_DAEMON, NDLP_ERR, "REPLAY: malformed " PLUGINSD_KEYWORD_REPLAY_END " command"); + RRDSET *st = pluginsd_get_scope_chart(parser); + if(st) + st->replication_empty_response_count = 0; return PARSER_RC_ERROR; } @@ -402,6 +405,10 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS parser->user.data_collections_count++; + // Reset empty response counter when we receive actual data + if(parser->user.replay.rset_enabled && st) + st->replication_empty_response_count = 0; + if(parser->user.replay.rset_enabled && st->rrdhost->receiver) { time_t now = now_realtime_sec(); time_t started = st->rrdhost->receiver->replication.first_time_s; @@ -433,6 +440,9 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS st->replay.log_next_data_collection = true; #endif + if(start_streaming) + st->replication_empty_response_count = 0; + if (start_streaming) { #ifdef REPLICATION_TRACKING st->stream.rcv.who = REPLAY_WHO_FINISHED; @@ -465,6 +475,111 @@ ALWAYS_INLINE PARSER_RC pluginsd_replay_end(char **words, size_t num_words, PARS return PARSER_RC_OK; } + // ======================================================================== + // SAFETY NET: Detect stuck replication loops + // ======================================================================== + // + // We received start_streaming=false, which means we need to send another + // replication request. However, we need to detect if we're stuck in an + // infinite retry loop where no progress is being made. + // + // This can happen when: + // 1. Parent already has newer data than child + // 2. Child keeps splitting responses due to buffer constraints + // 3. Network issues causing repeated empty/failed responses + + // Check parent's current retention to detect if we're already caught up + time_t local_first_entry = 0, local_last_entry = 0; + rrdset_get_retention_of_tier_for_collected_chart( + st, &local_first_entry, &local_last_entry, now_realtime_sec(), 0); + + // Detect suspicious pattern: parent requested data but is already caught up + // This indicates we're in a loop where child keeps splitting responses + // even though parent doesn't need more data. + bool parent_already_caught_up = (local_last_entry >= last_entry_child); + bool requested_non_empty_range = (first_entry_requested != 0 || last_entry_requested != 0); + bool is_suspicious_response = (requested_non_empty_range && parent_already_caught_up); + + bool should_check_for_stuck_replication = false; + + // Track consecutive suspicious responses - applies to all builds + if(is_suspicious_response) { + st->replication_empty_response_count++; + // After 3 consecutive suspicious responses, we need to investigate + if(st->replication_empty_response_count >= 3) { + should_check_for_stuck_replication = true; + } + } else { + // Reset counter if this was a legitimate response (parent still catching up) + st->replication_empty_response_count = 0; + } + + if (should_check_for_stuck_replication) { + // We already have local_first_entry and local_last_entry from above + + // Check multiple conditions to ensure we're truly stuck: + // + // Condition 1: Parent has data that covers or exceeds child's retention + // (We already checked this in parent_already_caught_up, but verify again) + bool parent_has_equal_or_newer_data = (local_last_entry >= last_entry_child); + + // Calculate the gap for logging purposes + time_t gap_to_child = (last_entry_child > local_last_entry) ? + (last_entry_child - local_last_entry) : 0; + + // Condition 2: Parent's data is reasonably recent + time_t wall_clock = now_realtime_sec(); + bool parent_data_is_recent = (local_last_entry > 0 && + (wall_clock - local_last_entry) < 300); + + // Only finish replication if parent has equal or newer data than child + // Do NOT terminate if there's any gap, as that would cause data loss + if (parent_has_equal_or_newer_data) { + + // Log with appropriate level based on confidence + ND_LOG_FIELD_PRIORITY level = (parent_has_equal_or_newer_data && parent_data_is_recent) ? + NDLP_INFO : NDLP_WARNING; + + nd_log(NDLS_DAEMON, level, + "PLUGINSD REPLAY: 'host:%s/chart:%s' detected stuck replication loop. " + "Parent last entry: %llu, Child last entry: %llu, Gap: %llu seconds, " + "Empty responses: %u. Forcing replication to finish.", + rrdhost_hostname(host), rrdset_id(st), + (unsigned long long)local_last_entry, + (unsigned long long)last_entry_child, + (unsigned long long)gap_to_child, + (unsigned int)st->replication_empty_response_count + ); + + st->replication_empty_response_count = 0; + + // IMPORTANT: Mark as finished and decrement counter NOW, before sending final request. + // This prevents infinite loops even if child continues to respond with start_streaming=false. + // The next REPLAY_END will see FINISHED flag and handle accordingly. + RRDSET_FLAGS old = rrdset_flag_set_and_clear( + st, RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED, + RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS | RRDSET_FLAG_SYNC_CLOCK); + + if(!(old & RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED)) { + if(rrdhost_receiver_replicating_charts_minus_one(st->rrdhost) == 0) + pulse_host_status(host, PULSE_HOST_STATUS_RCV_RUNNING, 0); + } + + pluginsd_clear_scope_chart(parser, PLUGINSD_KEYWORD_REPLAY_END); + host->stream.rcv.status.replication.percent = 100.0; + worker_set_metric(WORKER_RECEIVER_JOB_REPLICATION_COMPLETION, host->stream.rcv.status.replication.percent); + + // Send one final request to notify child. If child responds with start_streaming=true, + // it will start streaming. If it responds with start_streaming=false, the next + // REPLAY_END will see the FINISHED flag and log a warning but not loop forever. + bool ok = replicate_chart_request(send_to_plugin, parser, host, st, + first_entry_child, last_entry_child, child_world_time, + 0, 0); // prev_wanted = 0,0 to trigger empty request path + + return ok ? PARSER_RC_OK : PARSER_RC_ERROR; + } + } + #ifdef REPLICATION_TRACKING st->stream.rcv.who = REPLAY_WHO_ME; #endif diff --git a/src/streaming/stream-receiver-connection.c b/src/streaming/stream-receiver-connection.c index 7d4e07a60cf89d..2405a1e787af08 100644 --- a/src/streaming/stream-receiver-connection.c +++ b/src/streaming/stream-receiver-connection.c @@ -5,6 +5,14 @@ #include "stream-receiver-internals.h" #include "stream-replication-sender.h" +#if defined(__APPLE__) && !defined(TCP_KEEPIDLE) +#define TCP_KEEPIDLE TCP_KEEPALIVE +#endif + +#define CONNECTION_PROBE_AFTER_SECONDS (30) +#define CONNECTION_PROBE_INTERVAL_SECONDS (10) +#define CONNECTION_PROBE_COUNT (3) + void svc_rrdhost_obsolete_all_charts(RRDHOST *host); // -------------------------------------------------------------------------------------------------------------------- @@ -286,6 +294,33 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) { nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RCV '%s' [from [%s]:%s]: cannot set timeout for socket %d", rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, rpt->sock.fd); + + // Enable TCP keepalive to detect dead connections faster + // When a child vanishes (e.g., VM powered off), the socket won't close normally. + // TCP keepalive will probe the connection and detect it's dead. + int enable = 1; + int idle = CONNECTION_PROBE_AFTER_SECONDS; + int interval = CONNECTION_PROBE_INTERVAL_SECONDS; + int count = CONNECTION_PROBE_COUNT; + + if (setsockopt(rpt->sock.fd, SOL_SOCKET, SO_KEEPALIVE, &enable, sizeof(enable)) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM RCV '%s' [from [%s]:%s]: cannot enable SO_KEEPALIVE on socket %d", + rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, rpt->sock.fd); +#ifdef TCP_KEEPIDLE + if (setsockopt(rpt->sock.fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle)) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM RCV '%s' [from [%s]:%s]: cannot set TCP_KEEPIDLE on socket %d", + rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, rpt->sock.fd); + if (setsockopt(rpt->sock.fd, IPPROTO_TCP, TCP_KEEPINTVL, &interval, sizeof(interval)) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM RCV '%s' [from [%s]:%s]: cannot set TCP_KEEPINTVL on socket %d", + rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, rpt->sock.fd); + if (setsockopt(rpt->sock.fd, IPPROTO_TCP, TCP_KEEPCNT, &count, sizeof(count)) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, + "STREAM RCV '%s' [from [%s]:%s]: cannot set TCP_KEEPCNT on socket %d", + rrdhost_hostname(rpt->host), rpt->remote_ip, rpt->remote_port, rpt->sock.fd); +#endif } netdata_log_debug(D_STREAM, "Initial response to %s: %s", rpt->remote_ip, initial_response); diff --git a/src/streaming/stream-receiver.c b/src/streaming/stream-receiver.c index d1af7b2ba7a785..546a3a34682bd7 100644 --- a/src/streaming/stream-receiver.c +++ b/src/streaming/stream-receiver.c @@ -896,6 +896,58 @@ void stream_receiver_check_all_nodes_from_poll(struct stream_thread *sth, usec_t if (m->type != POLLFD_TYPE_RECEIVER) continue; struct receiver_state *rpt = m->rpt; + // Probe socket to detect dead connections (e.g., from TCP keepalive) + // Uses nd_sock_peek_nowait() which handles both SSL and plain TCP: + // - For SSL: uses SSL_peek() to avoid corrupting SSL state + // - For plain TCP: uses recv(MSG_PEEK | MSG_DONTWAIT) + char probe_byte; + ssize_t probe_rc = nd_sock_peek_nowait(&rpt->sock, &probe_byte, 1); + if (probe_rc == 0 || (probe_rc < 0 && errno == ECONNRESET)) { + // Connection closed by remote (gracefully or via reset) + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->remote_ip), + ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->remote_port), + ND_LOG_FIELD_TXT(NDF_NIDL_NODE, rpt->hostname), + ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt), + ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_REMOTE_CLOSED); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RCV[%zu] '%s' [from %s]: socket closed by remote - closing connection", + sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip); + + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE); + continue; + } + if (probe_rc < 0 && errno != EAGAIN && errno != EWOULDBLOCK) { + // Socket error detected (keepalive timeout, etc.) + // Save errno immediately as subsequent calls may modify it + int saved_errno = errno; + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->remote_ip), + ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->remote_port), + ND_LOG_FIELD_TXT(NDF_NIDL_NODE, rpt->hostname), + ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt), + ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + worker_is_busy(WORKER_STREAM_JOB_DISCONNECT_SOCKET_ERROR); + nd_log(NDLS_DAEMON, NDLP_ERR, + "STREAM RCV[%zu] '%s' [from %s]: socket error detected: %s - closing connection", + sth->id, rrdhost_hostname(rpt->host), rpt->remote_ip, strerror(saved_errno)); + + stream_receiver_remove(sth, rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR); + continue; + } + // probe_rc > 0: data available (normal) + // probe_rc < 0 with EAGAIN/EWOULDBLOCK: no data but connection alive + spinlock_lock(&rpt->thread.send_to_child.spinlock); STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(rpt->thread.send_to_child.scb); spinlock_unlock(&rpt->thread.send_to_child.spinlock); diff --git a/src/streaming/stream-replication-receiver.c b/src/streaming/stream-replication-receiver.c index 0305af037dfc06..d550fcffcdbf74 100644 --- a/src/streaming/stream-replication-receiver.c +++ b/src/streaming/stream-replication-receiver.c @@ -218,8 +218,16 @@ bool replicate_chart_request(send_command callback, struct parser *parser, RRDHO if (unlikely(r.child_db.first_entry_t > r.child_db.last_entry_t)) return send_replay_chart_cmd(&r, "sending empty replication request, child timings are invalid (first entry > last entry)", true); - if (unlikely(r.local_db.last_entry_t > r.child_db.last_entry_t)) - return send_replay_chart_cmd(&r, "sending empty replication request, local last entry is later than the child one", false); + // Check if parent is already caught up with or ahead of child + // This check uses >= (not just >) to handle the case where parent and child are exactly equal + // When equal, there's no gap to replicate, so we should finish replication + if (unlikely(r.local_db.last_entry_t >= r.child_db.last_entry_t)) { + // Parent is at or ahead of child - no replication needed + // Send empty request (after=0, before=0) with start_streaming=true to finish replication + // The child will receive this, recognize it as empty, and respond with start_streaming=true + // which will properly terminate the replication process + return send_replay_chart_cmd(&r, "sending empty replication request, local last entry is at or later than the child one", false); + } // let's find what the child can provide to fill that gap diff --git a/src/streaming/stream-replication-sender.c b/src/streaming/stream-replication-sender.c index 2034da4ef972dc..c56d827f3bb37e 100644 --- a/src/streaming/stream-replication-sender.c +++ b/src/streaming/stream-replication-sender.c @@ -442,6 +442,16 @@ static bool replication_query_execute(BUFFER *wb, struct replication_query *q, s if(buffer_strlen(wb) > max_msg_size && last_end_time_in_buffer) { q->query.before = last_end_time_in_buffer; + + // CRITICAL: When splitting a response due to buffer overflow, we MUST set + // enable_streaming=false to prevent data loss. If we send start_streaming=true + // with partial data, the parent will think replication is complete when we've + // actually truncated the requested interval, causing permanent data loss. + // + // The parent-side stuck detection will handle infinite loops differently by: + // 1. Detecting when no progress is made after multiple rounds + // 2. Explicitly marking replication as FINISHED before sending final request + // 3. Handling the child's response appropriately whether it says true or false q->query.enable_streaming = false; internal_error( @@ -1198,8 +1208,47 @@ static bool replication_execute_request(struct replication_request *rq, bool wor if(!rq->st) { __atomic_add_fetch(&replication_globals.atomic.error_not_found, 1, __ATOMIC_RELAXED); nd_log(NDLS_DAEMON, NDLP_ERR, - "STREAM SND REPLAY ERROR: 'host:%s/chart:%s' not found", + "STREAM SND REPLAY ERROR: 'host:%s/chart:%s' not found, sending empty response to unblock parent", rrdhost_hostname(rq->sender->host), string2str(rq->chart_id)); + + // CRITICAL: Parent is waiting for a response! We MUST send REPLAY_END even if chart not found + // Otherwise parent will wait forever with chart stuck in replicating state. + // Send empty response with start_streaming=true to finish replication for this non-existent chart. + BUFFER *wb = sender_thread_buffer(rq->sender, REPLICATION_THREAD_BUFFER_INITIAL_SIZE); + + bool with_slots = (rq->sender->capabilities & STREAM_CAP_SLOTS) ? true : false; + NUMBER_ENCODING integer_encoding = (rq->sender->capabilities & STREAM_CAP_IEEE754) ? + NUMBER_ENCODING_BASE64 : NUMBER_ENCODING_DECIMAL; + + buffer_fast_strcat(wb, PLUGINSD_KEYWORD_REPLAY_BEGIN, sizeof(PLUGINSD_KEYWORD_REPLAY_BEGIN) - 1); + if(with_slots) { + buffer_fast_strcat(wb, " "PLUGINSD_KEYWORD_SLOT":", sizeof(PLUGINSD_KEYWORD_SLOT) - 1 + 2); + buffer_print_uint64_encoded(wb, integer_encoding, 0); // slot 0 for unknown chart + } + buffer_fast_strcat(wb, " '", 2); + buffer_fast_strcat(wb, string2str(rq->chart_id), string_strlen(rq->chart_id)); + buffer_fast_strcat(wb, "'\n", 2); + + // Send REPLAY_END with empty data and start_streaming=true to unblock parent + buffer_fast_strcat(wb, PLUGINSD_KEYWORD_REPLAY_END " ", sizeof(PLUGINSD_KEYWORD_REPLAY_END) - 1 + 1); + buffer_print_int64_encoded(wb, integer_encoding, 0); // update_every + buffer_fast_strcat(wb, " ", 1); + buffer_print_uint64_encoded(wb, integer_encoding, 0); // db_first_entry + buffer_fast_strcat(wb, " ", 1); + buffer_print_uint64_encoded(wb, integer_encoding, 0); // db_last_entry + buffer_fast_strcat(wb, " true ", 7); // start_streaming=true (force finish) + buffer_print_uint64_encoded(wb, integer_encoding, 0); // after + buffer_fast_strcat(wb, " ", 1); + buffer_print_uint64_encoded(wb, integer_encoding, 0); // before + buffer_fast_strcat(wb, " ", 1); + buffer_print_uint64_encoded(wb, integer_encoding, now_realtime_sec()); // wall_clock_time + buffer_fast_strcat(wb, "\n", 1); + + sender_commit(rq->sender, wb, STREAM_TRAFFIC_TYPE_REPLICATION); + __atomic_add_fetch(&rq->sender->host->stream.snd.status.replication.counter_out, 1, __ATOMIC_RELAXED); + replication_replied_add(); + + ret = true; // Consider this a successful response goto cleanup; } } diff --git a/src/web/api/queries/weights.c b/src/web/api/queries/weights.c index d249ee19d4af02..ffe6f80167accb 100644 --- a/src/web/api/queries/weights.c +++ b/src/web/api/queries/weights.c @@ -2226,6 +2226,12 @@ static ssize_t weights_do_context_callback(void *data, RRDCONTEXT_ACQUIRED *rca, static ssize_t query_scope_foreach_host_parallel(SIMPLE_PATTERN *scope_hosts_sp, SIMPLE_PATTERN *hosts_sp, struct query_weights_data *qwd) { +#ifndef ENABLE_DBENGINE + return query_scope_foreach_host(scope_hosts_sp, hosts_sp, + weights_do_node_callback, qwd, + &qwd->versions, NULL); + +#else size_t host_count = dictionary_entries(rrdhost_root_index); qwd->hosts_array = mallocz(sizeof(RRDHOST *) * host_count); qwd->hosts_array_capacity = host_count; @@ -2305,6 +2311,7 @@ static ssize_t query_scope_foreach_host_parallel(SIMPLE_PATTERN *scope_hosts_sp, freez(qwd->hosts_array); return total_added; +#endif } static ssize_t weights_do_node_callback(void *data, RRDHOST *host, bool queryable) { diff --git a/system/edit-config b/system/edit-config index 3944810d23a057..5d2020434bbf1f 100755 --- a/system/edit-config +++ b/system/edit-config @@ -1,8 +1,18 @@ #!/usr/bin/env sh +if [ "${container+x}" = "x" ]; then + _container_saved=$container +fi + # shellcheck disable=SC1091 [ -f /etc/profile ] && . /etc/profile +if [ "${_container_saved+x}" = "x" ]; then + container=$_container_saved +else + unset container +fi + set -e script_dir="$(CDPATH="" cd -- "$(dirname -- "$0")" && pwd -P)"