diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9588d36020..8c4a0a8db5 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,6 +9,9 @@ contact_links: - name: Chat with us on Matrix? url: https://matrix.to/#/#valkey:matrix.org about: We are on Matrix too! + - name: Chat with us on Slack? + url: https://join.slack.com/t/valkey-oss-developer/shared_invite/zt-2nxs51chx-EB9hu9Qdch3GMfRcztTSkQ + about: We are on Slack too! - name: Documentation issue? url: https://github.com/valkey-io/valkey-doc/issues about: Report it on the valkey-doc repo. diff --git a/.github/actions/generate-package-build-matrix/action.yml b/.github/actions/generate-package-build-matrix/action.yml index 7e90f27be5..2494a71118 100644 --- a/.github/actions/generate-package-build-matrix/action.yml +++ b/.github/actions/generate-package-build-matrix/action.yml @@ -24,11 +24,11 @@ runs: - name: Get targets run: | - x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' utils/releasetools/build-config.json) + x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' .github/actions/generate-package-build-matrix/build-config.json) x86_matrix=$(echo "{ \"distro\" : $x86_arch }" | jq -c .) echo "X86_MATRIX=$x86_matrix" >> $GITHUB_ENV - arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' utils/releasetools/build-config.json) + arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' .github/actions/generate-package-build-matrix/build-config.json) arm_matrix=$(echo "{ \"distro\" : $arm_arch }" | jq -c .) echo "ARM_MATRIX=$arm_matrix" >> $GITHUB_ENV shell: bash diff --git a/utils/releasetools/build-config.json b/.github/actions/generate-package-build-matrix/build-config.json similarity index 57% rename from utils/releasetools/build-config.json rename to .github/actions/generate-package-build-matrix/build-config.json index 5ee9308b3b..f64bf601ca 100644 --- a/utils/releasetools/build-config.json +++ b/.github/actions/generate-package-build-matrix/build-config.json @@ -1,28 +1,35 @@ { "linux_targets": [ + { "arch": "x86_64", - "target": "ubuntu18.04", + "target": "ubuntu-20.04", "type": "deb", - "platform": "bionic" + "platform": "focal" }, { "arch": "x86_64", - "target": "ubuntu20.04", + "target": "ubuntu-22.04", "type": "deb", - "platform": "focal" + "platform": "jammy" }, { - "arch": "arm64", - "target": "ubuntu18.04", + "arch": "x86_64", + "target": "ubuntu-24.04", "type": "deb", - "platform": "bionic" + "platform": "noble" }, { "arch": "arm64", "target": "ubuntu20.04", "type": "deb", "platform": "focal" + }, + { + "arch": "arm64", + "target": "ubuntu22.04", + "type": "deb", + "platform": "jammy" } ] } \ No newline at end of file diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml index 094d82de08..d7ab8e57d6 100644 --- a/.github/workflows/build-release-packages.yml +++ b/.github/workflows/build-release-packages.yml @@ -3,7 +3,12 @@ name: Build Release Packages on: release: types: [published] - + push: + paths: + - '.github/workflows/build-release-packages.yml' + - '.github/workflows/call-build-linux-arm-packages.yml' + - '.github/workflows/call-build-linux-x86-packages.yml' + - '.github/actions/generate-package-build-matrix/build-config.json' workflow_dispatch: inputs: version: @@ -11,17 +16,19 @@ on: required: true permissions: + id-token: write contents: read jobs: # This job provides the version metadata from the tag for the other jobs to use. release-build-get-meta: name: Get metadata to build + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} + is_test: ${{ steps.check-if-testing.outputs.IS_TEST }} steps: - - run: | echo "Version: ${{ inputs.version || github.ref_name }}" shell: bash @@ -32,8 +39,13 @@ jobs: - name: Get the version id: get_version run: | - VERSION="${INPUT_VERSION}" + if [[ "${{ github.event_name }}" == "push" ]]; then + VERSION=${{ github.ref_name }} + else + VERSION="${INPUT_VERSION}" + fi if [ -z "${VERSION}" ]; then + echo "Error: No version specified" exit 1 fi echo "VERSION=$VERSION" >> $GITHUB_OUTPUT @@ -43,8 +55,21 @@ jobs: # only ever be a tag INPUT_VERSION: ${{ inputs.version || github.ref_name }} + - name: Check if we are testing + id: check-if-testing + run: | + if [[ "${{ github.event_name }}" == "push" ]]; then + echo "This is a test workflow -> We will upload to the Test S3 Bucket" + echo "IS_TEST=true" >> $GITHUB_OUTPUT + else + echo "This is a Release workflow -> We will upload to the Release S3 Bucket" + echo "IS_TEST=false" >> $GITHUB_OUTPUT + fi + shell: bash + generate-build-matrix: name: Generating build matrix + if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey' runs-on: ubuntu-latest outputs: x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }} @@ -56,7 +81,7 @@ jobs: - uses: ./.github/actions/generate-package-build-matrix id: set-matrix with: - ref: ${{ inputs.version || github.ref_name }} + ref: ${{ needs.release-build-get-meta.outputs.version }} release-build-linux-x86-packages: needs: @@ -67,11 +92,10 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} release-build-linux-arm-packages: needs: @@ -82,8 +106,7 @@ jobs: version: ${{ needs.release-build-get-meta.outputs.version }} ref: ${{ inputs.version || github.ref_name }} build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }} + region: us-west-2 secrets: - token: ${{ secrets.GITHUB_TOKEN }} - bucket: ${{ secrets.AWS_S3_BUCKET }} - access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }} - secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }} + bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }} + role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml index 2a7bcc533f..65445a83c8 100644 --- a/.github/workflows/call-build-linux-arm-packages.yml +++ b/.github/workflows/call-build-linux-arm-packages.yml @@ -15,21 +15,20 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to push packages into. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The S3 bucket to push packages into. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Make Valkey uses: uraimo/run-on-arch-action@v2 with: @@ -65,15 +70,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index 9e438fa61a..4e68bf85f0 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -15,28 +15,27 @@ on: description: The build targets to produce as a JSON matrix. type: string required: true + region: + description: The AWS region to upload the packages to. + type: string + required: true secrets: - token: - description: The Github token or similar to authenticate with. + bucket_name: + description: The name of the S3 bucket to upload the packages to. + required: true + role_to_assume: + description: The role to assume for the S3 bucket. required: true - bucket: - description: The name of the S3 bucket to push packages into. - required: false - access_key_id: - description: The S3 access key id for the bucket. - required: false - secret_access_key: - description: The S3 secret access key for the bucket. - required: false permissions: + id-token: write contents: read jobs: build-valkey: # Capture source tarball and generate checksum for it name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }} - runs-on: "ubuntu-latest" + runs-on: ${{matrix.distro.target}} strategy: fail-fast: false matrix: ${{ fromJSON(inputs.build_matrix) }} @@ -46,6 +45,12 @@ jobs: with: ref: ${{ inputs.version }} + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: ${{ inputs.region }} + role-to-assume: ${{ secrets.role_to_assume }} + - name: Install dependencies run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev @@ -63,15 +68,5 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ - - name: Install AWS cli. - run: | - sudo apt-get install -y awscli - - - name: Configure AWS credentials - run: | - aws configure set region us-west-2 - aws configure set aws_access_key_id ${{ secrets.access_key_id }} - aws configure set aws_secret_access_key ${{ secrets.secret_access_key }} - - name: Sync to S3 - run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/ + run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bc946b7193..df3eaa1905 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes + run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes - name: test run: | sudo apt-get install tcl8.6 tclx @@ -77,10 +77,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: make + - name: prepare-development-libraries + run: sudo apt-get install librdmacm-dev libibverbs-dev + - name: make-rdma-module + run: make -j4 BUILD_RDMA=module + - name: make-rdma-builtin run: | - sudo apt-get install librdmacm-dev libibverbs-dev - make -j4 BUILD_RDMA=module + make distclean + make -j4 BUILD_RDMA=yes - name: clone-rxe-kmod run: | mkdir -p tests/rdma/rxe @@ -108,23 +112,30 @@ jobs: steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' + # Build with additional upcoming features + run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes build-32bit: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make + # Fast float requires C++ 32-bit libraries to compile on 64-bit ubuntu + # machine i.e. "-cross" suffixed version. Cross-compiling c++ to 32-bit + # also requires multilib support for g++ compiler i.e. "-multilib" + # suffixed version of g++. g++-multilib generally includes libstdc++. + # *cross version as well, but it is also added explicitly just in case. run: | - sudo apt-get update && sudo apt-get install libc6-dev-i386 - make -j4 SERVER_CFLAGS='-Werror' 32bit + sudo apt-get update + sudo apt-get install libc6-dev-i386 libstdc++-11-dev-i386-cross gcc-multilib g++-multilib + make -j4 SERVER_CFLAGS='-Werror' 32bit USE_FAST_FLOAT=yes build-libc-malloc: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc + run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc USE_FAST_FLOAT=yes build-almalinux8-jemalloc: runs-on: ubuntu-latest @@ -134,8 +145,8 @@ jobs: - name: make run: | - dnf -y install epel-release gcc make procps-ng which - make -j4 SERVER_CFLAGS='-Werror' + dnf -y install epel-release gcc gcc-c++ make procps-ng which + make -j4 SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes format-yaml: runs-on: ubuntu-latest diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 8e9045fe4b..e1d577b51b 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -86,7 +86,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'fortify') - container: ubuntu:lunar + container: ubuntu:plucky timeout-minutes: 14400 steps: - name: prep @@ -319,7 +319,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes - name: testprep run: | sudo apt-get install tcl8.6 tclx tcl-tls @@ -375,6 +375,44 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}} + test-ubuntu-tls-io-threads: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && + !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads') + timeout-minutes: 14400 + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: | + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get install tcl8.6 tclx tcl-tls + ./utils/gen-test-certs.sh + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: | + ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: | + ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}} + test-ubuntu-reclaim-cache: runs-on: ubuntu-latest if: | @@ -689,6 +727,52 @@ jobs: if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate + test-sanitizer-force-defrag: + runs-on: ubuntu-latest + if: | + (github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || + (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + !contains(github.event.inputs.skipjobs, 'sanitizer') + timeout-minutes: 14400 + strategy: + fail-fast: false + steps: + - name: prep + if: github.event_name == 'workflow_dispatch' + run: | + echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV + echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV + echo "skipjobs: ${{github.event.inputs.skipjobs}}" + echo "skiptests: ${{github.event.inputs.skiptests}}" + echo "test_args: ${{github.event.inputs.test_args}}" + echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}" + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: ${{ env.GITHUB_REPOSITORY }} + ref: ${{ env.GITHUB_HEAD_REF }} + - name: make + run: make all-with-unit-tests OPT=-O3 SANITIZER=address DEBUG_FORCE_DEFRAG=yes USE_JEMALLOC=no SERVER_CFLAGS='-Werror' + - name: testprep + run: | + sudo apt-get update + sudo apt-get install tcl8.6 tclx -y + - name: test + if: true && !contains(github.event.inputs.skiptests, 'valkey') + run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: module api test + if: true && !contains(github.event.inputs.skiptests, 'modules') + run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}} + - name: sentinel tests + if: true && !contains(github.event.inputs.skiptests, 'sentinel') + run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}} + - name: cluster tests + if: true && !contains(github.event.inputs.skiptests, 'cluster') + run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} + - name: unittest + if: true && !contains(github.event.inputs.skiptests, 'unittest') + run: ./src/valkey-unit-tests + test-rpm-distros-jemalloc: if: | (github.event_name == 'workflow_dispatch' || @@ -990,7 +1074,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, macos-14] + os: [macos-13, macos-14] runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' || @@ -1019,7 +1103,7 @@ jobs: run: make SERVER_CFLAGS='-Werror' test-freebsd: - runs-on: macos-12 + runs-on: macos-13 if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml index 69d9b9cb6a..14db670b24 100644 --- a/.github/workflows/spell-check.yml +++ b/.github/workflows/spell-check.yml @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Install typos - uses: taiki-e/install-action@cd5df4de2e75f3b819ba55f780f7bb8cd4a05a41 # v2.32.2 + uses: taiki-e/install-action@fe9759bf4432218c779595708e80a1aadc85cedc # v2.46.10 with: tool: typos diff --git a/.gitignore b/.gitignore index b108b4bb92..d5cac316e6 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ tests/rdma/rdma-test tags build-debug/ build-release/ +cmake-build-debug/ +cmake-build-release/ diff --git a/CMakeLists.txt b/CMakeLists.txt index ad0bab8896..55b18cb994 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.10) # Must be done first if (APPLE) @@ -41,3 +41,4 @@ unset(BUILD_UNIT_TESTS CACHE) unset(BUILD_TEST_MODULES CACHE) unset(BUILD_EXAMPLE_MODULES CACHE) unset(USE_TLS CACHE) +unset(DEBUG_FORCE_DEFRAG CACHE) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 1c530ec7ba..36764bb81b 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -49,7 +49,7 @@ representative at an online or offline event. Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -this email address: placeholderkv@gmail.com. +this email address: maintainers@lists.valkey.io. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. diff --git a/GOVERNANCE.md b/GOVERNANCE.md index 33c3887430..7fd33272cb 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -2,7 +2,9 @@ The Valkey project is managed by a Technical Steering Committee (TSC) composed of the maintainers of the Valkey repository. The Valkey project includes all of the current and future repositories under the Valkey-io organization. -Maintainers are defined as individuals with full commit access to a repository, which shall be in sync with the MAINTAINERS.md file in a given projects repository. +Committers are defined as individuals with write access to the code within a repository. +Maintainers are defined as individuals with full access to a repository and own its governance. +Both maintainers and committers should be clearly listed in the MAINTAINERS.md file in a given projects repository. Maintainers of other repositories within the Valkey project are not members of the TSC unless explicitly added. ## Technical Steering Committee diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 635bf25067..947979eb33 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -16,8 +16,16 @@ Maintainers listed in alphabetical order by their github ID. | Zhao Zhao | [soloestoy](https://github.com/soloestoy) | Alibaba | | Viktor Söderqvist | [zuiderkwast](https://github.com/zuiderkwast) | Ericsson | +## Current Committers -### Former Maintainers +Committers listed in alphabetical order by their github ID. + +| Committer | GitHub ID | Affiliation | +| ------------------- | ----------------------------------------------- | ----------- | +| Harkrishn Patro | [hpatro](https://github.com/hpatro) | Amazon | +| Ran Shidlansik | [ranshid](https://github.com/ranshid) | Amazon | + +### Former Maintainers and Committers | Maintainer | GitHub ID | Affiliation | | ------------------- | ----------------------------------------------- | ----------- | \ No newline at end of file diff --git a/README.md b/README.md index 94f38bccf7..c447cc8d47 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,13 @@ To build TLS as Valkey module: Note that sentinel mode does not support TLS module. To build with experimental RDMA support you'll need RDMA development libraries -(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). For now, Valkey only -supports RDMA as connection module mode. Run: +(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). + +To build RDMA support as Valkey built-in: + + % make BUILD_RDMA=yes + +To build RDMA as Valkey module: % make BUILD_RDMA=module @@ -203,20 +208,27 @@ Note that Valkey Over RDMA is an experimental feature. It may be changed or removed in any minor or major version. Currently, it is only supported on Linux. -To manually run a Valkey server with RDMA mode: +* RDMA built-in mode: + ``` + ./src/valkey-server --protected-mode no \ + --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` - % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 +* RDMA module mode: + ``` + ./src/valkey-server --protected-mode no \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` It's possible to change bind address/port of RDMA by runtime command: - 192.168.122.100:6379> CONFIG SET rdma.port 6380 + 192.168.122.100:6379> CONFIG SET rdma-port 6380 It's also possible to have both RDMA and TCP available, and there is no conflict of TCP(6379) and RDMA(6379), Ex: % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 \ --port 6379 Note that the network card (192.168.122.100 of this example) should support @@ -297,19 +309,19 @@ Other options supported by Valkey's `CMake` build system: ## Special build flags -- `-DBUILD_TLS=` enable TLS build for Valkey -- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported) +- `-DBUILD_TLS=` enable TLS build for Valkey. Default: `no` +- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported). Default: `no` - `-DBUILD_MALLOC=` choose the allocator to use. Default on Linux: `jemalloc`, for other OS: `libc` -- `-DBUILD_SANITIZER=` build with address sanitizer enabled -- `-DBUILD_UNIT_TESTS=[1|0]` when set, the build will produce the executable `valkey-unit-tests` -- `-DBUILD_TEST_MODULES=[1|0]` when set, the build will include the modules located under the `tests/modules` folder -- `-DBUILD_EXAMPLE_MODULES=[1|0]` when set, the build will include the example modules located under the `src/modules` folder +- `-DBUILD_SANITIZER=` build with address sanitizer enabled. Default: disabled (no sanitizer) +- `-DBUILD_UNIT_TESTS=[yes|no]` when set, the build will produce the executable `valkey-unit-tests`. Default: `no` +- `-DBUILD_TEST_MODULES=[yes|no]` when set, the build will include the modules located under the `tests/modules` folder. Default: `no` +- `-DBUILD_EXAMPLE_MODULES=[yes|no]` when set, the build will include the example modules located under the `src/modules` folder. Default: `no` ## Common flags - `-DCMAKE_BUILD_TYPE=` define the build type, see CMake manual for more details - `-DCMAKE_INSTALL_PREFIX=/installation/path` override this value to define a custom install prefix. Default: `/usr/local` -- `-G` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. +- `-G""` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. ## Verbose build diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index d76f17625e..1a754ff846 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -10,6 +10,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/ae.c ${CMAKE_SOURCE_DIR}/src/anet.c ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/hashtable.c ${CMAKE_SOURCE_DIR}/src/kvstore.c ${CMAKE_SOURCE_DIR}/src/sds.c ${CMAKE_SOURCE_DIR}/src/zmalloc.c @@ -74,6 +75,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/geohash.c ${CMAKE_SOURCE_DIR}/src/geohash_helper.c ${CMAKE_SOURCE_DIR}/src/childinfo.c + ${CMAKE_SOURCE_DIR}/src/allocator_defrag.c ${CMAKE_SOURCE_DIR}/src/defrag.c ${CMAKE_SOURCE_DIR}/src/siphash.c ${CMAKE_SOURCE_DIR}/src/rax.c @@ -87,6 +89,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/tracking.c ${CMAKE_SOURCE_DIR}/src/socket.c ${CMAKE_SOURCE_DIR}/src/tls.c + ${CMAKE_SOURCE_DIR}/src/rdma.c ${CMAKE_SOURCE_DIR}/src/sha256.c ${CMAKE_SOURCE_DIR}/src/timeout.c ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c diff --git a/cmake/Modules/Utils.cmake b/cmake/Modules/Utils.cmake index 304f39fb2c..59076397de 100644 --- a/cmake/Modules/Utils.cmake +++ b/cmake/Modules/Utils.cmake @@ -100,3 +100,16 @@ function (valkey_parse_build_option OPTION_VALUE OUT_ARG_ENUM) PARENT_SCOPE) endif () endfunction () + +function (valkey_pkg_config PKGNAME OUT_VARIABLE) + if (NOT FOUND_PKGCONFIG) + # Locate pkg-config once + find_package(PkgConfig REQUIRED) + set(FOUND_PKGCONFIG 1) + endif () + pkg_check_modules(__PREFIX REQUIRED ${PKGNAME}) + message(STATUS "Found library for '${PKGNAME}': ${__PREFIX_LIBRARIES}") + set(${OUT_VARIABLE} + "${__PREFIX_LIBRARIES}" + PARENT_SCOPE) +endfunction () diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake index e935c3b308..8a4d4da1c9 100644 --- a/cmake/Modules/ValkeySetup.cmake +++ b/cmake/Modules/ValkeySetup.cmake @@ -74,9 +74,11 @@ endmacro () macro (valkey_build_and_install_bin target sources ld_flags libs link_name) add_executable(${target} ${sources}) - if (USE_JEMALLOC) - # Using jemalloc - target_link_libraries(${target} jemalloc) + if (USE_JEMALLOC + OR USE_TCMALLOC + OR USE_TCMALLOC_MINIMAL) + # Using custom allocator + target_link_libraries(${target} ${ALLOCATOR_LIB}) endif () # Place this line last to ensure that ${ld_flags} is placed last on the linker line @@ -151,16 +153,23 @@ endif () if (BUILD_MALLOC) if ("${BUILD_MALLOC}" STREQUAL "jemalloc") set(MALLOC_LIB "jemalloc") + set(ALLOCATOR_LIB "jemalloc") add_valkey_server_compiler_options("-DUSE_JEMALLOC") set(USE_JEMALLOC 1) elseif ("${BUILD_MALLOC}" STREQUAL "libc") set(MALLOC_LIB "libc") elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc") set(MALLOC_LIB "tcmalloc") + valkey_pkg_config(libtcmalloc ALLOCATOR_LIB) + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC 1) elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc_minimal") set(MALLOC_LIB "tcmalloc_minimal") + valkey_pkg_config(libtcmalloc_minimal ALLOCATOR_LIB) + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC_MINIMAL 1) else () message(FATAL_ERROR "BUILD_MALLOC can be one of: jemalloc, libc, tcmalloc or tcmalloc_minimal") endif () @@ -199,29 +208,30 @@ if (BUILD_RDMA) # RDMA support (Linux only) if (LINUX AND NOT APPLE) valkey_parse_build_option(${BUILD_RDMA} USE_RDMA) + find_package(PkgConfig REQUIRED) + # Locate librdmacm & libibverbs, fail if we can't find them + valkey_pkg_config(librdmacm RDMACM_LIBS) + valkey_pkg_config(libibverbs IBVERBS_LIBS) + message(STATUS "${RDMACM_LIBS};${IBVERBS_LIBS}") + list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}") + if (USE_RDMA EQUAL 2) # Module message(STATUS "Building RDMA as module") add_valkey_server_compiler_options("-DUSE_RDMA=2") - find_package(PkgConfig REQUIRED) - - # Locate librdmacm & libibverbs, fail if we can't find them - pkg_check_modules(RDMACM REQUIRED librdmacm) - pkg_check_modules(IBVERBS REQUIRED libibverbs) - - message(STATUS "${RDMACM_LINK_LIBRARIES};${IBVERBS_LINK_LIBRARIES}") - list(APPEND RDMA_LIBS "${RDMACM_LIBRARIES};${IBVERBS_LIBRARIES}") - unset(RDMACM_LINK_LIBRARIES CACHE) - unset(IBVERBS_LINK_LIBRARIES CACHE) - set(BUILD_RDMA_MODULE 1) - elseif (USE_RDMA EQUAL 1) - # RDMA can only be built as a module. So disable it - message(WARNING "BUILD_RDMA can be one of: [NO | 0 | MODULE], but '${BUILD_RDMA}' was provided") - message(STATUS "RDMA build is disabled") - set(USE_RDMA 0) + set(BUILD_RDMA_MODULE 2) + elseif (USE_RDMA EQUAL 1) # Builtin + message(STATUS "Building RDMA as builtin") + add_valkey_server_compiler_options("-DUSE_RDMA=1") + add_valkey_server_compiler_options("-DBUILD_RDMA_MODULE=0") + list(APPEND SERVER_LIBS "${RDMA_LIBS}") endif () else () message(WARNING "RDMA is only supported on Linux platforms") endif () +else () + # By default, RDMA is disabled + message(STATUS "RDMA is disabled") + set(USE_RDMA 0) endif () set(BUILDING_ARM64 0) @@ -266,17 +276,18 @@ endif () # Sanitizer if (BUILD_SANITIZER) - # For best results, force libc - set(MALLOC_LIB, "libc") + # Common CFLAGS + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-sanitize-recover=all") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-omit-frame-pointer") if ("${BUILD_SANITIZER}" STREQUAL "address") - add_valkey_server_compiler_options("-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=address") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=address") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=address") elseif ("${BUILD_SANITIZER}" STREQUAL "thread") - add_valkey_server_compiler_options("-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=thread") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=thread") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=thread") elseif ("${BUILD_SANITIZER}" STREQUAL "undefined") - add_valkey_server_compiler_options("-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer") - add_valkey_server_linker_option("-fsanitize=undefined") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=undefined") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=undefined") else () message(FATAL_ERROR "Unknown sanitizer: ${BUILD_SANITIZER}") endif () @@ -366,7 +377,6 @@ include(SourceFiles) # Clear the below variables from the cache unset(CMAKE_C_FLAGS CACHE) -unset(BUILD_SANITIZER CACHE) unset(VALKEY_SERVER_LDFLAGS CACHE) unset(VALKEY_SERVER_CFLAGS CACHE) unset(PYTHON_EXE CACHE) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index c904b94031..3f5b04dc22 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -1,4 +1,6 @@ -add_subdirectory(jemalloc) +if (USE_JEMALLOC) + add_subdirectory(jemalloc) +endif () add_subdirectory(lua) # Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE diff --git a/deps/Makefile b/deps/Makefile index f1e4bd6ce2..72389def95 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -42,6 +42,7 @@ distclean: -(cd jemalloc && [ -f Makefile ] && $(MAKE) distclean) > /dev/null || true -(cd hdr_histogram && $(MAKE) clean) > /dev/null || true -(cd fpconv && $(MAKE) clean) > /dev/null || true + -(cd fast_float_c_interface && $(MAKE) clean) > /dev/null || true -(rm -f .make-*) .PHONY: distclean @@ -116,3 +117,9 @@ jemalloc: .make-prerequisites cd jemalloc && $(MAKE) lib/libjemalloc.a .PHONY: jemalloc + +fast_float_c_interface: .make-prerequisites + @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) + cd fast_float_c_interface && $(MAKE) + +.PHONY: fast_float_c_interface diff --git a/deps/README.md b/deps/README.md index b918b47456..97a7baf64b 100644 --- a/deps/README.md +++ b/deps/README.md @@ -6,6 +6,7 @@ should be provided by the operating system. * **linenoise** is a readline replacement. It is developed by the same authors of Valkey but is managed as a separated project and updated as needed. * **lua** is Lua 5.1 with minor changes for security and additional libraries. * **hdr_histogram** Used for per-command latency tracking histograms. +* **fast_float** is a replacement for strtod to convert strings to floats efficiently. How to upgrade the above dependencies === @@ -105,3 +106,17 @@ We use a customized version based on master branch commit e4448cf6d1cd08fff51981 2. Copy updated files from newer version onto files in /hdr_histogram. 3. Apply the changes from 1 above to the updated files. +fast_float +--- +The fast_float library provides fast header-only implementations for the C++ from_chars functions for `float` and `double` types as well as integer types. These functions convert ASCII strings representing decimal values (e.g., `1.3e10`) into binary types. The functions are much faster than comparable number-parsing functions from existing C++ standard libraries. + +Specifically, `fast_float` provides the following function to parse floating-point numbers with a C++17-like syntax (the library itself only requires C++11): + + template ())> + from_chars_result_t from_chars(UC const *first, UC const *last, T &value, chars_format fmt = chars_format::general); + +To upgrade the library, +1. Check out https://github.com/fastfloat/fast_float/tree/main +2. cd fast_float +3. Invoke "python3 ./script/amalgamate.py --output fast_float.h" +4. Copy fast_float.h file to "deps/fast_float/". diff --git a/deps/fast_float/fast_float.h b/deps/fast_float/fast_float.h new file mode 100644 index 0000000000..9ba3bc2e97 --- /dev/null +++ b/deps/fast_float/fast_float.h @@ -0,0 +1,3912 @@ +// fast_float by Daniel Lemire +// fast_float by João Paulo Magalhaes +// +// +// with contributions from Eugene Golushkov +// with contributions from Maksim Kita +// with contributions from Marcin Wojdyr +// with contributions from Neal Richardson +// with contributions from Tim Paine +// with contributions from Fabio Pellacini +// with contributions from Lénárd Szolnoki +// with contributions from Jan Pharago +// with contributions from Maya Warrier +// with contributions from Taha Khokhar +// +// +// Licensed under the Apache License, Version 2.0, or the +// MIT License or the Boost License. This file may not be copied, +// modified, or distributed except according to those terms. +// +// MIT License Notice +// +// MIT License +// +// Copyright (c) 2021 The fast_float authors +// +// Permission is hereby granted, free of charge, to any +// person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the +// Software without restriction, including without +// limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice +// shall be included in all copies or substantial portions +// of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// Apache License (Version 2.0) Notice +// +// Copyright 2021 The fast_float authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// +// BOOST License Notice +// +// Boost Software License - Version 1.0 - August 17th, 2003 +// +// Permission is hereby granted, free of charge, to any person or organization +// obtaining a copy of the software and accompanying documentation covered by +// this license (the "Software") to use, reproduce, display, distribute, +// execute, and transmit the Software, and to prepare derivative works of the +// Software, and to permit third-parties to whom the Software is furnished to +// do so, all subject to the following: +// +// The copyright notices in the Software and this entire statement, including +// the above license grant, this restriction and the following disclaimer, +// must be included in all copies of the Software, in whole or in part, and +// all derivative works of the Software, unless such copies or derivative +// works are solely in the form of machine-executable object code generated by +// a source language processor. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// + +#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H +#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if defined(__cpp_lib_is_constant_evaluated) && \ + __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST && \ + __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#define FASTFLOAT_IS_CONSTEXPR 1 +#else +#define FASTFLOAT_CONSTEXPR20 +#define FASTFLOAT_IS_CONSTEXPR 0 +#endif + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0 +#else +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1 +#endif + +#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifndef FASTFLOAT_FLOAT_COMMON_H +#define FASTFLOAT_FLOAT_COMMON_H + +#include +#include +#include +#include +#include +#include +#ifdef __has_include +#if __has_include() && (__cplusplus > 202002L || _MSVC_LANG > 202002L) +#include +#endif +#endif + +namespace fast_float { + +#define FASTFLOAT_JSONFMT (1 << 5) +#define FASTFLOAT_FORTRANFMT (1 << 6) + +enum chars_format { + scientific = 1 << 0, + fixed = 1 << 2, + hex = 1 << 3, + no_infnan = 1 << 4, + // RFC 8259: https://datatracker.ietf.org/doc/html/rfc8259#section-6 + json = FASTFLOAT_JSONFMT | fixed | scientific | no_infnan, + // Extension of RFC 8259 where, e.g., "inf" and "nan" are allowed. + json_or_infnan = FASTFLOAT_JSONFMT | fixed | scientific, + fortran = FASTFLOAT_FORTRANFMT | fixed | scientific, + general = fixed | scientific +}; + +template struct from_chars_result_t { + UC const *ptr; + std::errc ec; +}; +using from_chars_result = from_chars_result_t; + +template struct parse_options_t { + constexpr explicit parse_options_t(chars_format fmt = chars_format::general, + UC dot = UC('.')) + : format(fmt), decimal_point(dot) {} + + /** Which number formats are accepted */ + chars_format format; + /** The character used as decimal point */ + UC decimal_point; +}; +using parse_options = parse_options_t; + +} // namespace fast_float + +#if FASTFLOAT_HAS_BIT_CAST +#include +#endif + +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ + defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) || \ + defined(__MINGW64__) || defined(__s390x__) || \ + (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \ + defined(__PPC64LE__)) || \ + defined(__loongarch64)) +#define FASTFLOAT_64BIT 1 +#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || defined(__ppc__) || \ + defined(__MINGW32__) || defined(__EMSCRIPTEN__)) +#define FASTFLOAT_32BIT 1 +#else + // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. +// We can never tell the register width, but the SIZE_MAX is a good +// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max +// portability. +#if SIZE_MAX == 0xffff +#error Unknown platform (16-bit, unsupported) +#elif SIZE_MAX == 0xffffffff +#define FASTFLOAT_32BIT 1 +#elif SIZE_MAX == 0xffffffffffffffff +#define FASTFLOAT_64BIT 1 +#else +#error Unknown platform (not 32-bit, not 64-bit?) +#endif +#endif + +#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) || \ + (defined(_M_ARM64) && !defined(__MINGW32__)) +#include +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +#define FASTFLOAT_VISUAL_STUDIO 1 +#endif + +#if defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined _WIN32 +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) +#include +#elif defined(sun) || defined(__sun) +#include +#elif defined(__MVS__) +#include +#else +#ifdef __has_include +#if __has_include() +#include +#endif //__has_include() +#endif //__has_include +#endif +# +#ifndef __BYTE_ORDER__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#ifndef __ORDER_LITTLE_ENDIAN__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#define FASTFLOAT_IS_BIG_ENDIAN 1 +#endif +#endif + +#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) && \ + (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP == 2))) +#define FASTFLOAT_SSE2 1 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define FASTFLOAT_NEON 1 +#endif + +#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON) +#define FASTFLOAT_HAS_SIMD 1 +#endif + +#if defined(__GNUC__) +// disable -Wcast-align=strict (GCC only) +#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-align\"") +#else +#define FASTFLOAT_SIMD_DISABLE_WARNINGS +#endif + +#if defined(__GNUC__) +#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop") +#else +#define FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif + +#ifdef FASTFLOAT_VISUAL_STUDIO +#define fastfloat_really_inline __forceinline +#else +#define fastfloat_really_inline inline __attribute__((always_inline)) +#endif + +#ifndef FASTFLOAT_ASSERT +#define FASTFLOAT_ASSERT(x) \ + { ((void)(x)); } +#endif + +#ifndef FASTFLOAT_DEBUG_ASSERT +#define FASTFLOAT_DEBUG_ASSERT(x) \ + { ((void)(x)); } +#endif + +// rust style `try!()` macro, or `?` operator +#define FASTFLOAT_TRY(x) \ + { \ + if (!(x)) \ + return false; \ + } + +#define FASTFLOAT_ENABLE_IF(...) \ + typename std::enable_if<(__VA_ARGS__), int>::type + +namespace fast_float { + +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + +template +fastfloat_really_inline constexpr bool is_supported_float_type() { + return std::is_same::value || std::is_same::value +#if __STDCPP_FLOAT32_T__ + || std::is_same::value +#endif +#if __STDCPP_FLOAT64_T__ + || std::is_same::value +#endif + ; +} + +template +fastfloat_really_inline constexpr bool is_supported_char_type() { + return std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value; +} + +// Compares two ASCII strings in a case insensitive manner. +template +inline FASTFLOAT_CONSTEXPR14 bool +fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) { + char running_diff{0}; + for (size_t i = 0; i < length; ++i) { + running_diff |= (char(input1[i]) ^ char(input2[i])); + } + return (running_diff == 0) || (running_diff == 32); +} + +#ifndef FLT_EVAL_METHOD +#error "FLT_EVAL_METHOD should be defined, please include cfloat." +#endif + +// a pointer and a length to a contiguous block of memory +template struct span { + const T *ptr; + size_t length; + constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span() : ptr(nullptr), length(0) {} + + constexpr size_t len() const noexcept { return length; } + + FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return ptr[index]; + } +}; + +struct value128 { + uint64_t low; + uint64_t high; + constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + constexpr value128() : low(0), high(0) {} +}; + +/* Helper C++14 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int +leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + if (input_num & uint64_t(0xffffffff00000000)) { + input_num >>= 32; + last_bit |= 32; + } + if (input_num & uint64_t(0xffff0000)) { + input_num >>= 16; + last_bit |= 16; + } + if (input_num & uint64_t(0xff00)) { + input_num >>= 8; + last_bit |= 8; + } + if (input_num & uint64_t(0xf0)) { + input_num >>= 4; + last_bit |= 4; + } + if (input_num & uint64_t(0xc)) { + input_num >>= 2; + last_bit |= 2; + } + if (input_num & uint64_t(0x2)) { /* input_num >>= 1; */ + last_bit |= 1; + } + return 63 - last_bit; +} + +/* result might be undefined when input_num is zero */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int +leading_zeroes(uint64_t input_num) { + assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } +#ifdef FASTFLOAT_VISUAL_STUDIO +#if defined(_M_X64) || defined(_M_ARM64) + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + _BitScanReverse64(&leading_zero, input_num); + return (int)(63 - leading_zero); +#else + return leading_zeroes_generic(input_num); +#endif +#else + return __builtin_clzll(input_num); +#endif +} + +// slow emulation routine for 32-bit +fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { + return x * (uint64_t)y; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { + uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); + uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); + uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); + uint64_t adbc_carry = (uint64_t)(adbc < ad); + uint64_t lo = bd + (adbc << 32); + *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + + (adbc_carry << 32) + (uint64_t)(lo < bd); + return lo; +} + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab, + uint64_t cd, + uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} +#endif // !__MINGW64__ + +#endif // FASTFLOAT_32BIT + +// compute 64-bit a*b +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } + value128 answer; +#if defined(_M_ARM64) && !defined(__MINGW32__) + // ARM64 has native support for 64-bit multiplications, no need to emulate + // But MinGW on ARM64 doesn't have native support for 64-bit multiplications + answer.high = __umulh(a, b); + answer.low = a * b; +#elif defined(FASTFLOAT_32BIT) || \ + (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64)) + answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64 +#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__) + __uint128_t r = ((__uint128_t)a) * b; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#else + answer.low = umul128_generic(a, b, &answer.high); +#endif + return answer; +} + +struct adjusted_mantissa { + uint64_t mantissa{0}; + int32_t power2{0}; // a negative value indicates an invalid result + adjusted_mantissa() = default; + constexpr bool operator==(const adjusted_mantissa &o) const { + return mantissa == o.mantissa && power2 == o.power2; + } + constexpr bool operator!=(const adjusted_mantissa &o) const { + return mantissa != o.mantissa || power2 != o.power2; + } +}; + +// Bias so we can get the real exponent with an invalid adjusted_mantissa. +constexpr static int32_t invalid_am_bias = -0x8000; + +// used for binary_format_lookup_tables::max_mantissa +constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; + +template struct binary_format_lookup_tables; + +template struct binary_format : binary_format_lookup_tables { + using equiv_uint = + typename std::conditional::type; + + static inline constexpr int mantissa_explicit_bits(); + static inline constexpr int minimum_exponent(); + static inline constexpr int infinite_power(); + static inline constexpr int sign_index(); + static inline constexpr int + min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int max_exponent_fast_path(); + static inline constexpr int max_exponent_round_to_even(); + static inline constexpr int min_exponent_round_to_even(); + static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); + static inline constexpr uint64_t + max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int largest_power_of_ten(); + static inline constexpr int smallest_power_of_ten(); + static inline constexpr T exact_power_of_ten(int64_t power); + static inline constexpr size_t max_digits(); + static inline constexpr equiv_uint exponent_mask(); + static inline constexpr equiv_uint mantissa_mask(); + static inline constexpr equiv_uint hidden_bit_mask(); +}; + +template struct binary_format_lookup_tables { + static constexpr double powers_of_ten[] = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + + // Largest integer value v so that (5**index * v) <= 1<<53. + // 0x20000000000000 == 1 << 53 + static constexpr uint64_t max_mantissa[] = { + 0x20000000000000, + 0x20000000000000 / 5, + 0x20000000000000 / (5 * 5), + 0x20000000000000 / (5 * 5 * 5), + 0x20000000000000 / (5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555), + 0x20000000000000 / (constant_55555 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr double binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template struct binary_format_lookup_tables { + static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + + // Largest integer value v so that (5**index * v) <= 1<<24. + // 0x1000000 == 1<<24 + static constexpr uint64_t max_mantissa[] = { + 0x1000000, + 0x1000000 / 5, + 0x1000000 / (5 * 5), + 0x1000000 / (5 * 5 * 5), + 0x1000000 / (5 * 5 * 5 * 5), + 0x1000000 / (constant_55555), + 0x1000000 / (constant_55555 * 5), + 0x1000000 / (constant_55555 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * constant_55555), + 0x1000000 / (constant_55555 * constant_55555 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr float binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -22; +#endif +} + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -10; +#endif +} + +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 52; +} +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 10; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -4; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -17; +} + +template <> inline constexpr int binary_format::minimum_exponent() { + return -1023; +} +template <> inline constexpr int binary_format::minimum_exponent() { + return -127; +} + +template <> inline constexpr int binary_format::infinite_power() { + return 0x7FF; +} +template <> inline constexpr int binary_format::infinite_power() { + return 0xFF; +} + +template <> inline constexpr int binary_format::sign_index() { + return 63; +} +template <> inline constexpr int binary_format::sign_index() { + return 31; +} + +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 22; +} +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 10; +} + +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 22 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 10 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} + +template <> +inline constexpr double +binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} +template <> +inline constexpr float binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} + +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 308; +} +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 38; +} + +template <> +inline constexpr int binary_format::smallest_power_of_ten() { + return -342; +} +template <> inline constexpr int binary_format::smallest_power_of_ten() { + return -64; +} + +template <> inline constexpr size_t binary_format::max_digits() { + return 769; +} +template <> inline constexpr size_t binary_format::max_digits() { + return 114; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7F800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7FF0000000000000; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x007FFFFF; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x000FFFFFFFFFFFFF; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x00800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x0010000000000000; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +to_float(bool negative, adjusted_mantissa am, T &value) { + using fastfloat_uint = typename binary_format::equiv_uint; + fastfloat_uint word = (fastfloat_uint)am.mantissa; + word |= fastfloat_uint(am.power2) + << binary_format::mantissa_explicit_bits(); + word |= fastfloat_uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); +#else + ::memcpy(&value, &word, sizeof(T)); +#endif +} + +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default +template struct space_lut { + static constexpr bool value[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr bool space_lut::value[]; + +#endif + +inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } +#endif + +template static constexpr uint64_t int_cmp_zeros() { + static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), + "Unsupported character size"); + return (sizeof(UC) == 1) ? 0x3030303030303030 + : (sizeof(UC) == 2) + ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | + uint64_t(UC('0')) << 16 | UC('0')) + : (uint64_t(UC('0')) << 32 | UC('0')); +} +template static constexpr int int_cmp_len() { + return sizeof(uint64_t) / sizeof(UC); +} +template static constexpr UC const *str_const_nan() { + return nullptr; +} +template <> constexpr char const *str_const_nan() { return "nan"; } +template <> constexpr wchar_t const *str_const_nan() { return L"nan"; } +template <> constexpr char16_t const *str_const_nan() { + return u"nan"; +} +template <> constexpr char32_t const *str_const_nan() { + return U"nan"; +} +template static constexpr UC const *str_const_inf() { + return nullptr; +} +template <> constexpr char const *str_const_inf() { return "infinity"; } +template <> constexpr wchar_t const *str_const_inf() { + return L"infinity"; +} +template <> constexpr char16_t const *str_const_inf() { + return u"infinity"; +} +template <> constexpr char32_t const *str_const_inf() { + return U"infinity"; +} + +template struct int_luts { + static constexpr uint8_t chdigit[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + + static constexpr size_t maxdigits_u64[] = { + 64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13}; + + static constexpr uint64_t min_safe_u64[] = { + 9223372036854775808ull, 12157665459056928801ull, 4611686018427387904, + 7450580596923828125, 4738381338321616896, 3909821048582988049, + 9223372036854775808ull, 12157665459056928801ull, 10000000000000000000ull, + 5559917313492231481, 2218611106740436992, 8650415919381337933, + 2177953337809371136, 6568408355712890625, 1152921504606846976, + 2862423051509815793, 6746640616477458432, 15181127029874798299ull, + 1638400000000000000, 3243919932521508681, 6221821273427820544, + 11592836324538749809ull, 876488338465357824, 1490116119384765625, + 2481152873203736576, 4052555153018976267, 6502111422497947648, + 10260628712958602189ull, 15943230000000000000ull, 787662783788549761, + 1152921504606846976, 1667889514952984961, 2386420683693101056, + 3379220508056640625, 4738381338321616896}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint8_t int_luts::chdigit[]; + +template constexpr size_t int_luts::maxdigits_u64[]; + +template constexpr uint64_t int_luts::min_safe_u64[]; + +#endif + +template +fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) { + return int_luts<>::chdigit[static_cast(c)]; +} + +fastfloat_really_inline constexpr size_t max_digits_u64(int base) { + return int_luts<>::maxdigits_u64[base - 2]; +} + +// If a u64 is exactly max_digits_u64() in length, this is +// the value below which it has definitely overflowed. +fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) { + return int_luts<>::min_safe_u64[base - 2]; +} + +} // namespace fast_float + +#endif + + +#ifndef FASTFLOAT_FAST_FLOAT_H +#define FASTFLOAT_FAST_FLOAT_H + + +namespace fast_float { +/** + * This function parses the character sequence [first,last) for a number. It + * parses floating-point numbers expecting a locale-indepent format equivalent + * to what is used by std::strtod in the default ("C") locale. The resulting + * floating-point value is the closest floating-point values (using either float + * or double), using the "round to even" convention for values that would + * otherwise fall right in-between two values. That is, we provide exact parsing + * according to the IEEE standard. + * + * Given a successful parse, the pointer (`ptr`) in the returned value is set to + * point right after the parsed number, and the `value` referenced is set to the + * parsed value. In case of error, the returned `ec` contains a representative + * error, otherwise the default (`std::errc()`) value is stored. + * + * The implementation does not throw and does not allocate memory (e.g., with + * `new` or `malloc`). + * + * Like the C++17 standard, the `fast_float::from_chars` functions take an + * optional last argument of the type `fast_float::chars_format`. It is a bitset + * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt & + * fast_float::chars_format::scientific` are set to determine whether we allow + * the fixed point and scientific notation respectively. The default is + * `fast_float::chars_format::general` which allows both `fixed` and + * `scientific`. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt = chars_format::general) noexcept; + +/** + * Like from_chars, but accepts an `options` argument to govern number parsing. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept; +/** + * from_chars for integer types. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept; + +} // namespace fast_float +#endif // FASTFLOAT_FAST_FLOAT_H + +#ifndef FASTFLOAT_ASCII_NUMBER_H +#define FASTFLOAT_ASCII_NUMBER_H + +#include +#include +#include +#include +#include +#include + + +#ifdef FASTFLOAT_SSE2 +#include +#endif + +#ifdef FASTFLOAT_NEON +#include +#endif + +namespace fast_float { + +template fastfloat_really_inline constexpr bool has_simd_opt() { +#ifdef FASTFLOAT_HAS_SIMD + return std::is_same::value; +#else + return false; +#endif +} + +// Next function can be micro-optimized, but compilers are entirely +// able to optimize it well. +template +fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { + return !(c > UC('9') || c < UC('0')); +} + +fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { + return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | + (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 | + (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 | + (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56; +} + +// Read 8 UC into a u64. Truncates UC if not char. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +read8_to_u64(const UC *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { + uint64_t val = 0; + for (int i = 0; i < 8; ++i) { + val |= uint64_t(uint8_t(*chars)) << (i * 8); + ++chars; + } + return val; + } + uint64_t val; + ::memcpy(&val, chars, sizeof(uint64_t)); +#if FASTFLOAT_IS_BIG_ENDIAN == 1 + // Need to read as-if the number was in little-endian order. + val = byteswap(val); +#endif + return val; +} + +#ifdef FASTFLOAT_SSE2 + +fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i packed = _mm_packus_epi16(data, data); +#ifdef FASTFLOAT_64BIT + return uint64_t(_mm_cvtsi128_si64(packed)); +#else + uint64_t value; + // Visual Studio + older versions of GCC don't support _mm_storeu_si64 + _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed); + return value; +#endif + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + _mm_loadu_si128(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#elif defined(FASTFLOAT_NEON) + +fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + uint8x8_t utf8_packed = vmovn_u16(data); + return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + vld1q_u16(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#endif // FASTFLOAT_SSE2 + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +uint64_t simd_read8_to_u64(UC const *) { + return 0; +} + +// credit @aqrit +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_eight_digits_unrolled(uint64_t val) { + const uint64_t mask = 0x000000FF000000FF; + const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return uint32_t(val); +} + +// Call this if chars are definitely 8 digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +parse_eight_digits_unrolled(UC const *chars) noexcept { + if (cpp20_and_in_constexpr() || !has_simd_opt()) { + return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay + } + return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); +} + +// credit @aqrit +fastfloat_really_inline constexpr bool +is_made_of_eight_digits_fast(uint64_t val) noexcept { + return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & + 0x8080808080808080)); +} + +#ifdef FASTFLOAT_HAS_SIMD + +// Call this if chars might not be 8 digits. +// Using this style (instead of is_made_of_eight_digits_fast() then +// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +simd_parse_if_eight_digits_unrolled(const char16_t *chars, + uint64_t &i) noexcept { + if (cpp20_and_in_constexpr()) { + return false; + } +#ifdef FASTFLOAT_SSE2 + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = + _mm_loadu_si128(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720)); + const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759)); + + if (_mm_movemask_epi8(t1) == 0) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#elif defined(FASTFLOAT_NEON) + FASTFLOAT_SIMD_DISABLE_WARNINGS + const uint16x8_t data = vld1q_u16(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0')); + const uint16x8_t mask = vcltq_u16(t0, vmovq_n_u16('9' - '0' + 1)); + + if (vminvq_u16(mask) == 0xFFFF) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#else + (void)chars; + (void)i; + return false; +#endif // FASTFLOAT_SSE2 +} + +#endif // FASTFLOAT_HAS_SIMD + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) { + return 0; +} + +template ::value) = 0> +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) { + if (!has_simd_opt()) { + return; + } + while ((std::distance(p, pend) >= 8) && + simd_parse_if_eight_digits_unrolled( + p, i)) { // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const char *&p, const char *const pend, + uint64_t &i) { + // optimizes better than parse_if_eight_digits_unrolled() for UC = char. + while ((std::distance(p, pend) >= 8) && + is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + + parse_eight_digits_unrolled(read8_to_u64( + p)); // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +enum class parse_error { + no_error, + // [JSON-only] The minus sign must be followed by an integer. + missing_integer_after_sign, + // A sign must be followed by an integer or dot. + missing_integer_or_dot_after_sign, + // [JSON-only] The integer part must not have leading zeros. + leading_zeros_in_integer_part, + // [JSON-only] The integer part must have at least one digit. + no_digits_in_integer_part, + // [JSON-only] If there is a decimal point, there must be digits in the + // fractional part. + no_digits_in_fractional_part, + // The mantissa must have at least one digit. + no_digits_in_mantissa, + // Scientific notation requires an exponential part. + missing_exponential_part, +}; + +template struct parsed_number_string_t { + int64_t exponent{0}; + uint64_t mantissa{0}; + UC const *lastmatch{nullptr}; + bool negative{false}; + bool valid{false}; + bool too_many_digits{false}; + // contains the range of the significant digits + span integer{}; // non-nullable + span fraction{}; // nullable + parse_error error{parse_error::no_error}; +}; + +using byte_span = span; +using parsed_number_string = parsed_number_string_t; + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +report_parse_error(UC const *p, parse_error error) { + parsed_number_string_t answer; + answer.valid = false; + answer.lastmatch = p; + answer.error = error; + return answer; +} + +// Assuming that you use no more than 19 digits, this will +// parse an ASCII string. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +parse_number_string(UC const *p, UC const *pend, + parse_options_t options) noexcept { + chars_format const fmt = options.format; + UC const decimal_point = options.decimal_point; + + parsed_number_string_t answer; + answer.valid = false; + answer.too_many_digits = false; + answer.negative = (*p == UC('-')); +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (!(fmt & FASTFLOAT_JSONFMT) && *p == UC('+'))) { +#else + if (*p == UC('-')) { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here +#endif + ++p; + if (p == pend) { + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + if (fmt & FASTFLOAT_JSONFMT) { + if (!is_integer(*p)) { // a sign must be followed by an integer + return report_parse_error(p, + parse_error::missing_integer_after_sign); + } + } else { + if (!is_integer(*p) && + (*p != + decimal_point)) { // a sign must be followed by an integer or the dot + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + } + } + UC const *const start_digits = p; + + uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + + while ((p != pend) && is_integer(*p)) { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + + uint64_t(*p - + UC('0')); // might overflow, we will handle the overflow later + ++p; + } + UC const *const end_of_integer_part = p; + int64_t digit_count = int64_t(end_of_integer_part - start_digits); + answer.integer = span(start_digits, size_t(digit_count)); + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0) { + return report_parse_error(p, parse_error::no_digits_in_integer_part); + } + if ((start_digits[0] == UC('0') && digit_count > 1)) { + return report_parse_error(start_digits, + parse_error::leading_zeros_in_integer_part); + } + } + + int64_t exponent = 0; + const bool has_decimal_point = (p != pend) && (*p == decimal_point); + if (has_decimal_point) { + ++p; + UC const *before = p; + // can occur at most twice without overflowing, but let it occur more, since + // for integers with many digits, digit parsing is the primary bottleneck. + loop_parse_if_eight_digits(p, pend, i); + + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + exponent = before - p; + answer.fraction = span(before, size_t(p - before)); + digit_count -= exponent; + } + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in fractional part + if (has_decimal_point && exponent == 0) { + return report_parse_error(p, + parse_error::no_digits_in_fractional_part); + } + } else if (digit_count == + 0) { // we must have encountered at least one integer! + return report_parse_error(p, parse_error::no_digits_in_mantissa); + } + int64_t exp_number = 0; // explicit exponential part + if (((fmt & chars_format::scientific) && (p != pend) && + ((UC('e') == *p) || (UC('E') == *p))) || + ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) && + ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || + (UC('D') == *p)))) { + UC const *location_of_e = p; + if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || + (UC('D') == *p)) { + ++p; + } + bool neg_exp = false; + if ((p != pend) && (UC('-') == *p)) { + neg_exp = true; + ++p; + } else if ((p != pend) && + (UC('+') == + *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1) + ++p; + } + if ((p == pend) || !is_integer(*p)) { + if (!(fmt & chars_format::fixed)) { + // The exponential part is invalid for scientific notation, so it must + // be a trailing token for fixed notation. However, fixed notation is + // disabled, so report a scientific notation error. + return report_parse_error(p, parse_error::missing_exponential_part); + } + // Otherwise, we will be ignoring the 'e'. + p = location_of_e; + } else { + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + if (exp_number < 0x10000000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + } else { + // If it scientific and not fixed, we have to bail out. + if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { + return report_parse_error(p, parse_error::missing_exponential_part); + } + } + answer.lastmatch = p; + answer.valid = true; + + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon. + // + // We can deal with up to 19 digits. + if (digit_count > 19) { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + // We need to be mindful of the case where we only have zeroes... + // E.g., 0.000000000...000. + UC const *start = start_digits; + while ((start != pend) && (*start == UC('0') || *start == decimal_point)) { + if (*start == UC('0')) { + digit_count--; + } + start++; + } + + if (digit_count > 19) { + answer.too_many_digits = true; + // Let us start again, this time, avoiding overflows. + // We don't need to check if is_integer, since we use the + // pre-tokenized spans from above. + i = 0; + p = answer.integer.ptr; + UC const *int_end = p + answer.integer.len(); + const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + if (i >= minimal_nineteen_digit_integer) { // We have a big integers + exponent = end_of_integer_part - p + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value + } + } + answer.exponent = exponent; + answer.mantissa = i; + return answer; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t +parse_int_string(UC const *p, UC const *pend, T &value, int base) { + from_chars_result_t answer; + + UC const *const first = p; + + bool negative = (*p == UC('-')); + if (!std::is_signed::value && negative) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (*p == UC('+'))) { +#else + if (*p == UC('-')) { +#endif + ++p; + } + + UC const *const start_num = p; + + while (p != pend && *p == UC('0')) { + ++p; + } + + const bool has_leading_zeros = p > start_num; + + UC const *const start_digits = p; + + uint64_t i = 0; + if (base == 10) { + loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible + } + while (p != pend) { + uint8_t digit = ch_to_digit(*p); + if (digit >= base) { + break; + } + i = uint64_t(base) * i + digit; // might overflow, check this later + p++; + } + + size_t digit_count = size_t(p - start_digits); + + if (digit_count == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + answer.ptr = p; + + // check u64 overflow + size_t max_digits = max_digits_u64(base); + if (digit_count > max_digits) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + // this check can be eliminated for all other types, but they will all require + // a max_digits(base) equivalent + if (digit_count == max_digits && i < min_safe_u64(base)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + + // check other types overflow + if (!std::is_same::value) { + if (i > uint64_t(std::numeric_limits::max()) + uint64_t(negative)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + } + + if (negative) { +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + // this weird workaround is required because: + // - converting unsigned to signed when its value is greater than signed max + // is UB pre-C++23. + // - reinterpret_casting (~i + 1) would work, but it is not constexpr + // this is always optimized into a neg instruction (note: T is an integer + // type) + value = T(-std::numeric_limits::max() - + T(i - uint64_t(std::numeric_limits::max()))); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#endif + } else { + value = T(i); + } + + answer.ec = std::errc(); + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_FAST_TABLE_H +#define FASTFLOAT_FAST_TABLE_H + +#include + +namespace fast_float { + +/** + * When mapping numbers from decimal to binary, + * we go from w * 10^q to m * 2^p but we have + * 10^q = 5^q * 2^q, so effectively + * we are trying to match + * w * 2^q * 5^q to m * 2^p. Thus the powers of two + * are not a concern since they can be represented + * exactly using the binary notation, only the powers of five + * affect the binary significand. + */ + +/** + * The smallest non-zero float (binary64) is 2^-1074. + * We take as input numbers of the form w x 10^q where w < 2^64. + * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. + * However, we have that + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. + * Thus it is possible for a number of the form w * 10^-342 where + * w is a 64-bit value to be a non-zero floating-point number. + ********* + * Any number of form w * 10^309 where w>= 1 is going to be + * infinite in binary64 so we never need to worry about powers + * of 5 greater than 308. + */ +template struct powers_template { + + constexpr static int smallest_power_of_five = + binary_format::smallest_power_of_ten(); + constexpr static int largest_power_of_five = + binary_format::largest_power_of_ten(); + constexpr static int number_of_entries = + 2 * (largest_power_of_five - smallest_power_of_five + 1); + // Powers of five from 5^-342 all the way to 5^308 rounded toward one. + constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a, 0x113faa2906a13b3f, + 0x9558b4661b6565f8, 0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76, 0x5d79bcf00d2df649, + 0xe95a99df8ace6f53, 0xf4d82c2c107973dc, + 0x91d8a02bb6c10594, 0x79071b9b8a4be869, + 0xb64ec836a47146f9, 0x9748e2826cdee284, + 0xe3e27a444d8d98b7, 0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f, 0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723, 0xad2c788035e61382, + 0x8b16fb203055ac76, 0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78, 0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b, 0x8672648c40e5ad68, + 0xa9c98d8ccb009506, 0x680efdaf511f18c2, + 0xd43bf0effdc0ba48, 0x212bd1b2566def2, + 0x84a57695fe98746d, 0x14bb630f7604b57, + 0xa5ced43b7e3e9188, 0x419ea3bd35385e2d, + 0xcf42894a5dce35ea, 0x52064cac828675b9, + 0x818995ce7aa0e1b2, 0x7343efebd1940993, + 0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6, 0xd41a26e077774ef6, + 0xfd00b897478238d0, 0x8920b098955522b4, + 0x9e20735e8cb16382, 0x55b46e5f5d5535b0, + 0xc5a890362fddbc62, 0xeb2189f734aa831d, + 0xf712b443bbd52b7b, 0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d, 0x47b233c92125366e, + 0xc1069cd4eabe89f8, 0x999ec0bb696e840a, + 0xf148440a256e2c76, 0xc00670ea43ca250d, + 0x96cd2a865764dbca, 0x380406926a5e5728, + 0xbc807527ed3e12bc, 0xc605083704f5ecf2, + 0xeba09271e88d976b, 0xf7864a44c633682e, + 0x93445b8731587ea3, 0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c, 0x5960ea05bad82964, + 0xe61acf033d1a45df, 0x6fb92487298e33bd, + 0x8fd0c16206306bab, 0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696, 0x8f48a4899877186c, + 0xe0b62e2929aba83c, 0x331acdabfe94de87, + 0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a, 0xc9e82cd9f69d6150, + 0x892731ac9faf056e, 0xbe311c083a225cd2, + 0xab70fe17c79ac6ca, 0x6dbd630a48aaf406, + 0xd64d3d9db981787d, 0x92cbbccdad5b108, + 0x85f0468293f0eb4e, 0x25bbf56008c58ea5, + 0xa76c582338ed2621, 0xaf2af2b80af6f24e, + 0xd1476e2c07286faa, 0x1af5af660db4aee1, + 0x82cca4db847945ca, 0x50d98d9fc890ed4d, + 0xa37fce126597973c, 0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1, 0x77b191618c54e9ac, + 0xc795830d75038c1d, 0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25, 0x4b0573286b44ad1d, + 0x9becce62836ac577, 0x4ee367f9430aec32, + 0xc2e801fb244576d5, 0x229c41f793cda73f, + 0xf3a20279ed56d48a, 0x6b43527578c1110f, + 0x9845418c345644d6, 0x830a13896b78aaa9, + 0xbe5691ef416bd60c, 0x23cc986bc656d553, + 0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9, 0xd1b3400f8f9cff68, + 0x91376c36d99995be, 0x23100809b9c21fa1, + 0xb58547448ffffb2d, 0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9, 0x16c90c8f323f516c, + 0x8dd01fad907ffc3b, 0xae3da7d97f6792e3, + 0xb1442798f49ffb4a, 0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d, 0x40405643d711d583, + 0x8a7d3eef7f1cfc52, 0x482835ea666b2572, + 0xad1c8eab5ee43b66, 0xda3243650005eecf, + 0xd863b256369d4a40, 0x90bed43e40076a82, + 0x873e4f75e2224e68, 0x5a7744a6e804a291, + 0xa90de3535aaae202, 0x711515d0a205cb36, + 0xd3515c2831559a83, 0xd5a5b44ca873e03, + 0x8412d9991ed58091, 0xe858790afe9486c2, + 0xa5178fff668ae0b6, 0x626e974dbe39a872, + 0xce5d73ff402d98e3, 0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e, 0x7ce66634bc9d0b99, + 0xa139029f6a239f72, 0x1c1fffc1ebc44e80, + 0xc987434744ac874e, 0xa327ffb266b56220, + 0xfbe9141915d7a922, 0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9, + 0xc4ce17b399107c22, 0xcb550fb4384d21d3, + 0xf6019da07f549b2b, 0x7e2a53a146606a48, + 0x99c102844f94e0fb, 0x2eda7444cbfc426d, + 0xc0314325637a1939, 0xfa911155fefb5308, + 0xf03d93eebc589f88, 0x793555ab7eba27ca, + 0x96267c7535b763b5, 0x4bc1558b2f3458de, + 0xbbb01b9283253ca2, 0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb, 0x465e15a979c1cadc, + 0x92a1958a7675175f, 0xbfacd89ec191ec9, + 0xb749faed14125d36, 0xcef980ec671f667b, + 0xe51c79a85916f484, 0x82b7e12780e7401a, + 0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9, 0x67a791e093e1d49a, + 0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d, 0x58fae9f773886e18, + 0xda7f5bf590966848, 0xaf39a475506a899e, + 0x888f99797a5e012d, 0x6d8406c952429603, + 0xaab37fd7d8f58178, 0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26, 0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e, + 0xd0601d8efc57b08b, 0xf13b94daf124da26, + 0x823c12795db6ce57, 0x76c53d08d6b70858, + 0xa2cb1717b52481ed, 0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02, 0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a, 0x359ab6419ca1091b, + 0xf867241c8cc6d4c0, 0xc30163d203c94b62, + 0x9b407691d7fc44f8, 0x79e0de63425dcf1d, + 0xc21094364dfb5636, 0x985915fc12f542e4, + 0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a, 0xa705992ceecf9c42, + 0xbd8430bd08277231, 0x50c6ff782a838353, + 0xece53cec4a314ebd, 0xa4f8bf5635246428, + 0x940f4613ae5ed136, 0x871b7795e136be99, + 0xb913179899f68584, 0x28e2557b59846e3f, + 0xe757dd7ec07426e5, 0x331aeada2fe589cf, + 0x9096ea6f3848984f, 0x3ff0d2c85def7621, + 0xb4bca50b065abe63, 0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb, 0xd3e8495912c62894, + 0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c, + 0xb080392cc4349dec, 0xbd8d794d96aacfb3, + 0xdca04777f541c567, 0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60, 0xf41686c49db57244, + 0xac5d37d5b79b6239, 0x311c2875c522ced5, + 0xd77485cb25823ac7, 0x7d633293366b828b, + 0x86a8d39ef77164bc, 0xae5dff9c02033197, + 0xa8530886b54dbdeb, 0xd9f57f830283fdfc, + 0xd267caa862a12d66, 0xd072df63c324fd7b, + 0x8380dea93da4bc60, 0x4247cb9e59f71e6d, + 0xa46116538d0deb78, 0x52d9be85f074e608, + 0xcd795be870516656, 0x67902e276c921f8b, + 0x806bd9714632dff6, 0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3, 0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0, 0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c, 0x796b805720085f81, + 0x9cc3a6eec6311a63, 0xcbe3303674053bb0, + 0xc3f490aa77bd60fc, 0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b, 0xee92fb5515482d44, + 0x991711052d8bf3c5, 0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6, 0xd262d45a78a0635d, + 0xef340a98172aace4, 0x86fb897116c87c34, + 0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0, + 0xbae0a846d2195712, 0x8974836059cca109, + 0xe998d258869facd7, 0x2bd1a438703fc94b, + 0x91ff83775423cc06, 0x7b6306a34627ddcf, + 0xb67f6455292cbf08, 0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93, + 0x8e938662882af53e, 0x547eb47b7282ee9c, + 0xb23867fb2a35b28d, 0xe99e619a4f23aa43, + 0xdec681f9f4c31f31, 0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e, 0xde83bc408dd3dd04, + 0xae0b158b4738705e, 0x9624ab50b148d445, + 0xd98ddaee19068c76, 0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b, 0x7647c3200069671f, + 0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073, + 0xa5fb0a17c777cf09, 0xf468107100525890, + 0xcf79cc9db955c2cc, 0x7182148d4066eeb4, + 0x81ac1fe293d599bf, 0xc6f14cd848405530, + 0xa21727db38cb002f, 0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b, 0xa6d90811f0e4851c, + 0xfd442e4688bd304a, 0x908f4a166d1da663, + 0x9e4a9cec15763e2e, 0x9a598e4e043287fe, + 0xc5dd44271ad3cdba, 0x40eff1e1853f29fd, + 0xf7549530e188c128, 0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9, 0x82bb74f8301958ce, + 0xc13a148e3032d6e7, 0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de, 0x7415d448f6b6f0e7, + 0xebdf661791d60f56, 0x111b495b3464ad21, + 0x936b9fcebb25c995, 0xcab10dd900beec34, + 0xb84687c269ef3bfb, 0x3d5d514f40eea742, + 0xe65829b3046b0afa, 0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab, + 0xb3f4e093db73a093, 0x59ed216765690f56, + 0xe0f218b8d25088b8, 0x306869c13ec3532c, + 0x8c974f7383725573, 0x1e414218c73a13fb, + 0xafbd2350644eeacf, 0xe5d1929ef90898fa, + 0xdbac6c247d62a583, 0xdf45f746b74abf39, + 0x894bc396ce5da772, 0x6b8bba8c328eb783, + 0xab9eb47c81f5114f, 0x66ea92f3f326564, + 0xd686619ba27255a2, 0xc80a537b0efefebd, + 0x8613fd0145877585, 0xbd06742ce95f5f36, + 0xa798fc4196e952e7, 0x2c48113823b73704, + 0xd17f3b51fca3a7a0, 0xf75a15862ca504c5, + 0x82ef85133de648c4, 0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3, 0x318df905079926a8, + 0xffbbcfe994e5c61f, 0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d, 0x6bea10ca65c084e, + 0xc31bfa0fe5698db8, 0x486e494fcff30a62, + 0xf3e2f893dec3f126, 0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7, 0xf89629465a75e01c, + 0xbe89523386091465, 0xf6bbb397f1135823, + 0xee2ba6c0678b597f, 0x746aa07ded582e2c, + 0x94db483840b717ef, 0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb, 0x92f34d62616ce413, + 0xe896a0d7e51e1566, 0x77b020baf9c81d17, + 0x915e2486ef32cd60, 0xace1474dc1d122e, + 0xb5b5ada8aaff80b8, 0xd819992132456ba, + 0xe3231912d5bf60e6, 0x10e1fff697ed6c69, + 0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d, 0x86c16c98d2c953c6, + 0xd89d64d57a607744, 0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b, 0x11471cd764ad4972, + 0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf, + 0xd389b47879823479, 0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb, 0xcedf722a585139ba, + 0xa54394fe1eedb8fe, 0xc2974eb4ee658828, + 0xce947a3da6a9273e, 0x733d226229feea32, + 0x811ccc668829b887, 0x806357d5a3f525f, + 0xa163ff802a3426a8, 0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052, 0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67, 0xbbac2078d443ace2, + 0x9d9ba7832936edc0, 0xd54b944b84aa4c0d, + 0xc5029163f384a931, 0xa9e795e65d4df11, + 0xf64335bcf065d37d, 0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e, 0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6, + 0xf07da27a82c37088, 0x5d767327bb4e5a4c, + 0x964e858c91ba2655, 0x3a6a07f8d510f86f, + 0xbbe226efb628afea, 0x890489f70a55368b, + 0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f, 0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb, 0x9ce6ebb40173744, + 0xe55990879ddcaabd, 0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6, 0x9fa946824a12232d, + 0xb32df8e9f3546564, 0x47939822dc96abf9, + 0xdff9772470297ebd, 0x59787e2b93bc56f7, + 0x8bfbea76c619ef36, 0x57eb4edb3c55b65a, + 0xaefae51477a06b03, 0xede622920b6b23f1, + 0xdab99e59958885c4, 0xe95fab368e45eced, + 0x88b402f7fd75539b, 0x11dbcb0218ebb414, + 0xaae103b5fcd2a881, 0xd652bdc29f26a119, + 0xd59944a37c0752a2, 0x4be76d3346f0495f, + 0x857fcae62d8493a5, 0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2, 0x7e2000a41346a7a7, + 0x825ecc24c873782f, 0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b, 0x728900802f0f32fa, + 0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc, 0xe2f610c84987bfa8, + 0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143, 0x91503d1c79720dbb, + 0xf8a95fcf88747d94, 0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba, + 0xc24452da229b021b, 0xfbe85badce996168, + 0xf2d56790ab41c2a2, 0xfae27299423fb9c3, + 0x97c560ba6b0919a5, 0xdccd879fc967d41a, + 0xbdb6b8e905cb600f, 0x5400e987bbc1c920, + 0xed246723473e3813, 0x290123e9aab23b68, + 0x9436c0760c86e30b, 0xf9a0b6720aaf6521, + 0xb94470938fa89bce, 0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2, 0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232, 0x25c6da63c38de1b0, + 0x8d590723948a535f, 0x579c487e5a38ad0e, + 0xb0af48ec79ace837, 0x2d835a9df0c6d851, + 0xdcdb1b2798182244, 0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5, 0xe272467e3d222f3f, + 0xd7adf884aa879177, 0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea, 0x98e947129fc2b4e9, + 0xa87fea27a539e9a5, 0x3f2398d747b36224, + 0xd29fe4b18e88640e, 0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89, 0x1953cf68300424ac, + 0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7, + 0xcdb02555653131b6, 0x3792f412cb06794d, + 0x808e17555f3ebf11, 0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b, 0xf245825a5a445275, + 0xfb158592be068d2e, 0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d, 0x55464dd69685606b, + 0xc428d05aa4751e4c, 0xaa97e14c3c26b886, + 0xf53304714d9265df, 0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab, 0xe546a8038efe4029, + 0xbf8fdb78849a5f96, 0xde98520472bdd033, + 0xef73d256a5c0f77c, 0x963e66858f6d4440, + 0x95a8637627989aad, 0xdde7001379a44aa8, + 0xbb127c53b17ec159, 0x5560c018580d5d52, + 0xe9d71b689dde71af, 0xaab8f01e6e10b4a6, + 0x9226712162ab070d, 0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05, 0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3, 0x77f3608e92adb242, + 0xb267ed1940f1c61c, 0x55f038b237591ed3, + 0xdf01e85f912e37a3, 0x6b6c46dec52f6688, + 0x8b61313bbabce2c6, 0x2323ac4b3b3da015, + 0xae397d8aa96c1b77, 0xabec975e0a0d081a, + 0xd9c7dced53c72255, 0x96e7bd358c904a21, + 0x881cea14545c7575, 0x7e50d64177da2e54, + 0xaa242499697392d2, 0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787, 0x955e4ec64b44e864, + 0x84ec3c97da624ab4, 0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba, 0x67de18eda5814af2, + 0x81ceb32c4b43fcf4, 0x80eacf948770ced7, + 0xa2425ff75e14fc31, 0xa1258379a94d028d, + 0xcad2f7f5359a3b3e, 0x96ee45813a04330, + 0xfd87b5f28300ca0d, 0x8bca9d6e188853fc, + 0x9e74d1b791e07e48, 0x775ea264cf55347e, + 0xc612062576589dda, 0x95364afe032a819e, + 0xf79687aed3eec551, 0x3a83ddbd83f52205, + 0x9abe14cd44753b52, 0xc4926a9672793543, + 0xc16d9a0095928a27, 0x75b7053c0f178294, + 0xf1c90080baf72cb1, 0x5324c68b12dd6339, + 0x971da05074da7bee, 0xd3f6fc16ebca5e04, + 0xbce5086492111aea, 0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07, 0x3aff322e62439fd0, + 0xb877aa3236a4b449, 0x9befeb9fad487c3, + 0xe69594bec44de15b, 0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9, 0xf9d37014bf60a11, + 0xb424dc35095cd80f, 0x538484c19ef38c95, + 0xe12e13424bb40e13, 0x2865a5f206b06fba, + 0x8cbccc096f5088cb, 0xf93f87b7442e45d4, + 0xafebff0bcb24aafe, 0xf78f69a51539d749, + 0xdbe6fecebdedd5be, 0xb573440e5a884d1c, + 0x89705f4136b4a597, 0x31680a88f8953031, + 0xabcc77118461cefc, 0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc, 0x3d32907604691b4d, + 0x8637bd05af6c69b5, 0xa63f9a49c2c1b110, + 0xa7c5ac471b478423, 0xfcf80dc33721d54, + 0xd1b71758e219652b, 0xd3c36113404ea4a9, + 0x83126e978d4fdf3b, 0x645a1cac083126ea, + 0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4, + 0xcccccccccccccccc, 0xcccccccccccccccd, + 0x8000000000000000, 0x0, + 0xa000000000000000, 0x0, + 0xc800000000000000, 0x0, + 0xfa00000000000000, 0x0, + 0x9c40000000000000, 0x0, + 0xc350000000000000, 0x0, + 0xf424000000000000, 0x0, + 0x9896800000000000, 0x0, + 0xbebc200000000000, 0x0, + 0xee6b280000000000, 0x0, + 0x9502f90000000000, 0x0, + 0xba43b74000000000, 0x0, + 0xe8d4a51000000000, 0x0, + 0x9184e72a00000000, 0x0, + 0xb5e620f480000000, 0x0, + 0xe35fa931a0000000, 0x0, + 0x8e1bc9bf04000000, 0x0, + 0xb1a2bc2ec5000000, 0x0, + 0xde0b6b3a76400000, 0x0, + 0x8ac7230489e80000, 0x0, + 0xad78ebc5ac620000, 0x0, + 0xd8d726b7177a8000, 0x0, + 0x878678326eac9000, 0x0, + 0xa968163f0a57b400, 0x0, + 0xd3c21bcecceda100, 0x0, + 0x84595161401484a0, 0x0, + 0xa56fa5b99019a5c8, 0x0, + 0xcecb8f27f4200f3a, 0x0, + 0x813f3978f8940984, 0x4000000000000000, + 0xa18f07d736b90be5, 0x5000000000000000, + 0xc9f2c9cd04674ede, 0xa400000000000000, + 0xfc6f7c4045812296, 0x4d00000000000000, + 0x9dc5ada82b70b59d, 0xf020000000000000, + 0xc5371912364ce305, 0x6c28000000000000, + 0xf684df56c3e01bc6, 0xc732000000000000, + 0x9a130b963a6c115c, 0x3c7f400000000000, + 0xc097ce7bc90715b3, 0x4b9f100000000000, + 0xf0bdc21abb48db20, 0x1e86d40000000000, + 0x96769950b50d88f4, 0x1314448000000000, + 0xbc143fa4e250eb31, 0x17d955a000000000, + 0xeb194f8e1ae525fd, 0x5dcfab0800000000, + 0x92efd1b8d0cf37be, 0x5aa1cae500000000, + 0xb7abc627050305ad, 0xf14a3d9e40000000, + 0xe596b7b0c643c719, 0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f, 0xe4820023a2000000, + 0xb35dbf821ae4f38b, 0xdda2802c8a800000, + 0xe0352f62a19e306e, 0xd50b2037ad200000, + 0x8c213d9da502de45, 0x4526f422cc340000, + 0xaf298d050e4395d6, 0x9670b12b7f410000, + 0xdaf3f04651d47b4c, 0x3c0cdd765f114000, + 0x88d8762bf324cd0f, 0xa5880a69fb6ac800, + 0xab0e93b6efee0053, 0x8eea0d047a457a00, + 0xd5d238a4abe98068, 0x72a4904598d6d880, + 0x85a36366eb71f041, 0x47a6da2b7f864750, + 0xa70c3c40a64e6c51, 0x999090b65f67d924, + 0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d, + 0x82818f1281ed449f, 0xbff8f10e7a8921a4, + 0xa321f2d7226895c7, 0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490, + 0xfee50b7025c36a08, 0x2f236d04753d5b4, + 0x9f4f2726179a2245, 0x1d762422c946590, + 0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2, + 0x9b934c3b330c8577, 0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a, 0x8bef464e3945ef7a, + 0x97edd871cfda3a56, 0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317, + 0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436, 0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44, 0x60dbbca87196b616, + 0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd, + 0xb51d13aea4a488dd, 0x6babab6398bdbe41, + 0xe264589a4dcdab14, 0xc696963c7eed2dd1, + 0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8, 0x3b25a55f43294bcb, + 0xdd15fe86affad912, 0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab, 0x6e3569326c784337, + 0xacb92ed9397bf996, 0x49c2c37f07965404, + 0xd7e77a8f87daf7fb, 0xdc33745ec97be906, + 0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3, + 0xa8acd7c0222311bc, 0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b, 0xf50a3fa490c30190, + 0x83c7088e1aab65db, 0x792667c6da79e0fa, + 0xa4b8cab1a1563f52, 0x577001b891185938, + 0xcde6fd5e09abcf26, 0xed4c0226b55e6f86, + 0x80b05e5ac60b6178, 0x544f8158315b05b4, + 0xa0dc75f1778e39d6, 0x696361ae3db1c721, + 0xc913936dd571c84c, 0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f, 0x4ab48a04065c723, + 0x9d174b2dcec0e47b, 0x62eb0d64283f9c76, + 0xc45d1df942711d9a, 0x3ba5d0bd324f8394, + 0xf5746577930d6500, 0xca8f44ec7ee36479, + 0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5, 0xbba1f1d158724a12, + 0xbb445da9ca61281f, 0x2a8a6e45ae8edc97, + 0xea1575143cf97226, 0xf52d09d71a3293bd, + 0x924d692ca61be758, 0x593c2626705f9c56, + 0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c, + 0xe498f455c38b997a, 0xb6dfb9c0f956447, + 0x8edf98b59a373fec, 0x4724bd4189bd5eac, + 0xb2977ee300c50fe7, 0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed, + 0x8b865b215899f46c, 0xbd79e0d20082ee74, + 0xae67f1e9aec07187, 0xecd8590680a3aa11, + 0xda01ee641a708de9, 0xe80e6f4820cc9495, + 0x884134fe908658b2, 0x3109058d147fdcdd, + 0xaa51823e34a7eede, 0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a, + 0x850fadc09923329e, 0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45, 0x84db8346b786151c, + 0xcfe87f7cef46ff16, 0xe612641865679a63, + 0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749, 0xe3be5e330f38f09d, + 0xcb090c8001ab551c, 0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa, + 0xc646d63501a1511d, 0xb281e1fd541501b8, + 0xf7d88bc24209a565, 0x1f225a7ca91a4226, + 0x9ae757596946075f, 0x3375788de9b06958, + 0xc1a12d2fc3978937, 0x52d6b1641c83ae, + 0xf209787bb47d6b84, 0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332, 0xf840b7ba963646e0, + 0xbd176620a501fbff, 0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf, 0xc66f336c36b10137, + 0xb8a8d9bbe123f017, 0xb80b0047445d4184, + 0xe6d3102ad96cec1d, 0xa60dc059157491e5, + 0x9043ea1ac7e41392, 0x87c89837ad68db2f, + 0xb454e4a179dd1877, 0x29babe4598c311fb, + 0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d, 0x1899e4a65f58660c, + 0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d, 0x76707543f4fa1f73, + 0x899504ae72497eba, 0x6a06494a791c53a8, + 0xabfa45da0edbde69, 0x487db9d17636892, + 0xd6f8d7509292d603, 0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2, 0xb8a2392ba45a9b2, + 0xa7f26836f282b732, 0x8e6cac7768d7141e, + 0xd1ef0244af2364ff, 0x3207d795430cd926, + 0x8335616aed761f1f, 0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6, + 0xcd036837130890a1, 0x36dba887c37a8c0f, + 0x802221226be55a64, 0xc2494954da2c9789, + 0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d, 0x6f92829494e5acc7, + 0xfa42a8b73abbf48c, 0xcb772339ba1f17f9, + 0x9c69a97284b578d7, 0xff2a760414536efb, + 0xc38413cf25e2d70d, 0xfef5138519684aba, + 0xf46518c2ef5b8cd1, 0x7eb258665fc25d69, + 0x98bf2f79d5993802, 0xef2f773ffbd97a61, + 0xbeeefb584aff8603, 0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2, 0xdd945a747bf26183, + 0xba756174393d88df, 0x94f971119aeef9e4, + 0xe912b9d1478ceb17, 0x7a37cd5601aab85d, + 0x91abb422ccb812ee, 0xac62e055c10ab33a, + 0xb616a12b7fe617aa, 0x577b986b314d6009, + 0xe39c49765fdf9d94, 0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d, 0x14588f13be847307, + 0xb1d219647ae6b31c, 0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee, 0x25de7bb9480d5854, + 0xada72ccc20054ae9, 0xaf561aa79a10ae6a, + 0xd910f7ff28069da4, 0x1b2ba1518094da04, + 0x87aa9aff79042286, 0x90fb44d2f05d0842, + 0xa99541bf57452b28, 0x353a1607ac744a53, + 0xd3fa922f2d1675f2, 0x42889b8997915ce8, + 0x847c9b5d7c2e09b7, 0x69956135febada11, + 0xa59bc234db398c25, 0x43fab9837e699095, + 0xcf02b2c21207ef2e, 0x94f967e45e03f4bb, + 0x8161afb94b44f57d, 0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc, 0x6462d92a69731732, + 0xca28a291859bbf93, 0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78, 0x5cda735244c3d43e, + 0x9defbf01b061adab, 0x3a0888136afa64a7, + 0xc56baec21c7a1916, 0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b, 0x8aad549e57273d45, + 0x9a3c2087a63f6399, 0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5, + 0x969eb7c47859e743, 0x9f644ae5a4b1b325, + 0xbc4665b596706114, 0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959, 0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8, 0x9a7f12442d588f2, + 0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81, 0x8f1668c8a86da5fa, + 0x8fa475791a569d10, 0xf96e017d694487bc, + 0xb38d92d760ec4455, 0x37c981dcc395a9ac, + 0xe070f78d3927556a, 0x85bbe253f47b1417, + 0x8c469ab843b89562, 0x93956d7478ccec8e, + 0xaf58416654a6babb, 0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a, 0x6997b05fcc0319e, + 0x88fcf317f22241e2, 0x441fece3bdf81f03, + 0xab3c2fddeeaad25a, 0xd527e81cad7626c3, + 0xd60b3bd56a5586f1, 0x8a71e223d8d3b074, + 0x85c7056562757456, 0xf6872d5667844e49, + 0xa738c6bebb12d16c, 0xb428f8ac016561db, + 0xd106f86e69d785c7, 0xe13336d701beba52, + 0x82a45b450226b39c, 0xecc0024661173473, + 0xa34d721642b06084, 0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5, 0x31ec038df7b441f4, + 0xff290242c83396ce, 0x7e67047175a15271, + 0x9f79a169bd203e41, 0xf0062c6e984d386, + 0xc75809c42c684dd1, 0x52c07b78a3e60868, + 0xf92e0c3537826145, 0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb, 0x88a66076400bb691, + 0xc2abf989935ddbfe, 0x6acff893d00ea435, + 0xf356f7ebf83552fe, 0x583f6b8c4124d43, + 0x98165af37b2153de, 0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c, + 0xeda2ee1c7064130c, 0x1162def06f79df73, + 0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1, 0x6d953e2bd7173692, + 0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0, 0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8, 0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a, 0x2e44ae64840fd61d, + 0x8da471a9de737e24, 0x5ceaecfed289e5d2, + 0xb10d8e1456105dad, 0x7425a83e872c5f47, + 0xdd50f1996b947518, 0xd12f124e28f77719, + 0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b, 0x636cc64d1001550b, + 0xd8210befd30efa5a, 0x3c47f7e05401aa4e, + 0x8714a775e3e95c78, 0x65acfaec34810a71, + 0xa8d9d1535ce3b396, 0x7f1839a741a14d0d, + 0xd31045a8341ca07c, 0x1ede48111209a050, + 0x83ea2b892091e44d, 0x934aed0aab460432, + 0xa4e4b66b68b65d60, 0xf81da84d5617853f, + 0xce1de40642e3f4b9, 0x36251260ab9d668e, + 0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019, + 0xa1075a24e4421730, 0xb24cf65b8612f81f, + 0xc94930ae1d529cfc, 0xdee033f26797b627, + 0xfb9b7cd9a4a7443c, 0x169840ef017da3b1, + 0x9d412e0806e88aa5, 0x8e1f289560ee864e, + 0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2, 0xae10af696774b1db, + 0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f, 0x17fd090a58d32af3, + 0xeff394dcff8a948e, 0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9, 0x4abdaf101564f98e, + 0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513, 0x84c86189216dc5ed, + 0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77, 0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515, 0xfabaf3feaa5334a, + 0x8f05b1163ba6832d, 0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8, 0x743e20e9ef511012, + 0xdf78e4b2bd342cf6, 0x914da9246b255416, + 0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e, + 0xae9672aba3d0c320, 0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e, + 0x8865899617fb1871, 0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d, 0xddbb901b98feeab7, + 0xd51ea6fa85785631, 0x552a74227f3ea565, + 0x8533285c936b35de, 0xd53a88958f87275f, + 0xa67ff273b8460356, 0x8a892abaf368f137, + 0xd01fef10a657842c, 0x2d2b7569b0432d85, + 0x8213f56a67f6b29b, 0x9c3b29620e29fc73, + 0xa298f2c501f45f42, 0x8349f3ba91b47b8f, + 0xcb3f2f7642717713, 0x241c70a936219a73, + 0xfe0efb53d30dd4d7, 0xed238cd383aa0110, + 0x9ec95d1463e8a506, 0xf4363804324a40aa, + 0xc67bb4597ce2ce48, 0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da, 0xdd94b7868e94050a, + 0x9b10a4e5e9913128, 0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf, 0xbc633b39673c8cec, + 0x976e41088617ca01, 0xd5be0503e085d813, + 0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18, + 0xec9c459d51852ba2, 0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45, 0xcabb90e5c942b503, + 0xb8da1662e7b00a17, 0x3d6a751f3b936243, + 0xe7109bfba19c0c9d, 0xcc512670a783ad4, + 0x906a617d450187e2, 0x27fb2b80668b24c5, + 0xb484f9dc9641e9da, 0xb1f9f660802dedf6, + 0xe1a63853bbd26451, 0x5e7873f8a0396973, + 0x8d07e33455637eb2, 0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7, 0x7641a140cc7810fb, + 0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d, + 0xac2820d9623bf429, 0x546345fa9fbdcd44, + 0xd732290fbacaf133, 0xa97c177947ad4095, + 0x867f59a9d4bed6c0, 0x49ed8eabcccc485d, + 0xa81f301449ee8c70, 0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c, 0x73832eec6fff3111, + 0x83585d8fd9c25db7, 0xc831fd53c5ff7eab, + 0xa42e74f3d032f525, 0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85, 0x7980d163cf5b81b3, + 0xa0555e361951c366, 0xd7e105bcc332621f, + 0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7, + 0xfa856334878fc150, 0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07, 0xa862f80ec4700c8, + 0xf4a642e14c6262c8, 0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd, 0x8038d51cb897789c, + 0xbf21e44003acdd2c, 0xe0470a63e6bd56c3, + 0xeeea5d5004981478, 0x1858ccfce06cac74, + 0x95527a5202df0ccb, 0xf37801e0c43ebc8, + 0xbaa718e68396cffd, 0xd30560258f54e6ba, + 0xe950df20247c83fd, 0x47c6b82ef32a2069, + 0x91d28b7416cdd27e, 0x4cdc331d57fa5441, + 0xb6472e511c81471d, 0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5, 0x58180fddd97723a6, + 0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648, + }; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr uint64_t + powers_template::power_of_five_128[number_of_entries]; + +#endif + +using powers = powers_template<>; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DECIMAL_TO_BINARY_H +#define FASTFLOAT_DECIMAL_TO_BINARY_H + +#include +#include +#include +#include +#include +#include + +namespace fast_float { + +// This will compute or rather approximate w * 5**q and return a pair of 64-bit +// words approximating the result, with the "high" part corresponding to the +// most significant bits and the low part corresponding to the least significant +// bits. +// +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +compute_product_approximation(int64_t q, uint64_t w) { + const int index = 2 * int(q - powers::smallest_power_of_five); + // For small values of q, e.g., q in [0,27], the answer is always exact + // because The line value128 firstproduct = full_multiplication(w, + // power_of_five_128[index]); gives the exact answer. + value128 firstproduct = + full_multiplication(w, powers::power_of_five_128[index]); + static_assert((bit_precision >= 0) && (bit_precision <= 64), + " precision should be in (0,64]"); + constexpr uint64_t precision_mask = + (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision) + : uint64_t(0xFFFFFFFFFFFFFFFF); + if ((firstproduct.high & precision_mask) == + precision_mask) { // could further guard with (lower + w < lower) + // regarding the second product, we only need secondproduct.high, but our + // expectation is that the compiler will optimize this extra work away if + // needed. + value128 secondproduct = + full_multiplication(w, powers::power_of_five_128[index + 1]); + firstproduct.low += secondproduct.high; + if (secondproduct.high > firstproduct.low) { + firstproduct.high++; + } + } + return firstproduct; +} + +namespace detail { +/** + * For q in (0,350), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * floor(p) + q + * where + * p = log(5**q)/log(2) = q * log(5)/log(2) + * + * For negative values of q in (-400,0), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * -ceil(p) + q + * where + * p = log(5**-q)/log(2) = -q * log(5)/log(2) + */ +constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept { + return (((152170 + 65536) * q) >> 16) + 63; +} +} // namespace detail + +// create an adjusted mantissa, biased by the invalid power2 +// for significant digits already multiplied by 10 ** q. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa +compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { + int hilz = int(w >> 63) ^ 1; + adjusted_mantissa answer; + answer.mantissa = w << hilz; + int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent(); + answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + + invalid_am_bias); + return answer; +} + +// w * 10 ** q, without rounding the representation up. +// the power2 in the exponent will be adjusted by invalid_am_bias. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_error(int64_t q, uint64_t w) noexcept { + int lz = leading_zeroes(w); + w <<= lz; + value128 product = + compute_product_approximation(q, w); + return compute_error_scaled(q, product.high, lz); +} + +// w * 10 ** q +// The returned value should be a valid ieee64 number that simply need to be +// packed. However, in some very rare cases, the computation will fail. In such +// cases, we return an adjusted_mantissa with a negative power of 2: the caller +// should recompute in such cases. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_float(int64_t q, uint64_t w) noexcept { + adjusted_mantissa answer; + if ((w == 0) || (q < binary::smallest_power_of_ten())) { + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + if (q > binary::largest_power_of_ten()) { + // we want to get infinity: + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + // At this point in time q is in [powers::smallest_power_of_five, + // powers::largest_power_of_five]. + + // We want the most significant bit of i to be 1. Shift if needed. + int lz = leading_zeroes(w); + w <<= lz; + + // The required precision is binary::mantissa_explicit_bits() + 3 because + // 1. We need the implicit bit + // 2. We need an extra bit for rounding purposes + // 3. We might lose a bit due to the "upperbit" routine (result too small, + // requiring a shift) + + value128 product = + compute_product_approximation(q, w); + // The computed 'product' is always sufficient. + // Mathematical proof: + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to + // appear) See script/mushtak_lemire.py + + // The "compute_product_approximation" function can be slightly slower than a + // branchless approach: value128 product = compute_product(q, w); but in + // practice, we can win big with the compute_product_approximation if its + // additional branch is easily predicted. Which is best is data specific. + int upperbit = int(product.high >> 63); + int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3; + + answer.mantissa = product.high >> shift; + + answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - + binary::minimum_exponent()); + if (answer.power2 <= 0) { // we have a subnormal? + // Here have that answer.power2 <= 0 so -answer.power2 >= 0 + if (-answer.power2 + 1 >= + 64) { // if we have more than 64 bits below the minimum exponent, you + // have a zero for sure. + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + // next line is safe because -answer.power2 + 1 < 64 + answer.mantissa >>= -answer.power2 + 1; + // Thankfully, we can't have both "round-to-even" and subnormals because + // "round-to-even" only occurs for powers close to 0. + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + // There is a weird scenario where we don't have a subnormal but just. + // Suppose we start with 2.2250738585072013e-308, we end up + // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal + // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round + // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer + // subnormal, but we can only know this after rounding. + // So we only declare a subnormal if we are smaller than the threshold. + answer.power2 = + (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) + ? 0 + : 1; + return answer; + } + + // usually, we round *up*, but if we fall right in between and and we have an + // even basis, we need to round down + // We are only concerned with the cases where 5**q fits in single 64-bit word. + if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) && + (q <= binary::max_exponent_round_to_even()) && + ((answer.mantissa & 3) == 1)) { // we may fall between two floats! + // To be in-between two floats we need that in doing + // answer.mantissa = product.high >> (upperbit + 64 - + // binary::mantissa_explicit_bits() - 3); + // ... we dropped out only zeroes. But if this happened, then we can go + // back!!! + if ((answer.mantissa << shift) == product.high) { + answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up + } + } + + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) { + answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits()); + answer.power2++; // undo previous addition + } + + answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits()); + if (answer.power2 >= binary::infinite_power()) { // infinity + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + } + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_BIGINT_H +#define FASTFLOAT_BIGINT_H + +#include +#include +#include +#include + + +namespace fast_float { + +// the limb width: we want efficient multiplication of double the bits in +// limb, or for 64-bit limbs, at least 64-bit multiplication where we can +// extract the high and low parts efficiently. this is every 64-bit +// architecture except for sparc, which emulates 128-bit multiplication. +// we might have platforms where `CHAR_BIT` is not 8, so let's avoid +// doing `8 * sizeof(limb)`. +#if defined(FASTFLOAT_64BIT) && !defined(__sparc) +#define FASTFLOAT_64BIT_LIMB 1 +typedef uint64_t limb; +constexpr size_t limb_bits = 64; +#else +#define FASTFLOAT_32BIT_LIMB +typedef uint32_t limb; +constexpr size_t limb_bits = 32; +#endif + +typedef span limb_span; + +// number of bits in a bigint. this needs to be at least the number +// of bits required to store the largest bigint, which is +// `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or +// ~3600 bits, so we round to 4000. +constexpr size_t bigint_bits = 4000; +constexpr size_t bigint_limbs = bigint_bits / limb_bits; + +// vector-like type that is allocated on the stack. the entire +// buffer is pre-allocated, and only the length changes. +template struct stackvec { + limb data[size]; + // we never need more than 150 limbs + uint16_t length{0}; + + stackvec() = default; + stackvec(const stackvec &) = delete; + stackvec &operator=(const stackvec &) = delete; + stackvec(stackvec &&) = delete; + stackvec &operator=(stackvec &&other) = delete; + + // create stack vector from existing limb span. + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { + FASTFLOAT_ASSERT(try_extend(s)); + } + + FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + // index from the end of the container + FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + size_t rindex = length - index - 1; + return data[rindex]; + } + + // set the length, without bounds checking. + FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { + length = uint16_t(len); + } + constexpr size_t len() const noexcept { return length; } + constexpr bool is_empty() const noexcept { return length == 0; } + constexpr size_t capacity() const noexcept { return size; } + // append item to vector, without bounds checking + FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { + data[length] = value; + length++; + } + // append item to vector, returning if item was added + FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept { + if (len() < capacity()) { + push_unchecked(value); + return true; + } else { + return false; + } + } + // add items to the vector, from a span, without bounds checking + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { + limb *ptr = data + length; + std::copy_n(s.ptr, s.len(), ptr); + set_len(len() + s.len()); + } + // try to add items to the vector, returning if items were added + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { + if (len() + s.len() <= capacity()) { + extend_unchecked(s); + return true; + } else { + return false; + } + } + // resize the vector, without bounds checking + // if the new size is longer than the vector, assign value to each + // appended item. + FASTFLOAT_CONSTEXPR20 + void resize_unchecked(size_t new_len, limb value) noexcept { + if (new_len > len()) { + size_t count = new_len - len(); + limb *first = data + len(); + limb *last = first + count; + ::std::fill(first, last, value); + set_len(new_len); + } else { + set_len(new_len); + } + } + // try to resize the vector, returning if the vector was resized. + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { + if (new_len > capacity()) { + return false; + } else { + resize_unchecked(new_len, value); + return true; + } + } + // check if any limbs are non-zero after the given index. + // this needs to be done in reverse order, since the index + // is relative to the most significant limbs. + FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept { + while (index < len()) { + if (rindex(index) != 0) { + return true; + } + index++; + } + return false; + } + // normalize the big integer, so most-significant zero limbs are removed. + FASTFLOAT_CONSTEXPR14 void normalize() noexcept { + while (len() > 0 && rindex(0) == 0) { + length--; + } + } +}; + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +empty_hi64(bool &truncated) noexcept { + truncated = false; + return 0; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, bool &truncated) noexcept { + truncated = false; + int shl = leading_zeroes(r0); + return r0 << shl; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept { + int shl = leading_zeroes(r0); + if (shl == 0) { + truncated = r1 != 0; + return r0; + } else { + int shr = 64 - shl; + truncated = (r1 << shl) != 0; + return (r0 << shl) | (r1 >> shr); + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, bool &truncated) noexcept { + return uint64_hi64(r0, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + return uint64_hi64((x0 << 32) | x1, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + uint64_t x2 = r2; + return uint64_hi64(x0, (x1 << 32) | x2, truncated); +} + +// add two small integers, checking for overflow. +// we want an efficient operation. for msvc, where +// we don't have built-in intrinsics, this is still +// pretty fast. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_add(limb x, limb y, bool &overflow) noexcept { + limb z; +// gcc and clang +#if defined(__has_builtin) +#if __has_builtin(__builtin_add_overflow) + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } +#endif +#endif + + // generic, this still optimizes correctly on MSVC. + z = x + y; + overflow = z < x; + return z; +} + +// multiply two small integers, getting both the high and low bits. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_mul(limb x, limb y, limb &carry) noexcept { +#ifdef FASTFLOAT_64BIT_LIMB +#if defined(__SIZEOF_INT128__) + // GCC and clang both define it as an extension. + __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#else + // fallback, no native 128-bit integer multiplication with carry. + // on msvc, this optimizes identically, somehow. + value128 z = full_multiplication(x, y); + bool overflow; + z.low = scalar_add(z.low, carry, overflow); + z.high += uint64_t(overflow); // cannot overflow + carry = z.high; + return z.low; +#endif +#else + uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#endif +} + +// add scalar value to bigint starting from offset. +// used in grade school multiplication +template +inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec &vec, limb y, + size_t start) noexcept { + size_t index = start; + limb carry = y; + bool overflow; + while (carry != 0 && index < vec.len()) { + vec[index] = scalar_add(vec[index], carry, overflow); + carry = limb(overflow); + index += 1; + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add scalar value to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +small_add(stackvec &vec, limb y) noexcept { + return small_add_from(vec, y, 0); +} + +// multiply bigint by scalar value. +template +inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec &vec, + limb y) noexcept { + limb carry = 0; + for (size_t index = 0; index < vec.len(); index++) { + vec[index] = scalar_mul(vec[index], y, carry); + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add bigint to bigint starting from index. +// used in grade school multiplication +template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec &x, limb_span y, + size_t start) noexcept { + // the effective x buffer is from `xstart..x.len()`, so exit early + // if we can't get that current range. + if (x.len() < start || y.len() > x.len() - start) { + FASTFLOAT_TRY(x.try_resize(y.len() + start, 0)); + } + + bool carry = false; + for (size_t index = 0; index < y.len(); index++) { + limb xi = x[index + start]; + limb yi = y[index]; + bool c1 = false; + bool c2 = false; + xi = scalar_add(xi, yi, c1); + if (carry) { + xi = scalar_add(xi, 1, c2); + } + x[index + start] = xi; + carry = c1 | c2; + } + + // handle overflow + if (carry) { + FASTFLOAT_TRY(small_add_from(x, 1, y.len() + start)); + } + return true; +} + +// add bigint to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +large_add_from(stackvec &x, limb_span y) noexcept { + return large_add_from(x, y, 0); +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec &x, limb_span y) noexcept { + limb_span xs = limb_span(x.data, x.len()); + stackvec z(xs); + limb_span zs = limb_span(z.data, z.len()); + + if (y.len() != 0) { + limb y0 = y[0]; + FASTFLOAT_TRY(small_mul(x, y0)); + for (size_t index = 1; index < y.len(); index++) { + limb yi = y[index]; + stackvec zi; + if (yi != 0) { + // re-use the same buffer throughout + zi.set_len(0); + FASTFLOAT_TRY(zi.try_extend(zs)); + FASTFLOAT_TRY(small_mul(zi, yi)); + limb_span zis = limb_span(zi.data, zi.len()); + FASTFLOAT_TRY(large_add_from(x, zis, index)); + } + } + } + + x.normalize(); + return true; +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec &x, limb_span y) noexcept { + if (y.len() == 1) { + FASTFLOAT_TRY(small_mul(x, y[0])); + } else { + FASTFLOAT_TRY(long_mul(x, y)); + } + return true; +} + +template struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, + 5UL, + 25UL, + 125UL, + 625UL, + 3125UL, + 15625UL, + 78125UL, + 390625UL, + 1953125UL, + 9765625UL, + 48828125UL, + 244140625UL, + 1220703125UL, + 6103515625UL, + 30517578125UL, + 152587890625UL, + 762939453125UL, + 3814697265625UL, + 19073486328125UL, + 95367431640625UL, + 476837158203125UL, + 2384185791015625UL, + 11920928955078125UL, + 59604644775390625UL, + 298023223876953125UL, + 1490116119384765625UL, + 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint32_t pow5_tables::large_step; + +template constexpr uint64_t pow5_tables::small_power_of_5[]; + +template constexpr limb pow5_tables::large_power_of_5[]; + +#endif + +// big integer type. implements a small subset of big integer +// arithmetic, using simple algorithms since asymptotically +// faster algorithms are slower for a small number of limbs. +// all operations assume the big-integer is normalized. +struct bigint : pow5_tables<> { + // storage of the limbs, in little-endian order. + stackvec vec; + + FASTFLOAT_CONSTEXPR20 bigint() : vec() {} + bigint(const bigint &) = delete; + bigint &operator=(const bigint &) = delete; + bigint(bigint &&) = delete; + bigint &operator=(bigint &&other) = delete; + + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() { +#ifdef FASTFLOAT_64BIT_LIMB + vec.push_unchecked(value); +#else + vec.push_unchecked(uint32_t(value)); + vec.push_unchecked(uint32_t(value >> 32)); +#endif + vec.normalize(); + } + + // get the high 64 bits from the vector, and if bits were truncated. + // this is to get the significant digits for the float. + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept { +#ifdef FASTFLOAT_64BIT_LIMB + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint64_hi64(vec.rindex(0), truncated); + } else { + uint64_t result = uint64_hi64(vec.rindex(0), vec.rindex(1), truncated); + truncated |= vec.nonzero(2); + return result; + } +#else + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint32_hi64(vec.rindex(0), truncated); + } else if (vec.len() == 2) { + return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated); + } else { + uint64_t result = + uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated); + truncated |= vec.nonzero(3); + return result; + } +#endif + } + + // compare two big integers, returning the large value. + // assumes both are normalized. if the return value is + // negative, other is larger, if the return value is + // positive, this is larger, otherwise they are equal. + // the limbs are stored in little-endian order, so we + // must compare the limbs in ever order. + FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept { + if (vec.len() > other.vec.len()) { + return 1; + } else if (vec.len() < other.vec.len()) { + return -1; + } else { + for (size_t index = vec.len(); index > 0; index--) { + limb xi = vec[index - 1]; + limb yi = other.vec[index - 1]; + if (xi > yi) { + return 1; + } else if (xi < yi) { + return -1; + } + } + return 0; + } + } + + // shift left each limb n bits, carrying over to the new limb + // returns true if we were able to shift all the digits. + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { + // Internally, for each item, we shift left by n, and add the previous + // right shifted limb-bits. + // For example, we transform (for u8) shifted left 2, to: + // b10100100 b01000010 + // b10 b10010001 b00001000 + FASTFLOAT_DEBUG_ASSERT(n != 0); + FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8); + + size_t shl = n; + size_t shr = limb_bits - shl; + limb prev = 0; + for (size_t index = 0; index < vec.len(); index++) { + limb xi = vec[index]; + vec[index] = (xi << shl) | (prev >> shr); + prev = xi; + } + + limb carry = prev >> shr; + if (carry != 0) { + return vec.try_push(carry); + } + return true; + } + + // move the limbs left by `n` limbs. + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { + FASTFLOAT_DEBUG_ASSERT(n != 0); + if (n + vec.len() > vec.capacity()) { + return false; + } else if (!vec.is_empty()) { + // move limbs + limb *dst = vec.data + n; + const limb *src = vec.data; + std::copy_backward(src, src + vec.len(), dst + vec.len()); + // fill in empty limbs + limb *first = vec.data; + limb *last = first + n; + ::std::fill(first, last, 0); + vec.set_len(n + vec.len()); + return true; + } else { + return true; + } + } + + // move the limbs left by `n` bits. + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { + size_t rem = n % limb_bits; + size_t div = n / limb_bits; + if (rem != 0) { + FASTFLOAT_TRY(shl_bits(rem)); + } + if (div != 0) { + FASTFLOAT_TRY(shl_limbs(div)); + } + return true; + } + + // get the number of leading zeros in the bigint. + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { + if (vec.is_empty()) { + return 0; + } else { +#ifdef FASTFLOAT_64BIT_LIMB + return leading_zeroes(vec.rindex(0)); +#else + // no use defining a specialized leading_zeroes for a 32-bit type. + uint64_t r0 = vec.rindex(0); + return leading_zeroes(r0 << 32); +#endif + } + } + + // get the number of bits in the bigint. + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { + int lz = ctlz(); + return int(limb_bits * vec.len()) - lz; + } + + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } + + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } + + // multiply as if by 2 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } + + // multiply as if by 5 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { + // multiply by a power of 5 + size_t large_length = sizeof(large_power_of_5) / sizeof(limb); + limb_span large = limb_span(large_power_of_5, large_length); + while (exp >= large_step) { + FASTFLOAT_TRY(large_mul(vec, large)); + exp -= large_step; + } +#ifdef FASTFLOAT_64BIT_LIMB + uint32_t small_step = 27; + limb max_native = 7450580596923828125UL; +#else + uint32_t small_step = 13; + limb max_native = 1220703125U; +#endif + while (exp >= small_step) { + FASTFLOAT_TRY(small_mul(vec, max_native)); + exp -= small_step; + } + if (exp != 0) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + // This is similar to https://github.com/llvm/llvm-project/issues/47746, + // except the workaround described there don't work here + FASTFLOAT_TRY(small_mul( + vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))); + } + + return true; + } + + // multiply as if by 10 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { + FASTFLOAT_TRY(pow5(exp)); + return pow2(exp); + } +}; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DIGIT_COMPARISON_H +#define FASTFLOAT_DIGIT_COMPARISON_H + +#include +#include +#include +#include + + +namespace fast_float { + +// 1e0 to 1e19 +constexpr static uint64_t powers_of_ten_uint64[] = {1UL, + 10UL, + 100UL, + 1000UL, + 10000UL, + 100000UL, + 1000000UL, + 10000000UL, + 100000000UL, + 1000000000UL, + 10000000000UL, + 100000000000UL, + 1000000000000UL, + 10000000000000UL, + 100000000000000UL, + 1000000000000000UL, + 10000000000000000UL, + 100000000000000000UL, + 1000000000000000000UL, + 10000000000000000000UL}; + +// calculate the exponent, in scientific notation, of the number. +// this algorithm is not even close to optimized, but it has no practical +// effect on performance: in order to have a faster algorithm, we'd need +// to slow down performance for faster algorithms, and this is still fast. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t +scientific_exponent(parsed_number_string_t &num) noexcept { + uint64_t mantissa = num.mantissa; + int32_t exponent = int32_t(num.exponent); + while (mantissa >= 10000) { + mantissa /= 10000; + exponent += 4; + } + while (mantissa >= 100) { + mantissa /= 100; + exponent += 2; + } + while (mantissa >= 10) { + mantissa /= 10; + exponent += 1; + } + return exponent; +} + +// this converts a native floating-point number to an extended-precision float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended(T value) noexcept { + using equiv_uint = typename binary_format::equiv_uint; + constexpr equiv_uint exponent_mask = binary_format::exponent_mask(); + constexpr equiv_uint mantissa_mask = binary_format::mantissa_mask(); + constexpr equiv_uint hidden_bit_mask = binary_format::hidden_bit_mask(); + + adjusted_mantissa am; + int32_t bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + equiv_uint bits; +#if FASTFLOAT_HAS_BIT_CAST + bits = std::bit_cast(value); +#else + ::memcpy(&bits, &value, sizeof(T)); +#endif + if ((bits & exponent_mask) == 0) { + // denormal + am.power2 = 1 - bias; + am.mantissa = bits & mantissa_mask; + } else { + // normal + am.power2 = int32_t((bits & exponent_mask) >> + binary_format::mantissa_explicit_bits()); + am.power2 -= bias; + am.mantissa = (bits & mantissa_mask) | hidden_bit_mask; + } + + return am; +} + +// get the extended precision value of the halfway point between b and b+u. +// we are given a native float that represents b, so we need to adjust it +// halfway between b and b+u. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended_halfway(T value) noexcept { + adjusted_mantissa am = to_extended(value); + am.mantissa <<= 1; + am.mantissa += 1; + am.power2 -= 1; + return am; +} + +// round an extended-precision float to the nearest machine float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am, + callback cb) noexcept { + int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; + if (-am.power2 >= mantissa_shift) { + // have a denormal float + int32_t shift = -am.power2 + 1; + cb(am, std::min(shift, 64)); + // check for round-up: if rounding-nearest carried us to the hidden bit. + am.power2 = (am.mantissa < + (uint64_t(1) << binary_format::mantissa_explicit_bits())) + ? 0 + : 1; + return; + } + + // have a normal float, use the default shift. + cb(am, mantissa_shift); + + // check for carry + if (am.mantissa >= + (uint64_t(2) << binary_format::mantissa_explicit_bits())) { + am.mantissa = (uint64_t(1) << binary_format::mantissa_explicit_bits()); + am.power2++; + } + + // check for infinite: we could have carried to an infinite power + am.mantissa &= ~(uint64_t(1) << binary_format::mantissa_explicit_bits()); + if (am.power2 >= binary_format::infinite_power()) { + am.power2 = binary_format::infinite_power(); + am.mantissa = 0; + } +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_nearest_tie_even(adjusted_mantissa &am, int32_t shift, + callback cb) noexcept { + const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1; + const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1); + uint64_t truncated_bits = am.mantissa & mask; + bool is_above = truncated_bits > halfway; + bool is_halfway = truncated_bits == halfway; + + // shift digits into position + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; + + bool is_odd = (am.mantissa & 1) == 1; + am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_down(adjusted_mantissa &am, int32_t shift) noexcept { + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +skip_zeros(UC const *&first, UC const *last) noexcept { + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + break; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + break; + } + first++; + } +} + +// determine if any non-zero digits were truncated. +// all characters must be valid digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(UC const *first, UC const *last) noexcept { + // do 8-bit optimizations, can just compare to 8 literal 0s. + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + return true; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + return true; + } + ++first; + } + return false; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(span s) noexcept { + return is_truncated(s.ptr, s.ptr + s.len()); +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +parse_eight_digits(const UC *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 100000000 + parse_eight_digits_unrolled(p); + p += 8; + counter += 8; + count += 8; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +parse_one_digit(UC const *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 10 + limb(*p - UC('0')); + p++; + counter++; + count++; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +add_native(bigint &big, limb power, limb value) noexcept { + big.mul(power); + big.add(value); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +round_up_bigint(bigint &big, size_t &count) noexcept { + // need to round-up the digits, but need to avoid rounding + // ....9999 to ...10000, which could cause a false halfway point. + add_native(big, 10, 1); + count++; +} + +// parse the significant digits into a big integer +template +inline FASTFLOAT_CONSTEXPR20 void +parse_mantissa(bigint &result, parsed_number_string_t &num, + size_t max_digits, size_t &digits) noexcept { + // try to minimize the number of big integer and scalar multiplication. + // therefore, try to parse 8 digits at a time, and multiply by the largest + // scalar value (9 or 19 digits) for each step. + size_t counter = 0; + digits = 0; + limb value = 0; +#ifdef FASTFLOAT_64BIT_LIMB + size_t step = 19; +#else + size_t step = 9; +#endif + + // process all integer digits. + UC const *p = num.integer.ptr; + UC const *pend = p + num.integer.len(); + skip_zeros(p, pend); + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (num.fraction.ptr != nullptr) { + truncated |= is_truncated(num.fraction); + } + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + + // add our fraction digits, if they're available. + if (num.fraction.ptr != nullptr) { + p = num.fraction.ptr; + pend = p + num.fraction.len(); + if (digits == 0) { + skip_zeros(p, pend); + } + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + } + + if (counter != 0) { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + } +} + +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept { + FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent))); + adjusted_mantissa answer; + bool truncated; + answer.mantissa = bigmant.hi64(truncated); + int bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + answer.power2 = bigmant.bit_length() - 64 + bias; + + round(answer, [truncated](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, + [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool { + return is_above || (is_halfway && truncated) || + (is_odd && is_halfway); + }); + }); + + return answer; +} + +// the scaling here is quite simple: we have, for the real digits `m * 10^e`, +// and for the theoretical digits `n * 2^f`. Since `e` is always negative, +// to scale them identically, we do `n * 2^f * 5^-f`, so we now have `m * 2^e`. +// we then need to scale by `2^(f- e)`, and then the two significant digits +// are of the same magnitude. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp( + bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept { + bigint &real_digits = bigmant; + int32_t real_exp = exponent; + + // get the value of `b`, rounded down, and get a bigint representation of b+h + adjusted_mantissa am_b = am; + // gcc7 buf: use a lambda to remove the noexcept qualifier bug with + // -Wnoexcept-type. + round(am_b, + [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); }); + T b; + to_float(false, am_b, b); + adjusted_mantissa theor = to_extended_halfway(b); + bigint theor_digits(theor.mantissa); + int32_t theor_exp = theor.power2; + + // scale real digits and theor digits to be same power. + int32_t pow2_exp = theor_exp - real_exp; + uint32_t pow5_exp = uint32_t(-real_exp); + if (pow5_exp != 0) { + FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp)); + } + if (pow2_exp > 0) { + FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp))); + } else if (pow2_exp < 0) { + FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp))); + } + + // compare digits, and use it to director rounding + int ord = real_digits.compare(theor_digits); + adjusted_mantissa answer = am; + round(answer, [ord](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, [ord](bool is_odd, bool _, bool __) -> bool { + (void)_; // not needed, since we've done our comparison + (void)__; // not needed, since we've done our comparison + if (ord > 0) { + return true; + } else if (ord < 0) { + return false; + } else { + return is_odd; + } + }); + }); + + return answer; +} + +// parse the significant digits as a big integer to unambiguously round the +// the significant digits. here, we are trying to determine how to round +// an extended float representation close to `b+h`, halfway between `b` +// (the float rounded-down) and `b+u`, the next positive float. this +// algorithm is always correct, and uses one of two approaches. when +// the exponent is positive relative to the significant digits (such as +// 1234), we create a big-integer representation, get the high 64-bits, +// determine if any lower bits are truncated, and use that to direct +// rounding. in case of a negative exponent relative to the significant +// digits (such as 1.2345), we create a theoretical representation of +// `b` as a big-integer type, scaled to the same binary exponent as +// the actual digits. we then compare the big integer representations +// of both, and use that to direct rounding. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +digit_comp(parsed_number_string_t &num, adjusted_mantissa am) noexcept { + // remove the invalid exponent bias + am.power2 -= invalid_am_bias; + + int32_t sci_exp = scientific_exponent(num); + size_t max_digits = binary_format::max_digits(); + size_t digits = 0; + bigint bigmant; + parse_mantissa(bigmant, num, max_digits, digits); + // can't underflow, since digits is at most max_digits. + int32_t exponent = sci_exp + 1 - int32_t(digits); + if (exponent >= 0) { + return positive_digit_comp(bigmant, exponent); + } else { + return negative_digit_comp(bigmant, am, exponent); + } +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_PARSE_NUMBER_H +#define FASTFLOAT_PARSE_NUMBER_H + + +#include +#include +#include +#include +namespace fast_float { + +namespace detail { +/** + * Special case +inf, -inf, nan, infinity, -infinity. + * The case comparisons could be made much faster given that we know that the + * strings a null-free and fixed. + **/ +template +from_chars_result_t FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first, + UC const *last, + T &value) noexcept { + from_chars_result_t answer{}; + answer.ptr = first; + answer.ec = std::errc(); // be optimistic + bool minusSign = false; + if (*first == + UC('-')) { // assume first < last, so dereference without checks; + // C++17 20.19.3.(7.1) explicitly forbids '+' here + minusSign = true; + ++first; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if (*first == UC('+')) { + ++first; + } +#endif + if (last - first >= 3) { + if (fastfloat_strncasecmp(first, str_const_nan(), 3)) { + answer.ptr = (first += 3); + value = minusSign ? -std::numeric_limits::quiet_NaN() + : std::numeric_limits::quiet_NaN(); + // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, + // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). + if (first != last && *first == UC('(')) { + for (UC const *ptr = first + 1; ptr != last; ++ptr) { + if (*ptr == UC(')')) { + answer.ptr = ptr + 1; // valid nan(n-char-seq-opt) + break; + } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) || + (UC('A') <= *ptr && *ptr <= UC('Z')) || + (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_'))) + break; // forbidden char, not nan(n-char-seq-opt) + } + } + return answer; + } + if (fastfloat_strncasecmp(first, str_const_inf(), 3)) { + if ((last - first >= 8) && + fastfloat_strncasecmp(first + 3, str_const_inf() + 3, 5)) { + answer.ptr = first + 8; + } else { + answer.ptr = first + 3; + } + value = minusSign ? -std::numeric_limits::infinity() + : std::numeric_limits::infinity(); + return answer; + } + } + answer.ec = std::errc::invalid_argument; + return answer; +} + +/** + * Returns true if the floating-pointing rounding mode is to 'nearest'. + * It is the default on most system. This function is meant to be inexpensive. + * Credit : @mwalcott3 + */ +fastfloat_really_inline bool rounds_to_nearest() noexcept { + // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return false; +#endif + // See + // A fast function to check your floating-point rounding mode + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ + // + // This function is meant to be equivalent to : + // prior: #include + // return fegetround() == FE_TONEAREST; + // However, it is expected to be much faster than the fegetround() + // function call. + // + // The volatile keywoard prevents the compiler from computing the function + // at compile-time. + // There might be other ways to prevent compile-time optimizations (e.g., + // asm). The value does not need to be std::numeric_limits::min(), any + // small value so that 1 + x should round to 1 would do (after accounting for + // excess precision, as in 387 instructions). + static volatile float fmin = std::numeric_limits::min(); + float fmini = fmin; // we copy it so that it gets loaded at most once. +// +// Explanation: +// Only when fegetround() == FE_TONEAREST do we have that +// fmin + 1.0f == 1.0f - fmin. +// +// FE_UPWARD: +// fmin + 1.0f > 1 +// 1.0f - fmin == 1 +// +// FE_DOWNWARD or FE_TOWARDZERO: +// fmin + 1.0f == 1 +// 1.0f - fmin < 1 +// +// Note: This may fail to be accurate if fast-math has been +// enabled, as rounding conventions may not apply. +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +// todo: is there a VS warning? +// see +// https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 +#elif defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + return (fmini + 1.0f == 1.0f - fmini); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#elif defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +} + +} // namespace detail + +template struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + return from_chars_advanced(first, last, value, options); + } +}; + +#if __STDCPP_FLOAT32_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float32_t &value, + parse_options_t options) noexcept { + // if std::float32_t is defined, and we are in C++23 mode; macro set for + // float32; set value to float due to equivalence between float and + // float32_t + float val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +#if __STDCPP_FLOAT64_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float64_t &value, + parse_options_t options) noexcept { + // if std::float64_t is defined, and we are in C++23 mode; macro set for + // float64; set value as double due to equivalence between double and + // float64_t + double val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt /*= chars_format::general*/) noexcept { + return from_chars_caller::call(first, last, value, + parse_options_t(fmt)); +} + +/** + * This function overload takes parsed_number_string_t structure that is created + * and populated either by from_chars_advanced function taking chars range and + * parsing options or other parsing custom function implemented by user. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; + + answer.ec = std::errc(); // be optimistic + answer.ptr = pns.lastmatch; + // The implementation of the Clinger's fast path is convoluted because + // we want round-to-nearest in all cases, irrespective of the rounding mode + // selected on the thread. + // We proceed optimistically, assuming that detail::rounds_to_nearest() + // returns true. + if (binary_format::min_exponent_fast_path() <= pns.exponent && + pns.exponent <= binary_format::max_exponent_fast_path() && + !pns.too_many_digits) { + // Unfortunately, the conventional Clinger's fast path is only possible + // when the system rounds to the nearest float. + // + // We expect the next branch to almost always be selected. + // We could check it first (before the previous branch), but + // there might be performance advantages at having the check + // be last. + if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { + // We have that fegetround() == FE_TONEAREST. + // Next is Clinger's fast path. + if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { + value = T(pns.mantissa); + if (pns.exponent < 0) { + value = value / binary_format::exact_power_of_ten(-pns.exponent); + } else { + value = value * binary_format::exact_power_of_ten(pns.exponent); + } + if (pns.negative) { + value = -value; + } + return answer; + } + } else { + // We do not have that fegetround() == FE_TONEAREST. + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's + // proposal + if (pns.exponent >= 0 && + pns.mantissa <= + binary_format::max_mantissa_fast_path(pns.exponent)) { +#if defined(__clang__) || defined(FASTFLOAT_32BIT) + // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD + if (pns.mantissa == 0) { + value = pns.negative ? T(-0.) : T(0.); + return answer; + } +#endif + value = T(pns.mantissa) * + binary_format::exact_power_of_ten(pns.exponent); + if (pns.negative) { + value = -value; + } + return answer; + } + } + } + adjusted_mantissa am = + compute_float>(pns.exponent, pns.mantissa); + if (pns.too_many_digits && am.power2 >= 0) { + if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { + am = compute_error>(pns.exponent, pns.mantissa); + } + } + // If we called compute_float>(pns.exponent, pns.mantissa) + // and we have an invalid power (am.power2 < 0), then we need to go the long + // way around again. This is very uncommon. + if (am.power2 < 0) { + am = digit_comp(pns, am); + } + to_float(pns.negative, am, value); + // Test for over/underflow. + if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + am.power2 == binary_format::infinite_power()) { + answer.ec = std::errc::result_out_of_range; + } + return answer; +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + parsed_number_string_t pns = + parse_number_string(first, last, options); + if (!pns.valid) { + if (options.format & chars_format::no_infnan) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } else { + return detail::parse_infnan(first, last, value); + } + } + + // call overload that takes parsed_number_string_t directly. + return from_chars_advanced(pns, value); +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base) noexcept { + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last || base < 2 || base > 36) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + return parse_int_string(first, last, value, base); +} + +} // namespace fast_float + +#endif diff --git a/deps/fast_float_c_interface/Makefile b/deps/fast_float_c_interface/Makefile new file mode 100644 index 0000000000..4db3efe2c3 --- /dev/null +++ b/deps/fast_float_c_interface/Makefile @@ -0,0 +1,37 @@ +CCCOLOR:="\033[34m" +SRCCOLOR:="\033[33m" +ENDCOLOR:="\033[0m" + +CXX?=c++ +# we need = instead of := so that $@ in QUIET_CXX gets evaluated in the rule and is assigned appropriate value. +TEMP:=$(CXX) +QUIET_CXX=@printf ' %b %b\n' $(CCCOLOR)C++$(ENDCOLOR) $(SRCCOLOR)$@$(ENDCOLOR) 1>&2; +CXX=$(QUIET_CXX)$(TEMP) + +WARN=-Wall -W -Wno-missing-field-initializers + +STD=-pedantic -std=c++11 + +OPT?=-O3 +CLANG := $(findstring clang,$(shell sh -c '$(CC) --version | head -1')) +ifeq ($(OPT),-O3) + ifeq (clang,$(CLANG)) + OPT+=-flto + else + OPT+=-flto=auto -ffat-lto-objects + endif +endif + +# 1) Today src/Makefile passes -m32 flag for explicit 32-bit build on 64-bit machine, via CFLAGS. For 32-bit build on +# 32-bit machine and 64-bit on 64-bit machine, CFLAGS are empty. No other flags are set that can conflict with C++, +# therefore let's use CFLAGS without changes for now. +# 2) FASTFLOAT_ALLOWS_LEADING_PLUS allows +inf to be parsed as inf, instead of error. +CXXFLAGS=$(STD) $(OPT) $(WARN) -static -fPIC -fno-exceptions $(CFLAGS) -D FASTFLOAT_ALLOWS_LEADING_PLUS + +.PHONY: all clean + +all: fast_float_strtod.o + +clean: + rm -f *.o || true; + diff --git a/deps/fast_float_c_interface/fast_float_strtod.cpp b/deps/fast_float_c_interface/fast_float_strtod.cpp new file mode 100644 index 0000000000..8e5d19470f --- /dev/null +++ b/deps/fast_float_c_interface/fast_float_strtod.cpp @@ -0,0 +1,24 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "../fast_float/fast_float.h" +#include + +extern "C" +{ + double fast_float_strtod(const char *str, const char** endptr) + { + double temp = 0; + auto answer = fast_float::from_chars(str, str + strlen(str), temp); + if (answer.ec != std::errc()) { + errno = (answer.ec == std::errc::result_out_of_range) ? ERANGE : EINVAL; + } + if (endptr) { + *endptr = answer.ptr; + } + return temp; + } +} diff --git a/deps/hiredis/.github/workflows/build.yml b/deps/hiredis/.github/workflows/build.yml index 581800b4f7..048ee51cd4 100644 --- a/deps/hiredis/.github/workflows/build.yml +++ b/deps/hiredis/.github/workflows/build.yml @@ -112,7 +112,7 @@ jobs: run: $GITHUB_WORKSPACE/test.sh freebsd: - runs-on: macos-12 + runs-on: macos-13 name: FreeBSD steps: - uses: actions/checkout@v3 diff --git a/deps/jemalloc/CMakeLists.txt b/deps/jemalloc/CMakeLists.txt index e79e960ec2..0fa99df55e 100644 --- a/deps/jemalloc/CMakeLists.txt +++ b/deps/jemalloc/CMakeLists.txt @@ -12,9 +12,18 @@ if (NOT EXISTS ${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a) COMMAND sh -c "${JEMALLOC_SRC_DIR}/configure --disable-cxx \ --with-version=5.3.0-0-g0 --with-lg-quantum=3 --disable-cache-oblivious --with-jemalloc-prefix=je_ \ --enable-static --disable-shared --prefix=${JEMALLOC_INSTALL_DIR}" - WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} COMMAND_ERROR_IS_FATAL ANY) + WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} RESULTS_VARIABLE CONFIGURE_RESULT) + + if (NOT ${CONFIGURE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc configure failed") + endif () + execute_process(COMMAND make -j${VALKEY_PROCESSOR_COUNT} lib/libjemalloc.a install - WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}") + WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}" RESULTS_VARIABLE MAKE_RESULT) + + if (NOT ${MAKE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc build failed") + endif () endif () # Import the compiled library as a CMake target diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 2cd7e7ce93..b0868b7d61 100644 --- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -337,55 +337,4 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { return fallback_alloc(size); } -JEMALLOC_ALWAYS_INLINE int -iget_defrag_hint(tsdn_t *tsdn, void* ptr) { - int defrag = 0; - emap_alloc_ctx_t alloc_ctx; - emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); - if (likely(alloc_ctx.slab)) { - /* Small allocation. */ - edata_t *slab = emap_edata_lookup(tsdn, &arena_emap_global, ptr); - arena_t *arena = arena_get_from_edata(slab); - szind_t binind = edata_szind_get(slab); - unsigned binshard = edata_binshard_get(slab); - bin_t *bin = arena_get_bin(arena, binind, binshard); - malloc_mutex_lock(tsdn, &bin->lock); - arena_dalloc_bin_locked_info_t info; - arena_dalloc_bin_locked_begin(&info, binind); - /* Don't bother moving allocations from the slab currently used for new allocations */ - if (slab != bin->slabcur) { - int free_in_slab = edata_nfree_get(slab); - if (free_in_slab) { - const bin_info_t *bin_info = &bin_infos[binind]; - /* Find number of non-full slabs and the number of regs in them */ - unsigned long curslabs = 0; - size_t curregs = 0; - /* Run on all bin shards (usually just one) */ - for (uint32_t i=0; i< bin_info->n_shards; i++) { - bin_t *bb = arena_get_bin(arena, binind, i); - curslabs += bb->stats.nonfull_slabs; - /* Deduct the regs in full slabs (they're not part of the game) */ - unsigned long full_slabs = bb->stats.curslabs - bb->stats.nonfull_slabs; - curregs += bb->stats.curregs - full_slabs * bin_info->nregs; - if (bb->slabcur) { - /* Remove slabcur from the overall utilization (not a candidate to nove from) */ - curregs -= bin_info->nregs - edata_nfree_get(bb->slabcur); - curslabs -= 1; - } - } - /* Compare the utilization ratio of the slab in question to the total average - * among non-full slabs. To avoid precision loss in division, we do that by - * extrapolating the usage of the slab as if all slabs have the same usage. - * If this slab is less used than the average, we'll prefer to move the data - * to hopefully more used ones. To avoid stagnation when all slabs have the same - * utilization, we give additional 12.5% weight to the decision to defrag. */ - defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs + curregs / 8; - } - } - arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); - malloc_mutex_unlock(tsdn, &bin->lock); - } - return defrag; -} - #endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in index d04af34d93..ebb3137e6f 100644 --- a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in +++ b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in @@ -147,7 +147,3 @@ #else # define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW #endif - -/* This version of Jemalloc, modified for Redis, has the je_get_defrag_hint() - * function. */ -#define JEMALLOC_FRAG_HINT diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c index 83026093be..ea9232c5d6 100644 --- a/deps/jemalloc/src/jemalloc.c +++ b/deps/jemalloc/src/jemalloc.c @@ -4474,12 +4474,3 @@ jemalloc_postfork_child(void) { } /******************************************************************************/ - -/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation. - * returns 1 if the allocation should be moved, and 0 if the allocation be kept. - * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */ -JEMALLOC_EXPORT int JEMALLOC_NOTHROW -get_defrag_hint(void* ptr) { - assert(ptr != NULL); - return iget_defrag_hint(TSDN_NULL, ptr); -} diff --git a/deps/lua/CMakeLists.txt b/deps/lua/CMakeLists.txt index e911de9232..0629d7f978 100644 --- a/deps/lua/CMakeLists.txt +++ b/deps/lua/CMakeLists.txt @@ -1,5 +1,7 @@ project(lualib) +include(CheckFunctionExists) + set(LUA_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/src") set(LUA_SRCS ${LUA_SRC_DIR}/fpconv.c @@ -42,3 +44,10 @@ set(LUA_SRCS add_library(lualib STATIC "${LUA_SRCS}") target_include_directories(lualib PUBLIC "${LUA_SRC_DIR}") target_compile_definitions(lualib PRIVATE ENABLE_CJSON_GLOBAL) + +# Use mkstemp if available +check_function_exists(mkstemp HAVE_MKSTEMP) +if (HAVE_MKSTEMP) + target_compile_definitions(lualib PRIVATE LUA_USE_MKSTEMP) +endif () +unset(HAVE_MKSTEMP CACHE) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b7e328163b..90d7e25cf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,22 @@ if (VALKEY_RELEASE_BUILD) set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) endif () +if (DEBUG_FORCE_DEFRAG) + message(STATUS "Forcing Active Defrag run on valkey-server") + target_compile_definitions(valkey-server PRIVATE DEBUG_FORCE_DEFRAG) + target_compile_definitions(valkey-server PRIVATE HAVE_DEFRAG) +endif () + +if (BUILD_SANITIZER) + # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input) + # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS' + # are set with the link & compile flags required + message(STATUS "Adding sanitizer flags for target valkey-server") + target_compile_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_CFLAGS}) + target_link_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_LDFLAGS}) +endif () +unset(BUILD_SANITIZER CACHE) + # Target: valkey-cli list(APPEND CLI_LIBS "linenoise") valkey_build_and_install_bin(valkey-cli "${VALKEY_CLI_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${CLI_LIBS}" "redis-cli") @@ -45,7 +61,7 @@ if (BUILD_RDMA_MODULE) set(MODULE_NAME "valkey-rdma") message(STATUS "Building RDMA module") add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}") - target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE -DUSE_RDMA=1) + target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE=2 -DUSE_RDMA=1) target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}") # remove the "lib" prefix from the module set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") diff --git a/src/Makefile b/src/Makefile index a76356e9d5..e52f4f08d3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -130,6 +130,11 @@ ifdef REDIS_LDFLAGS SERVER_LDFLAGS := $(REDIS_LDFLAGS) endif +# Special case of forcing defrag to run even though we have no Jemlloc support +ifeq ($(DEBUG_FORCE_DEFRAG), yes) + SERVER_CFLAGS +=-DHAVE_DEFRAG -DDEBUG_FORCE_DEFRAG +endif + FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm @@ -325,26 +330,26 @@ ifeq ($(BUILD_TLS),module) TLS_MODULE_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_MODULE) endif -BUILD_RDMA:=no -RDMA_MODULE= -RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so -RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) -ifeq ($(BUILD_RDMA),module) - FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) - RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) +RDMA_LIBS= +RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) ifeq ($(RDMA_PKGCONFIG),0) RDMA_LIBS=$(shell $(PKG_CONFIG) --libs librdmacm libibverbs) else RDMA_LIBS=-lrdmacm -libverbs endif - RDMA_MODULE=$(RDMA_MODULE_NAME) - RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE $(RDMA_LIBS) -else -ifeq ($(BUILD_RDMA),no) - # disable RDMA, do nothing -else - $(error "RDMA is only supported as module (BUILD_RDMA=module), or disabled (BUILD_RDMA=no)") + +ifeq ($(BUILD_RDMA),yes) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE=$(BUILD_NO) + FINAL_LIBS += $(RDMA_LIBS) endif + +RDMA_MODULE= +RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so +RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) +ifeq ($(BUILD_RDMA),module) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) + RDMA_MODULE=$(RDMA_MODULE_NAME) + RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) -DBUILD_RDMA_MODULE=$(BUILD_MODULE) $(RDMA_LIBS) endif ifndef V @@ -411,7 +416,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) @@ -424,6 +429,17 @@ ENGINE_TEST_OBJ:=$(sort $(patsubst unit/%.c,unit/%.o,$(ENGINE_TEST_FILES))) ENGINE_UNIT_TESTS:=$(ENGINE_NAME)-unit-tests$(PROG_SUFFIX) ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ))) +USE_FAST_FLOAT?=no +ifeq ($(USE_FAST_FLOAT),yes) + # valkey_strtod.h uses this flag to switch valkey_strtod function to fast_float_strtod, + # therefore let's pass it to compiler for preprocessing. + FINAL_CFLAGS += -D USE_FAST_FLOAT + # next, let's build and add actual library containing fast_float_strtod function for linking. + DEPENDENCY_TARGETS += fast_float_c_interface + FAST_FLOAT_STRTOD_OBJECT := ../deps/fast_float_c_interface/fast_float_strtod.o + FINAL_LIBS += $(FAST_FLOAT_STRTOD_OBJECT) +endif + all: $(SERVER_NAME) $(ENGINE_SENTINEL_NAME) $(ENGINE_CLI_NAME) $(ENGINE_BENCHMARK_NAME) $(ENGINE_CHECK_RDB_NAME) $(ENGINE_CHECK_AOF_NAME) $(TLS_MODULE) $(RDMA_MODULE) @echo "" @echo "Hint: It's a good idea to run 'make test' ;)" @@ -588,7 +604,7 @@ bench: $(ENGINE_BENCHMARK_NAME) 32bit: @echo "" - @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386 and libstdc++-11-dev-i386-cross" @echo "" $(MAKE) all-with-unit-tests CFLAGS="-m32" LDFLAGS="-m32" diff --git a/src/acl.c b/src/acl.c index 688820fd89..d1f970a805 100644 --- a/src/acl.c +++ b/src/acl.c @@ -297,11 +297,6 @@ int ACLListMatchSds(void *a, void *b) { return sdscmp(a, b) == 0; } -/* Method to free list elements from ACL users password/patterns lists. */ -void ACLListFreeSds(void *item) { - sdsfree(item); -} - /* Method to duplicate list elements from ACL users password/patterns lists. */ void *ACLListDupSds(void *item) { return sdsdup(item); @@ -374,7 +369,7 @@ aclSelector *ACLCreateSelector(int flags) { listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern); listSetDupMethod(selector->patterns, ACLListDupKeyPattern); listSetMatchMethod(selector->channels, ACLListMatchSds); - listSetFreeMethod(selector->channels, ACLListFreeSds); + listSetFreeMethod(selector->channels, sdsfreeVoid); listSetDupMethod(selector->channels, ACLListDupSds); memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands)); @@ -445,7 +440,7 @@ user *ACLCreateUser(const char *name, size_t namelen) { u->passwords = listCreate(); u->acl_string = NULL; listSetMatchMethod(u->passwords, ACLListMatchSds); - listSetFreeMethod(u->passwords, ACLListFreeSds); + listSetFreeMethod(u->passwords, sdsfreeVoid); listSetDupMethod(u->passwords, ACLListDupSds); u->selectors = listCreate(); @@ -489,6 +484,11 @@ void ACLFreeUser(user *u) { zfree(u); } +/* Used for generic free functions. */ +static void ACLFreeUserVoid(void *u) { + ACLFreeUser(u); +} + /* When a user is deleted we need to cycle the active * connections in order to kill all the pending ones that * are authenticated with such user. */ @@ -652,14 +652,15 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int unsigned long id = cmd->id; ACLSetSelectorCommandBit(selector, id, allow); ACLResetFirstArgsForCommand(selector, id); - if (cmd->subcommands_dict) { - dictEntry *de; - dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict); - while ((de = dictNext(di)) != NULL) { - struct serverCommand *sub = (struct serverCommand *)dictGetVal(de); + if (cmd->subcommands_ht) { + hashtableIterator iter; + hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *sub = next; ACLSetSelectorCommandBit(selector, sub->id, allow); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } } @@ -669,19 +670,20 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int * value. Since the category passed by the user may be non existing, the * function returns C_ERR if the category was not found, or C_OK if it was * found and the operation was performed. */ -void ACLSetSelectorCommandBitsForCategory(dict *commands, aclSelector *selector, uint64_t cflag, int value) { - dictIterator *di = dictGetIterator(commands); - dictEntry *de; - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); +void ACLSetSelectorCommandBitsForCategory(hashtable *commands, aclSelector *selector, uint64_t cflag, int value) { + hashtableIterator iter; + hashtableInitIterator(&iter, commands); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (cmd->acl_categories & cflag) { ACLChangeSelectorPerm(selector, cmd, value); } - if (cmd->subcommands_dict) { - ACLSetSelectorCommandBitsForCategory(cmd->subcommands_dict, selector, cflag, value); + if (cmd->subcommands_ht) { + ACLSetSelectorCommandBitsForCategory(cmd->subcommands_ht, selector, cflag, value); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* This function is responsible for recomputing the command bits for all selectors of the existing users. @@ -732,26 +734,27 @@ int ACLSetSelectorCategory(aclSelector *selector, const char *category, int allo return C_OK; } -void ACLCountCategoryBitsForCommands(dict *commands, +void ACLCountCategoryBitsForCommands(hashtable *commands, aclSelector *selector, unsigned long *on, unsigned long *off, uint64_t cflag) { - dictIterator *di = dictGetIterator(commands); - dictEntry *de; - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); + hashtableIterator iter; + hashtableInitIterator(&iter, commands); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (cmd->acl_categories & cflag) { if (ACLGetSelectorCommandBit(selector, cmd->id)) (*on)++; else (*off)++; } - if (cmd->subcommands_dict) { - ACLCountCategoryBitsForCommands(cmd->subcommands_dict, selector, on, off, cflag); + if (cmd->subcommands_ht) { + ACLCountCategoryBitsForCommands(cmd->subcommands_ht, selector, on, off, cflag); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* Return the number of commands allowed (on) and denied (off) for the user 'u' @@ -1163,7 +1166,7 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) { return C_ERR; } - if (cmd->subcommands_dict) { + if (cmd->subcommands_ht) { /* If user is trying to allow a valid subcommand we can just add its unique ID */ cmd = ACLLookupCommand(op + 1); if (cmd == NULL) { @@ -2442,12 +2445,12 @@ sds ACLLoadFromFile(const char *filename) { c->user = new_user; } - if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease); - raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser); + if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid); + raxFreeWithCallback(old_users, ACLFreeUserVoid); sdsfree(errors); return NULL; } else { - raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser); + raxFreeWithCallback(Users, ACLFreeUserVoid); Users = old_users; errors = sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); @@ -2754,22 +2757,22 @@ sds getAclErrorMessage(int acl_res, user *user, struct serverCommand *cmd, sds e * ==========================================================================*/ /* ACL CAT category */ -void aclCatWithFlags(client *c, dict *commands, uint64_t cflag, int *arraylen) { - dictEntry *de; - dictIterator *di = dictGetIterator(commands); - - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); +void aclCatWithFlags(client *c, hashtable *commands, uint64_t cflag, int *arraylen) { + hashtableIterator iter; + hashtableInitIterator(&iter, commands); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (cmd->acl_categories & cflag) { addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); (*arraylen)++; } - if (cmd->subcommands_dict) { - aclCatWithFlags(c, cmd->subcommands_dict, cflag, arraylen); + if (cmd->subcommands_ht) { + aclCatWithFlags(c, cmd->subcommands_ht, cflag, arraylen); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* Add the formatted response from a single selector to the ACL GETUSER diff --git a/src/adlist.c b/src/adlist.c index 11b152592b..0dc77cc038 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -77,6 +77,12 @@ void listRelease(list *list) { zfree(list); } +/* Just like listRelease, but takes the list as a (void *). + * Useful as generic free callback. */ +void listReleaseVoid(void *l) { + listRelease((list *)l); +} + /* Add a new node to the list, to head, containing the specified 'value' * pointer as value. * diff --git a/src/adlist.h b/src/adlist.h index bfc4280434..c642c1c791 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -72,6 +72,7 @@ typedef struct list { /* Prototypes */ list *listCreate(void); void listRelease(list *list); +void listReleaseVoid(void *list); void listEmpty(list *list); list *listAddNodeHead(list *list, void *value); list *listAddNodeTail(list *list, void *value); diff --git a/src/ae.c b/src/ae.c index 9bf8619902..643ff17070 100644 --- a/src/ae.c +++ b/src/ae.c @@ -85,7 +85,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; eventLoop->timeEventHead = NULL; - eventLoop->timeEventNextId = 0; + eventLoop->timeEventNextId = 1; eventLoop->stop = 0; eventLoop->maxfd = -1; eventLoop->beforesleep = NULL; diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c new file mode 100644 index 0000000000..5e805b3044 --- /dev/null +++ b/src/allocator_defrag.c @@ -0,0 +1,477 @@ +/* Copyright 2024- Valkey contributors + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/* + * This file implements allocator-specific defragmentation logic used + * within the Valkey engine. Below is the relationship between various + * components involved in allocation and defragmentation: + * + * Application code + * / \ + * allocation / \ defrag + * / \ + * zmalloc allocator_defrag + * / | \ / \ + * / | \ / \ + * / | \ / \ + * libc tcmalloc jemalloc other + * + * Explanation: + * - **Application code**: High-level application logic that uses memory + * allocation and may trigger defragmentation. + * - **zmalloc**: An abstraction layer over the memory allocator, providing + * a uniform allocation interface to the application code. It can delegate + * to various underlying allocators (e.g., libc, tcmalloc, jemalloc, or others). + * It is not dependant on defrag implementation logic and it's possible to use jemalloc + * version that does not support defrag. + * - **allocator_defrag**: This file contains allocator-specific logic for + * defragmentation, invoked from `defrag.c` when memory defragmentation is needed. + * currently jemalloc is the only allocator with implemented defrag logic. It is possible that + * future implementation will include non-allocator defragmentation (think of data-structure + * compaction for example). + * - **Underlying allocators**: These are the actual memory allocators, such as + * libc, tcmalloc, jemalloc, or other custom allocators. The defragmentation + * logic in `allocator_defrag` interacts with these allocators to reorganize + * memory and reduce fragmentation. + * + * The `defrag.c` file acts as the central entry point for defragmentation, + * invoking allocator-specific implementations provided here in `allocator_defrag.c`. + * + * Note: Developers working on `zmalloc` or `allocator_defrag` should refer to + * the other component to ensure both are using the same allocator configuration. + */ + +#include "server.h" +#include "serverassert.h" +#include "allocator_defrag.h" + +#if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC) + +#define STRINGIFY_(x) #x +#define STRINGIFY(x) STRINGIFY_(x) + +#define BATCH_QUERY_ARGS_OUT 3 +#define SLAB_NFREE(out, i) out[(i) * BATCH_QUERY_ARGS_OUT] +#define SLAB_LEN(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 2] +#define SLAB_NUM_REGS(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 1] + +#define UTILIZATION_THRESHOLD_FACTOR_MILI (125) // 12.5% additional utilization + +/* + * Represents a precomputed key for querying jemalloc statistics. + * + * The `jeMallctlKey` structure stores a key corresponding to a specific jemalloc + * statistics field name. This key is used with the `je_mallctlbymib` interface + * to query statistics more efficiently, bypassing the need for runtime string + * lookup and translation performed by `je_mallctl`. + * + * - `je_mallctlnametomib` is called once for each statistics field to precompute + * and store the key corresponding to the field name. + * - Subsequent queries use `je_mallctlbymib` with the stored key, avoiding the + * overhead of repeated string-based lookups. + * + */ +typedef struct jeMallctlKey { + size_t key[6]; /* The precomputed key used to query jemalloc statistics. */ + size_t keylen; /* The length of the key array. */ +} jeMallctlKey; + +/* Stores MIB (Management Information Base) keys for jemalloc bin queries. + * + * This struct holds precomputed `jeMallctlKey` values for querying various + * jemalloc bin-related statistics efficiently. + */ +typedef struct jeBinInfoKeys { + jeMallctlKey curr_slabs; /* Key to query the current number of slabs in the bin. */ + jeMallctlKey nonfull_slabs; /* Key to query the number of non-full slabs in the bin. */ + jeMallctlKey curr_regs; /* Key to query the current number of regions in the bin. */ +} jeBinInfoKeys; + +/* Represents detailed information about a jemalloc bin. + * + * This struct provides metadata about a jemalloc bin, including the size of + * its regions, total number of regions, and related MIB keys for efficient + * queries. + */ +typedef struct jeBinInfo { + size_t reg_size; /* Size of each region in the bin. */ + uint32_t nregs; /* Total number of regions in the bin. */ + jeBinInfoKeys info_keys; /* Precomputed MIB keys for querying bin statistics. */ +} jeBinInfo; + +/* Represents the configuration for jemalloc bins. + * + * This struct contains information about the number of bins and metadata for + * each bin, as well as precomputed keys for batch utility queries and epoch updates. + */ +typedef struct jemallocCB { + unsigned nbins; /* Number of bins in the jemalloc configuration. */ + jeBinInfo *bin_info; /* Array of `jeBinInfo` structs, one for each bin. */ + jeMallctlKey util_batch_query; /* Key to query batch utilization information. */ + jeMallctlKey epoch; /* Key to trigger statistics sync between threads. */ +} jemallocCB; + +/* Represents the latest usage statistics for a jemalloc bin. + * + * This struct tracks the current usage of a bin, including the number of slabs + * and regions, and calculates the number of full slabs from other fields. + */ +typedef struct jemallocBinUsageData { + size_t curr_slabs; /* Current number of slabs in the bin. */ + size_t curr_nonfull_slabs; /* Current number of non-full slabs in the bin. */ + size_t curr_regs; /* Current number of regions in the bin. */ +} jemallocBinUsageData; + + +static int defrag_supported = 0; +/* Control block holding information about bins and query helper - + * this structure is initialized once when calling allocatorDefragInit. It does not change afterwards*/ +static jemallocCB je_cb = {0, NULL, {{0}, 0}, {{0}, 0}}; +/* Holds the latest usage statistics for each bin. This structure is updated when calling + * allocatorDefragGetFragSmallbins and later is used to make a defrag decision for a memory pointer. */ +static jemallocBinUsageData *je_usage_info = NULL; + + +/* ----------------------------------------------------------------------------- + * Alloc/Free API that are cooperative with defrag + * -------------------------------------------------------------------------- */ + +/* Allocation and free functions that bypass the thread cache + * and go straight to the allocator arena bins. + * Currently implemented only for jemalloc. Used for online defragmentation. + */ +void *allocatorDefragAlloc(size_t size) { + void *ptr = je_mallocx(size, MALLOCX_TCACHE_NONE); + return ptr; +} +void allocatorDefragFree(void *ptr, size_t size) { + if (ptr == NULL) return; + je_sdallocx(ptr, size, MALLOCX_TCACHE_NONE); +} + +/* ----------------------------------------------------------------------------- + * Helper functions for jemalloc translation between size and index + * -------------------------------------------------------------------------- */ + +/* Get the bin index in bin array from the reg_size. + * + * these are reverse engineered mapping of reg_size -> binind. We need this information because the utilization query + * returns the size of the buffer and not the bin index, and we need the bin index to access it's usage information + * + * Note: In case future PR will return the binind (that is better API anyway) we can get rid of + * these conversion functions + */ +static inline unsigned jeSize2BinIndexLgQ3(size_t sz) { + /* Smallest power-of-2 quantum for binning */ + const size_t size_class_group_size = 4; + /* Number of bins in each power-of-2 size class group */ + const size_t lg_quantum_3_first_pow2 = 3; + /* Offset for exponential bins */ + const size_t lg_quantum_3_offset = ((64 >> lg_quantum_3_first_pow2) - 1); + /* Small sizes (8-64 bytes) use linear binning */ + if (sz <= 64) { // 64 = 1 << (lg_quantum_3_first_pow2 + 3) + return (sz >> 3) - 1; // Divide by 8 and subtract 1 + } + + /* For larger sizes, use exponential binning */ + + /* Calculate leading zeros of (sz - 1) to properly handle power-of-2 sizes */ + unsigned leading_zeros = __builtin_clzll(sz - 1); + unsigned exp = 64 - leading_zeros; // Effective log2(sz) + + /* Calculate the size's position within its group */ + unsigned within_group_offset = size_class_group_size - + (((1ULL << exp) - sz) >> (exp - lg_quantum_3_first_pow2)); + + /* Calculate the final bin index */ + return within_group_offset + + ((exp - (lg_quantum_3_first_pow2 + 3)) - 1) * size_class_group_size + + lg_quantum_3_offset; +} +/* ----------------------------------------------------------------------------- + * Interface functions to get fragmentation info from jemalloc + * -------------------------------------------------------------------------- */ +#define ARENA_TO_QUERY MALLCTL_ARENAS_ALL + +static inline void jeRefreshStats(const jemallocCB *je_cb) { + uint64_t epoch = 1; // Value doesn't matter + size_t sz = sizeof(epoch); + /* Refresh stats */ + je_mallctlbymib(je_cb->epoch.key, je_cb->epoch.keylen, &epoch, &sz, &epoch, sz); +} + +/* Extract key that corresponds to the given name for fast query. This should be called once for each key_name */ +static inline int jeQueryKeyInit(const char *key_name, jeMallctlKey *key_info) { + key_info->keylen = sizeof(key_info->key) / sizeof(key_info->key[0]); + int res = je_mallctlnametomib(key_name, key_info->key, &key_info->keylen); + /* sanity check that returned value is not larger than provided */ + assert(key_info->keylen <= sizeof(key_info->key) / sizeof(key_info->key[0])); + return res; +} + +/* Query jemalloc control interface using previously extracted key (with jeQueryKeyInit) instead of name string. + * This interface (named MIB in jemalloc) is faster as it avoids string dict lookup at run-time. */ +static inline int jeQueryCtlInterface(const jeMallctlKey *key_info, void *value) { + size_t sz = sizeof(size_t); + return je_mallctlbymib(key_info->key, key_info->keylen, value, &sz, NULL, 0); +} + +static inline int binQueryHelperInitialization(jeBinInfoKeys *helper, unsigned bin_index) { + char mallctl_name[128]; + + /* Mib of fetch number of used regions in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curregs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_regs) != 0) return -1; + /* Mib of fetch number of current slabs in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curslabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_slabs) != 0) return -1; + /* Mib of fetch nonfull slabs */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.nonfull_slabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->nonfull_slabs) != 0) return -1; + + return 0; +} + +/* Initializes the defragmentation system for the jemalloc memory allocator. + * + * This function performs the necessary setup and initialization steps for the defragmentation system. + * It retrieves the configuration information for the jemalloc arenas and bins, and initializes the usage + * statistics data structure. + * + * return 0 on success, or a non-zero error code on failure. + * + * The initialization process involves the following steps: + * 1. Check if defragmentation is supported by the current jemalloc version. + * 2. Retrieve the arena bin configuration information using the `je_mallctlbymib` function. + * 3. Initialize the `usage_latest` structure with the bin usage statistics and configuration data. + * 4. Set the `defrag_supported` flag to indicate that defragmentation is enabled. + * + * Note: This function must be called before using any other defragmentation-related functionality. + * It should be called during the initialization phase of the code that uses the + * defragmentation feature. + */ +int allocatorDefragInit(void) { + char mallctl_name[100]; + jeBinInfo *bin_info; + size_t sz; + int je_res; + + /* the init should be called only once, fail if unexpected call */ + assert(!defrag_supported); + + /* Get the mib of the per memory pointers query command that is used during defrag scan over memory */ + if (jeQueryKeyInit("experimental.utilization.batch_query", &je_cb.util_batch_query) != 0) return -1; + + je_res = jeQueryKeyInit("epoch", &je_cb.epoch); + assert(je_res == 0); + jeRefreshStats(&je_cb); + + /* get quantum for verification only, current code assumes lg-quantum should be 3 */ + size_t jemalloc_quantum; + sz = sizeof(jemalloc_quantum); + je_mallctl("arenas.quantum", &jemalloc_quantum, &sz, NULL, 0); + /* lg-quantum should be 3 so jemalloc_quantum should be 1<<3 */ + assert(jemalloc_quantum == 8); + + sz = sizeof(je_cb.nbins); + je_res = je_mallctl("arenas.nbins", &je_cb.nbins, &sz, NULL, 0); + assert(je_res == 0 && je_cb.nbins != 0); + + je_cb.bin_info = je_calloc(je_cb.nbins, sizeof(jeBinInfo)); + assert(je_cb.bin_info != NULL); + je_usage_info = je_calloc(je_cb.nbins, sizeof(jemallocBinUsageData)); + assert(je_usage_info != NULL); + + for (unsigned j = 0; j < je_cb.nbins; j++) { + bin_info = &je_cb.bin_info[j]; + /* The size of the current bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.size", j); + sz = sizeof(bin_info->reg_size); + je_res = je_mallctl(mallctl_name, &bin_info->reg_size, &sz, NULL, 0); + assert(je_res == 0); + /* Number of regions per slab */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.nregs", j); + sz = sizeof(bin_info->nregs); + je_res = je_mallctl(mallctl_name, &bin_info->nregs, &sz, NULL, 0); + assert(je_res == 0); + + /* init bin specific fast query keys */ + je_res = binQueryHelperInitialization(&bin_info->info_keys, j); + assert(je_res == 0); + + /* verify the reverse map of reg_size to bin index */ + assert(jeSize2BinIndexLgQ3(bin_info->reg_size) == j); + } + + /* defrag is supported mark it to enable defrag queries */ + defrag_supported = 1; + return 0; +} + +/* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). + * The function will refresh the epoch. + * + * return total fragmentation bytes + */ +unsigned long allocatorDefragGetFragSmallbins(void) { + assert(defrag_supported); + unsigned long frag = 0; + jeRefreshStats(&je_cb); + for (unsigned j = 0; j < je_cb.nbins; j++) { + jeBinInfo *bin_info = &je_cb.bin_info[j]; + jemallocBinUsageData *bin_usage = &je_usage_info[j]; + + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_regs, &bin_usage->curr_regs); + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_slabs, &bin_usage->curr_slabs); + /* Number of non full slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.nonfull_slabs, &bin_usage->curr_nonfull_slabs); + + /* Calculate the fragmentation bytes for the current bin and add it to the total. */ + frag += ((bin_info->nregs * bin_usage->curr_slabs) - bin_usage->curr_regs) * bin_info->reg_size; + } + return frag; +} + +/* Determines whether defragmentation should be performed on a pointer based on jemalloc information. + * + * bin_info Pointer to the bin information structure. + * bin_usage Pointer to the bin usage structure. + * nalloced Number of allocated regions in the bin. + * + * return 1 if defragmentation should be performed, 0 otherwise. + * + * This function checks the following conditions to determine if defragmentation should be performed: + * 1. If the number of allocated regions (nalloced) is equal to the total number of regions (bin_info->nregs), + * defragmentation is not necessary as moving regions is guaranteed not to change the fragmentation ratio. + * 2. If the number of non-full slabs (bin_usage->curr_nonfull_slabs) is less than 2, defragmentation is not performed + * because there is no other slab to move regions to. + * 3. If slab utilization < 'avg utilization'*1.125 [code 1.125 == (1000+UTILIZATION_THRESHOLD_FACTOR_MILI)/1000] + * than we should defrag. This is aligned with previous je_defrag_hint implementation. + */ +static inline int makeDefragDecision(jeBinInfo *bin_info, jemallocBinUsageData *bin_usage, unsigned long nalloced) { + unsigned long curr_full_slabs = bin_usage->curr_slabs - bin_usage->curr_nonfull_slabs; + size_t allocated_nonfull = bin_usage->curr_regs - curr_full_slabs * bin_info->nregs; + if (bin_info->nregs == nalloced || bin_usage->curr_nonfull_slabs < 2 || + 1000 * nalloced * bin_usage->curr_nonfull_slabs > (1000 + UTILIZATION_THRESHOLD_FACTOR_MILI) * allocated_nonfull) { + return 0; + } + return 1; +} + +/* + * Performs defragmentation analysis for a given ptr. + * + * ptr - ptr to memory region to be analyzed. + * + * return - the function returns 1 if defrag should be performed, 0 otherwise. + */ +int allocatorShouldDefrag(void *ptr) { + assert(defrag_supported); + size_t out[BATCH_QUERY_ARGS_OUT]; + size_t out_sz = sizeof(out); + size_t in_sz = sizeof(ptr); + for (unsigned j = 0; j < BATCH_QUERY_ARGS_OUT; j++) { + out[j] = -1; + } + je_mallctlbymib(je_cb.util_batch_query.key, + je_cb.util_batch_query.keylen, + out, &out_sz, + &ptr, in_sz); + /* handle results with appropriate quantum value */ + assert(SLAB_NUM_REGS(out, 0) > 0); + assert(SLAB_LEN(out, 0) > 0); + assert(SLAB_NFREE(out, 0) != (size_t)-1); + unsigned region_size = SLAB_LEN(out, 0) / SLAB_NUM_REGS(out, 0); + /* check that the allocation size is in range of small bins */ + if (region_size > je_cb.bin_info[je_cb.nbins - 1].reg_size) { + return 0; + } + /* get the index based on quantum used */ + unsigned binind = jeSize2BinIndexLgQ3(region_size); + /* make sure binind is in range and reverse map is correct */ + assert(binind < je_cb.nbins && region_size == je_cb.bin_info[binind].reg_size); + + return makeDefragDecision(&je_cb.bin_info[binind], + &je_usage_info[binind], + je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0)); +} + +/* Utility function to get the fragmentation ratio from jemalloc. + * It is critical to do that by comparing only heap maps that belong to + * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this + * fragmentation ratio in order to decide if a defrag action should be taken + * or not, a false detection can cause the defragmenter to waste a lot of CPU + * without the possibility of getting any results. */ +float getAllocatorFragmentation(size_t *out_frag_bytes) { + size_t resident, active, allocated, frag_smallbins_bytes; + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); + frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); + /* Calculate the fragmentation ratio as the proportion of wasted memory in small + * bins (which are defraggable) relative to the total allocated memory (including large bins). + * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, + * despite the fact it's not a lot of memory for the user. */ + float frag_pct = (float)frag_smallbins_bytes / allocated * 100; + float rss_pct = ((float)resident / allocated) * 100 - 100; + size_t rss_bytes = resident - allocated; + if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; + serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", + allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); + return frag_pct; +} + +#elif defined(DEBUG_FORCE_DEFRAG) +int allocatorDefragInit(void) { + return 0; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(size); + zfree(ptr); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + return zmalloc(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 1; +} + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + *out_frag_bytes = server.active_defrag_ignore_bytes + 1; + return server.active_defrag_threshold_upper; +} + +#else +int allocatorDefragInit(void) { + return -1; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(ptr); + UNUSED(size); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + UNUSED(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 0; +} + +float getAllocatorFragmentation(size_t *out_frag_bytes) { + UNUSED(out_frag_bytes); + return 0; +} +#endif diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h new file mode 100644 index 0000000000..7947bef72c --- /dev/null +++ b/src/allocator_defrag.h @@ -0,0 +1,24 @@ +#ifndef __ALLOCATOR_DEFRAG_H +#define __ALLOCATOR_DEFRAG_H + +#if defined(USE_JEMALLOC) +#include +/* We can enable the server defrag capabilities only if we are using Jemalloc + * and the version that has the experimental.utilization namespace in mallctl . */ +#if (defined(JEMALLOC_VERSION_MAJOR) && \ + (JEMALLOC_VERSION_MAJOR > 5 || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))) || \ + defined(DEBUG_FORCE_DEFRAG) +#define HAVE_DEFRAG +#endif +#endif + +int allocatorDefragInit(void); +void allocatorDefragFree(void *ptr, size_t size); +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size); +unsigned long allocatorDefragGetFragSmallbins(void); +int allocatorShouldDefrag(void *ptr); +float getAllocatorFragmentation(size_t *out_frag_bytes); + +#endif /* __ALLOCATOR_DEFRAG_H */ diff --git a/src/anet.c b/src/anet.c index d4ac698982..8dc06ca62e 100644 --- a/src/anet.c +++ b/src/anet.c @@ -70,17 +70,24 @@ int anetGetError(int fd) { return sockerr; } -int anetSetBlock(char *err, int fd, int non_block) { +static int anetGetSocketFlags(char *err, int fd) { int flags; - /* Set the socket blocking (if non_block is zero) or non-blocking. - * Note that fcntl(2) for F_GETFL and F_SETFL can't be - * interrupted by a signal. */ if ((flags = fcntl(fd, F_GETFL)) == -1) { anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno)); return ANET_ERR; } + return flags; +} + +int anetSetBlock(char *err, int fd, int non_block) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + /* Check if this flag has been set or unset, if so, * then there is no need to call fcntl to set/unset it again. */ if (!!(flags & O_NONBLOCK) == !!non_block) return ANET_OK; @@ -105,6 +112,21 @@ int anetBlock(char *err, int fd) { return anetSetBlock(err, fd, 0); } +int anetIsBlock(char *err, int fd) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + + /* Check if the O_NONBLOCK flag is set */ + if (flags & O_NONBLOCK) { + return 0; /* Socket is non-blocking */ + } else { + return 1; /* Socket is blocking */ + } +} + /* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. * This function should be invoked for fd's on specific places * where fork + execve system calls are called. */ diff --git a/src/anet.h b/src/anet.h index ab32f72e4b..b14b4bdaad 100644 --- a/src/anet.h +++ b/src/anet.h @@ -61,6 +61,7 @@ int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port) int anetUnixAccept(char *err, int serversock); int anetNonBlock(char *err, int fd); int anetBlock(char *err, int fd); +int anetIsBlock(char *err, int fd); int anetCloexec(int fd); int anetEnableTcpNoDelay(char *err, int fd); int anetDisableTcpNoDelay(char *err, int fd); diff --git a/src/aof.c b/src/aof.c index e0ca6fbb61..0fd3cf5c26 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2190,7 +2190,6 @@ static int rewriteFunctions(rio *aof) { } int rewriteAppendOnlyFileRio(rio *aof) { - dictEntry *de; int j; long key_count = 0; long long updated_time = 0; @@ -2219,17 +2218,18 @@ int rewriteAppendOnlyFileRio(rio *aof) { kvs_it = kvstoreIteratorInit(db->keys); /* Iterate this DB writing every entry */ - while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { + void *next; + while (kvstoreIteratorNext(kvs_it, &next)) { + robj *o = next; sds keystr; - robj key, *o; + robj key; long long expiretime; size_t aof_bytes_before_key = aof->processed_bytes; - keystr = dictGetKey(de); - o = dictGetVal(de); + keystr = objectGetKey(o); initStaticStringObject(key, keystr); - expiretime = getExpire(db, &key); + expiretime = objectGetExpire(o); /* Save the key and associated value */ if (o->type == OBJ_STRING) { diff --git a/src/bitops.c b/src/bitops.c index 10c383b270..1457cd5322 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -486,7 +486,7 @@ robj *lookupStringForBitCommand(client *c, uint64_t maxbit, int *dirty) { if (o == NULL) { o = createObject(OBJ_STRING, sdsnewlen(NULL, byte + 1)); - dbAdd(c->db, c->argv[1], o); + dbAdd(c->db, c->argv[1], &o); if (dirty) *dirty = 1; } else { o = dbUnshareStringValue(c->db, c->argv[1], o); @@ -772,9 +772,8 @@ void bitopCommand(client *c) { /* Store the computed value into the target key */ if (maxlen) { o = createObject(OBJ_STRING, res); - setKey(c, c->db, targetkey, o, 0); + setKey(c, c->db, targetkey, &o, 0); notifyKeyspaceEvent(NOTIFY_STRING, "set", targetkey, c->db->id); - decrRefCount(o); server.dirty++; } else if (dbDelete(c->db, targetkey)) { signalModifiedKey(c, c->db, targetkey); diff --git a/src/blocked.c b/src/blocked.c index 8e1974a703..aeec560b3f 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -206,7 +206,6 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Reset the client for a new query, unless the client has pending command to process * or in case a shutdown operation was canceled and we are still in the processCommand sequence */ if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) { - freeClientOriginalArgv(c); /* Clients that are not blocked on keys are not reprocessed so we must * call reqresAppendResponse here (for clients blocked on key, * unblockClientOnKey is called, which eventually calls processCommand, diff --git a/src/call_reply.c b/src/call_reply.c index 00d196081e..dc981b8be8 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) { sdsfree(reply); } list *deferred_error_list = listCreate(); - listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree); + listSetFreeMethod(deferred_error_list, sdsfreeVoid); listAddNodeTail(deferred_error_list, sdsnew(err_buff)); return callReplyCreate(err_buff, deferred_error_list, private_data); } diff --git a/src/cluster.c b/src/cluster.c index 9154ac3207..df6bb86454 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -276,9 +276,9 @@ void restoreCommand(client *c) { } /* Create the key and set the TTL if any */ - dbAdd(c->db, key, obj); + dbAdd(c->db, key, &obj); if (ttl) { - setExpire(c, c->db, key, ttl); + obj = setExpire(c, c->db, key, ttl); if (!absttl) { /* Propagate TTL as absolute timestamp */ robj *ttl_obj = createStringObjectFromLongLong(ttl); @@ -811,7 +811,7 @@ static int shouldReturnTlsInfo(void) { } unsigned int countKeysInSlot(unsigned int slot) { - return kvstoreDictSize(server.db->keys, slot); + return kvstoreHashtableSize(server.db->keys, slot); } void clusterCommandHelp(client *c) { @@ -908,16 +908,16 @@ void clusterCommand(client *c) { unsigned int keys_in_slot = countKeysInSlot(slot); unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys; addReplyArrayLen(c, numkeys); - kvstoreDictIterator *kvs_di = NULL; - dictEntry *de = NULL; - kvs_di = kvstoreGetDictIterator(server.db->keys, slot); + kvstoreHashtableIterator *kvs_di = NULL; + kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot); for (unsigned int i = 0; i < numkeys; i++) { - de = kvstoreDictIteratorNext(kvs_di); - serverAssert(de != NULL); - sds sdskey = dictGetKey(de); + void *next; + serverAssert(kvstoreHashtableIteratorNext(kvs_di, &next)); + robj *valkey = next; + sds sdskey = objectGetKey(valkey); addReplyBulkCBuffer(c, sdskey, sdslen(sdskey)); } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); } else if ((!strcasecmp(c->argv[1]->ptr, "slaves") || !strcasecmp(c->argv[1]->ptr, "replicas")) && c->argc == 3) { /* CLUSTER REPLICAS */ clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); diff --git a/src/cluster.h b/src/cluster.h index 65eadf4c65..142f2d70b3 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -12,6 +12,12 @@ #define CLUSTER_FAIL 1 /* The cluster can't work */ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ +/* Reason why the cluster state changes to fail. When adding new reasons, + * make sure to update clusterLogFailReason. */ +#define CLUSTER_FAIL_NONE 0 +#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1 +#define CLUSTER_FAIL_MINORITY_PARTITION 2 + /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ #define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 6ed8cacb0a..0b29fc1abf 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -424,9 +424,19 @@ typedef struct { union { clusterMsg msg; clusterMsgLight msg_light; - }; + } data[]; } clusterMsgSendBlock; +/* Helper function to extract a normal message from a send block. */ +static clusterMsgLight *getLightMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg_light; +} + +/* Helper function to extract a light message from a send block. */ +static clusterMsg *getMessageFromSendBlock(clusterMsgSendBlock *msgblock) { + return &msgblock->data[0].msg; +} + /* ----------------------------------------------------------------------------- * Initialization * -------------------------------------------------------------------------- */ @@ -1083,6 +1093,7 @@ void clusterInit(void) { server.cluster->myself = NULL; server.cluster->currentEpoch = 0; server.cluster->state = CLUSTER_FAIL; + server.cluster->fail_reason = CLUSTER_FAIL_NONE; server.cluster->size = 0; server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); @@ -1288,15 +1299,15 @@ void clusterReset(int hard) { * CLUSTER communication link * -------------------------------------------------------------------------- */ clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) { - uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, msg); + uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, data); clusterMsgSendBlock *msgblock = zcalloc(blocklen); msgblock->refcount = 1; msgblock->totlen = blocklen; server.stat_cluster_links_memory += blocklen; if (IS_LIGHT_MESSAGE(type)) { - clusterBuildMessageHdrLight(&msgblock->msg_light, type, msglen); + clusterBuildMessageHdrLight(getLightMessageFromSendBlock(msgblock), type, msglen); } else { - clusterBuildMessageHdr(&msgblock->msg, type, msglen); + clusterBuildMessageHdr(getMessageFromSendBlock(msgblock), type, msglen); } return msgblock; } @@ -1336,6 +1347,10 @@ clusterLink *createClusterLink(clusterNode *node) { * with this link will have the 'link' field set to NULL. */ void freeClusterLink(clusterLink *link) { serverAssert(link != NULL); + serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound"); + if (link->conn) { connClose(link->conn); link->conn = NULL; @@ -1351,6 +1366,7 @@ void freeClusterLink(clusterLink *link) { } else if (link->node->inbound_link == link) { serverAssert(link->inbound); link->node->inbound_link = NULL; + link->node->inbound_link_freed_time = mstime(); } } zfree(link); @@ -1490,6 +1506,7 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->fail_time = 0; node->link = NULL; node->inbound_link = NULL; + node->inbound_link_freed_time = node->ctime; memset(node->ip, 0, sizeof(node->ip)); node->announce_client_ipv4 = sdsempty(); node->announce_client_ipv6 = sdsempty(); @@ -1499,7 +1516,6 @@ clusterNode *createClusterNode(char *nodename, int flags) { node->cport = 0; node->tls_port = 0; node->fail_reports = listCreate(); - node->voted_time = 0; node->orphaned_time = 0; node->repl_offset_time = 0; node->repl_offset = 0; @@ -1547,9 +1563,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) { * older than the global node timeout. Note that anyway for a node to be * flagged as FAIL we need to have a local PFAIL state that is at least * older than the global node timeout, so we don't just trust the number - * of failure reports from other nodes. */ + * of failure reports from other nodes. + * + * If the reporting node loses its voting right during this time, we will + * also clear its report. */ void clusterNodeCleanupFailureReports(clusterNode *node) { list *l = node->fail_reports; + if (!listLength(l)) return; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -1559,7 +1580,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { listRewind(l, &li); while ((ln = listNext(&li)) != NULL) { fr = ln->value; - if (now - fr->time > maxtime) listDelNode(l, ln); + if (now - fr->time > maxtime) { + listDelNode(l, ln); + } else if (!clusterNodeIsVotingPrimary(fr->node)) { + listDelNode(l, ln); + } } } @@ -1576,6 +1601,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) { * Otherwise 0 is returned. */ int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) { list *l = node->fail_reports; + if (!listLength(l)) return 0; + listNode *ln; listIter li; clusterNodeFailReport *fr; @@ -1696,6 +1723,9 @@ void clusterAddNode(clusterNode *node) { * it is a replica node. */ void clusterDelNode(clusterNode *delnode) { + serverAssert(delnode != NULL); + serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name); + int j; dictIterator *di; dictEntry *de; @@ -2078,7 +2108,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { /* Return 1 if we already have a node in HANDSHAKE state matching the * specified ip address and port number. This function is used in order to * avoid adding a new handshake node for the same address multiple times. */ -int clusterHandshakeInProgress(char *ip, int port, int cport) { +static int clusterHandshakeInProgress(char *ip, int port, int cport) { dictIterator *di; dictEntry *de; @@ -2100,7 +2130,7 @@ int clusterHandshakeInProgress(char *ip, int port, int cport) { * * EAGAIN - There is already a handshake in progress for this address. * EINVAL - IP or port are not valid. */ -int clusterStartHandshake(char *ip, int port, int cport) { +static int clusterStartHandshake(char *ip, int port, int cport) { clusterNode *n; char norm_ip[NET_IP_STR_LEN]; struct sockaddr_storage sa; @@ -2246,10 +2276,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a voting primary. */ - if (sender && clusterNodeIsVotingPrimary(sender)) { + * Handle failure reports, the report is added only if the sender is a voting primary, + * and deletion of a failure report is not restricted. */ + if (sender) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { - if (clusterNodeAddFailureReport(node, sender)) { + if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) { serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name, sender->human_nodename, node->name, node->human_nodename); } @@ -2670,7 +2701,8 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * If the sender and myself are in the same shard, try psync. */ clusterSetPrimary(sender, !are_in_same_shard, !are_in_same_shard); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG | + CLUSTER_TODO_BROADCAST_ALL); } else if (nodeIsPrimary(myself) && (sender_slots >= migrated_our_slots) && !are_in_same_shard) { /* When all our slots are lost to the sender and the sender belongs to * a different shard, this is likely due to a client triggered slot @@ -2994,7 +3026,8 @@ int clusterIsValidPacket(clusterLink *link) { } if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { - serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + serverLog(LL_WARNING, "Dropping packet of type %s that matches debug drop filter", + clusterGetMessageTypeString(type)); return 0; } @@ -3085,7 +3118,7 @@ int clusterProcessPacket(clusterLink *link) { if (server.debug_cluster_close_link_on_packet_drop && (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); - serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); + serverLog(LL_WARNING, "Closing link for matching packet type %s", clusterGetMessageTypeString(type)); return 0; } return 1; @@ -3101,8 +3134,8 @@ int clusterProcessPacket(clusterLink *link) { freeClusterLink(link); serverLog( LL_NOTICE, - "Closing link for node that sent a lightweight message of type %hu as its first message on the link", - type); + "Closing link for node that sent a lightweight message of type %s as its first message on the link", + clusterGetMessageTypeString(type)); return 0; } clusterNode *sender = link->node; @@ -3111,6 +3144,27 @@ int clusterProcessPacket(clusterLink *link) { return 1; } + if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) { + /* If the link is bound to a node and the node is in the handshake state, and we receive + * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are + * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert + * will happen if the other sends a MEET packet because it detects that there is no inbound + * link, this node creates a new node in HANDSHAKE state (with a random node name), and + * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET + * flag. This node is supposed to open an outbound connection to the other node in the next + * cron cycle, but before this happens, the other node re-sends a MEET on the same link + * because it still detects no inbound connection. We improved the re-send logic of MEET in + * #1441, now we will only re-send MEET packet once every handshake timeout period. + * + * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name + * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent + * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET + * packet eliminate the handshake state. */ + serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state", + link->node->name); + return 1; + } + uint16_t flags = ntohs(hdr->flags); uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0; clusterNode *sender = getNodeFromLinkAndMsg(link, hdr); @@ -3148,7 +3202,8 @@ int clusterProcessPacket(clusterLink *link) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); - if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { + if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && + sender->configEpoch >= server.cluster->failover_auth_epoch) { /* Another node has claimed an epoch greater than or equal to ours. * If we have an ongoing election, reset it because we cannot win * with an epoch smaller than or equal to the incoming claim. This @@ -3205,33 +3260,48 @@ int clusterProcessPacket(clusterLink *link) { } } - /* Add this node if it is new for us and the msg type is MEET. - * In this stage we don't try to add the node with the right - * flags, replicaof pointer, and so forth, as this details will be - * resolved when we'll receive PONGs from the node. The exception - * to this is the flag that indicates extensions are supported, as - * we want to send extensions right away in the return PONG in order - * to reduce the amount of time needed to stabilize the shard ID. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) { - clusterNode *node; - - node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); - getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); - node->cport = ntohs(hdr->cport); - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { - node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + if (type == CLUSTERMSG_TYPE_MEET) { + if (!sender) { + /* Add this node if it is new for us and the msg type is MEET. + * In this stage we don't try to add the node with the right + * flags, replicaof pointer, and so forth, as this details will be + * resolved when we'll receive PONGs from the node. The exception + * to this is the flag that indicates extensions are supported, as + * we want to send extensions right away in the return PONG in order + * to reduce the amount of time needed to stabilize the shard ID. */ + clusterNode *node; + + node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); + serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK); + getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); + node->cport = ntohs(hdr->cport); + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { + node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED; + } + setClusterNodeToInboundClusterLink(node, link); + clusterAddNode(node); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + + /* If this is a MEET packet from an unknown node, we still process + * the gossip section here since we have to trust the sender because + * of the message type. */ + clusterProcessGossipSection(hdr, link); + } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) { + /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not + * know it. + * Freeing my outbound link to that node, to force a reconnect and sending a PING. + * Once that node receives our PING, it should recognize the new connection as an inbound link from me. + * We should only free the outbound link if the node is known for more time than the handshake timeout, + * since during this time, the other side might still be trying to complete the handshake. */ + + /* We should always receive a MEET packet on an inbound link. */ + serverAssert(link != sender->link); + serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node", + sender->name); + freeClusterLink(sender->link); } - setClusterNodeToInboundClusterLink(node, link); - clusterAddNode(node); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } - /* If this is a MEET packet from an unknown node, we still process - * the gossip section here since we have to trust the sender because - * of the message type. */ - if (!sender && type == CLUSTERMSG_TYPE_MEET) clusterProcessGossipSection(hdr, link); - /* Anyway reply with a PONG */ clusterSendPing(link, CLUSTERMSG_TYPE_PONG); } @@ -3241,7 +3311,7 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type), link->node ? link->node->name : "NULL"); - if (sender && (sender->flags & CLUSTER_NODE_MEET)) { + if (sender && nodeInMeetState(sender)) { /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ sender->flags &= ~CLUSTER_NODE_MEET; serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name, @@ -3623,7 +3693,7 @@ void clusterWriteHandler(connection *conn) { while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) { listNode *head = listFirst(link->send_msg_queue); clusterMsgSendBlock *msgblock = (clusterMsgSendBlock *)head->value; - clusterMsg *msg = &msgblock->msg; + clusterMsg *msg = getMessageFromSendBlock(msgblock); size_t msg_offset = link->head_msg_send_offset; size_t msg_len = ntohl(msg->totlen); @@ -3680,7 +3750,7 @@ void clusterLinkConnectHandler(connection *conn) { * of a PING one, to force the receiver to add us in its node * table. */ mstime_t old_ping_sent = node->ping_sent; - clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); + clusterSendPing(link, nodeInMeetState(node) ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING); if (old_ping_sent) { /* If there was an active ping before the link was * disconnected, we want to restore the ping time, otherwise @@ -3759,7 +3829,9 @@ void clusterReadHandler(connection *conn) { if (nread <= 0) { /* I/O error... */ - serverLog(LL_DEBUG, "I/O error reading from node link: %s", + serverLog(LL_DEBUG, "I/O error reading from node link (%.40s:%s): %s", + link->node ? link->node->name : "", + link->inbound ? "inbound" : "outbound", (nread == 0) ? "connection closed" : connGetLastError(conn)); handleLinkIOError(link); return; @@ -3806,7 +3878,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { if (!link) { return; } - if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0) + if (listLength(link->send_msg_queue) == 0 && getMessageFromSendBlock(msgblock)->totlen != 0) connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1); listAddNodeTail(link->send_msg_queue, msgblock); @@ -3817,7 +3889,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) { server.stat_cluster_links_memory += sizeof(listNode); /* Populate sent messages stats. */ - uint16_t type = ntohs(msgblock->msg.type); + uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type); if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++; } @@ -3940,6 +4012,12 @@ void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip information. */ void clusterSendPing(clusterLink *link, int type) { + serverLog(LL_DEBUG, "Sending %s packet to node %.40s (%s) on %s link", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "", + link->node ? link->node->human_nodename : "", + link->inbound ? "inbound" : "outbound"); + static unsigned long long cluster_pings_sent = 0; cluster_pings_sent++; int gossipcount = 0; /* Number of gossip sections added so far. */ @@ -3997,7 +4075,7 @@ void clusterSendPing(clusterLink *link, int type) { * sizeof(clusterMsg) or more. */ if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); @@ -4142,10 +4220,10 @@ clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message, clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen); clusterMsgDataPublish *hdr_data_msg; if (is_light) { - clusterMsgLight *hdr_light = &msgblock->msg_light; + clusterMsgLight *hdr_light = getLightMessageFromSendBlock(msgblock); hdr_data_msg = &hdr_light->data.publish.msg; } else { - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr_data_msg = &hdr->data.publish.msg; } hdr_data_msg->channel_len = htonl(channel_len); @@ -4168,7 +4246,7 @@ void clusterSendFail(char *nodename) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFail); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.fail.about.nodename, nodename, CLUSTER_NAMELEN); clusterBroadcastMessage(msgblock); @@ -4184,7 +4262,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataUpdate); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); memcpy(hdr->data.update.nodecfg.nodename, node->name, CLUSTER_NAMELEN); hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch); memcpy(hdr->data.update.nodecfg.slots, node->slots, sizeof(node->slots)); @@ -4206,7 +4284,7 @@ void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, cons msglen += sizeof(clusterMsgModule) - 3 + len; clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen); - clusterMsg *hdr = &msgblock->msg; + clusterMsg *hdr = getMessageFromSendBlock(msgblock); hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */ hdr->data.module.msg.type = type; hdr->data.module.msg.len = htonl(len); @@ -4295,11 +4373,10 @@ void clusterRequestFailoverAuth(void) { uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen); - clusterMsg *hdr = &msgblock->msg; /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit * in the header to communicate the nodes receiving the message that * they should authorized the failover even if the primary is working. */ - if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; + if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; clusterBroadcastMessage(msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } @@ -4376,18 +4453,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { return; } - /* We did not voted for a replica about this primary for two - * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { - serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this primary before %lld milliseconds", - node->name, node->human_nodename, - (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); - return; - } - /* The replica requesting the vote must have a configEpoch for the claimed * slots that is >= the one of the primaries currently serving the same * slots in the current configuration. */ @@ -4401,7 +4466,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * by the replica requesting our vote. Refuse to vote for this replica. */ serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", + "slot %d epoch (%llu) > reqConfigEpoch (%llu)", node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch, (unsigned long long)requestConfigEpoch); return; @@ -4409,7 +4474,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, @@ -4502,7 +4566,7 @@ void clusterLogCantFailover(int reason) { case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break; case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break; case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break; - default: msg = "Unknown reason code."; break; + default: serverPanic("Unknown cant failover reason code."); } lastlog_time = time(NULL); serverLog(LL_NOTICE, "Currently unable to failover: %s", msg); @@ -4548,7 +4612,7 @@ void clusterFailoverReplaceYourPrimary(void) { /* 4) Pong all the other nodes so that they can update the state * accordingly and detect that we switched to primary role. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); /* 5) If there was a manual failover in progress, clear the state. */ resetManualFailover(); @@ -4693,8 +4757,8 @@ void clusterHandleReplicaFailover(void) { if (server.cluster->failover_auth_sent == 0) { server.cluster->currentEpoch++; server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.", - (unsigned long long)server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu", + (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); @@ -4858,6 +4922,27 @@ void clusterHandleReplicaMigration(int max_replicas) { * data loss due to the asynchronous primary-replica replication. * -------------------------------------------------------------------------- */ +void manualFailoverCanStart(void) { + serverAssert(server.cluster->mf_can_start == 0); + + if (server.cluster->failover_auth_time) { + /* There is another manual failover requested by the user. + * If we have an ongoing election, reset it because the user may initiate + * manual failover again when the previous manual failover timed out. + * Otherwise, if the previous election timed out (see auth_timeout) and + * before the next retry (see auth_retry_time), the new manual failover + * will pause the primary and replica can not do anything to advance the + * manual failover, and then the manual failover eventually times out. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a new manual failover. " + "Resetting the election.", + (unsigned long long)server.cluster->failover_auth_epoch); + } + + server.cluster->mf_can_start = 1; +} + /* Reset the manual failover state. This works for both primaries and replicas * as all the state about manual failover is cleared. * @@ -4898,7 +4983,7 @@ void clusterHandleManualFailover(void) { if (server.cluster->mf_primary_offset == replicationGetReplicaOffset()) { /* Our replication offset matches the primary replication offset * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); serverLog(LL_NOTICE, "All primary replication stream processed, " "manual failover can start."); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); @@ -4929,6 +5014,15 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ clusterDelNode(node); return 1; } + if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) && + now - node->inbound_link_freed_time > handshake_timeout) { + /* Node has an outbound link, but no inbound link for more than the handshake timeout. + * This probably means this node does not know us yet, whereas we know it. + * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */ + node->flags |= CLUSTER_NODE_MEET; + serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name); + clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET); + } if (node->link == NULL) { clusterLink *link = createClusterLink(node); @@ -5018,7 +5112,7 @@ void clusterCron(void) { /* Ping some random node 1 time every 10 iterations, so that we usually ping * one random node every second. */ - if (!(iteration % 10)) { + if (!server.debug_cluster_disable_random_ping && !(iteration % 10)) { int j; /* Check a few random nodes and ping the one with the oldest @@ -5195,6 +5289,13 @@ void clusterBeforeSleep(void) { int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; clusterSaveConfigOrDie(fsync); } + + if (flags & CLUSTER_TODO_BROADCAST_ALL) { + /* Broadcast a pong to all known nodes. This is useful when something changes + * in the configuration and we want to make the cluster aware it before the + * regular ping. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } } void clusterDoBeforeSleep(int flags) { @@ -5343,6 +5444,23 @@ void clusterCloseAllSlots(void) { * Cluster state evaluation function * -------------------------------------------------------------------------- */ +void clusterLogFailReason(int reason) { + if (reason == CLUSTER_FAIL_NONE) return; + + char *msg; + switch (reason) { + case CLUSTER_FAIL_NOT_FULL_COVERAGE: + msg = "At least one hash slot is not served by any available node. " + "Please check the 'cluster-require-full-coverage' configuration."; + break; + case CLUSTER_FAIL_MINORITY_PARTITION: + msg = "I am part of a minority partition."; + break; + default: serverPanic("Unknown fail reason code."); + } + serverLog(LL_WARNING, "Cluster is currently down: %s", msg); +} + /* The following are defines that are only used in the evaluation function * and are based on heuristics. Actually the main point about the rejoin and * writable delay is that they should be a few orders of magnitude larger @@ -5352,7 +5470,7 @@ void clusterCloseAllSlots(void) { #define CLUSTER_WRITABLE_DELAY 2000 void clusterUpdateState(void) { - int j, new_state; + int j, new_state, new_reason; int reachable_primaries = 0; static mstime_t among_minority_time; static mstime_t first_call_time = 0; @@ -5373,12 +5491,14 @@ void clusterUpdateState(void) { /* Start assuming the state is OK. We'll turn it into FAIL if there * are the right conditions. */ new_state = CLUSTER_OK; + new_reason = CLUSTER_FAIL_NONE; /* Check if all the slots are covered. */ if (server.cluster_require_full_coverage) { for (j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE; break; } } @@ -5413,6 +5533,7 @@ void clusterUpdateState(void) { if (reachable_primaries < needed_quorum) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_MINORITY_PARTITION; among_minority_time = mstime(); } } @@ -5436,7 +5557,21 @@ void clusterUpdateState(void) { serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s", new_state == CLUSTER_OK ? "ok" : "fail"); server.cluster->state = new_state; + + /* Cluster state changes from ok to fail, print a log. */ + if (new_state == CLUSTER_FAIL) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; + } } + + /* Cluster state is still fail, but the reason has changed, print a log. */ + if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; + } + + if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE; } /* This function is called after the node startup in order to verify that data @@ -5598,12 +5733,12 @@ sds representClusterNodeFlags(sds ci, uint16_t flags) { * else each slot is added separately. */ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { for (int i = 0; i < slot_info_pairs_count; i += 2) { - unsigned long start = slot_info_pairs[i]; - unsigned long end = slot_info_pairs[i + 1]; + unsigned int start = slot_info_pairs[i]; + unsigned int end = slot_info_pairs[i + 1]; if (start == end) { - ci = sdscatfmt(ci, " %i", start); + ci = sdscatfmt(ci, " %u", start); } else { - ci = sdscatfmt(ci, " %i-%i", start, end); + ci = sdscatfmt(ci, " %u-%u", start, end); } } return ci; @@ -6104,12 +6239,13 @@ unsigned int delKeysInSlot(unsigned int hashslot) { server.server_del_keys_in_slot = 1; unsigned int j = 0; - kvstoreDictIterator *kvs_di = NULL; - dictEntry *de = NULL; - kvs_di = kvstoreGetDictSafeIterator(server.db->keys, hashslot); - while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { + kvstoreHashtableIterator *kvs_di = NULL; + void *next; + kvs_di = kvstoreGetHashtableSafeIterator(server.db->keys, hashslot); + while (kvstoreHashtableIteratorNext(kvs_di, &next)) { + robj *valkey = next; enterExecutionUnit(1, 0); - sds sdskey = dictGetKey(de); + sds sdskey = objectGetKey(valkey); robj *key = createStringObject(sdskey, sdslen(sdskey)); dbDelete(&server.db[0], key); propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del); @@ -6124,7 +6260,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { j++; server.dirty++; } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); server.server_del_keys_in_slot = 0; serverAssert(server.execution_nesting == 0); @@ -6133,7 +6269,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) { /* Get the count of the channels for a given slot. */ unsigned int countChannelsInSlot(unsigned int hashslot) { - return kvstoreDictSize(server.pubsubshard_channels, hashslot); + return kvstoreHashtableSize(server.pubsubshard_channels, hashslot); } clusterNode *getMyClusterNode(void) { @@ -6545,7 +6681,7 @@ void clusterCommandSetSlot(client *c) { } /* After importing this slot, let the other nodes know as * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); } } } @@ -6577,6 +6713,10 @@ int clusterCommandSpecial(client *c) { addReplyErrorFormat(c, "Invalid node address specified: %s:%s", (char *)c->argv[2]->ptr, (char *)c->argv[3]->ptr); } else { + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster meet %s:%lld (user request from '%s').", (char *)c->argv[2]->ptr, port, + client); + sdsfree(client); addReply(c, shared.ok); } } else if (!strcasecmp(c->argv[1]->ptr, "flushslots") && c->argc == 2) { @@ -6691,6 +6831,9 @@ int clusterCommandSpecial(client *c) { addReplyError(c, "Can't forget my master!"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster forget %s (user request from '%s').", (char *)c->argv[2]->ptr, client); + sdsfree(client); clusterBlacklistAddNode(n); clusterDelNode(n); clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); @@ -6737,8 +6880,7 @@ int clusterCommandSpecial(client *c) { * If the instance is a replica, it had a totally different replication history. * In these both cases, myself as a replica has to do a full sync. */ clusterSetPrimary(n, 1, 1); - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_BROADCAST_ALL); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "count-failure-reports") && c->argc == 3) { /* CLUSTER COUNT-FAILURE-REPORTS */ @@ -6780,7 +6922,7 @@ int clusterCommandSpecial(client *c) { } resetManualFailover(); server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); if (takeover) { /* A takeover does not perform any initial check. It just @@ -6795,7 +6937,7 @@ int clusterCommandSpecial(client *c) { * primary to agree about the offset. We just failover taking over * it without coordination. */ serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); /* We can start a manual failover as soon as possible, setting a flag * here so that we don't need to waiting for the cron to kick in. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); @@ -6859,6 +7001,9 @@ int clusterCommandSpecial(client *c) { "master nodes containing keys"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster reset (user request from '%s').", client); + sdsfree(client); clusterReset(hard); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5280644e6e..d3e1c3459e 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -25,6 +25,7 @@ #define CLUSTER_TODO_SAVE_CONFIG (1 << 2) #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3) #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4) +#define CLUSTER_TODO_BROADCAST_ALL (1 << 5) /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { @@ -60,12 +61,14 @@ typedef struct clusterLink { #define nodeIsPrimary(n) ((n)->flags & CLUSTER_NODE_PRIMARY) #define nodeIsReplica(n) ((n)->flags & CLUSTER_NODE_REPLICA) #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) +#define nodeInMeetState(n) ((n)->flags & CLUSTER_NODE_MEET) #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) #define nodeSupportsExtensions(n) ((n)->flags & CLUSTER_NODE_EXTENSIONS_SUPPORTED) #define nodeSupportsLightMsgHdr(n) ((n)->flags & CLUSTER_NODE_LIGHT_HDR_SUPPORTED) +#define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) /* This structure represent elements of node->fail_reports. */ typedef struct clusterNodeFailReport { @@ -338,9 +341,10 @@ struct _clusterNode { mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ + mstime_t inbound_link_freed_time; /* Last time we freed the inbound link for this node. + If it was never freed, it is the same as ctime */ long long repl_offset; /* Last known repl offset for this node. */ char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ sds announce_client_ipv4; /* IPv4 for clients only. */ @@ -368,6 +372,7 @@ struct clusterState { clusterNode *myself; /* This node */ uint64_t currentEpoch; int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int fail_reason; /* Why the cluster state changes to fail. */ int size; /* Num of primary nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ diff --git a/src/commands.def b/src/commands.def index 791b30d540..f03e44db9f 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1230,6 +1230,34 @@ struct COMMAND_ARG CLIENT_CAPA_Args[] = { #define CLIENT_ID_Keyspecs NULL #endif +/********** CLIENT IMPORT_SOURCE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLIENT IMPORT_SOURCE history */ +#define CLIENT_IMPORT_SOURCE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLIENT IMPORT_SOURCE tips */ +#define CLIENT_IMPORT_SOURCE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLIENT IMPORT_SOURCE key specs */ +#define CLIENT_IMPORT_SOURCE_Keyspecs NULL +#endif + +/* CLIENT IMPORT_SOURCE enabled argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_enabled_Subargs[] = { +{MAKE_ARG("on",ARG_TYPE_PURE_TOKEN,-1,"ON",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("off",ARG_TYPE_PURE_TOKEN,-1,"OFF",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* CLIENT IMPORT_SOURCE argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_Args[] = { +{MAKE_ARG("enabled",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=CLIENT_IMPORT_SOURCE_enabled_Subargs}, +}; + /********** CLIENT INFO ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -1630,6 +1658,7 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, {MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, @@ -7291,8 +7320,8 @@ struct COMMAND_ARG MEMORY_USAGE_Args[] = { struct COMMAND_STRUCT MEMORY_Subcommands[] = { {MAKE_CMD("doctor","Outputs a memory problems report.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_DOCTOR_History,0,MEMORY_DOCTOR_Tips,3,memoryCommand,2,0,0,MEMORY_DOCTOR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_HELP_History,0,MEMORY_HELP_Tips,0,memoryCommand,2,CMD_LOADING|CMD_STALE,0,MEMORY_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, -{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,0,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,CMD_LOADING,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, +{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,CMD_LOADING,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, {MAKE_CMD("stats","Returns details about memory usage.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_STATS_History,0,MEMORY_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_STATS_Keyspecs,0,NULL,0)}, {MAKE_CMD("usage","Estimates the memory usage of a key.","O(N) where N is the number of samples.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_USAGE_History,0,MEMORY_USAGE_Tips,0,memoryCommand,-3,CMD_READONLY,0,MEMORY_USAGE_Keyspecs,1,NULL,2),.args=MEMORY_USAGE_Args}, {0} @@ -10603,6 +10632,7 @@ commandHistory SET_History[] = { {"6.0.0","Added the `KEEPTTL` option."}, {"6.2.0","Added the `GET`, `EXAT` and `PXAT` option."}, {"7.0.0","Allowed the `NX` and `GET` options to be used together."}, +{"8.1.0","Added the `IFEQ` option."}, }; #endif @@ -10620,8 +10650,9 @@ keySpec SET_Keyspecs[1] = { /* SET condition argument table */ struct COMMAND_ARG SET_condition_Subargs[] = { -{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,"2.6.12",CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,"2.6.12",CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("comparison-value",ARG_TYPE_STRING,-1,"IFEQ","Sets the key's value only if the current value matches the specified comparison value.","8.1.0",CMD_ARG_NONE,0,NULL)}, }; /* SET expiration argument table */ @@ -10637,7 +10668,7 @@ struct COMMAND_ARG SET_expiration_Subargs[] = { struct COMMAND_ARG SET_Args[] = { {MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, {MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,"2.6.12",CMD_ARG_OPTIONAL,2,NULL),.subargs=SET_condition_Subargs}, +{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,3,NULL),.subargs=SET_condition_Subargs}, {MAKE_ARG("get",ARG_TYPE_PURE_TOKEN,-1,"GET",NULL,"6.2.0",CMD_ARG_OPTIONAL,0,NULL)}, {MAKE_ARG("expiration",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,5,NULL),.subargs=SET_expiration_Subargs}, }; @@ -11110,7 +11141,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("mset","Atomically creates or modifies the string values of one or more keys.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSET_History,0,MSET_Tips,2,msetCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSET_Keyspecs,1,NULL,1),.args=MSET_Args}, {MAKE_CMD("msetnx","Atomically modifies the string values of one or more keys only when all keys don't exist.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSETNX_History,0,MSETNX_Tips,0,msetnxCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSETNX_Keyspecs,1,NULL,1),.args=MSETNX_Args}, {MAKE_CMD("psetex","Sets both string value and expiration time in milliseconds of a key. The key is created if it doesn't exist.","O(1)","2.6.0",CMD_DOC_DEPRECATED,"`SET` with the `PX` argument","2.6.12","string",COMMAND_GROUP_STRING,PSETEX_History,0,PSETEX_Tips,0,psetexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,PSETEX_Keyspecs,1,NULL,3),.args=PSETEX_Args}, -{MAKE_CMD("set","Sets the string value of a key, ignoring its type. The key is created if it doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SET_History,4,SET_Tips,0,setCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SET_Keyspecs,1,setGetKeys,5),.args=SET_Args}, +{MAKE_CMD("set","Sets the string value of a key, ignoring its type. The key is created if it doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SET_History,5,SET_Tips,0,setCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SET_Keyspecs,1,setGetKeys,5),.args=SET_Args}, {MAKE_CMD("setex","Sets the string value and expiration time of a key. Creates the key if it doesn't exist.","O(1)","2.0.0",CMD_DOC_DEPRECATED,"`SET` with the `EX` argument","2.6.12","string",COMMAND_GROUP_STRING,SETEX_History,0,SETEX_Tips,0,setexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SETEX_Keyspecs,1,NULL,3),.args=SETEX_Args}, {MAKE_CMD("setnx","Set the string value of a key only when the key doesn't exist.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`SET` with the `NX` argument","2.6.12","string",COMMAND_GROUP_STRING,SETNX_History,0,SETNX_Tips,0,setnxCommand,3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,SETNX_Keyspecs,1,NULL,2),.args=SETNX_Args}, {MAKE_CMD("setrange","Overwrites a part of a string value with another by an offset. Creates the key if it doesn't exist.","O(1), not counting the time taken to copy the new string in place. Usually, this string is very small so the amortized complexity is O(1). Otherwise, complexity is O(M) with M being the length of the value argument.","2.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SETRANGE_History,0,SETRANGE_Tips,0,setrangeCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SETRANGE_Keyspecs,1,NULL,3),.args=SETRANGE_Args}, diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json new file mode 100644 index 0000000000..113c07d70a --- /dev/null +++ b/src/commands/client-import-source.json @@ -0,0 +1,40 @@ +{ + "IMPORT-SOURCE": { + "summary": "Mark this client as an import source when server is in import mode.", + "complexity": "O(1)", + "group": "connection", + "since": "8.1.0", + "arity": 3, + "container": "CLIENT", + "function": "clientCommand", + "command_flags": [ + "NOSCRIPT", + "LOADING", + "STALE" + ], + "acl_categories": [ + "CONNECTION" + ], + "reply_schema": { + "const": "OK" + }, + "arguments": [ + { + "name": "enabled", + "type": "oneof", + "arguments": [ + { + "name": "on", + "type": "pure-token", + "token": "ON" + }, + { + "name": "off", + "type": "pure-token", + "token": "OFF" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/commands/memory-malloc-stats.json b/src/commands/memory-malloc-stats.json index 5ef6a31c40..af5d439744 100644 --- a/src/commands/memory-malloc-stats.json +++ b/src/commands/memory-malloc-stats.json @@ -12,6 +12,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:SPECIAL" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "type": "string", "description": "The memory allocator's internal statistics report." diff --git a/src/commands/memory-purge.json b/src/commands/memory-purge.json index 77ed61dc5b..aea3e2d24a 100644 --- a/src/commands/memory-purge.json +++ b/src/commands/memory-purge.json @@ -11,6 +11,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:ALL_SUCCEEDED" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "const": "OK" } diff --git a/src/commands/set.json b/src/commands/set.json index 8236bc7bb9..3d3800f11d 100644 --- a/src/commands/set.json +++ b/src/commands/set.json @@ -23,6 +23,10 @@ [ "7.0.0", "Allowed the `NX` and `GET` options to be used together." + ], + [ + "8.1.0", + "Added the `IFEQ` option." ] ], "command_flags": [ @@ -89,17 +93,32 @@ "name": "condition", "type": "oneof", "optional": true, - "since": "2.6.12", "arguments": [ { "name": "nx", "type": "pure-token", - "token": "NX" + "token": "NX", + "since": "2.6.12" }, { "name": "xx", "type": "pure-token", - "token": "XX" + "token": "XX", + "since": "2.6.12" + }, + { + "name": "comparison-value", + "type": "string", + "token": "IFEQ", + "since": "8.1.0", + "summary": "Sets the key's value only if the current value matches the specified comparison value.", + "arguments": [ + { + "name": "comparison-value", + "type": "string", + "summary": "The value to compare with the current key's value before setting." + } + ] } ] }, diff --git a/src/config.c b/src/config.c index 15fec15276..e1cee3f95b 100644 --- a/src/config.c +++ b/src/config.c @@ -539,7 +539,6 @@ void loadServerConfigFromString(char *config) { loadServerConfig(argv[1], 0, NULL); } else if (!strcasecmp(argv[0], "rename-command") && argc == 3) { struct serverCommand *cmd = lookupCommandBySds(argv[1]); - int retval; if (!cmd) { err = "No such command in rename-command"; @@ -548,16 +547,13 @@ void loadServerConfigFromString(char *config) { /* If the target command name is the empty string we just * remove it from the command table. */ - retval = dictDelete(server.commands, argv[1]); - serverAssert(retval == DICT_OK); + serverAssert(hashtableDelete(server.commands, argv[1])); /* Otherwise we re-add the command under a different name. */ if (sdslen(argv[2]) != 0) { - sds copy = sdsdup(argv[2]); - - retval = dictAdd(server.commands, copy, cmd); - if (retval != DICT_OK) { - sdsfree(copy); + sdsfree(cmd->fullname); + cmd->fullname = sdsdup(argv[2]); + if (!hashtableAdd(server.commands, cmd)) { err = "Target command name already exists"; goto loaderr; } @@ -1536,10 +1532,27 @@ void rewriteConfigOOMScoreAdjValuesOption(standardConfig *config, const char *na } /* Rewrite the bind option. */ -void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { +static void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state, char **bindaddr, int bindaddr_count) { UNUSED(config); int force = 1; sds line, addresses; + + /* Rewrite as bind ... */ + if (bindaddr_count > 0) + addresses = sdsjoin(bindaddr, bindaddr_count, " "); + else + addresses = sdsnew("\"\""); + line = sdsnew(name); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, addresses); + sdsfree(addresses); + + rewriteConfigRewriteLine(state, name, line, force); +} + +/* Rewrite the bind option. */ +static void rewriteConfigSocketBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); int is_default = 0; /* Compare server.bindaddr with CONFIG_DEFAULT_BINDADDR */ @@ -1559,17 +1572,7 @@ void rewriteConfigBindOption(standardConfig *config, const char *name, struct re return; } - /* Rewrite as bind ... */ - if (server.bindaddr_count > 0) - addresses = sdsjoin(server.bindaddr, server.bindaddr_count, " "); - else - addresses = sdsnew("\"\""); - line = sdsnew(name); - line = sdscatlen(line, " ", 1); - line = sdscatsds(line, addresses); - sdsfree(addresses); - - rewriteConfigRewriteLine(state, name, line, force); + rewriteConfigBindOption(config, name, state, server.bindaddr, server.bindaddr_count); } /* Rewrite the loadmodule option. */ @@ -2637,7 +2640,7 @@ static int applyBind(const char **err) { tcp_listener->ct = connectionByType(CONN_TYPE_SOCKET); if (changeListener(tcp_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - if (tls_listener) closeListener(tls_listener); /* failed with TLS together */ + if (tls_listener) connCloseListener(tls_listener); /* failed with TLS together */ return 0; } @@ -2649,7 +2652,7 @@ static int applyBind(const char **err) { tls_listener->ct = connectionByType(CONN_TYPE_TLS); if (changeListener(tls_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - closeListener(tcp_listener); /* failed with TCP together */ + connCloseListener(tcp_listener); /* failed with TCP together */ return 0; } } @@ -2922,8 +2925,9 @@ static sds getConfigNotifyKeyspaceEventsOption(standardConfig *config) { return keyspaceEventsFlagsToString(server.notify_keyspace_events); } -static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err) { +static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err, char **bindaddr, int *bindaddr_count) { UNUSED(config); + int orig_bindaddr_count = *bindaddr_count; int j; if (argc > CONFIG_BINDADDR_MAX) { @@ -2935,11 +2939,73 @@ static int setConfigBindOption(standardConfig *config, sds *argv, int argc, cons if (argc == 1 && sdslen(argv[0]) == 0) argc = 0; /* Free old bind addresses */ - for (j = 0; j < server.bindaddr_count; j++) { - zfree(server.bindaddr[j]); + for (j = 0; j < orig_bindaddr_count; j++) zfree(bindaddr[j]); + for (j = 0; j < argc; j++) bindaddr[j] = zstrdup(argv[j]); + *bindaddr_count = argc; + + return 1; +} + +static int setConfigSocketBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.bindaddr, &server.bindaddr_count); +} + +static int setConfigRdmaBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.rdma_ctx_config.bindaddr, &server.rdma_ctx_config.bindaddr_count); +} + +static sds getConfigRdmaBindOption(standardConfig *config) { + UNUSED(config); + return sdsjoin(server.rdma_ctx_config.bindaddr, server.rdma_ctx_config.bindaddr_count, " "); +} + +static void rewriteConfigRdmaBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); + + if (server.rdma_ctx_config.bindaddr_count) { + rewriteConfigBindOption(config, name, state, server.rdma_ctx_config.bindaddr, + server.rdma_ctx_config.bindaddr_count); + } +} + +static int applyRdmaBind(const char **err) { + connListener *rdma_listener = listenerByType(CONN_TYPE_RDMA); + + if (!rdma_listener) { + *err = "No RDMA building support."; + return 0; + } + + rdma_listener->bindaddr = server.rdma_ctx_config.bindaddr; + rdma_listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + rdma_listener->port = server.rdma_ctx_config.port; + rdma_listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(rdma_listener) == C_ERR) { + *err = "Failed to bind to specified addresses for RDMA."; + return 0; + } + + return 1; +} + +static int updateRdmaPort(const char **err) { + connListener *listener = listenerByType(CONN_TYPE_RDMA); + + if (listener == NULL) { + *err = "No RDMA building support."; + return 0; + } + + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(listener) == C_ERR) { + *err = "Unable to listen on this port for RDMA. Check server logs."; + return 0; } - for (j = 0; j < argc; j++) server.bindaddr[j] = zstrdup(argv[j]); - server.bindaddr_count = argc; return 1; } @@ -3120,7 +3186,7 @@ standardConfig static_configs[] = { createBoolConfig("replica-read-only", "slave-read-only", DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_replica_ro, 1, NULL, NULL), createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, server.repl_replica_ignore_maxmemory, 1, NULL, NULL), createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, server.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread), - createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, 0, isValidActiveDefrag, NULL), + createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, CONFIG_ACTIVE_DEFRAG_DEFAULT, isValidActiveDefrag, NULL), createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, server.syslog_enabled, 0, NULL, NULL), createBoolConfig("cluster-enabled", NULL, IMMUTABLE_CONFIG, server.cluster_enabled, 0, NULL, NULL), createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG | DENY_LOADING_CONFIG, server.aof_enabled, 0, NULL, updateAppendonly), @@ -3139,6 +3205,7 @@ standardConfig static_configs[] = { createBoolConfig("enable-debug-assert", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, server.enable_debug_assert, 0, NULL, NULL), createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL), createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL), + createBoolConfig("import-mode", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), @@ -3200,17 +3267,18 @@ standardConfig static_configs[] = { createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL), createIntConfig("port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.port, 6379, INTEGER_CONFIG, NULL, updatePort), /* TCP port. */ createIntConfig("io-threads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, 1, IO_THREADS_MAX_NUM, server.io_threads_num, 1, INTEGER_CONFIG, NULL, NULL), /* Single threaded by default */ - createIntConfig("events-per-io-thread", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.events_per_io_thread, 2, INTEGER_CONFIG, NULL, NULL), + createIntConfig("events-per-io-thread", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.events_per_io_thread, 2, INTEGER_CONFIG, NULL, NULL), createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG, 0, 128, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL), createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_replica_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* replica max data age factor. */ createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), createIntConfig("tcp-keepalive", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tcpkeepalive, 300, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-migration-barrier", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_migration_barrier, 1, INTEGER_CONFIG, NULL, NULL), - createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ - createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ + createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ + createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ createIntConfig("active-defrag-threshold-lower", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_lower, 10, INTEGER_CONFIG, NULL, NULL), /* Default: don't defrag when fragmentation is below 10% */ createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: maximum defrag force at 100% fragmentation */ + createIntConfig("active-defrag-cycle-us", NULL, MODIFIABLE_CONFIG, 0, 100000, server.active_defrag_cycle_us, 500, INTEGER_CONFIG, NULL, updateDefragConfiguration), createIntConfig("lfu-log-factor", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_log_factor, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("lfu-decay-time", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_decay_time, 1, INTEGER_CONFIG, NULL, NULL), createIntConfig("replica-priority", "slave-priority", MODIFIABLE_CONFIG, 0, INT_MAX, server.replica_priority, 100, INTEGER_CONFIG, NULL, NULL), @@ -3236,6 +3304,9 @@ standardConfig static_configs[] = { createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod), createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.rdma_ctx_config.port, 0, INTEGER_CONFIG, NULL, updateRdmaPort), + createIntConfig("rdma-rx-size", NULL, IMMUTABLE_CONFIG, 64 * 1024, 16 * 1024 * 1024, server.rdma_ctx_config.rx_size, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-completion-vector", NULL, IMMUTABLE_CONFIG, -1, 1024, server.rdma_ctx_config.completion_vector, -1, INTEGER_CONFIG, NULL, NULL), /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), @@ -3315,7 +3386,8 @@ standardConfig static_configs[] = { createSpecialConfig("client-output-buffer-limit", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigClientOutputBufferLimitOption, getConfigClientOutputBufferLimitOption, rewriteConfigClientOutputBufferLimitOption, NULL), createSpecialConfig("oom-score-adj-values", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigOOMScoreAdjValuesOption, getConfigOOMScoreAdjValuesOption, rewriteConfigOOMScoreAdjValuesOption, updateOOMScoreAdj), createSpecialConfig("notify-keyspace-events", NULL, MODIFIABLE_CONFIG, setConfigNotifyKeyspaceEventsOption, getConfigNotifyKeyspaceEventsOption, rewriteConfigNotifyKeyspaceEventsOption, NULL), - createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigBindOption, getConfigBindOption, rewriteConfigBindOption, applyBind), + createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigSocketBindOption, getConfigBindOption, rewriteConfigSocketBindOption, applyBind), + createSpecialConfig("rdma-bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigRdmaBindOption, getConfigRdmaBindOption, rewriteConfigRdmaBindOption, applyRdmaBind), createSpecialConfig("replicaof", "slaveof", IMMUTABLE_CONFIG | MULTI_ARG_CONFIG, setConfigReplicaOfOption, getConfigReplicaOfOption, rewriteConfigReplicaOfOption, NULL), createSpecialConfig("latency-tracking-info-percentiles", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigLatencyTrackingInfoPercentilesOutputOption, getConfigLatencyTrackingInfoPercentilesOutputOption, rewriteConfigLatencyTrackingInfoPercentilesOutputOption, NULL), diff --git a/src/config.h b/src/config.h index 3b79c5c681..a2e9f353dc 100644 --- a/src/config.h +++ b/src/config.h @@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist); #define valkey_prefetch(addr) ((void)(addr)) #endif +/* Check if we can compile AVX2 code */ +#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) +#if defined(__has_attribute) && __has_attribute(target) +#define HAVE_AVX2 +#endif +#endif + +#if defined(HAVE_AVX2) +#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define ATTRIBUTE_TARGET_AVX2 +#endif + #endif diff --git a/src/connection.c b/src/connection.c index f0c1c2d364..8807541d77 100644 --- a/src/connection.c +++ b/src/connection.c @@ -66,6 +66,9 @@ int connTypeInitialize(void) { /* may fail if without BUILD_TLS=yes */ RedisRegisterConnectionTypeTLS(); + /* may fail if without BUILD_RDMA=yes */ + RegisterConnectionTypeRdma(); + return C_OK; } diff --git a/src/connection.h b/src/connection.h index 0762441732..fd7e0910cf 100644 --- a/src/connection.h +++ b/src/connection.h @@ -54,12 +54,14 @@ typedef enum { CONN_STATE_ERROR } ConnectionState; -#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ -#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */ +#define CONN_FLAG_WRITE_BARRIER (1 << 1) /* Write barrier requested */ +#define CONN_FLAG_ALLOW_ACCEPT_OFFLOAD (1 << 2) /* Connection accept can be offloaded to IO threads. */ #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" #define CONN_TYPE_TLS "tls" +#define CONN_TYPE_RDMA "rdma" #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */ typedef void (*ConnectionCallbackFunc)(struct connection *conn); @@ -79,6 +81,7 @@ typedef struct ConnectionType { int (*addr)(connection *conn, char *ip, size_t ip_len, int *port, int remote); int (*is_local)(connection *conn); int (*listen)(connListener *listener); + void (*closeListener)(connListener *listener); /* create/shutdown/close connection */ connection *(*conn_create)(void); @@ -442,6 +445,13 @@ static inline int connListen(connListener *listener) { return listener->ct->listen(listener); } +/* Close a listened listener */ +static inline void connCloseListener(connListener *listener) { + if (listener->count) { + listener->ct->closeListener(listener); + } +} + /* Get accept_handler of a connection type */ static inline aeFileProc *connAcceptHandler(ConnectionType *ct) { if (ct) return ct->accept_handler; @@ -454,6 +464,7 @@ sds getListensInfoString(sds info); int RedisRegisterConnectionTypeSocket(void); int RedisRegisterConnectionTypeUnix(void); int RedisRegisterConnectionTypeTLS(void); +int RegisterConnectionTypeRdma(void); /* Return 1 if connection is using TLS protocol, 0 if otherwise. */ static inline int connIsTLS(connection *conn) { diff --git a/src/db.c b/src/db.c index 3e0e5a2e63..e31d7e7f7f 100644 --- a/src/db.c +++ b/src/db.c @@ -52,13 +52,14 @@ typedef enum { KEY_DELETED /* The key was deleted now. */ } keyStatus; -keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index); -keyStatus expireIfNeeded(serverDb *db, robj *key, int flags); -int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index); -int keyIsExpired(serverDb *db, robj *key); -static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de); +static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, int flags, int dict_index); +static keyStatus expireIfNeeded(serverDb *db, robj *key, robj *val, int flags); +static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index); +static int objectIsExpired(robj *val); +static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, void **oldref); static int getKVStoreIndexForKey(sds key); -dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index); +static robj *dbFindWithDictIndex(serverDb *db, sds key, int dict_index); +static robj *dbFindExpiresWithDictIndex(serverDb *db, sds key, int dict_index); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -97,10 +98,9 @@ void updateLFU(robj *val) { * expired on replicas even if the primary is lagging expiring our key via DELs * in the replication link. */ robj *lookupKey(serverDb *db, robj *key, int flags) { - dictEntry *de = dbFind(db, key->ptr); - robj *val = NULL; - if (de) { - val = dictGetVal(de); + int dict_index = getKVStoreIndexForKey(key->ptr); + robj *val = dbFindWithDictIndex(db, key->ptr, dict_index); + if (val) { /* Forcing deletion of expired keys on a replica makes the replica * inconsistent with the primary. We forbid it on readonly replicas, but * we have to allow it on writable replicas to make write commands @@ -113,7 +113,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { int expire_flags = 0; if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; - if (expireIfNeeded(db, key, expire_flags) != KEY_VALID) { + if (expireIfNeededWithDictIndex(db, key, val, expire_flags, dict_index) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; } @@ -127,10 +127,8 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { server.current_client->cmd->proc != touchCommand) flags |= LOOKUP_NOTOUCH; if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { - if (!canUseSharedObject() && val->refcount == OBJ_SHARED_REFCOUNT) { - val = dupStringObject(val); - kvstoreDictSetVal(db->keys, getKVStoreIndexForKey(key->ptr), de, val); - } + /* Shared objects can't be stored in the database. */ + serverAssert(val->refcount != OBJ_SHARED_REFCOUNT); if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { updateLFU(val); } else { @@ -195,32 +193,47 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) { return o; } -/* Add the key to the DB. +/* Add a key-value entry to the DB. + * + * A copy of 'key' is stored in the database. The caller must ensure the + * `key` is properly freed by calling decrRefcount(key). * - * In this case a copy of `key` is copied in kvstore, the caller must ensure the `key` is properly freed. + * The value may (if its reference counter == 1) be reallocated and become + * invalid after a call to this function. The (possibly reallocated) value is + * stored in the database and the 'valref' pointer is updated to point to the + * new allocation. * - * It's up to the caller to increment the reference - * counter of the value if needed. + * The reference counter of the value pointed to by valref is not incremented, + * so the caller should not free the value using decrRefcount after calling this + * function. * * If the update_if_existing argument is false, the program is aborted * if the key already exists, otherwise, it can fall back to dbOverwrite. */ -static void dbAddInternal(serverDb *db, robj *key, robj *val, int update_if_existing) { - dictEntry *existing; +static void dbAddInternal(serverDb *db, robj *key, robj **valref, int update_if_existing) { int dict_index = getKVStoreIndexForKey(key->ptr); - dictEntry *de = kvstoreDictAddRaw(db->keys, dict_index, key->ptr, &existing); - if (update_if_existing && existing) { - dbSetValue(db, key, val, 1, existing); - return; + void **oldref = NULL; + if (update_if_existing) { + oldref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr); + if (oldref != NULL) { + dbSetValue(db, key, valref, 1, oldref); + return; + } + } else { + debugServerAssertWithInfo(NULL, key, kvstoreHashtableFindRef(db->keys, dict_index, key->ptr) == NULL); } - serverAssertWithInfo(NULL, key, de != NULL); + + /* Not existing. Convert val to valkey object and insert. */ + robj *val = *valref; + val = objectSetKeyAndExpire(val, key->ptr, -1); initObjectLRUOrLFU(val); - kvstoreDictSetVal(db->keys, dict_index, de, val); + kvstoreHashtableAdd(db->keys, dict_index, val); signalKeyAsReady(db, key, val->type); notifyKeyspaceEvent(NOTIFY_NEW, "new", key, db->id); + *valref = val; } -void dbAdd(serverDb *db, robj *key, robj *val) { - dbAddInternal(db, key, val, 0); +void dbAdd(serverDb *db, robj *key, robj **valref) { + dbAddInternal(db, key, valref, 0); } /* Returns which dict index should be used with kvstore for a given key. */ @@ -268,20 +281,32 @@ int getKeySlot(sds key) { * since it is not useful in this context. * * The function returns 1 if the key was added to the database, otherwise 0 is returned. - * - * In this case a copy of `key` is copied in kvstore, the caller must ensure the `key` is properly freed. */ -int dbAddRDBLoad(serverDb *db, sds key, robj *val) { +int dbAddRDBLoad(serverDb *db, sds key, robj **valref) { int dict_index = getKVStoreIndexForKey(key); - dictEntry *de = kvstoreDictAddRaw(db->keys, dict_index, key, NULL); - if (de == NULL) return 0; + hashtablePosition pos; + if (!kvstoreHashtableFindPositionForInsert(db->keys, dict_index, key, &pos, NULL)) { + return 0; + } + robj *val = *valref; + val = objectSetKeyAndExpire(val, key, -1); + kvstoreHashtableInsertAtPosition(db->keys, dict_index, val, &pos); initObjectLRUOrLFU(val); - kvstoreDictSetVal(db->keys, dict_index, de, val); + *valref = val; return 1; } -/* Overwrite an existing key with a new value. Incrementing the reference - * count of the new value is up to the caller. +/* Overwrite an existing key with a new value. + * + * The value may (if its reference counter == 1) be reallocated and become + * invalid after a call to this function. The (possibly reallocated) value is + * stored in the database and the 'valref' pointer is updated to point to the + * new allocation. + * + * The reference counter of the value pointed to by valref is not incremented, + * so the caller should not free the value using decrRefcount after calling this + * function. + * * This function does not modify the expire time of the existing key. * * The 'overwrite' flag is an indication whether this is done as part of a @@ -289,19 +314,23 @@ int dbAddRDBLoad(serverDb *db, sds key, robj *val) { * replacement (in which case we need to emit deletion signals), or just an * update of a value of an existing key (when false). * - * The dictEntry input is optional, can be used if we already have one. + * The 'oldref' argument is optional. If provided, it is a pointer to the + * location within the hash table where the old value is stored and the new + * value should be stored. * * The program is aborted if the key was not already present. */ -static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de) { - int dict_index = getKVStoreIndexForKey(key->ptr); - if (!de) de = kvstoreDictFind(db->keys, dict_index, key->ptr); - serverAssertWithInfo(NULL, key, de != NULL); - robj *old = dictGetVal(de); - - val->lru = old->lru; +static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, void **oldref) { + robj *val = *valref; + if (oldref == NULL) { + int dict_index = getKVStoreIndexForKey(key->ptr); + oldref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr); + } + serverAssertWithInfo(NULL, key, oldref != NULL); + robj *old = *oldref; + robj *new; if (overwrite) { - /* RM_StringDMA may call dbUnshareStringValue which may free val, so we + /* VM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain old */ incrRefCount(old); /* Although the key is not really deleted from the database, we regard @@ -311,10 +340,40 @@ static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEn /* We want to try to unblock any module clients or clients using a blocking XREADGROUP */ signalDeletedKeyAsReady(db, key, old->type); decrRefCount(old); - /* Because of RM_StringDMA, old may be changed, so we need get old again */ - old = dictGetVal(de); + /* Because of VM_StringDMA, old may be changed, so we need get old again */ + old = *oldref; + } + + if ((old->refcount == 1 && old->encoding != OBJ_ENCODING_EMBSTR) && + (val->refcount == 1 && val->encoding != OBJ_ENCODING_EMBSTR)) { + /* Keep old object in the database. Just swap it's ptr, type and + * encoding with the content of val. */ + int tmp_type = old->type; + int tmp_encoding = old->encoding; + void *tmp_ptr = old->ptr; + old->type = val->type; + old->encoding = val->encoding; + old->ptr = val->ptr; + val->type = tmp_type; + val->encoding = tmp_encoding; + val->ptr = tmp_ptr; + /* Set new to old to keep the old object. Set old to val to be freed below. */ + new = old; + old = val; + } else { + /* Replace the old value at its location in the key space. */ + val->lru = old->lru; + long long expire = objectGetExpire(old); + new = objectSetKeyAndExpire(val, key->ptr, expire); + *oldref = new; + /* Replace the old value at its location in the expire space. */ + if (expire >= 0) { + int dict_index = getKVStoreIndexForKey(key->ptr); + void **expireref = kvstoreHashtableFindRef(db->expires, dict_index, key->ptr); + serverAssert(expireref != NULL); + *expireref = new; + } } - kvstoreDictSetVal(db->keys, dict_index, de, val); /* For efficiency, let the I/O thread that allocated an object also deallocate it. */ if (tryOffloadFreeObjToIOThreads(old) == C_OK) { /* OK */ @@ -323,18 +382,21 @@ static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEn } else { decrRefCount(old); } + *valref = new; } /* Replace an existing key with a new value, we just replace value and don't * emit any events */ -void dbReplaceValue(serverDb *db, robj *key, robj *val) { - dbSetValue(db, key, val, 0, NULL); +void dbReplaceValue(serverDb *db, robj *key, robj **valref) { + dbSetValue(db, key, valref, 0, NULL); } /* High level Set operation. This function can be used in order to set * a key, whatever it was existing or not, to a new object. * - * 1) The ref count of the value object is incremented. + * 1) The value may be reallocated when adding it to the database. The value + * pointer 'valref' is updated to point to the reallocated object. The + * reference count of the value object is *not* incremented. * 2) clients WATCHing for the destination key notified. * 3) The expire time of the key is reset (the key is made persistent), * unless 'SETKEY_KEEPTTL' is enabled in flags. @@ -344,7 +406,7 @@ void dbReplaceValue(serverDb *db, robj *key, robj *val) { * All the new keys in the database should be created via this interface. * The client 'c' argument may be set to NULL if the operation is performed * in a context where there is no clear client performing the operation. */ -void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) { +void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { int keyfound = 0; if (flags & SETKEY_ALREADY_EXIST) @@ -355,13 +417,12 @@ void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) { keyfound = (lookupKeyWrite(db, key) != NULL); if (!keyfound) { - dbAdd(db, key, val); + dbAdd(db, key, valref); } else if (keyfound < 0) { - dbAddInternal(db, key, val, 1); + dbAddInternal(db, key, valref, 1); } else { - dbSetValue(db, key, val, 1, NULL); + dbSetValue(db, key, valref, 1, NULL); } - incrRefCount(val); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -371,24 +432,22 @@ void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) { * * The function makes sure to return keys not already expired. */ robj *dbRandomKey(serverDb *db) { - dictEntry *de; int maxtries = 100; int allvolatile = kvstoreSize(db->keys) == kvstoreSize(db->expires); while (1) { - sds key; - robj *keyobj; - int randomDictIndex = kvstoreGetFairRandomDictIndex(db->keys); - de = kvstoreDictGetFairRandomKey(db->keys, randomDictIndex); - if (de == NULL) return NULL; - - key = dictGetKey(de); - keyobj = createStringObject(key, sdslen(key)); - if (dbFindExpiresWithDictIndex(db, key, randomDictIndex)) { - if (allvolatile && server.primary_host && --maxtries == 0) { + void *entry; + int randomDictIndex = kvstoreGetFairRandomHashtableIndex(db->keys); + int ok = kvstoreHashtableFairRandomEntry(db->keys, randomDictIndex, &entry); + if (!ok) return NULL; + robj *valkey = entry; + sds key = objectGetKey(valkey); + robj *keyobj = createStringObject(key, sdslen(key)); + if (objectIsExpired(valkey)) { + if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically - * expired in the repilca, so the function cannot stop because + * expired in the replica, so the function cannot stop because * expireIfNeeded() is false, nor it can stop because * dictGetFairRandomKey() returns NULL (there are keys to return). * To prevent the infinite loop we do some tries, but if there @@ -396,7 +455,7 @@ robj *dbRandomKey(serverDb *db) { * return a key name that may be already expired. */ return keyobj; } - if (expireIfNeededWithDictIndex(db, keyobj, 0, randomDictIndex) != KEY_VALID) { + if (expireIfNeededWithDictIndex(db, keyobj, valkey, 0, randomDictIndex) != KEY_VALID) { decrRefCount(keyobj); continue; /* search for another key. This expired. */ } @@ -406,31 +465,38 @@ robj *dbRandomKey(serverDb *db) { } int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, int dict_index) { - dictEntry **plink; - int table; - dictEntry *de = kvstoreDictTwoPhaseUnlinkFind(db->keys, dict_index, key->ptr, &plink, &table); - if (de) { - robj *val = dictGetVal(de); - /* RM_StringDMA may call dbUnshareStringValue which may free val, so we + hashtablePosition pos; + void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, key->ptr, &pos); + if (ref != NULL) { + robj *val = *ref; + /* VM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ incrRefCount(val); /* Tells the module that the key has been unlinked from the database. */ moduleNotifyKeyUnlink(key, val, db->id, flags); /* We want to try to unblock any module clients or clients using a blocking XREADGROUP */ signalDeletedKeyAsReady(db, key, val->type); - /* We should call decr before freeObjAsync. If not, the refcount may be - * greater than 1, so freeObjAsync doesn't work */ + /* Match the incrRefCount above. */ decrRefCount(val); + /* Because of dbUnshareStringValue, the val in de may change. */ + val = *ref; + + /* Delete from keys and expires tables. This will not free the object. + * (The expires table has no destructor callback.) */ + kvstoreHashtableTwoPhasePopDelete(db->keys, dict_index, &pos); + if (objectGetExpire(val) != -1) { + int deleted = kvstoreHashtableDelete(db->expires, dict_index, key->ptr); + serverAssert(deleted); + } else { + debugServerAssert(0 == kvstoreHashtableDelete(db->expires, dict_index, key->ptr)); + } + if (async) { - /* Because of dbUnshareStringValue, the val in de may change. */ - freeObjAsync(key, dictGetVal(de), db->id); - kvstoreDictSetVal(db->keys, dict_index, de, NULL); + freeObjAsync(key, val, db->id); + } else { + decrRefCount(val); } - /* Deleting an entry from the expires dict will not free the sds of - * the key, because it is shared with the main dictionary. */ - kvstoreDictDelete(db->expires, dict_index, key->ptr); - kvstoreDictTwoPhaseUnlinkFree(db->keys, dict_index, de, plink, table); return 1; } else { return 0; @@ -493,7 +559,7 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o) { robj *decoded = getDecodedObject(o); o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); decrRefCount(decoded); - dbReplaceValue(db, key, o); + dbReplaceValue(db, key, &o); } return o; } @@ -504,7 +570,7 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o) { * The dbnum can be -1 if all the DBs should be emptied, or the specified * DB index if we want to empty only a single database. * The function returns the number of keys removed from the database(s). */ -long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(dict *)) { +long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(hashtable *)) { long long removed = 0; int startdb, enddb; @@ -546,7 +612,7 @@ long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callbac * On success the function returns the number of keys removed from the * database(s). Otherwise -1 is returned in the specific case the * DB number is out of range, and errno is set to EINVAL. */ -long long emptyData(int dbnum, int flags, void(callback)(dict *)) { +long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) { int async = (flags & EMPTYDB_ASYNC); int with_functions = !(flags & EMPTYDB_NOFUNCTIONS); ValkeyModuleFlushInfoV1 fi = {VALKEYMODULE_FLUSHINFO_VERSION, !async, dbnum}; @@ -572,7 +638,8 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) { if (with_functions) { serverAssert(dbnum == -1); - functionsLibCtxClearCurrent(async); + /* TODO: fix this callback incompatibility. The arg is not used. */ + functionsLibCtxClearCurrent(async, (void (*)(dict *))callback); } /* Also fire the end event. Note that this event will fire almost @@ -585,27 +652,25 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) { /* Initialize temporary db on replica for use during diskless replication. */ serverDb *initTempDb(void) { int slot_count_bits = 0; - int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND; + int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND; if (server.cluster_enabled) { slot_count_bits = CLUSTER_SLOT_MASK_BITS; - flags |= KVSTORE_FREE_EMPTY_DICTS; + flags |= KVSTORE_FREE_EMPTY_HASHTABLES; } serverDb *tempDb = zcalloc(sizeof(serverDb) * server.dbnum); for (int i = 0; i < server.dbnum; i++) { tempDb[i].id = i; - tempDb[i].keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags); - tempDb[i].expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags); + tempDb[i].keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags); + tempDb[i].expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags); } return tempDb; } -/* Discard tempDb, this can be slow (similar to FLUSHALL), but it's always async. */ -void discardTempDb(serverDb *tempDb, void(callback)(dict *)) { - int async = 1; - +/* Discard tempDb, it's always async. */ +void discardTempDb(serverDb *tempDb) { /* Release temp DBs. */ - emptyDbStructure(tempDb, -1, async, callback); + emptyDbStructure(tempDb, -1, 1, NULL); for (int i = 0; i < server.dbnum; i++) { kvstoreRelease(tempDb[i].keys); kvstoreRelease(tempDb[i].expires); @@ -757,7 +822,7 @@ void delGenericCommand(client *c, int lazy) { int numdel = 0, j; for (j = 1; j < c->argc; j++) { - if (expireIfNeeded(c->db, c->argv[j], 0) == KEY_DELETED) continue; + if (expireIfNeeded(c->db, c->argv[j], NULL, 0) == KEY_DELETED) continue; int deleted = lazy ? dbAsyncDelete(c->db, c->argv[j]) : dbSyncDelete(c->db, c->argv[j]); if (deleted) { signalModifiedKey(c, c->db, c->argv[j]); @@ -818,7 +883,6 @@ void randomkeyCommand(client *c) { } void keysCommand(client *c) { - dictEntry *de; sds pattern = c->argv[1]->ptr; int plen = sdslen(pattern), allkeys, pslot = -1; unsigned long numkeys = 0; @@ -827,27 +891,26 @@ void keysCommand(client *c) { if (server.cluster_enabled && !allkeys) { pslot = patternHashSlot(pattern, plen); } - kvstoreDictIterator *kvs_di = NULL; + kvstoreHashtableIterator *kvs_di = NULL; kvstoreIterator *kvs_it = NULL; if (pslot != -1) { - kvs_di = kvstoreGetDictSafeIterator(c->db->keys, pslot); + kvs_di = kvstoreGetHashtableSafeIterator(c->db->keys, pslot); } else { kvs_it = kvstoreIteratorInit(c->db->keys); } - robj keyobj; - while ((de = kvs_di ? kvstoreDictIteratorNext(kvs_di) : kvstoreIteratorNext(kvs_it)) != NULL) { - sds key = dictGetKey(de); - + void *next; + while (kvs_di ? kvstoreHashtableIteratorNext(kvs_di, &next) : kvstoreIteratorNext(kvs_it, &next)) { + robj *val = next; + sds key = objectGetKey(val); if (allkeys || stringmatchlen(pattern, plen, key, sdslen(key), 0)) { - initStaticStringObject(keyobj, key); - if (!keyIsExpired(c->db, &keyobj)) { + if (!objectIsExpired(val)) { addReplyBulkCBuffer(c, key, sdslen(key)); numkeys++; } } if (c->flag.close_asap) break; } - if (kvs_di) kvstoreReleaseDictIterator(kvs_di); + if (kvs_di) kvstoreReleaseHashtableIterator(kvs_di); if (kvs_it) kvstoreIteratorRelease(kvs_it); setDeferredArrayLen(c, replylen, numkeys); } @@ -856,6 +919,7 @@ void keysCommand(client *c) { typedef struct { list *keys; /* elements that collect from dict */ robj *o; /* o must be a hash/set/zset object, NULL means current db */ + serverDb *db; /* database currently being scanned */ long long type; /* the particular type when scan the db */ sds pattern; /* pattern string, NULL means no pattern */ long sampled; /* cumulative number of keys sampled */ @@ -877,9 +941,44 @@ int objectTypeCompare(robj *o, long long target) { else return 1; } + +/* Hashtable scan callback used by scanCallback when scanning the keyspace. */ +void keysScanCallback(void *privdata, void *entry) { + scanData *data = (scanData *)privdata; + robj *obj = entry; + data->sampled++; + + /* Filter an object if it isn't the type we want. */ + if (data->type != LLONG_MAX) { + if (!objectTypeCompare(obj, data->type)) return; + } + + sds key = objectGetKey(obj); + + /* Filter object if its key does not match the pattern. */ + if (data->pattern) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) { + return; + } + } + + /* Handle and skip expired key. */ + if (objectIsExpired(obj)) { + robj kobj; + initStaticStringObject(kobj, key); + if (expireIfNeeded(data->db, &kobj, obj, 0) != KEY_VALID) { + return; + } + } + + /* Keep this key. */ + list *keys = data->keys; + listAddNodeTail(keys, key); +} + /* This callback is used by scanGenericCommand in order to collect elements * returned by the dictionary iterator into a list. */ -void scanCallback(void *privdata, const dictEntry *de) { +void dictScanCallback(void *privdata, const dictEntry *de) { scanData *data = (scanData *)privdata; list *keys = data->keys; robj *o = data->o; @@ -887,14 +986,9 @@ void scanCallback(void *privdata, const dictEntry *de) { sds key = NULL; data->sampled++; - /* o and typename can not have values at the same time. */ - serverAssert(!((data->type != LLONG_MAX) && o)); - - /* Filter an element if it isn't the type we want. */ - if (!o && data->type != LLONG_MAX) { - robj *rval = dictGetVal(de); - if (!objectTypeCompare(rval, data->type)) return; - } + /* This callback is only used for scanning elements within a key (hash + * fields, set elements, etc.) so o must be set here. */ + serverAssert(o != NULL); /* Filter element if it does not match the pattern. */ sds keysds = dictGetKey(de); @@ -904,11 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) { } } - if (o == NULL) { - key = keysds; - } else if (o->type == OBJ_SET) { - key = keysds; - } else if (o->type == OBJ_HASH) { + if (o->type == OBJ_HASH) { key = keysds; if (!data->only_keys) { val = dictGetVal(de); @@ -921,13 +1011,33 @@ void scanCallback(void *privdata, const dictEntry *de) { val = sdsnewlen(buf, len); } } else { - serverPanic("Type not handled in SCAN callback."); + serverPanic("Type not handled in dict SCAN callback."); } listAddNodeTail(keys, key); if (val) listAddNodeTail(keys, val); } +void hashtableScanCallback(void *privdata, void *entry) { + scanData *data = (scanData *)privdata; + robj *o = data->o; + list *keys = data->keys; + data->sampled++; + + /* currently only implemented for SET scan */ + serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE); + sds key = (sds)entry; /* Specific for OBJ_SET */ + + /* Filter element if it does not match the pattern. */ + if (data->pattern) { + if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) { + return; + } + } + + listAddNodeTail(keys, key); +} + /* Try to parse a SCAN cursor stored at object 'o': * if the cursor is valid, store it as unsigned integer into *cursor and * returns C_OK. Otherwise return C_ERR and send an error to the @@ -991,7 +1101,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { sds typename = NULL; long long type = LLONG_MAX; int patlen = 0, use_pattern = 0, only_keys = 0; - dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object * must be Set, Sorted Set, or Hash. */ @@ -1060,34 +1169,35 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * just return everything inside the object in a single call, setting the * cursor to zero to signal the end of the iteration. */ - /* Handle the case of a hash table. */ - ht = NULL; + /* Handle the case of kvstore, dict or hashtable. */ + dict *dict_table = NULL; + hashtable *hashtable_table = NULL; + int shallow_copied_list_items = 0; if (o == NULL) { - ht = NULL; - } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) { - ht = o->ptr; + shallow_copied_list_items = 1; + } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable_table = o->ptr; + shallow_copied_list_items = 1; } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { - ht = o->ptr; + dict_table = o->ptr; + shallow_copied_list_items = 1; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - ht = zs->dict; + dict_table = zs->dict; + /* scanning ZSET allocates temporary strings even though it's a dict */ + shallow_copied_list_items = 0; } list *keys = listCreate(); - /* Set a free callback for the contents of the collected keys list. - * For the main keyspace dict, and when we scan a key that's dict encoded - * (we have 'ht'), we don't need to define free method because the strings - * in the list are just a shallow copy from the pointer in the dictEntry. - * When scanning a key with other encodings (e.g. listpack), we need to - * free the temporary strings we add to that list. - * The exception to the above is ZSET, where we do allocate temporary - * strings even when scanning a dict. */ - if (o && (!ht || o->type == OBJ_ZSET)) { - listSetFreeMethod(keys, (void (*)(void *))sdsfree); - } - - /* For main dictionary scan or data structure using hashtable. */ - if (!o || ht) { + /* Set a free callback for the contents of the collected keys list if they + * are deep copied temporary strings. We must not free them if they are just + * a shallow copy - a pointer to the actual data in the data structure */ + if (!shallow_copied_list_items) { + listSetFreeMethod(keys, sdsfreeVoid); + } + + /* For main hash table scan or scannable data structure. */ + if (!o || dict_table || hashtable_table) { /* We set the max number of iterations to ten times the specified * COUNT, so if the hash table is in a pathological state (very * sparsely populated) we avoid to block too much time at the cost @@ -1096,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { /* We pass scanData which have three pointers to the callback: * 1. data.keys: the list to which it will add new elements; - * 2. data.o: the object containing the dictionary so that + * 2. data.o: the object containing the hash table so that * it is possible to fetch more data in a type-dependent way; * 3. data.type: the specified type scan in the db, LLONG_MAX means * type matching is no needed; @@ -1109,6 +1219,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * only keys are returned. */ scanData data = { .keys = keys, + .db = c->db, .o = o, .type = type, .pattern = use_pattern ? pat : NULL, @@ -1125,9 +1236,11 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { /* In cluster mode there is a separate dictionary for each slot. * If cursor is empty, we should try exploring next non-empty slot. */ if (o == NULL) { - cursor = kvstoreScan(c->db->keys, cursor, onlydidx, scanCallback, NULL, &data); + cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data); + } else if (dict_table) { + cursor = dictScan(dict_table, cursor, dictScanCallback, &data); } else { - cursor = dictScan(ht, cursor, scanCallback, &data); + cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data); } } while (cursor && maxiterations-- && data.sampled < count); } else if (o->type == OBJ_SET) { @@ -1177,22 +1290,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { serverPanic("Not handled encoding in SCAN."); } - /* Step 3: Filter the expired keys */ - if (o == NULL && listLength(keys)) { - robj kobj; - listIter li; - listNode *ln; - listRewind(keys, &li); - while ((ln = listNext(&li))) { - sds key = listNodeValue(ln); - initStaticStringObject(kobj, key); - if (expireIfNeeded(c->db, &kobj, 0) != KEY_VALID) { - listDelNode(keys, ln); - } - } - } - - /* Step 4: Reply to the client. */ + /* Step 3: Reply to the client. */ addReplyArrayLen(c, 2); addReplyBulkLongLong(c, cursor); @@ -1315,9 +1413,9 @@ void renameGenericCommand(client *c, int nx) { * with the same name. */ dbDelete(c->db, c->argv[2]); } - dbAdd(c->db, c->argv[2], o); - if (expire != -1) setExpire(c, c->db, c->argv[2], expire); dbDelete(c->db, c->argv[1]); + dbAdd(c->db, c->argv[2], &o); + if (expire != -1) o = setExpire(c, c->db, c->argv[2], expire); signalModifiedKey(c, c->db, c->argv[1]); signalModifiedKey(c, c->db, c->argv[2]); notifyKeyspaceEvent(NOTIFY_GENERIC, "rename_from", c->argv[1], c->db->id); @@ -1378,12 +1476,14 @@ void moveCommand(client *c) { addReply(c, shared.czero); return; } - dbAdd(dst, c->argv[1], o); - if (expire != -1) setExpire(c, dst, c->argv[1], expire); - incrRefCount(o); - /* OK! key moved, free the entry in the source DB */ - dbDelete(src, c->argv[1]); + incrRefCount(o); /* ref counter = 2 */ + dbDelete(src, c->argv[1]); /* ref counter = 1 */ + + dbAdd(dst, c->argv[1], &o); + if (expire != -1) o = setExpire(c, dst, c->argv[1], expire); + + /* OK! key moved */ signalModifiedKey(c, src, c->argv[1]); signalModifiedKey(c, dst, c->argv[1]); notifyKeyspaceEvent(NOTIFY_GENERIC, "move_from", c->argv[1], src->id); @@ -1481,8 +1581,8 @@ void copyCommand(client *c) { dbDelete(dst, newkey); } - dbAdd(dst, newkey, newobj); - if (expire != -1) setExpire(c, dst, newkey, expire); + dbAdd(dst, newkey, &newobj); + if (expire != -1) newobj = setExpire(c, dst, newkey, expire); /* OK! key copied */ signalModifiedKey(c, dst, c->argv[2]); @@ -1501,9 +1601,8 @@ void scanDatabaseForReadyKeys(serverDb *db) { dictIterator *di = dictGetSafeIterator(db->blocking_keys); while ((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); - dictEntry *kde = dbFind(db, key->ptr); - if (kde) { - robj *value = dictGetVal(kde); + robj *value = dbFind(db, key->ptr); + if (value) { signalKeyAsReady(db, key, value->type); } } @@ -1521,17 +1620,15 @@ void scanDatabaseForDeletedKeys(serverDb *emptied, serverDb *replaced_with) { int existed = 0, exists = 0; int original_type = -1, curr_type = -1; - dictEntry *kde = dbFind(emptied, key->ptr); - if (kde) { - robj *value = dictGetVal(kde); + robj *value = dbFind(emptied, key->ptr); + if (value) { original_type = value->type; existed = 1; } if (replaced_with) { - kde = dbFind(replaced_with, key->ptr); - if (kde) { - robj *value = dictGetVal(kde); + value = dbFind(replaced_with, key->ptr); + if (value) { curr_type = value->type; exists = 1; } @@ -1668,39 +1765,63 @@ void swapdbCommand(client *c) { *----------------------------------------------------------------------------*/ int removeExpire(serverDb *db, robj *key) { - return kvstoreDictDelete(db->expires, getKVStoreIndexForKey(key->ptr), key->ptr) == DICT_OK; + int dict_index = getKVStoreIndexForKey(key->ptr); + void *popped; + if (kvstoreHashtablePop(db->expires, dict_index, key->ptr, &popped)) { + robj *val = popped; + robj *newval = objectSetExpire(val, -1); + serverAssert(newval == val); + debugServerAssert(getExpire(db, key) == -1); + return 1; + } + return 0; } /* Set an expire to the specified key. If the expire is set in the context * of an user calling a command 'c' is the client, otherwise 'c' is set * to NULL. The 'when' parameter is the absolute unix time in milliseconds * after which the key will no longer be considered valid. */ -void setExpire(client *c, serverDb *db, robj *key, long long when) { - dictEntry *kde, *de, *existing; +robj *setExpire(client *c, serverDb *db, robj *key, long long when) { + /* TODO: Add val as a parameter to this function, to avoid looking it up. */ + robj *val; - /* Reuse the sds from the main dict in the expire dict */ + /* Reuse the object from the main dict in the expire dict. When setting + * expire in an robj, it's potentially reallocated. We need to updates the + * pointer(s) to it. */ int dict_index = getKVStoreIndexForKey(key->ptr); - kde = kvstoreDictFind(db->keys, dict_index, key->ptr); - serverAssertWithInfo(NULL, key, kde != NULL); - de = kvstoreDictAddRaw(db->expires, dict_index, dictGetKey(kde), &existing); - if (existing) { - dictSetSignedIntegerVal(existing, when); + void **valref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr); + serverAssertWithInfo(NULL, key, valref != NULL); + val = *valref; + long long old_when = objectGetExpire(val); + robj *newval = objectSetExpire(val, when); + if (old_when != -1) { + /* Val already had an expire field, so it was not reallocated. */ + serverAssert(newval == val); + /* It already exists in set of keys with expire. */ + debugServerAssert(!kvstoreHashtableAdd(db->expires, dict_index, newval)); } else { - dictSetSignedIntegerVal(de, when); + /* No old expire. Update the pointer in the keys hashtable, if needed, + * and add it to the expires hashtable. */ + if (newval != val) { + val = *valref = newval; + } + int added = kvstoreHashtableAdd(db->expires, dict_index, newval); + serverAssert(added); } int writable_replica = server.primary_host && server.repl_replica_ro == 0; if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key); + return val; } /* Return the expire time of the specified key, or -1 if no expire * is associated with this key (i.e. the key is non volatile) */ long long getExpireWithDictIndex(serverDb *db, robj *key, int dict_index) { - dictEntry *de; + robj *val; - if ((de = dbFindExpiresWithDictIndex(db, key->ptr, dict_index)) == NULL) return -1; + if ((val = dbFindExpiresWithDictIndex(db, key->ptr, dict_index)) == NULL) return -1; - return dictGetSignedIntegerVal(de); + return objectGetExpire(val); } /* Return the expire time of the specified key, or -1 if no expire @@ -1779,20 +1900,43 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { +/* Returns 1 if the expire value is expired, 0 otherwise. */ +static int timestampIsExpired(mstime_t when) { + if (when < 0) return 0; /* no expire */ + mstime_t now = commandTimeSnapshot(); + + /* The key expired if the current (virtual or real) time is greater + * than the expire time of the key. */ + return now > when; +} + +/* Use this instead of keyIsExpired if you already have the value object. */ +static int objectIsExpired(robj *val) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; + if (!timestampIsExpired(objectGetExpire(val))) return 0; + if (server.primary_host == NULL && server.import_mode) { + if (server.current_client && server.current_client->flag.import_source) return 0; + } + return 1; +} +static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) { + /* Don't expire anything while loading. It will be done later. */ + if (server.loading) return 0; mstime_t when = getExpireWithDictIndex(db, key, dict_index); - mstime_t now; - - if (when < 0) return 0; /* No expire for this key */ + return timestampIsExpired(when); +} - now = commandTimeSnapshot(); +/* Check if the key is expired. */ +static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0; - /* The key expired if the current (virtual or real) time is greater - * than the expire time of the key. */ - return now > when; + /* See expireIfNeededWithDictIndex for more details. */ + if (server.primary_host == NULL && server.import_mode) { + if (server.current_client && server.current_client->flag.import_source) return 0; + } + return 1; } /* Check if the key is expired. */ @@ -1801,9 +1945,14 @@ int keyIsExpired(serverDb *db, robj *key) { return keyIsExpiredWithDictIndex(db, key, dict_index); } -keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) { +/* val is optional. Pass NULL if val is not yet fetched from the database. */ +static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, int flags, int dict_index) { if (server.lazy_expire_disabled) return KEY_VALID; - if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return KEY_VALID; + if (val != NULL) { + if (!objectIsExpired(val)) return KEY_VALID; + } else { + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID; + } /* If we are running in the context of a replica, instead of * evicting the expired key from the database, we return ASAP: @@ -1821,6 +1970,25 @@ keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int di if (server.primary_host != NULL) { if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; + } else if (server.import_mode) { + /* If we are running in the import mode on a primary, instead of + * evicting the expired key from the database, we return ASAP: + * the key expiration is controlled by the import source that will + * send us synthesized DEL operations for expired keys. The + * exception is when write operations are performed on this server + * because it's a primary. + * + * Notice: other clients, apart from the import source, should not access + * the data imported by import source. + * + * Still we try to return the right information to the caller, + * that is, KEY_VALID if we think the key should still be valid, + * KEY_EXPIRED if we think the key is expired but don't want to delete it at this time. + * + * When receiving commands from the import source, keys are never considered + * expired. */ + if (server.current_client && (server.current_client->flag.import_source)) return KEY_VALID; + if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } /* In some cases we're explicitly instructed to return an indication of a @@ -1872,19 +2040,24 @@ keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int di * the actual key deletion and propagation of the deletion, use the * EXPIRE_AVOID_DELETE_EXPIRED flag. * + * Passing the value 'val' to this function is optional, as an optimization to + * avoid looking up the key. Pass NULL if it's not already fetched from the + * database. + * * The return value of the function is KEY_VALID if the key is still valid. * The function returns KEY_EXPIRED if the key is expired BUT not deleted, * or returns KEY_DELETED if the key is expired and deleted. */ -keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { +static keyStatus expireIfNeeded(serverDb *db, robj *key, robj *val, int flags) { + if (val != NULL && !objectIsExpired(val)) return KEY_VALID; /* shortcut */ int dict_index = getKVStoreIndexForKey(key->ptr); - return expireIfNeededWithDictIndex(db, key, flags, dict_index); + return expireIfNeededWithDictIndex(db, key, val, flags, dict_index); } /* CB passed to kvstoreExpand. * The purpose is to skip expansion of unused dicts in cluster mode (all * dicts not mapped to *my* slots) */ static int dbExpandSkipSlot(int slot) { - return !clusterNodeCoversSlot(getMyClusterNode(), slot); + return !clusterNodeCoversSlot(clusterNodeGetPrimary(getMyClusterNode()), slot); } /* @@ -1892,10 +2065,11 @@ static int dbExpandSkipSlot(int slot) { * In cluster mode resizes all individual dictionaries for slots that this node owns. * * Based on the parameter `try_expand`, appropriate dict expand API is invoked. - * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`. - * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s). - * `DICT_OK` response is for successful expansion. However ,`DICT_ERR` response signifies failure in allocation in - * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. + * if try_expand is non-zero, `hashtableTryExpand` is used else `hashtableExpand`. + * + * Returns C_OK or C_ERR. C_OK response is for successful expansion. C_ERR + * signifies failure in allocation if try_expand is non-zero. Otherwise it + * signifies that no expansion was performed. */ static int dbExpandGeneric(kvstore *kvs, uint64_t db_size, int try_expand) { int ret; @@ -1921,20 +2095,24 @@ int dbExpandExpires(serverDb *db, uint64_t db_size, int try_expand) { return dbExpandGeneric(db->expires, db_size, try_expand); } -dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index) { - return kvstoreDictFind(db->keys, dict_index, key); +static robj *dbFindWithDictIndex(serverDb *db, sds key, int dict_index) { + void *existing = NULL; + kvstoreHashtableFind(db->keys, dict_index, key, &existing); + return existing; } -dictEntry *dbFind(serverDb *db, void *key) { +robj *dbFind(serverDb *db, sds key) { int dict_index = getKVStoreIndexForKey(key); return dbFindWithDictIndex(db, key, dict_index); } -dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index) { - return kvstoreDictFind(db->expires, dict_index, key); +static robj *dbFindExpiresWithDictIndex(serverDb *db, sds key, int dict_index) { + void *existing = NULL; + kvstoreHashtableFind(db->expires, dict_index, key, &existing); + return existing; } -dictEntry *dbFindExpires(serverDb *db, void *key) { +robj *dbFindExpires(serverDb *db, sds key) { int dict_index = getKVStoreIndexForKey(key); return dbFindExpiresWithDictIndex(db, key, dict_index); } @@ -1943,7 +2121,7 @@ unsigned long long dbSize(serverDb *db) { return kvstoreSize(db->keys); } -unsigned long long dbScan(serverDb *db, unsigned long long cursor, dictScanFunction *scan_cb, void *privdata) { +unsigned long long dbScan(serverDb *db, unsigned long long cursor, hashtableScanFunction scan_cb, void *privdata) { return kvstoreScan(db->keys, cursor, -1, scan_cb, NULL, privdata); } diff --git a/src/debug.c b/src/debug.c index 13da7bcc93..4efe12e237 100644 --- a/src/debug.c +++ b/src/debug.c @@ -46,6 +46,8 @@ #include #include +#include "valkey_strtod.h" + #ifdef HAVE_BACKTRACE #include #ifndef __OpenBSD__ @@ -281,7 +283,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) * a different digest. */ void computeDatasetDigest(unsigned char *final) { unsigned char digest[20]; - dictEntry *de; + robj *o; int j; uint32_t aux; @@ -297,17 +299,16 @@ void computeDatasetDigest(unsigned char *final) { mixDigest(final, &aux, sizeof(aux)); /* Iterate this DB writing every entry */ - while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { + while (kvstoreIteratorNext(kvs_it, (void **)&o)) { sds key; - robj *keyobj, *o; + robj *keyobj; memset(digest, 0, 20); /* This key-val digest */ - key = dictGetKey(de); + key = objectGetKey(o); keyobj = createStringObject(key, sdslen(key)); mixDigest(digest, key, sdslen(key)); - o = dictGetVal(de); xorObjectDigest(db, keyobj, digest, o); /* We can finally xor the key-val digest to the final digest */ @@ -436,6 +437,8 @@ void debugCommand(client *c) { "CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", " This is valid only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", " When set to 1, the cluster link is closed after dropping a packet based on the filter.", + "DISABLE-CLUSTER-RANDOM-PING <0|1>", + " Disable sending cluster ping to a random node every second.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -607,19 +610,20 @@ void debugCommand(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "close-cluster-link-on-packet-drop") && c->argc == 3) { server.debug_cluster_close_link_on_packet_drop = atoi(c->argv[2]->ptr); addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "disable-cluster-random-ping") && c->argc == 3) { + server.debug_cluster_disable_random_ping = atoi(c->argv[2]->ptr); + addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "object") && (c->argc == 3 || c->argc == 4)) { - dictEntry *de; robj *val; char *strenc; int fast = 0; if (c->argc == 4 && !strcasecmp(c->argv[3]->ptr, "fast")) fast = 1; - if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { + if ((val = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyErrorObject(c, shared.nokeyerr); return; } - val = dictGetVal(de); strenc = strEncoding(val->encoding); char extra[138] = {0}; @@ -667,16 +671,14 @@ void debugCommand(client *c) { addReplyStatusLength(c, s, sdslen(s)); sdsfree(s); } else if (!strcasecmp(c->argv[1]->ptr, "sdslen") && c->argc == 3) { - dictEntry *de; robj *val; sds key; - if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { + if ((val = dbFind(c->db, c->argv[2]->ptr)) == NULL) { addReplyErrorObject(c, shared.nokeyerr); return; } - val = dictGetVal(de); - key = dictGetKey(de); + key = objectGetKey(val); if (val->type != OBJ_STRING || !sdsEncodedObject(val)) { addReplyError(c, "Not an sds encoded string."); @@ -746,7 +748,7 @@ void debugCommand(client *c) { val = createStringObject(NULL, valsize); memcpy(val->ptr, buf, valsize <= buflen ? valsize : buflen); } - dbAdd(c->db, key, val); + dbAdd(c->db, key, &val); signalModifiedKey(c, c->db, key); decrRefCount(key); } @@ -769,8 +771,7 @@ void debugCommand(client *c) { /* We don't use lookupKey because a debug command should * work on logically expired keys */ - dictEntry *de; - robj *o = ((de = dbFind(c->db, c->argv[j]->ptr)) == NULL) ? NULL : dictGetVal(de); + robj *o = dbFind(c->db, c->argv[j]->ptr); if (o) xorObjectDigest(c->db, c->argv[j], digest, o); sds d = sdsempty(); @@ -841,7 +842,7 @@ void debugCommand(client *c) { "string|integer|double|bignum|null|array|set|map|attrib|push|verbatim|true|false"); } } else if (!strcasecmp(c->argv[1]->ptr, "sleep") && c->argc == 3) { - double dtime = strtod(c->argv[2]->ptr, NULL); + double dtime = valkey_strtod(c->argv[2]->ptr, NULL); long long utime = dtime * 1000000; struct timespec tv; @@ -915,30 +916,35 @@ void debugCommand(client *c) { addReplyVerbatim(c, stats, sdslen(stats), "txt"); sdsfree(stats); } else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) { - robj *o; - dict *ht = NULL; int full = 0; - if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1; - if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return; + robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr); + if (o == NULL) return; - /* Get the hash table reference from the object, if possible. */ + /* Get the dict reference from the object, if possible. */ + dict *d = NULL; + hashtable *ht = NULL; switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: { zset *zs = o->ptr; - ht = zs->dict; + d = zs->dict; } break; - case OBJ_ENCODING_HT: ht = o->ptr; break; + case OBJ_ENCODING_HT: d = o->ptr; break; + case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break; } - if (ht == NULL) { - addReplyError(c, "The value stored at the specified key is not " - "represented using an hash table"); - } else { + if (d != NULL) { char buf[4096]; - dictGetStats(buf, sizeof(buf), ht, full); + dictGetStats(buf, sizeof(buf), d, full); addReplyVerbatim(c, buf, strlen(buf), "txt"); + } else if (ht != NULL) { + char buf[4096]; + hashtableGetStats(buf, sizeof(buf), ht, full); + addReplyVerbatim(c, buf, strlen(buf), "txt"); + } else { + addReplyError(c, "The value stored at the specified key is not " + "represented using an hash table"); } } else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) { serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id"); @@ -1042,6 +1048,14 @@ __attribute__((noinline, weak)) void _serverAssert(const char *estr, const char bugReportEnd(0, 0); } +/* Returns the argv argument in binary representation, limited to length 128. */ +sds getArgvReprString(robj *argv) { + robj *decoded = getDecodedObject(argv); + sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); + decrRefCount(decoded); + return repr; +} + /* Checks if the argument at the given index should be redacted from logs. */ int shouldRedactArg(const client *c, int idx) { serverAssert(idx < c->argc); @@ -1066,16 +1080,12 @@ void _serverAssertPrintClientInfo(const client *c) { serverLog(LL_WARNING, "client->argv[%d]: %zu bytes", j, sdslen((sds)c->argv[j]->ptr)); continue; } - char buf[128]; - char *arg; - - if (c->argv[j]->type == OBJ_STRING && sdsEncodedObject(c->argv[j])) { - arg = (char *)c->argv[j]->ptr; - } else { - snprintf(buf, sizeof(buf), "Object type: %u, encoding: %u", c->argv[j]->type, c->argv[j]->encoding); - arg = buf; + sds repr = getArgvReprString(c->argv[j]); + serverLog(LL_WARNING, "client->argv[%d] = %s (refcount: %d)", j, repr, c->argv[j]->refcount); + sdsfree(repr); + if (!strcasecmp(c->argv[j]->ptr, "auth") || !strcasecmp(c->argv[j]->ptr, "auth2")) { + break; } - serverLog(LL_WARNING, "client->argv[%d] = \"%s\" (refcount: %d)", j, arg, c->argv[j]->refcount); } } @@ -1883,34 +1893,27 @@ void logCurrentClient(client *cc, const char *title) { client = catClientInfoString(sdsempty(), cc, server.hide_user_data_from_log); serverLog(LL_WARNING | LL_RAW, "%s\n", client); sdsfree(client); - serverLog(LL_WARNING | LL_RAW, "argc: '%d'\n", cc->argc); + serverLog(LL_WARNING | LL_RAW, "argc: %d\n", cc->argc); for (j = 0; j < cc->argc; j++) { if (shouldRedactArg(cc, j)) { serverLog(LL_WARNING | LL_RAW, "argv[%d]: %zu bytes\n", j, sdslen((sds)cc->argv[j]->ptr)); continue; } - robj *decoded; - decoded = getDecodedObject(cc->argv[j]); - sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); - serverLog(LL_WARNING | LL_RAW, "argv[%d]: '%s'\n", j, (char *)repr); - if (!strcasecmp(decoded->ptr, "auth") || !strcasecmp(decoded->ptr, "auth2")) { - sdsfree(repr); - decrRefCount(decoded); + sds repr = getArgvReprString(cc->argv[j]); + serverLog(LL_WARNING | LL_RAW, "argv[%d]: %s\n", j, repr); + sdsfree(repr); + if (!strcasecmp(cc->argv[j]->ptr, "auth") || !strcasecmp(cc->argv[j]->ptr, "auth2")) { break; } - sdsfree(repr); - decrRefCount(decoded); } /* Check if the first argument, usually a key, is found inside the * selected DB, and if so print info about the associated object. */ if (cc->argc > 1) { robj *val, *key; - dictEntry *de; key = getDecodedObject(cc->argv[1]); - de = dbFind(cc->db, key->ptr); - if (de) { - val = dictGetVal(de); + val = dbFind(cc->db, key->ptr); + if (val) { serverLog(LL_WARNING, "key '%s' found in DB containing the following object:", (char *)key->ptr); serverLogObjectDebugInfo(val); } diff --git a/src/defrag.c b/src/defrag.c index 4d34009f8b..e9f40d4fab 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -34,34 +34,129 @@ */ #include "server.h" +#include "hashtable.h" +#include "script.h" #include #ifdef HAVE_DEFRAG -typedef struct defragCtx { - void *privdata; +typedef enum { DEFRAG_NOT_DONE = 0, + DEFRAG_DONE = 1 } doneStatus; + + +/* + * Defragmentation is performed in stages. Each stage is serviced by a stage function + * (defragStageFn). The stage function is passed a target (void*) to defrag. The contents of that + * target are unique to the particular stage - and may even be NULL for some stage functions. The + * same stage function can be used multiple times (for different stages) each having a different + * target. + * + * The stage function is required to maintain an internal static state. This allows the stage + * function to continue when invoked in an iterative manner. When invoked with a 0 endtime, the + * stage function is required to clear it's internal state and prepare to begin a new stage. It + * should return false (more work to do) as it should NOT perform any real "work" during init. + * + * Parameters: + * endtime - This is the monotonic time that the function should end and return. This ensures + * a bounded latency due to defrag. When endtime is 0, the internal state should be + * cleared, preparing to begin the stage with a new target. + * target - This is the "thing" that should be defragged. It's type is dependent on the + * type of the stage function. This might be a dict, a kvstore, a DB, or other. + * privdata - A pointer to arbitrary private data which is unique to the stage function. + * + * Returns: + * - DEFRAG_DONE if the stage is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*defragStageFn)(monotime endtime, void *target, void *privdata); + +typedef struct { + defragStageFn stage_fn; // The function to be invoked for the stage + void *target; // The target that the function will defrag + void *privdata; // Private data, unique to the stage function +} StageDescriptor; + +/* Globals needed for the main defrag processing logic. + * Doesn't include variables specific to a stage or type of data. */ +struct DefragContext { + monotime start_cycle; // Time of beginning of defrag cycle + long long start_defrag_hits; // server.stat_active_defrag_hits captured at beginning of cycle + list *remaining_stages; // List of stages which remain to be processed + StageDescriptor *current_stage; // The stage that's currently being processed + + long long timeproc_id; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) + monotime timeproc_end_time; // Ending time of previous timerproc execution + long timeproc_overage_us; // A correction value if over target CPU percent +}; +static struct DefragContext defrag; + + +/* There are a number of stages which process a kvstore. To simplify this, a stage helper function + * `defragStageKvstoreHelper()` is defined. This function aids in iterating over the kvstore. It + * uses these definitions. + */ +/* State of the kvstore helper. The private data (privdata) passed to the kvstore helper MUST BEGIN + * with a kvstoreIterState (or be passed as NULL). */ +#define KVS_SLOT_DEFRAG_LUT -2 +#define KVS_SLOT_UNASSIGNED -1 +typedef struct { + kvstore *kvs; int slot; - void *aux; -} defragCtx; + unsigned long cursor; +} kvstoreIterState; +/* The kvstore helper uses this function to perform tasks before continuing the iteration. For the + * main hash table, large items are set aside and processed by this function before continuing with + * iteration over the kvstore. + * endtime - This is the monotonic time that the function should end and return. + * privdata - Private data for functions invoked by the helper. If provided in the call to + * `defragStageKvstoreHelper()`, the `kvstoreIterState` portion (at the beginning) + * will be updated with the current kvstore iteration status. + * + * Returns: + * - DEFRAG_DONE if the pre-continue work is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdata); -typedef struct defragPubSubCtx { - kvstore *pubsub_channels; - dict *(*clientPubSubChannels)(client *); + +// Private data for main dictionary keys +typedef struct { + kvstoreIterState kvstate; + int dbid; +} defragKeysCtx; +static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); + +// Private data for pubsub kvstores +typedef dict *(*getClientChannelsFn)(client *); +typedef struct { + getClientChannelsFn fn; +} getClientChannelsFnWrapper; + +typedef struct { + kvstoreIterState kvstate; + getClientChannelsFn getPubSubChannels; } defragPubSubCtx; +static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); -/* this method was added to jemalloc in order to help us understand which - * pointers are worthwhile moving and which aren't */ -int je_get_defrag_hint(void *ptr); -/* Defrag helper for generic allocations. - * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released - * and should NOT be accessed. */ -void *activeDefragAlloc(void *ptr) { +/* When scanning a main kvstore, large elements are queued for later handling rather than + * causing a large latency spike while processing a hash table bucket. This list is only used + * for stage: "defragStageDbKeys". It will only contain values for the current kvstore being + * defragged. + * Note that this is a list of key names. It's possible that the key may be deleted or modified + * before "later" and we will search by key name to find the entry when we defrag the item later. + */ +static list *defrag_later; +static unsigned long defrag_later_cursor; + +/* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block. + * It is the responsibility of the caller to free the old block if a non-NULL value (new block) + * is returned. (Returns NULL if no relocation was needed.) + */ +static void *activeDefragAllocWithoutFree(void *ptr, size_t *allocation_size) { size_t size; void *newptr; - if (!je_get_defrag_hint(ptr)) { + if (!allocatorShouldDefrag(ptr)) { server.stat_active_defrag_misses++; return NULL; } @@ -69,49 +164,32 @@ void *activeDefragAlloc(void *ptr) { * make sure not to use the thread cache. so that we don't get back the same * pointers we try to free */ size = zmalloc_size(ptr); - newptr = zmalloc_no_tcache(size); + newptr = allocatorDefragAlloc(size); memcpy(newptr, ptr, size); - zfree_no_tcache(ptr); + if (allocation_size) *allocation_size = size; + server.stat_active_defrag_hits++; return newptr; } -/* This method captures the expiry db dict entry which refers to data stored in keys db dict entry. */ -void defragEntryStartCbForKeys(void *ctx, void *oldptr) { - defragCtx *defragctx = (defragCtx *)ctx; - serverDb *db = defragctx->privdata; - sds oldsds = (sds)dictGetKey((dictEntry *)oldptr); - int slot = defragctx->slot; - if (kvstoreDictSize(db->expires, slot)) { - dictEntry *expire_de = kvstoreDictFind(db->expires, slot, oldsds); - defragctx->aux = expire_de; - } -} - -/* This method updates the key of expiry db dict entry. The key might be no longer valid - * as it could have been cleaned up during the defrag-realloc of the main dictionary. */ -void defragEntryFinishCbForKeys(void *ctx, void *newptr) { - defragCtx *defragctx = (defragCtx *)ctx; - dictEntry *expire_de = (dictEntry *)defragctx->aux; - /* Item doesn't have TTL associated to it. */ - if (!expire_de) return; - /* No reallocation happened. */ - if (!newptr) { - expire_de = NULL; - return; - } - serverDb *db = defragctx->privdata; - sds newsds = (sds)dictGetKey((dictEntry *)newptr); - int slot = defragctx->slot; - kvstoreDictSetKey(db->expires, slot, expire_de, newsds); +/* Defrag helper for generic allocations. + * + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released + * and should NOT be accessed. */ +void *activeDefragAlloc(void *ptr) { + size_t allocation_size; + void *newptr = activeDefragAllocWithoutFree(ptr, &allocation_size); + if (newptr) allocatorDefragFree(ptr, allocation_size); + return newptr; } -/*Defrag helper for sds strings +/* Defrag helper for sds strings * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -sds activeDefragSds(sds sdsptr) { +static sds activeDefragSds(sds sdsptr) { void *ptr = sdsAllocPtr(sdsptr); void *newptr = activeDefragAlloc(ptr); if (newptr) { @@ -122,60 +200,48 @@ sds activeDefragSds(sds sdsptr) { return NULL; } -/* Defrag helper for robj and/or string objects with expected refcount. - * - * Like activeDefragStringOb, but it requires the caller to pass in the expected - * reference count. In some cases, the caller needs to update a robj whose - * reference count is not 1, in these cases, the caller must explicitly pass - * in the reference count, otherwise defragmentation will not be performed. - * Note that the caller is responsible for updating any other references to the robj. */ -robj *activeDefragStringObEx(robj *ob, int expected_refcount) { - robj *ret = NULL; - if (ob->refcount != expected_refcount) return NULL; - - /* try to defrag robj (only if not an EMBSTR type (handled below). */ - if (ob->type != OBJ_STRING || ob->encoding != OBJ_ENCODING_EMBSTR) { - if ((ret = activeDefragAlloc(ob))) { - ob = ret; - } +/* Performs defrag on a string-type (or generic) robj, but does not free the old robj. This is the + * caller's responsibility. This is necessary for string objects with multiple references. In this + * case the caller can fix the references before freeing the original object. + */ +static robj *activeDefragStringObWithoutFree(robj *ob, size_t *allocation_size) { + if (ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_RAW) { + // Try to defrag the linked sds, regardless of if robj will be moved + sds newsds = activeDefragSds((sds)ob->ptr); + if (newsds) ob->ptr = newsds; } - /* try to defrag string object */ - if (ob->type == OBJ_STRING) { - if (ob->encoding == OBJ_ENCODING_RAW) { - sds newsds = activeDefragSds((sds)ob->ptr); - if (newsds) { - ob->ptr = newsds; - } - } else if (ob->encoding == OBJ_ENCODING_EMBSTR) { - /* The sds is embedded in the object allocation, calculate the - * offset and update the pointer in the new allocation. */ - long ofs = (intptr_t)ob->ptr - (intptr_t)ob; - if ((ret = activeDefragAlloc(ob))) { - ret->ptr = (void *)((intptr_t)ret + ofs); - } - } else if (ob->encoding != OBJ_ENCODING_INT) { - serverPanic("Unknown string encoding"); - } + robj *new_robj = activeDefragAllocWithoutFree(ob, allocation_size); + + if (new_robj && ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_EMBSTR) { + // If the robj is moved, correct the internal pointer + long embstr_offset = (intptr_t)ob->ptr - (intptr_t)ob; + new_robj->ptr = (void *)((intptr_t)new_robj + embstr_offset); } - return ret; + return new_robj; } + /* Defrag helper for robj and/or string objects * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ robj *activeDefragStringOb(robj *ob) { - return activeDefragStringObEx(ob, 1); + size_t allocation_size; + if (ob->refcount != 1) return NULL; // Unsafe to defrag if multiple refs + robj *new_robj = activeDefragStringObWithoutFree(ob, &allocation_size); + if (new_robj) allocatorDefragFree(ob, allocation_size); + return new_robj; } + /* Defrag helper for lua scripts * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -luaScript *activeDefragLuaScript(luaScript *script) { +static luaScript *activeDefragLuaScript(luaScript *script) { luaScript *ret = NULL; /* try to defrag script struct */ @@ -197,7 +263,7 @@ luaScript *activeDefragLuaScript(luaScript *script) { * Returns NULL in case the allocation wasn't moved. * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -dict *dictDefragTables(dict *d) { +static dict *dictDefragTables(dict *d) { dict *ret = NULL; dictEntry **newtable; /* handle the dict struct */ @@ -215,7 +281,7 @@ dict *dictDefragTables(dict *d) { } /* Internal function used by zslDefrag */ -void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { +static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == oldnode) update[i]->level[i].forward = newnode; @@ -237,7 +303,7 @@ void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnod * only need to defrag the skiplist, but not update the obj pointer. * When return value is non-NULL, it is the score reference that must be updated * in the dict record. */ -double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { +static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx; int i; sds ele = newele ? newele : oldele; @@ -271,7 +337,7 @@ double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { /* Defrag helper for sorted set. * Defrag a single dict entry key name, and corresponding skiplist struct */ -void activeDefragZsetEntry(zset *zs, dictEntry *de) { +static void activeDefragZsetEntry(zset *zs, dictEntry *de) { sds newsds; double *newscore; sds sdsele = dictGetKey(de); @@ -288,13 +354,13 @@ void activeDefragZsetEntry(zset *zs, dictEntry *de) { #define DEFRAG_SDS_DICT_VAL_VOID_PTR 3 #define DEFRAG_SDS_DICT_VAL_LUA_SCRIPT 4 -void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { +static void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); } /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */ -void activeDefragSdsDict(dict *d, int val_type) { +static void activeDefragSdsDict(dict *d, int val_type) { unsigned long cursor = 0; dictDefragFunctions defragfns = { .defragAlloc = activeDefragAlloc, @@ -309,35 +375,22 @@ void activeDefragSdsDict(dict *d, int val_type) { } while (cursor != 0); } -/* Defrag a list of ptr, sds or robj string values */ -void activeDefragList(list *l, int val_type) { - listNode *ln, *newln; - for (ln = l->head; ln; ln = ln->next) { - if ((newln = activeDefragAlloc(ln))) { - if (newln->prev) - newln->prev->next = newln; - else - l->head = newln; - if (newln->next) - newln->next->prev = newln; - else - l->tail = newln; - ln = newln; - } - if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) { - sds newsds, sdsele = ln->value; - if ((newsds = activeDefragSds(sdsele))) ln->value = newsds; - } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) { - robj *newele, *ele = ln->value; - if ((newele = activeDefragStringOb(ele))) ln->value = newele; - } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) { - void *newptr, *ptr = ln->value; - if ((newptr = activeDefragAlloc(ptr))) ln->value = newptr; - } - } +void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) { + UNUSED(privdata); + sds *sds_ref = (sds *)entry_ref; + sds new_sds = activeDefragSds(*sds_ref); + if (new_sds != NULL) *sds_ref = new_sds; +} + +void activeDefragSdsHashtable(hashtable *ht) { + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); } -void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { +/* Defrag a list of ptr, sds or robj string values */ +static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; unsigned char *newzl; if ((newnode = activeDefragAlloc(node))) { @@ -354,7 +407,7 @@ void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { if ((newzl = activeDefragAlloc(node->entry))) node->entry = newzl; } -void activeDefragQuickListNodes(quicklist *ql) { +static void activeDefragQuickListNodes(quicklist *ql) { quicklistNode *node = ql->head; while (node) { activeDefragQuickListNode(ql, &node); @@ -365,13 +418,18 @@ void activeDefragQuickListNodes(quicklist *ql) { /* when the value has lots of elements, we want to handle it later and not as * part of the main dictionary scan. this is needed in order to prevent latency * spikes when handling large items */ -void defragLater(serverDb *db, dictEntry *kde) { - sds key = sdsdup(dictGetKey(kde)); - listAddNodeTail(db->defrag_later, key); +static void defragLater(robj *obj) { + if (!defrag_later) { + defrag_later = listCreate(); + listSetFreeMethod(defrag_later, sdsfreeVoid); + defrag_later_cursor = 0; + } + sds key = sdsdup(objectGetKey(obj)); + listAddNodeTail(defrag_later, key); } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { +static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) { quicklist *ql = ob->ptr; quicklistNode *node; long iterations = 0; @@ -396,7 +454,7 @@ long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { activeDefragQuickListNode(ql, &node); server.stat_active_defrag_scanned++; if (++iterations > 128 && !bookmark_failed) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { if (!quicklistBookmarkCreate(&ql, "_AD", node)) { bookmark_failed = 1; } else { @@ -417,14 +475,14 @@ typedef struct { zset *zs; } scanLaterZsetData; -void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { +static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { dictEntry *de = (dictEntry *)_de; scanLaterZsetData *data = privdata; activeDefragZsetEntry(data->zs, de); server.stat_active_defrag_scanned++; } -void scanLaterZset(robj *ob, unsigned long *cursor) { +static void scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return; zset *zs = (zset *)ob->ptr; dict *d = zs->dict; @@ -433,22 +491,28 @@ void scanLaterZset(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data); } -/* Used as scan callback when all the work is done in the dictDefragFunctions. */ -void scanCallbackCountScanned(void *privdata, const dictEntry *de) { +/* Used as hashtable scan callback when all we need is to defrag the hashtable + * internals (the allocated buckets) and not the elements. */ +static void scanHashtableCallbackCountScanned(void *privdata, void *elemref) { + UNUSED(privdata); + UNUSED(elemref); + server.stat_active_defrag_scanned++; +} + +/* Used as dict scan callback when all the work is done in the dictDefragFunctions. */ +static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); server.stat_active_defrag_scanned++; } -void scanLaterSet(robj *ob, unsigned long *cursor) { - if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; - dict *d = ob->ptr; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds}; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); +static void scanLaterSet(robj *ob, unsigned long *cursor) { + if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return; + hashtable *ht = ob->ptr; + *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } -void scanLaterHash(robj *ob, unsigned long *cursor) { +static void scanLaterHash(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, @@ -457,19 +521,17 @@ void scanLaterHash(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } -void defragQuicklist(serverDb *db, dictEntry *kde) { - robj *ob = dictGetVal(kde); +static void defragQuicklist(robj *ob) { quicklist *ql = ob->ptr, *newql; serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST); if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql; if (ql->len > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(ob); else activeDefragQuickListNodes(ql); } -void defragZsetSkiplist(serverDb *db, dictEntry *kde) { - robj *ob = dictGetVal(kde); +static void defragZsetSkiplist(robj *ob) { zset *zs = (zset *)ob->ptr; zset *newzs; zskiplist *newzsl; @@ -481,7 +543,7 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader; if (dictSize(zs->dict) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(ob); else { dictIterator *di = dictGetIterator(zs->dict); while ((de = dictNext(di)) != NULL) { @@ -493,35 +555,34 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; } -void defragHash(serverDb *db, dictEntry *kde) { - robj *ob = dictGetVal(kde); +static void defragHash(robj *ob) { dict *d, *newd; serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); d = ob->ptr; if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(ob); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); /* defrag the dict struct and tables */ if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; } -void defragSet(serverDb *db, dictEntry *kde) { - robj *ob = dictGetVal(kde); - dict *d, *newd; - serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; - if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); - else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); - /* defrag the dict struct and tables */ - if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; +static void defragSet(robj *ob) { + serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE); + hashtable *ht = ob->ptr; + if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { + defragLater(ob); + } else { + activeDefragSdsHashtable(ht); + } + /* defrag the hashtable struct and tables */ + hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (newHashtable) ob->ptr = newHashtable; } /* Defrag callback for radix tree iterator, called for each node, * used in order to defrag the nodes allocations. */ -int defragRaxNode(raxNode **noderef) { +static int defragRaxNode(raxNode **noderef) { raxNode *newnode = activeDefragAlloc(*noderef); if (newnode) { *noderef = newnode; @@ -531,7 +592,7 @@ int defragRaxNode(raxNode **noderef) { } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) { +static int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, monotime endtime) { static unsigned char last[sizeof(streamID)]; raxIterator ri; long iterations = 0; @@ -567,7 +628,7 @@ int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) if (newdata) raxSetData(ri.node, ri.data = newdata); server.stat_active_defrag_scanned++; if (++iterations > 128) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { serverAssert(ri.key_len == sizeof(last)); memcpy(last, ri.key, ri.key_len); raxStop(&ri); @@ -589,7 +650,7 @@ typedef void *(raxDefragFunction)(raxIterator *ri, void *privdata); * 2) rax nodes * 3) rax entry data (only if defrag_data is specified) * 4) call a callback per element, and allow the callback to return a new pointer for the element */ -void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { +static void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { raxIterator ri; rax *rax; if ((rax = activeDefragAlloc(*raxref))) *raxref = rax; @@ -612,7 +673,7 @@ typedef struct { streamConsumer *c; } PendingEntryContext; -void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; streamNACK *nack = ri->data, *newnack; nack->consumer = ctx->c; /* update nack pointer to consumer */ @@ -626,7 +687,7 @@ void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { return newnack; } -void *defragStreamConsumer(raxIterator *ri, void *privdata) { +static void *defragStreamConsumer(raxIterator *ri, void *privdata) { streamConsumer *c = ri->data; streamCG *cg = privdata; void *newc = activeDefragAlloc(c); @@ -642,7 +703,7 @@ void *defragStreamConsumer(raxIterator *ri, void *privdata) { return newc; /* returns NULL if c was not defragged */ } -void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { streamCG *cg = ri->data; UNUSED(privdata); if (cg->consumers) defragRadixTree(&cg->consumers, 0, defragStreamConsumer, cg); @@ -650,8 +711,7 @@ void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { return NULL; } -void defragStream(serverDb *db, dictEntry *kde) { - robj *ob = dictGetVal(kde); +static void defragStream(robj *ob) { serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM); stream *s = ob->ptr, *news; @@ -661,7 +721,7 @@ void defragStream(serverDb *db, dictEntry *kde) { if (raxSize(s->rax) > server.active_defrag_max_scan_fields) { rax *newrax = activeDefragAlloc(s->rax); if (newrax) s->rax = newrax; - defragLater(db, kde); + defragLater(ob); } else defragRadixTree(&s->rax, 1, NULL, NULL); @@ -671,25 +731,36 @@ void defragStream(serverDb *db, dictEntry *kde) { /* Defrag a module key. This is either done immediately or scheduled * for later. Returns then number of pointers defragged. */ -void defragModule(serverDb *db, dictEntry *kde) { - robj *obj = dictGetVal(kde); +static void defragModule(serverDb *db, robj *obj) { serverAssert(obj->type == OBJ_MODULE); - - if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(db, kde); + /* Fun fact (and a bug since forever): The key is passed to + * moduleDefragValue as an sds string, but the parameter is declared to be + * an robj and it's passed as such to the module type defrag callbacks. + * Nobody can ever have used this, i.e. accessed the key name in the defrag + * or free_effort module type callbacks. */ + void *sds_key_passed_as_robj = objectGetKey(obj); + if (!moduleDefragValue(sds_key_passed_as_robj, obj, db->id)) defragLater(obj); } /* for each key we scan in the main dict, this function will attempt to defrag * all the various pointers it has. */ -void defragKey(defragCtx *ctx, dictEntry *de) { - serverDb *db = ctx->privdata; - int slot = ctx->slot; +static void defragKey(defragKeysCtx *ctx, robj **elemref) { + serverDb *db = &server.db[ctx->dbid]; + int slot = ctx->kvstate.slot; robj *newob, *ob; unsigned char *newzl; + ob = *elemref; - /* Try to defrag robj and / or string value. */ - ob = dictGetVal(de); + /* Try to defrag robj and/or string value. */ if ((newob = activeDefragStringOb(ob))) { - kvstoreDictSetVal(db->keys, slot, de, newob); + *elemref = newob; + if (objectGetExpire(newob) >= 0) { + /* Replace the pointer in the expire table without accessing the old + * pointer. */ + hashtable *expires_ht = kvstoreGetHashtable(db->expires, slot); + int replaced = hashtableReplaceReallocatedEntry(expires_ht, ob, newob); + serverAssert(replaced); + } ob = newob; } @@ -697,15 +768,15 @@ void defragKey(defragCtx *ctx, dictEntry *de) { /* Already handled in activeDefragStringOb. */ } else if (ob->type == OBJ_LIST) { if (ob->encoding == OBJ_ENCODING_QUICKLIST) { - defragQuicklist(db, de); + defragQuicklist(ob); } else if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else { serverPanic("Unknown list encoding"); } } else if (ob->type == OBJ_SET) { - if (ob->encoding == OBJ_ENCODING_HT) { - defragSet(db, de); + if (ob->encoding == OBJ_ENCODING_HASHTABLE) { + defragSet(ob); } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) { void *newptr, *ptr = ob->ptr; if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr; @@ -716,7 +787,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) { - defragZsetSkiplist(db, de); + defragZsetSkiplist(ob); } else { serverPanic("Unknown sorted set encoding"); } @@ -724,23 +795,23 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_HT) { - defragHash(db, de); + defragHash(ob); } else { serverPanic("Unknown hash encoding"); } } else if (ob->type == OBJ_STREAM) { - defragStream(db, de); + defragStream(ob); } else if (ob->type == OBJ_MODULE) { - defragModule(db, de); + defragModule(db, ob); } else { serverPanic("Unknown object type"); } } /* Defrag scan callback for the main db dictionary. */ -void defragScanCallback(void *privdata, const dictEntry *de) { +static void dbKeysScanCallback(void *privdata, void *elemref) { long long hits_before = server.stat_active_defrag_hits; - defragKey((defragCtx *)privdata, (dictEntry *)de); + defragKey((defragKeysCtx *)privdata, (robj **)elemref); if (server.stat_active_defrag_hits != hits_before) server.stat_active_defrag_key_hits++; else @@ -748,42 +819,19 @@ void defragScanCallback(void *privdata, const dictEntry *de) { server.stat_active_defrag_scanned++; } -/* Utility function to get the fragmentation ratio from jemalloc. - * It is critical to do that by comparing only heap maps that belong to - * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this - * fragmentation ratio in order to decide if a defrag action should be taken - * or not, a false detection can cause the defragmenter to waste a lot of CPU - * without the possibility of getting any results. */ -float getAllocatorFragmentation(size_t *out_frag_bytes) { - size_t resident, active, allocated, frag_smallbins_bytes; - zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes); - - /* Calculate the fragmentation ratio as the proportion of wasted memory in small - * bins (which are defraggable) relative to the total allocated memory (including large bins). - * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, - * despite the fact it's not a lot of memory for the user. */ - float frag_pct = (float)frag_smallbins_bytes / allocated * 100; - float rss_pct = ((float)resident / allocated) * 100 - 100; - size_t rss_bytes = resident - allocated; - if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes; - serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", - allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); - return frag_pct; -} - -/* Defrag scan callback for the pubsub dictionary. */ -void defragPubsubScanCallback(void *privdata, const dictEntry *de) { - defragCtx *ctx = privdata; - defragPubSubCtx *pubsub_ctx = ctx->privdata; - kvstore *pubsub_channels = pubsub_ctx->pubsub_channels; - robj *newchannel, *channel = dictGetKey(de); - dict *newclients, *clients = dictGetVal(de); +/* Defrag scan callback for a pubsub channels hashtable. */ +static void defragPubsubScanCallback(void *privdata, void *elemref) { + defragPubSubCtx *ctx = privdata; + void **channel_dict_ref = (void **)elemref; + dict *newclients, *clients = *channel_dict_ref; + robj *newchannel, *channel = *(robj **)dictMetadata(clients); + size_t allocation_size; /* Try to defrag the channel name. */ serverAssert(channel->refcount == (int)dictSize(clients) + 1); - newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1); + newchannel = activeDefragStringObWithoutFree(channel, &allocation_size); if (newchannel) { - kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry *)de, newchannel); + *(robj **)dictMetadata(clients) = newchannel; /* The channel name is shared by the client's pubsub(shard) and server's * pubsub(shard), after defraging the channel name, we need to update @@ -792,37 +840,27 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) { dictEntry *clientde; while ((clientde = dictNext(di)) != NULL) { client *c = dictGetKey(clientde); - dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel); + dict *client_channels = ctx->getPubSubChannels(c); + dictEntry *pubsub_channel = dictFind(client_channels, newchannel); serverAssert(pubsub_channel); - dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel); + dictSetKey(ctx->getPubSubChannels(c), pubsub_channel, newchannel); } dictReleaseIterator(di); + // Now that we're done correcting the references, we can safely free the old channel robj + allocatorDefragFree(channel, allocation_size); } /* Try to defrag the dictionary of clients that is stored as the value part. */ if ((newclients = dictDefragTables(clients))) - kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry *)de, newclients); + *channel_dict_ref = newclients; server.stat_active_defrag_scanned++; } -/* We may need to defrag other globals, one small allocation can hold a full allocator run. - * so although small, it is still important to defrag these */ -void defragOtherGlobals(void) { - /* there are many more pointers to defrag (e.g. client argv, output / aof buffers, etc. - * but we assume most of these are short lived, we only need to defrag allocations - * that remain static for a long time */ - activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); - moduleDefragGlobals(); - kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables); - kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables); -} - /* returns 0 more work may or may not be needed (see non-zero cursor), * and 1 if time is up and more work is needed. */ -int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int dbid) { - if (de) { - robj *ob = dictGetVal(de); +static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) { + if (ob) { if (ob->type == OBJ_LIST) { return scanLaterList(ob, cursor, endtime); } else if (ob->type == OBJ_SET) { @@ -834,7 +872,14 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int } else if (ob->type == OBJ_STREAM) { return scanLaterStreamListpacks(ob, cursor, endtime); } else if (ob->type == OBJ_MODULE) { - return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid); + /* Fun fact (and a bug since forever): The key is passed to + * moduleLateDefrag as an sds string, but the parameter is declared + * to be an robj and it's passed as such to the module type defrag + * callbacks. Nobody can ever have used this, i.e. accessed the key + * name in the defrag module type callback. */ + void *sds_key_passed_as_robj = objectGetKey(ob); + long long endtimeWallClock = ustime() + (endtime - getMonotonicUs()); + return moduleLateDefrag(sds_key_passed_as_robj, ob, cursor, endtimeWallClock, dbid); } else { *cursor = 0; /* object type may have changed since we schedule it for later */ } @@ -844,299 +889,479 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int return 0; } -/* static variables serving defragLaterStep to continue scanning a key from were we stopped last time. */ -static sds defrag_later_current_key = NULL; -static unsigned long defrag_later_cursor = 0; -/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int defragLaterStep(serverDb *db, int slot, long long endtime) { +// A kvstoreHelperPreContinueFn +static doneStatus defragLaterStep(monotime endtime, void *privdata) { + defragKeysCtx *ctx = privdata; + unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long key_defragged; - do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_later_cursor) { - listNode *head = listFirst(db->defrag_later); - - /* Move on to next key */ - if (defrag_later_current_key) { - serverAssert(defrag_later_current_key == head->value); - listDelNode(db->defrag_later, head); - defrag_later_cursor = 0; - defrag_later_current_key = NULL; - } + while (defrag_later && listLength(defrag_later) > 0) { + listNode *head = listFirst(defrag_later); + sds key = head->value; + void *found = NULL; + kvstoreHashtableFind(ctx->kvstate.kvs, ctx->kvstate.slot, key, &found); + robj *ob = found; + + long long key_defragged = server.stat_active_defrag_hits; + bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->dbid) == 1); + if (key_defragged != server.stat_active_defrag_hits) { + server.stat_active_defrag_key_hits++; + } else { + server.stat_active_defrag_key_misses++; + } - /* stop if we reached the last one. */ - head = listFirst(db->defrag_later); - if (!head) return 0; + if (timeout) break; + + if (defrag_later_cursor == 0) { + // the item is finished, move on + listDelNode(defrag_later, head); + } - /* start a new key */ - defrag_later_current_key = head->value; - defrag_later_cursor = 0; + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || + server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() > endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; } + } + + return (!defrag_later || listLength(defrag_later) == 0) ? DEFRAG_DONE : DEFRAG_NOT_DONE; +} + + +/* This helper function handles most of the work for iterating over a kvstore. 'privdata', if + * provided, MUST begin with 'kvstoreIterState' and this part is automatically updated by this + * function during the iteration. */ +static doneStatus defragStageKvstoreHelper(monotime endtime, + kvstore *kvs, + hashtableScanFunction scan_fn, + kvstoreHelperPreContinueFn precontinue_fn, + void *privdata) { + static kvstoreIterState state; // STATIC - this persists + if (endtime == 0) { + // Starting the stage, set up the state information for this stage + state.kvs = kvs; + state.slot = KVS_SLOT_DEFRAG_LUT; + state.cursor = 0; + return DEFRAG_NOT_DONE; + } + if (kvs != state.kvs) { + // There has been a change of the kvs (flushdb, swapdb, etc.). Just complete the stage. + return DEFRAG_DONE; + } + + unsigned int iterations = 0; + unsigned long long prev_defragged = server.stat_active_defrag_hits; + unsigned long long prev_scanned = server.stat_active_defrag_scanned; - /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */ - dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key); - key_defragged = server.stat_active_defrag_hits; + if (state.slot == KVS_SLOT_DEFRAG_LUT) { + // Before we start scanning the kvstore, handle the main structures do { - int quit = 0; - if (defragLaterItem(de, &defrag_later_cursor, endtime, db->id)) - quit = 1; /* time is up, we didn't finish all the work */ - - /* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields - * (if we have a lot of pointers in one hash bucket, or rehashing), - * check if we reached the time limit. */ - if (quit || (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64)) { - if (quit || ustime() > endtime) { - if (key_defragged != server.stat_active_defrag_hits) - server.stat_active_defrag_key_hits++; - else - server.stat_active_defrag_key_misses++; - return 1; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; + state.cursor = kvstoreHashtableDefragTables(kvs, state.cursor, activeDefragAlloc); + if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE; + } while (state.cursor != 0); + state.slot = KVS_SLOT_UNASSIGNED; + } + + while (true) { + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() >= endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; + } + + if (precontinue_fn) { + if (privdata) *(kvstoreIterState *)privdata = state; + if (precontinue_fn(endtime, privdata) == DEFRAG_NOT_DONE) return DEFRAG_NOT_DONE; + } + + if (!state.cursor) { + // If there's no cursor, we're ready to begin a new kvstore slot. + if (state.slot == KVS_SLOT_UNASSIGNED) { + state.slot = kvstoreGetFirstNonEmptyHashtableIndex(kvs); + } else { + state.slot = kvstoreGetNextNonEmptyHashtableIndex(kvs, state.slot); } - } while (defrag_later_cursor); - if (key_defragged != server.stat_active_defrag_hits) - server.stat_active_defrag_key_hits++; - else - server.stat_active_defrag_key_misses++; - } while (1); + + if (state.slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE; + } + + // Whatever privdata's actual type, this function requires that it begins with kvstoreIterState. + if (privdata) *(kvstoreIterState *)privdata = state; + state.cursor = kvstoreHashtableScanDefrag(kvs, state.slot, state.cursor, + scan_fn, privdata, activeDefragAlloc, + HASHTABLE_SCAN_EMIT_REF); + } + + return DEFRAG_NOT_DONE; } -#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) -#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) -/* decide if defrag is needed, and at what CPU effort to invest in it */ -void computeDefragCycles(void) { - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - /* If we're not already running, and below the threshold, exit. */ - if (!server.active_defrag_running) { - if (frag_pct < server.active_defrag_threshold_lower || frag_bytes < server.active_defrag_ignore_bytes) return; +// Target is a DBID +static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; + + static defragKeysCtx ctx; // STATIC - this persists + if (endtime == 0) { + ctx.dbid = dbid; + // Don't return yet. Call the helper with endtime==0 below. } + serverAssert(ctx.dbid == dbid); - /* Calculate the adaptive aggressiveness of the defrag based on the current - * fragmentation and configurations. */ - int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, - server.active_defrag_cycle_min, server.active_defrag_cycle_max); - cpu_pct = LIMIT(cpu_pct, server.active_defrag_cycle_min, server.active_defrag_cycle_max); + return defragStageKvstoreHelper(endtime, db->keys, + dbKeysScanCallback, defragLaterStep, &ctx); +} - /* Normally we allow increasing the aggressiveness during a scan, but don't - * reduce it, since we should not lower the aggressiveness when fragmentation - * drops. But when a configuration is made, we should reconsider it. */ - if (cpu_pct > server.active_defrag_running || server.active_defrag_configuration_changed) { - server.active_defrag_running = cpu_pct; - server.active_defrag_configuration_changed = 0; - serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", frag_pct, frag_bytes, - cpu_pct); + +// Target is a DBID +static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + int dbid = (uintptr_t)target; + serverDb *db = &server.db[dbid]; + return defragStageKvstoreHelper(endtime, db->expires, + scanHashtableCallbackCountScanned, NULL, NULL); +} + + +static doneStatus defragStagePubsubKvstore(monotime endtime, void *target, void *privdata) { + // target is server.pubsub_channels or server.pubsubshard_channels + getClientChannelsFnWrapper *fnWrapper = privdata; + defragPubSubCtx ctx; + ctx.getPubSubChannels = fnWrapper->fn; + return defragStageKvstoreHelper(endtime, (kvstore *)target, + defragPubsubScanCallback, NULL, &ctx); +} + + +static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + /* In case we are in the process of eval some script we do not want to replace the script being run + * so we just bail out without really defragging here. */ + if (scriptIsRunning()) return DEFRAG_DONE; + activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); + return DEFRAG_DONE; +} + + +static doneStatus defragModuleGlobals(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + moduleDefragGlobals(); + return DEFRAG_DONE; +} + + +static bool defragIsRunning(void) { + return (defrag.timeproc_id > 0); +} + + +static void addDefragStage(defragStageFn stage_fn, void *target, void *privdata) { + StageDescriptor *stage = zmalloc(sizeof(StageDescriptor)); + stage->stage_fn = stage_fn; + stage->target = target; + stage->privdata = privdata; + listAddNodeTail(defrag.remaining_stages, stage); +} + + +// Called at the end of a complete defrag cycle, or when defrag is terminated +static void endDefragCycle(bool normal_termination) { + if (normal_termination) { + // For normal termination, we expect... + serverAssert(!defrag.current_stage); + serverAssert(listLength(defrag.remaining_stages) == 0); + serverAssert(!defrag_later || listLength(defrag_later) == 0); + } else { + // Defrag is being terminated abnormally + aeDeleteTimeEvent(server.el, defrag.timeproc_id); + + if (defrag.current_stage) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } + listSetFreeMethod(defrag.remaining_stages, zfree); + } + defrag.timeproc_id = AE_DELETED_EVENT_ID; + + listRelease(defrag.remaining_stages); + defrag.remaining_stages = NULL; + + if (defrag_later) { + listRelease(defrag_later); + defrag_later = NULL; } + defrag_later_cursor = 0; + + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", + (int)elapsedMs(defrag.start_cycle), (int)(server.stat_active_defrag_hits - defrag.start_defrag_hits), + frag_pct, frag_bytes); + + server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); + server.stat_last_active_defrag_time = 0; + server.active_defrag_cpu_percent = 0; } -/* Perform incremental defragmentation work from the serverCron. - * This works in a similar way to activeExpireCycle, in the sense that - * we do incremental work across calls. */ -void activeDefragCycle(void) { - static int slot = -1; - static int current_db = -1; - static int defrag_later_item_in_progress = 0; - static int defrag_stage = 0; - static unsigned long defrag_cursor = 0; - static serverDb *db = NULL; - static long long start_scan, start_stat; - unsigned int iterations = 0; - unsigned long long prev_defragged = server.stat_active_defrag_hits; - unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long start, timelimit, endtime; - mstime_t latency; - int all_stages_finished = 0; - int quit = 0; - if (!server.active_defrag_enabled) { - if (server.active_defrag_running) { - /* if active defrag was disabled mid-run, start from fresh next time. */ - server.active_defrag_running = 0; - server.active_defrag_configuration_changed = 0; - if (db) listEmpty(db->defrag_later); - defrag_later_current_key = NULL; - defrag_later_cursor = 0; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - goto update_metrics; +/* Must be called at the start of the timeProc as it measures the delay from the end of the previous + * timeProc invocation when performing the computation. */ +static int computeDefragCycleUs(void) { + long dutyCycleUs; + + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + static int prevCpuPercent = 0; // STATIC - this persists + if (targetCpuPercent != prevCpuPercent) { + /* If the targetCpuPercent changes, the value might be different from when the last wait + * time was computed. In this case, don't consider wait time. (This is really only an + * issue in crazy tests that dramatically increase CPU while defrag is running.) */ + defrag.timeproc_end_time = 0; + prevCpuPercent = targetCpuPercent; + } + + // Given when the last duty cycle ended, compute time needed to achieve the desired percentage. + if (defrag.timeproc_end_time == 0) { + // Either the first call to the timeProc, or we were paused for some reason. + defrag.timeproc_overage_us = 0; + dutyCycleUs = server.active_defrag_cycle_us; + } else { + long waitedUs = getMonotonicUs() - defrag.timeproc_end_time; + /* Given the elapsed wait time between calls, compute the necessary duty time needed to + * achieve the desired CPU percentage. + * With: D = duty time, W = wait time, P = percent + * Solve: D P + * ----- = ----- + * D + W 100 + * Solving for D: + * D = P * W / (100 - P) + * + * Note that dutyCycleUs addresses starvation. If the wait time was long, we will compensate + * with a proportionately long duty-cycle. This won't significantly affect perceived + * latency, because clients are already being impacted by the long cycle time which caused + * the starvation of the timer. */ + dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent); + + // Also adjust for any accumulated overage. + dutyCycleUs -= defrag.timeproc_overage_us; + defrag.timeproc_overage_us = 0; + + if (dutyCycleUs < server.active_defrag_cycle_us) { + /* We never reduce our cycle time, that would increase overhead. Instead, we track this + * as part of the overage, and increase wait time between cycles. */ + defrag.timeproc_overage_us = server.active_defrag_cycle_us - dutyCycleUs; + dutyCycleUs = server.active_defrag_cycle_us; } - return; } + return dutyCycleUs; +} + + +/* Must be called at the end of the timeProc as it records the timeproc_end_time for use in the next + * computeDefragCycleUs computation. */ +static int computeDelayMs(monotime intendedEndtime) { + defrag.timeproc_end_time = getMonotonicUs(); + long overage = defrag.timeproc_end_time - intendedEndtime; + defrag.timeproc_overage_us += overage; // track over/under desired CPU + /* Allow negative overage (underage) to count against existing overage, but don't allow + * underage (from short stages) to be accumulated. */ + if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0; + + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + // Given the desired duty cycle, what inter-cycle delay do we need to achieve that? + // We want to achieve a specific CPU percent. To do that, we can't use a skewed computation. + // Example, if we run for 1ms and delay 10ms, that's NOT 10%, because the total cycle time is 11ms. + // Instead, if we rum for 1ms, our total time should be 10ms. So the delay is only 9ms. + long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent; + long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us; + // Only increase delay by the fraction of the overage that would be non-duty-cycle + delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; + if (delayUs < 0) delayUs = 0; + long delayMs = delayUs / 1000; // round down + return delayMs; +} + - if (hasActiveChildProcess()) return; /* Defragging memory while there's a fork will just do damage. */ +/* An independent time proc for defrag. While defrag is running, this is called much more often + * than the server cron. Frequent short calls provides low latency impact. */ +static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); - /* Once a second, check if the fragmentation justfies starting a scan - * or making it more aggressive. */ - run_with_period(1000) { - computeDefragCycles(); + // This timer shouldn't be registered unless there's work to do. + serverAssert(defrag.current_stage || listLength(defrag.remaining_stages) > 0); + + if (!server.active_defrag_enabled) { + // Defrag has been disabled while running + endDefragCycle(false); + return AE_NOMORE; } - /* Normally it is checked once a second, but when there is a configuration - * change, we want to check it as soon as possible. */ - if (server.active_defrag_configuration_changed) { - computeDefragCycles(); - server.active_defrag_configuration_changed = 0; + if (hasActiveChildProcess()) { + // If there's a child process, pause the defrag, polling until the child completes. + defrag.timeproc_end_time = 0; // prevent starvation recovery + return 100; } - if (!server.active_defrag_running) return; + monotime starttime = getMonotonicUs(); + int dutyCycleUs = computeDefragCycleUs(); + monotime endtime = starttime + dutyCycleUs; + bool haveMoreWork = true; - /* See activeExpireCycle for how timelimit is handled. */ - start = ustime(); - timelimit = 1000000 * server.active_defrag_running / server.hz / 100; - if (timelimit <= 0) timelimit = 1; - endtime = start + timelimit; + mstime_t latency; latencyStartMonitor(latency); - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragEntryStartCb = defragEntryStartCbForKeys, - .defragEntryFinishCb = defragEntryFinishCbForKeys}; do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_stage && !defrag_cursor && (slot < 0)) { - /* finish any leftovers from previous db before moving to the next one */ - if (db && defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } - - /* Move on to next database, and stop if we reached the last one. */ - if (++current_db >= server.dbnum) { - /* defrag other items not part of the db / keys */ - defragOtherGlobals(); - - long long now = ustime(); - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", - (int)((now - start_scan) / 1000), (int)(server.stat_active_defrag_hits - start_stat), - frag_pct, frag_bytes); - - start_scan = now; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - server.active_defrag_running = 0; - - computeDefragCycles(); /* if another scan is needed, start it right away */ - if (server.active_defrag_running != 0 && ustime() < endtime) continue; - break; - } else if (current_db == 0) { - /* Start a scan from the first database. */ - start_scan = ustime(); - start_stat = server.stat_active_defrag_hits; - } + if (!defrag.current_stage) { + defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); + listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); + // Initialize the stage with endtime==0 + doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); + serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE + } - db = &server.db[current_db]; - kvstoreDictLUTDefrag(db->keys, dictDefragTables); - kvstoreDictLUTDefrag(db->expires, dictDefragTables); - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; + doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); + if (status == DEFRAG_DONE) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; } - /* This array of structures holds the parameters for all defragmentation stages. */ - typedef struct defragStage { - kvstore *kvs; - dictScanFunction *scanfn; - void *privdata; - } defragStage; - defragStage defrag_stages[] = { - {db->keys, defragScanCallback, db}, - {db->expires, scanCallbackCountScanned, NULL}, - {server.pubsub_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}}, - {server.pubsubshard_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}}, - }; - do { - int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]); - serverAssert(defrag_stage < num_stages); - defragStage *current_stage = &defrag_stages[defrag_stage]; - - /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */ - if (defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } + haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0); + /* If we've completed a stage early, and still have a standard time allotment remaining, + * we'll start another stage. This can happen when defrag is running infrequently, and + * starvation protection has increased the duty-cycle. */ + } while (haveMoreWork && getMonotonicUs() <= endtime - server.active_defrag_cycle_us); - if (!defrag_later_item_in_progress) { - /* Continue defragmentation from the previous stage. - * If slot is -1, it means this stage starts from the first non-empty slot. */ - if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs); - defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor, current_stage->scanfn, - &defragfns, &(defragCtx){current_stage->privdata, slot}); - } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("active-defrag-cycle", latency); - if (!defrag_cursor) { - /* Move to the next slot only if regular and large item scanning has been completed. */ - if (listLength(db->defrag_later) > 0) { - defrag_later_item_in_progress = 1; - continue; - } + if (haveMoreWork) { + return computeDelayMs(endtime); + } else { + endDefragCycle(true); + return AE_NOMORE; // Ends the timer proc + } +} - /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */ - if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1) defrag_stage++; - defrag_later_item_in_progress = 0; - } - /* Check if all defragmentation stages have been processed. - * If so, mark as finished and reset the stage counter to move on to next database. */ - if (defrag_stage == num_stages) { - all_stages_finished = 1; - defrag_stage = 0; - } +/* During long running scripts, or while loading, there is a periodic function for handling other + * actions. This interface allows defrag to continue running, avoiding a single long defrag step + * after the long operation completes. */ +void defragWhileBlocked(void) { + // This is called infrequently, while timers are not active. We might need to start defrag. + if (!defragIsRunning()) monitorActiveDefrag(); - /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys - * (if we have a lot of pointers in one hash bucket or rehashing), - * check if we reached the time limit. - * But regardless, don't start a new db in this loop, this is because after - * the last db we call defragOtherGlobals, which must be done in one cycle */ - if (all_stages_finished || ++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64) { - /* Quit if all stages were finished or timeout. */ - if (all_stages_finished || ustime() > endtime) { - quit = 1; - break; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; - } - } while (!all_stages_finished && !quit); - } while (!quit); + if (!defragIsRunning()) return; - latencyEndMonitor(latency); - latencyAddSampleIfNeeded("active-defrag-cycle", latency); + // Save off the timeproc_id. If we have a normal termination, it will be cleared. + long long timeproc_id = defrag.timeproc_id; + + // Simulate a single call of the timer proc + long long reschedule_delay = activeDefragTimeProc(NULL, 0, NULL); + if (reschedule_delay == AE_NOMORE) { + // If it's done, deregister the timer + aeDeleteTimeEvent(server.el, timeproc_id); + } + /* Otherwise, just ignore the reschedule_delay, the timer will pop the next time that the + * event loop can process timers again. */ +} + + +static void beginDefragCycle(void) { + serverAssert(!defragIsRunning()); + + serverAssert(defrag.remaining_stages == NULL); + defrag.remaining_stages = listCreate(); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + addDefragStage(defragStageDbKeys, (void *)(uintptr_t)dbid, NULL); + addDefragStage(defragStageExpiresKvstore, (void *)(uintptr_t)dbid, NULL); + } -update_metrics: - if (server.active_defrag_running > 0) { - if (server.stat_last_active_defrag_time == 0) elapsedStart(&server.stat_last_active_defrag_time); - } else if (server.stat_last_active_defrag_time != 0) { - server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); - server.stat_last_active_defrag_time = 0; + static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels}; + static getClientChannelsFnWrapper getClientPubSubShardChannelsFn = {getClientPubSubShardChannels}; + addDefragStage(defragStagePubsubKvstore, server.pubsub_channels, &getClientPubSubChannelsFn); + addDefragStage(defragStagePubsubKvstore, server.pubsubshard_channels, &getClientPubSubShardChannelsFn); + + addDefragStage(defragLuaScripts, NULL, NULL); + addDefragStage(defragModuleGlobals, NULL, NULL); + + defrag.current_stage = NULL; + defrag.start_cycle = getMonotonicUs(); + defrag.start_defrag_hits = server.stat_active_defrag_hits; + defrag.timeproc_end_time = 0; + defrag.timeproc_overage_us = 0; + defrag.timeproc_id = aeCreateTimeEvent(server.el, 0, activeDefragTimeProc, NULL, NULL); + + elapsedStart(&server.stat_last_active_defrag_time); +} + + +#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) +#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) + +/* decide if defrag is needed, and at what CPU effort to invest in it */ +static void updateDefragCpuPercent(void) { + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + if (server.active_defrag_cpu_percent == 0) { + if (frag_pct < server.active_defrag_threshold_lower || + frag_bytes < server.active_defrag_ignore_bytes) return; + } + + /* Calculate the adaptive aggressiveness of the defrag based on the current + * fragmentation and configurations. */ + int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, + server.active_defrag_cpu_min, server.active_defrag_cpu_max); + cpu_pct = LIMIT(cpu_pct, server.active_defrag_cpu_min, server.active_defrag_cpu_max); + + /* Normally we allow increasing the aggressiveness during a scan, but don't + * reduce it, since we should not lower the aggressiveness when fragmentation + * drops. But when a configuration is made, we should reconsider it. */ + if (cpu_pct > server.active_defrag_cpu_percent || server.active_defrag_configuration_changed) { + server.active_defrag_configuration_changed = 0; + if (defragIsRunning()) { + serverLog(LL_VERBOSE, "Changing active defrag CPU, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } else { + serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } + server.active_defrag_cpu_percent = cpu_pct; } } + +void monitorActiveDefrag(void) { + if (!server.active_defrag_enabled) return; + + /* Defrag gets paused while a child process is active. So there's no point in starting a new + * cycle or adjusting the CPU percentage for an existing cycle. */ + if (hasActiveChildProcess()) return; + + updateDefragCpuPercent(); + + if (server.active_defrag_cpu_percent > 0 && !defragIsRunning()) beginDefragCycle(); +} + #else /* HAVE_DEFRAG */ -void activeDefragCycle(void) { +void monitorActiveDefrag(void) { /* Not implemented yet. */ } @@ -1150,4 +1375,7 @@ robj *activeDefragStringOb(robj *ob) { return NULL; } +void defragWhileBlocked(void) { +} + #endif diff --git a/src/dict.c b/src/dict.c index 48c0f815bb..f75369d533 100644 --- a/src/dict.c +++ b/src/dict.c @@ -1321,7 +1321,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { /* Reallocate the dictEntry, key and value allocations in a bucket using the * provided allocation functions in order to defrag them. */ -static void dictDefragBucket(dictEntry **bucketref, dictDefragFunctions *defragfns, void *privdata) { +static void dictDefragBucket(dictEntry **bucketref, const dictDefragFunctions *defragfns, void *privdata) { dictDefragAllocFunction *defragalloc = defragfns->defragAlloc; dictDefragAllocFunction *defragkey = defragfns->defragKey; dictDefragAllocFunction *defragval = defragfns->defragVal; @@ -1499,7 +1499,7 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *pri * where NULL means that no reallocation happened and the old memory is still * valid. */ unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata) { +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata) { int htidx0, htidx1; const dictEntry *de, *next; unsigned long m0, m1; diff --git a/src/dict.h b/src/dict.h index 88ebd7bf99..854d026cdc 100644 --- a/src/dict.h +++ b/src/dict.h @@ -238,7 +238,7 @@ void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size); diff --git a/src/eval.c b/src/eval.c index e5d7d56aa2..e9fac531f5 100644 --- a/src/eval.c +++ b/src/eval.c @@ -199,10 +199,12 @@ void scriptingInit(int setup) { } /* Initialize a dictionary we use to map SHAs to scripts. - * Initialize a list we use for lua script evictions, it shares the - * sha with the dictionary, so free fn is not set. */ + * Initialize a list we use for lua script evictions. + * Note that we duplicate the sha when adding to the lru list due to defrag, + * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); + listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -518,9 +520,6 @@ void luaDeleteFunction(client *c, sds sha) { dictEntry *de = dictUnlink(lctx.lua_scripts, sha); serverAssertWithInfo(c ? c : lctx.lua_client, NULL, de); luaScript *l = dictGetVal(de); - /* We only delete `EVAL` scripts, which must exist in the LRU list. */ - serverAssert(l->node); - listDelNode(lctx.lua_scripts_lru_list, l->node); lctx.lua_scripts_mem -= sdsAllocSize(sha) + getStringObjectSdsUsedMemory(l->body); dictFreeUnlinkedEntry(lctx.lua_scripts, de); } @@ -549,11 +548,12 @@ listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha) { listNode *ln = listFirst(lctx.lua_scripts_lru_list); sds oldest = listNodeValue(ln); luaDeleteFunction(c, oldest); + listDelNode(lctx.lua_scripts_lru_list, ln); server.stat_evictedscripts++; } /* Add current. */ - listAddNodeTail(lctx.lua_scripts_lru_list, sha); + listAddNodeTail(lctx.lua_scripts_lru_list, sdsdup(sha)); return listLast(lctx.lua_scripts_lru_list); } @@ -777,7 +777,7 @@ void ldbInit(void) { ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); - listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree); + listSetFreeMethod(ldb.logs, sdsfreeVoid); ldb.children = listCreate(); ldb.src = NULL; ldb.lines = 0; diff --git a/src/evict.c b/src/evict.c index 5e4b6220eb..eecd000a4b 100644 --- a/src/evict.c +++ b/src/evict.c @@ -143,26 +143,14 @@ void evictionPoolAlloc(void) { * right. */ int evictionPoolPopulate(serverDb *db, kvstore *samplekvs, struct evictionPoolEntry *pool) { int j, k, count; - dictEntry *samples[server.maxmemory_samples]; + void *samples[server.maxmemory_samples]; - int slot = kvstoreGetFairRandomDictIndex(samplekvs); - count = kvstoreDictGetSomeKeys(samplekvs, slot, samples, server.maxmemory_samples); + int slot = kvstoreGetFairRandomHashtableIndex(samplekvs); + count = kvstoreHashtableSampleEntries(samplekvs, slot, &samples[0], server.maxmemory_samples); for (j = 0; j < count; j++) { unsigned long long idle; - sds key; - robj *o; - dictEntry *de; - - de = samples[j]; - key = dictGetKey(de); - - /* If the dictionary we are sampling from is not the main - * dictionary (but the expires one) we need to lookup the key - * again in the key dictionary to obtain the value object. */ - if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { - if (samplekvs != db->keys) de = kvstoreDictFind(db->keys, slot, key); - o = dictGetVal(de); - } + robj *o = samples[j]; + sds key = objectGetKey(o); /* Calculate the idle time according to the policy. This is called * idle just because the code initially handled LRU, but is in fact @@ -180,7 +168,7 @@ int evictionPoolPopulate(serverDb *db, kvstore *samplekvs, struct evictionPoolEn idle = 255 - LFUDecrAndReturn(o); } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { /* In this case the sooner the expire the better. */ - idle = ULLONG_MAX - (long)dictGetVal(de); + idle = ULLONG_MAX - objectGetExpire(o); } else { serverPanic("Unknown eviction policy in evictionPoolPopulate()"); } @@ -546,8 +534,8 @@ int performEvictions(void) { goto update_metrics; } - if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) { - result = EVICT_FAIL; /* We need to free memory, but policy forbids. */ + if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION || (iAmPrimary() && server.import_mode)) { + result = EVICT_FAIL; /* We need to free memory, but policy forbids or we are in import mode. */ goto update_metrics; } @@ -568,7 +556,7 @@ int performEvictions(void) { sds bestkey = NULL; int bestdbid; serverDb *db; - dictEntry *de; + robj *valkey; if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU | MAXMEMORY_FLAG_LFU) || server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { @@ -592,7 +580,7 @@ int performEvictions(void) { if (current_db_keys == 0) continue; total_keys += current_db_keys; - int l = kvstoreNumNonEmptyDicts(kvs); + int l = kvstoreNumNonEmptyHashtables(kvs); /* Do not exceed the number of non-empty slots when looping. */ while (l--) { sampled_keys += evictionPoolPopulate(db, kvs, pool); @@ -617,7 +605,8 @@ int performEvictions(void) { } else { kvs = server.db[bestdbid].expires; } - de = kvstoreDictFind(kvs, pool[k].slot, pool[k].key); + void *entry = NULL; + int found = kvstoreHashtableFind(kvs, pool[k].slot, pool[k].key, &entry); /* Remove the entry from the pool. */ if (pool[k].key != pool[k].cached) sdsfree(pool[k].key); @@ -626,8 +615,9 @@ int performEvictions(void) { /* If the key exists, is our pick. Otherwise it is * a ghost and we need to try the next element. */ - if (de) { - bestkey = dictGetKey(de); + if (found) { + valkey = entry; + bestkey = objectGetKey(valkey); break; } else { /* Ghost... Iterate again. */ @@ -651,10 +641,10 @@ int performEvictions(void) { } else { kvs = db->expires; } - int slot = kvstoreGetFairRandomDictIndex(kvs); - de = kvstoreDictGetRandomKey(kvs, slot); - if (de) { - bestkey = dictGetKey(de); + int slot = kvstoreGetFairRandomHashtableIndex(kvs); + int found = kvstoreHashtableRandomEntry(kvs, slot, (void **)&valkey); + if (found) { + bestkey = objectGetKey(valkey); bestdbid = j; break; } diff --git a/src/expire.c b/src/expire.c index 928bb58d86..e4c3b0ec96 100644 --- a/src/expire.c +++ b/src/expire.c @@ -46,8 +46,7 @@ static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921, 0.833748, 0.817073, 0.800731, 0.784717, 0.769022, 0.753642, 0.738569, 0.723798}; /* Helper function for the activeExpireCycle() function. - * This function will try to expire the key that is stored in the hash table - * entry 'de' of the 'expires' hash table of a database. + * This function will try to expire the key-value entry 'val'. * * If the key is found to be expired, it is removed from the database and * 1 is returned. Otherwise no operation is performed and 0 is returned. @@ -56,11 +55,12 @@ static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921, * * The parameter 'now' is the current time in milliseconds as is passed * to the function to avoid too many gettimeofday() syscalls. */ -int activeExpireCycleTryExpire(serverDb *db, dictEntry *de, long long now) { - long long t = dictGetSignedIntegerVal(de); +int activeExpireCycleTryExpire(serverDb *db, robj *val, long long now) { + long long t = objectGetExpire(val); + serverAssert(t >= 0); if (now > t) { enterExecutionUnit(1, 0); - sds key = dictGetKey(de); + sds key = objectGetKey(val); robj *keyobj = createStringObject(key, sdslen(key)); deleteExpiredKeyAndPropagate(db, keyobj); decrRefCount(keyobj); @@ -127,11 +127,11 @@ typedef struct { int ttl_samples; /* num keys with ttl not yet expired */ } expireScanData; -void expireScanCallback(void *privdata, const dictEntry *const_de) { - dictEntry *de = (dictEntry *)const_de; +void expireScanCallback(void *privdata, void *entry) { + robj *val = entry; expireScanData *data = privdata; - long long ttl = dictGetSignedIntegerVal(de) - data->now; - if (activeExpireCycleTryExpire(data->db, de, data->now)) { + long long ttl = objectGetExpire(val) - data->now; + if (activeExpireCycleTryExpire(data->db, val, data->now)) { data->expired++; /* Propagate the DEL command */ postExecutionUnitOperations(); @@ -144,13 +144,13 @@ void expireScanCallback(void *privdata, const dictEntry *const_de) { data->sampled++; } -static inline int isExpiryDictValidForSamplingCb(dict *d) { - long long numkeys = dictSize(d); - unsigned long buckets = dictBuckets(d); +static inline int isExpiryTableValidForSamplingCb(hashtable *ht) { + long long numkeys = hashtableSize(ht); + unsigned long buckets = hashtableBuckets(ht); /* When there are less than 1% filled buckets, sampling the key * space is expensive, so stop here waiting for better times... * The dictionary will be resized asap. */ - if (buckets > DICT_HT_INITIAL_SIZE && (numkeys * 100 / buckets < 1)) { + if (buckets > 0 && (numkeys * 100 / buckets < 1)) { return C_ERR; } return C_OK; @@ -279,14 +279,14 @@ void activeExpireCycle(int type) { * is very fast: we are in the cache line scanning a sequential * array of NULL pointers, so we can scan a lot more buckets * than keys in the same time. */ - long max_buckets = num * 20; + long max_buckets = num * 10; long checked_buckets = 0; int origin_ttl_samples = data.ttl_samples; while (data.sampled < num && checked_buckets < max_buckets) { db->expires_cursor = kvstoreScan(db->expires, db->expires_cursor, -1, expireScanCallback, - isExpiryDictValidForSamplingCb, &data); + isExpiryTableValidForSamplingCb, &data); if (db->expires_cursor == 0) { db_done = 1; break; @@ -422,7 +422,7 @@ void expireReplicaKeys(void) { while (dbids && dbid < server.dbnum) { if ((dbids & 1) != 0) { serverDb *db = server.db + dbid; - dictEntry *expire = dbFindExpires(db, keyname); + robj *expire = dbFindExpires(db, keyname); int expired = 0; if (expire && activeExpireCycleTryExpire(server.db + dbid, expire, start)) { @@ -520,8 +520,11 @@ int checkAlreadyExpired(long long when) { * of a replica instance. * * Instead we add the already expired key to the database with expire time - * (possibly in the past) and wait for an explicit DEL from the primary. */ - return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host); + * (possibly in the past) and wait for an explicit DEL from the primary. + * + * If the server is a primary and in the import mode, we also add the already + * expired key and wait for an explicit DEL from the import source. */ + return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode); } #define EXPIRE_NX (1 << 0) @@ -616,14 +619,16 @@ void expireGenericCommand(client *c, long long basetime, int unit) { } when += basetime; + robj *obj = lookupKeyWrite(c->db, key); + /* No key, return zero. */ - if (lookupKeyWrite(c->db, key) == NULL) { + if (obj == NULL) { addReply(c, shared.czero); return; } if (flag) { - current_expire = getExpire(c->db, key); + current_expire = objectGetExpire(obj); /* NX option is set, check current expiry */ if (flag & EXPIRE_NX) { @@ -671,7 +676,7 @@ void expireGenericCommand(client *c, long long basetime, int unit) { addReply(c, shared.cone); return; } else { - setExpire(c, c->db, key, when); + obj = setExpire(c, c->db, key, when); addReply(c, shared.cone); /* Propagate as PEXPIREAT millisecond-timestamp * Only rewrite the command arg if not already PEXPIREAT */ diff --git a/src/function_lua.c b/src/function_lua.c index fa9983bf7e..b535528906 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -64,17 +64,14 @@ typedef struct luaFunctionCtx { } luaFunctionCtx; typedef struct loadCtx { - functionLibInfo *li; + list *functions; monotime start_time; size_t timeout; } loadCtx; -typedef struct registerFunctionArgs { - sds name; - sds desc; - luaFunctionCtx *lua_f_ctx; - uint64_t f_flags; -} registerFunctionArgs; +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function); /* Hook for FUNCTION LOAD execution. * Used to cancel the execution in case of a timeout (500ms). @@ -93,15 +90,42 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) { } } +static void freeCompiledFunc(ValkeyModuleCtx *module_ctx, + luaEngineCtx *lua_engine_ctx, + void *compiled_func) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction *func = compiled_func; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + luaEngineFreeFunction(module_ctx, lua_engine_ctx, func->function); + zfree(func); +} + /* - * Compile a given blob and save it on the registry. - * Return a function ctx with Lua ref that allows to later retrieve the - * function from the registry. + * Compile a given script code by generating a set of compiled functions. These + * functions are also saved into the the registry of the Lua environment. + * + * Returns an array of compiled functions. The `compileFunction` struct stores a + * Lua ref that allows to later retrieve the function from the registry. + * In the `out_num_compiled_functions` parameter is returned the size of the + * array. * * Return NULL on compilation error and set the error to the err variable */ -static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) { - int ret = C_ERR; +static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + + compiledFunction **compiled_functions = NULL; luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; @@ -114,15 +138,15 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_pop(lua, 1); /* pop the metatable */ /* compile the code */ - if (luaL_loadbuffer(lua, blob, sdslen(blob), "@user_function")) { - *err = sdscatprintf(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1)); + if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) { + *err = valkey_asprintf("Error compiling function: %s", lua_tostring(lua, -1)); lua_pop(lua, 1); /* pops the error */ goto done; } serverAssert(lua_isfunction(lua, -1)); loadCtx load_ctx = { - .li = li, + .functions = listCreate(), .start_time = getMonotonicUs(), .timeout = timeout, }; @@ -133,13 +157,31 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size if (lua_pcall(lua, 0, 0, 0)) { errorInfo err_info = {0}; luaExtractErrorInformation(lua, &err_info); - *err = sdscatprintf(sdsempty(), "Error registering functions: %s", err_info.msg); + *err = valkey_asprintf("Error registering functions: %s", err_info.msg); lua_pop(lua, 1); /* pops the error */ luaErrorInformationDiscard(&err_info); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + while ((node = listNext(iter)) != NULL) { + freeCompiledFunc(module_ctx, lua_engine_ctx, listNodeValue(node)); + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); goto done; } - ret = C_OK; + compiled_functions = + zcalloc(sizeof(compiledFunction *) * listLength(load_ctx.functions)); + listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD); + listNode *node = NULL; + *out_num_compiled_functions = 0; + while ((node = listNext(iter)) != NULL) { + compiledFunction *func = listNodeValue(node); + compiled_functions[*out_num_compiled_functions] = func; + (*out_num_compiled_functions)++; + } + listReleaseIterator(iter); + listRelease(load_ctx.functions); done: /* restore original globals */ @@ -152,19 +194,23 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size lua_sethook(lua, NULL, 0, 0); /* Disable hook */ luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, NULL); - return ret; + return compiled_functions; } /* * Invole the give function with the given keys and args */ -static void luaEngineCall(scriptRunCtx *run_ctx, - void *engine_ctx, +static void luaEngineCall(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + functionCtx *func_ctx, void *compiled_function, robj **keys, size_t nkeys, robj **args, size_t nargs) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -177,25 +223,38 @@ static void luaEngineCall(scriptRunCtx *run_ctx, serverAssert(lua_isfunction(lua, -1)); + scriptRunCtx *run_ctx = (scriptRunCtx *)func_ctx; luaCallFunction(run_ctx, lua, keys, nkeys, args, nargs, 0); lua_pop(lua, 1); /* Pop error handler */ } -static size_t luaEngineGetUsedMemoy(void *engine_ctx) { +static engineMemoryInfo luaEngineGetMemoryInfo(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + luaEngineCtx *lua_engine_ctx = engine_ctx; - return luaMemory(lua_engine_ctx->lua); + + return (engineMemoryInfo){ + .used_memory = luaMemory(lua_engine_ctx->lua), + .engine_memory_overhead = zmalloc_size(lua_engine_ctx), + }; } -static size_t luaEngineFunctionMemoryOverhead(void *compiled_function) { +static size_t luaEngineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); + return zmalloc_size(compiled_function); } -static size_t luaEngineMemoryOverhead(void *engine_ctx) { - luaEngineCtx *lua_engine_ctx = engine_ctx; - return zmalloc_size(lua_engine_ctx); -} +static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function) { + /* The lua engine is implemented in the core, and not in a Valkey Module */ + serverAssert(module_ctx == NULL); -static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { luaEngineCtx *lua_engine_ctx = engine_ctx; lua_State *lua = lua_engine_ctx->lua; luaFunctionCtx *f_ctx = compiled_function; @@ -203,26 +262,19 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) { zfree(f_ctx); } -static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args, - sds name, - sds desc, +static void luaRegisterFunctionArgsInitialize(compiledFunction *func, + robj *name, + robj *desc, luaFunctionCtx *lua_f_ctx, uint64_t flags) { - *register_f_args = (registerFunctionArgs){ + *func = (compiledFunction){ .name = name, .desc = desc, - .lua_f_ctx = lua_f_ctx, + .function = lua_f_ctx, .f_flags = flags, }; } -static void luaRegisterFunctionArgsDispose(lua_State *lua, registerFunctionArgs *register_f_args) { - sdsfree(register_f_args->name); - if (register_f_args->desc) sdsfree(register_f_args->desc); - lua_unref(lua, register_f_args->lua_f_ctx->lua_function_ref); - zfree(register_f_args->lua_f_ctx); -} - /* Read function flags located on the top of the Lua stack. * On success, return C_OK and set the flags to 'flags' out parameter * Return C_ERR if encounter an unknown flag. */ @@ -267,10 +319,11 @@ static int luaRegisterFunctionReadFlags(lua_State *lua, uint64_t *flags) { return ret; } -static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadNamedArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; - sds desc = NULL; + robj *name = NULL; + robj *desc = NULL; luaFunctionCtx *lua_f_ctx = NULL; uint64_t flags = 0; if (!lua_istable(lua, 1)) { @@ -287,14 +340,15 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs err = "named argument key given to server.register_function is not a string"; goto error; } + const char *key = lua_tostring(lua, -2); if (!strcasecmp(key, "function_name")) { - if (!(name = luaGetStringSds(lua, -1))) { + if (!(name = luaGetStringObject(lua, -1))) { err = "function_name argument given to server.register_function must be a string"; goto error; } } else if (!strcasecmp(key, "description")) { - if (!(desc = luaGetStringSds(lua, -1))) { + if (!(desc = luaGetStringObject(lua, -1))) { err = "description argument given to server.register_function must be a string"; goto error; } @@ -335,13 +389,17 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs goto error; } - luaRegisterFunctionArgsInitialize(register_f_args, name, desc, lua_f_ctx, flags); + luaRegisterFunctionArgsInitialize(func, + name, + desc, + lua_f_ctx, + flags); return C_OK; error: - if (name) sdsfree(name); - if (desc) sdsfree(desc); + if (name) decrRefCount(name); + if (desc) decrRefCount(desc); if (lua_f_ctx) { lua_unref(lua, lua_f_ctx->lua_function_ref); zfree(lua_f_ctx); @@ -350,11 +408,12 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs return C_ERR; } -static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, + compiledFunction *func) { char *err = NULL; - sds name = NULL; + robj *name = NULL; luaFunctionCtx *lua_f_ctx = NULL; - if (!(name = luaGetStringSds(lua, 1))) { + if (!(name = luaGetStringObject(lua, 1))) { err = "first argument to server.register_function must be a string"; goto error; } @@ -369,17 +428,17 @@ static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctio lua_f_ctx = zmalloc(sizeof(*lua_f_ctx)); lua_f_ctx->lua_function_ref = lua_function_ref; - luaRegisterFunctionArgsInitialize(register_f_args, name, NULL, lua_f_ctx, 0); + luaRegisterFunctionArgsInitialize(func, name, NULL, lua_f_ctx, 0); return C_OK; error: - if (name) sdsfree(name); + if (name) decrRefCount(name); luaPushError(lua, err); return C_ERR; } -static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *register_f_args) { +static int luaRegisterFunctionReadArgs(lua_State *lua, compiledFunction *func) { int argc = lua_gettop(lua); if (argc < 1 || argc > 2) { luaPushError(lua, "wrong number of arguments to server.register_function"); @@ -387,33 +446,28 @@ static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *reg } if (argc == 1) { - return luaRegisterFunctionReadNamedArgs(lua, register_f_args); + return luaRegisterFunctionReadNamedArgs(lua, func); } else { - return luaRegisterFunctionReadPositionalArgs(lua, register_f_args); + return luaRegisterFunctionReadPositionalArgs(lua, func); } } static int luaRegisterFunction(lua_State *lua) { - registerFunctionArgs register_f_args = {0}; + compiledFunction *func = zcalloc(sizeof(*func)); loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME); if (!load_ctx) { + zfree(func); luaPushError(lua, "server.register_function can only be called on FUNCTION LOAD command"); return luaError(lua); } - if (luaRegisterFunctionReadArgs(lua, ®ister_f_args) != C_OK) { + if (luaRegisterFunctionReadArgs(lua, func) != C_OK) { + zfree(func); return luaError(lua); } - sds err = NULL; - if (functionLibCreateFunction(register_f_args.name, register_f_args.lua_f_ctx, load_ctx->li, register_f_args.desc, - register_f_args.f_flags, &err) != C_OK) { - luaRegisterFunctionArgsDispose(lua, ®ister_f_args); - luaPushError(lua, err); - sdsfree(err); - return luaError(lua); - } + listAddNodeTail(load_ctx->functions, func); return 0; } @@ -494,16 +548,17 @@ int luaEngineInitEngine(void) { lua_enablereadonlytable(lua_engine_ctx->lua, -1, 1); /* protect the new global table */ lua_replace(lua_engine_ctx->lua, LUA_GLOBALSINDEX); /* set new global table as the new globals */ - - engine *lua_engine = zmalloc(sizeof(*lua_engine)); - *lua_engine = (engine){ - .engine_ctx = lua_engine_ctx, - .create = luaEngineCreate, - .call = luaEngineCall, - .get_used_memory = luaEngineGetUsedMemoy, + engineMethods lua_engine_methods = { + .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + .create_functions_library = luaEngineCreate, + .call_function = luaEngineCall, .get_function_memory_overhead = luaEngineFunctionMemoryOverhead, - .get_engine_memory_overhead = luaEngineMemoryOverhead, .free_function = luaEngineFreeFunction, + .get_memory_info = luaEngineGetMemoryInfo, }; - return functionsRegisterEngine(LUA_ENGINE_NAME, lua_engine); + + return functionsRegisterEngine(LUA_ENGINE_NAME, + NULL, + lua_engine_ctx, + &lua_engine_methods); } diff --git a/src/functions.c b/src/functions.c index c9ec42b322..0d003f7fac 100644 --- a/src/functions.c +++ b/src/functions.c @@ -31,6 +31,7 @@ #include "sds.h" #include "dict.h" #include "adlist.h" +#include "module.h" #define LOAD_TIMEOUT_MS 500 @@ -117,9 +118,28 @@ static dict *engines = NULL; /* Libraries Ctx. */ static functionsLibCtx *curr_functions_lib_ctx = NULL; +static void setupEngineModuleCtx(engineInfo *ei, client *c) { + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + moduleScriptingEngineInitContext(ei->module_ctx, ei->engineModule, c); + } +} + +static void teardownEngineModuleCtx(engineInfo *ei) { + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + moduleFreeContext(ei->module_ctx); + } +} + static size_t functionMallocSize(functionInfo *fi) { - return zmalloc_size(fi) + sdsAllocSize(fi->name) + (fi->desc ? sdsAllocSize(fi->desc) : 0) + - fi->li->ei->engine->get_function_memory_overhead(fi->function); + setupEngineModuleCtx(fi->li->ei, NULL); + size_t size = zmalloc_size(fi) + + sdsAllocSize(fi->name) + + (fi->desc ? sdsAllocSize(fi->desc) : 0) + + fi->li->ei->engine->get_function_memory_overhead(fi->li->ei->module_ctx, fi->function); + teardownEngineModuleCtx(fi->li->ei); + return size; } static size_t libraryMallocSize(functionLibInfo *li) { @@ -141,8 +161,12 @@ static void engineFunctionDispose(void *obj) { if (fi->desc) { sdsfree(fi->desc); } + setupEngineModuleCtx(fi->li->ei, NULL); engine *engine = fi->li->ei->engine; - engine->free_function(engine->engine_ctx, fi->function); + engine->free_function(fi->li->ei->module_ctx, + engine->engine_ctx, + fi->function); + teardownEngineModuleCtx(fi->li->ei); zfree(fi); } @@ -161,9 +185,9 @@ static void engineLibraryDispose(void *obj) { } /* Clear all the functions from the given library ctx */ -void functionsLibCtxClear(functionsLibCtx *lib_ctx) { - dictEmpty(lib_ctx->functions, NULL); - dictEmpty(lib_ctx->libraries, NULL); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)) { + dictEmpty(lib_ctx->functions, callback); + dictEmpty(lib_ctx->libraries, callback); dictIterator *iter = dictGetIterator(lib_ctx->engines_stats); dictEntry *entry = NULL; while ((entry = dictNext(iter))) { @@ -175,19 +199,28 @@ void functionsLibCtxClear(functionsLibCtx *lib_ctx) { lib_ctx->cache_memory = 0; } -void functionsLibCtxClearCurrent(int async) { +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)) { if (async) { functionsLibCtx *old_l_ctx = curr_functions_lib_ctx; curr_functions_lib_ctx = functionsLibCtxCreate(); freeFunctionsAsync(old_l_ctx); } else { - functionsLibCtxClear(curr_functions_lib_ctx); + functionsLibCtxClear(curr_functions_lib_ctx, callback); + } +} + +/* Free the given functions ctx */ +static void functionsLibCtxFreeGeneric(functionsLibCtx *functions_lib_ctx, int async) { + if (async) { + freeFunctionsAsync(functions_lib_ctx); + } else { + functionsLibCtxFree(functions_lib_ctx); } } /* Free the given functions ctx */ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { - functionsLibCtxClear(functions_lib_ctx); + functionsLibCtxClear(functions_lib_ctx, NULL); dictRelease(functions_lib_ctx->functions); dictRelease(functions_lib_ctx->libraries); dictRelease(functions_lib_ctx->engines_stats); @@ -196,8 +229,8 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { /* Swap the current functions ctx with the given one. * Free the old functions ctx. */ -void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx) { - functionsLibCtxFree(curr_functions_lib_ctx); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async) { + functionsLibCtxFreeGeneric(curr_functions_lib_ctx, async); curr_functions_lib_ctx = new_lib_ctx; } @@ -224,6 +257,15 @@ functionsLibCtx *functionsLibCtxCreate(void) { return ret; } +void functionsAddEngineStats(engineInfo *ei) { + serverAssert(curr_functions_lib_ctx != NULL); + dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, ei->name); + if (entry == NULL) { + functionsLibEngineStats *stats = zcalloc(sizeof(*stats)); + dictAdd(curr_functions_lib_ctx->engines_stats, ei->name, stats); + } +} + /* * Creating a function inside the given library. * On success, return C_OK. @@ -233,24 +275,34 @@ functionsLibCtx *functionsLibCtxCreate(void) { * the function will verify that the given name is following the naming format * and return an error if its not. */ -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err) { - if (functionsVerifyName(name) != C_OK) { - *err = sdsnew("Library names can only contain letters, numbers, or underscores(_) and must be at least one " - "character long"); +static int functionLibCreateFunction(robj *name, + void *function, + functionLibInfo *li, + robj *desc, + uint64_t f_flags, + sds *err) { + serverAssert(name->type == OBJ_STRING); + serverAssert(desc == NULL || desc->type == OBJ_STRING); + + if (functionsVerifyName(name->ptr) != C_OK) { + *err = sdsnew("Function names can only contain letters, numbers, or " + "underscores(_) and must be at least one character long"); return C_ERR; } - if (dictFetchValue(li->functions, name)) { + sds name_sds = sdsdup(name->ptr); + if (dictFetchValue(li->functions, name_sds)) { *err = sdsnew("Function already exists in the library"); + sdsfree(name_sds); return C_ERR; } functionInfo *fi = zmalloc(sizeof(*fi)); *fi = (functionInfo){ - .name = name, + .name = name_sds, .function = function, .li = li, - .desc = desc, + .desc = desc ? sdsdup(desc->ptr) : NULL, .f_flags = f_flags, }; @@ -339,7 +391,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l } else { if (!old_libraries_list) { old_libraries_list = listCreate(); - listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree); + listSetFreeMethod(old_libraries_list, engineLibraryDispose); } libraryUnlink(functions_lib_ctx_dst, old_li); listAddNodeTail(old_libraries_list, old_li); @@ -371,7 +423,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l dictReleaseIterator(iter); iter = NULL; - functionsLibCtxClear(functions_lib_ctx_src); + functionsLibCtxClear(functions_lib_ctx_src, NULL); if (old_libraries_list) { listRelease(old_libraries_list); old_libraries_list = NULL; @@ -394,11 +446,24 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l return ret; } -/* Register an engine, should be called once by the engine on startup and give the following: +/* Register an engine, should be called once by the engine on startup and give + * the following: * * - engine_name - name of the engine to register - * - engine_ctx - the engine ctx that should be used by the server to interact with the engine */ -int functionsRegisterEngine(const char *engine_name, engine *engine) { + * + * - engine_module - the valkey module that implements this engine + * + * - engine_ctx - the engine ctx that should be used by the server to interact + * with the engine. + * + * - engine_methods - the struct with the scripting engine callback functions + * pointers. + * + */ +int functionsRegisterEngine(const char *engine_name, + ValkeyModule *engine_module, + engineCtx *engine_ctx, + engineMethods *engine_methods) { sds engine_name_sds = sdsnew(engine_name); if (dictFetchValue(engines, engine_name_sds)) { serverLog(LL_WARNING, "Same engine was registered twice"); @@ -406,6 +471,16 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) { return C_ERR; } + engine *eng = zmalloc(sizeof(engine)); + *eng = (engine){ + .engine_ctx = engine_ctx, + .create = engine_methods->create_functions_library, + .call = engine_methods->call_function, + .get_function_memory_overhead = engine_methods->get_function_memory_overhead, + .free_function = engine_methods->free_function, + .get_memory_info = engine_methods->get_memory_info, + }; + client *c = createClient(NULL); c->flag.deny_blocking = 1; c->flag.script = 1; @@ -413,15 +488,64 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) { engineInfo *ei = zmalloc(sizeof(*ei)); *ei = (engineInfo){ .name = engine_name_sds, - .engine = engine, + .engineModule = engine_module, + .module_ctx = engine_module ? moduleAllocateContext() : NULL, + .engine = eng, .c = c, }; dictAdd(engines, engine_name_sds, ei); - engine_cache_memory += zmalloc_size(ei) + sdsAllocSize(ei->name) + zmalloc_size(engine) + - engine->get_engine_memory_overhead(engine->engine_ctx); + functionsAddEngineStats(ei); + + setupEngineModuleCtx(ei, NULL); + engineMemoryInfo mem_info = eng->get_memory_info(ei->module_ctx, + eng->engine_ctx); + engine_cache_memory += zmalloc_size(ei) + + sdsAllocSize(ei->name) + + zmalloc_size(eng) + + mem_info.engine_memory_overhead; + + teardownEngineModuleCtx(ei); + + return C_OK; +} + +/* Removes a scripting engine from the server. + * + * - engine_name - name of the engine to remove + */ +int functionsUnregisterEngine(const char *engine_name) { + sds engine_name_sds = sdsnew(engine_name); + dictEntry *entry = dictFind(engines, engine_name_sds); + if (entry == NULL) { + serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name); + sdsfree(engine_name_sds); + return C_ERR; + } + + engineInfo *ei = dictGetVal(entry); + + dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries); + while ((entry = dictNext(iter))) { + functionLibInfo *li = dictGetVal(entry); + if (li->ei == ei) { + libraryUnlink(curr_functions_lib_ctx, li); + engineLibraryFree(li); + } + } + dictReleaseIterator(iter); + + zfree(ei->engine); + sdsfree(ei->name); + freeClient(ei->c); + if (ei->engineModule != NULL) { + serverAssert(ei->module_ctx != NULL); + zfree(ei->module_ctx); + } + zfree(ei); + sdsfree(engine_name_sds); return C_OK; } @@ -640,11 +764,19 @@ static void fcallCommandGeneric(client *c, int ro) { } scriptRunCtx run_ctx; - if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return; - - engine->call(&run_ctx, engine->engine_ctx, fi->function, c->argv + 3, numkeys, c->argv + 3 + numkeys, + setupEngineModuleCtx(fi->li->ei, run_ctx.original_client); + + engine->call(fi->li->ei->module_ctx, + engine->engine_ctx, + &run_ctx, + fi->function, + c->argv + 3, + numkeys, + c->argv + 3 + numkeys, c->argc - 3 - numkeys); + + teardownEngineModuleCtx(fi->li->ei); scriptResetRun(&run_ctx); } @@ -769,7 +901,7 @@ void functionRestoreCommand(client *c) { } if (restore_replicy == restorePolicy_Flush) { - functionsLibCtxSwapWithCurrent(functions_lib_ctx); + functionsLibCtxSwapWithCurrent(functions_lib_ctx, server.lazyfree_lazy_user_flush); functions_lib_ctx = NULL; /* avoid releasing the f_ctx in the end */ } else { if (libraryJoin(curr_functions_lib_ctx, functions_lib_ctx, restore_replicy == restorePolicy_Replace, &err) != @@ -789,7 +921,7 @@ void functionRestoreCommand(client *c) { addReply(c, shared.ok); } if (functions_lib_ctx) { - functionsLibCtxFree(functions_lib_ctx); + functionsLibCtxFreeGeneric(functions_lib_ctx, server.lazyfree_lazy_user_flush); } } @@ -811,7 +943,7 @@ void functionFlushCommand(client *c) { return; } - functionsLibCtxClearCurrent(async); + functionsLibCtxClearCurrent(async, NULL); /* Indicate that the command changed the data so it will be replicated and * counted as a data change (for persistence configuration) */ @@ -944,14 +1076,40 @@ void functionFreeLibMetaData(functionsLibMetaData *md) { if (md->engine) sdsfree(md->engine); } +static void freeCompiledFunctions(engineInfo *ei, + compiledFunction **compiled_functions, + size_t num_compiled_functions, + size_t free_function_from_idx) { + setupEngineModuleCtx(ei, NULL); + + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + decrRefCount(func->name); + if (func->desc) { + decrRefCount(func->desc); + } + if (i >= free_function_from_idx) { + ei->engine->free_function(ei->module_ctx, + ei->engine->engine_ctx, + func->function); + } + zfree(func); + } + + zfree(compiled_functions); + + teardownEngineModuleCtx(ei); +} + /* Compile and save the given library, return the loaded library name on success * and NULL on failure. In case on failure the err out param is set with relevant error message */ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout) { dictIterator *iter = NULL; dictEntry *entry = NULL; - functionLibInfo *new_li = NULL; functionLibInfo *old_li = NULL; functionsLibMetaData md = {0}; + functionLibInfo *new_li = NULL; + if (functionExtractLibMetaData(code, &md, err) != C_OK) { return NULL; } @@ -981,10 +1139,47 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC } new_li = engineLibraryCreate(md.name, ei, code); - if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) { + size_t num_compiled_functions = 0; + char *compile_error = NULL; + setupEngineModuleCtx(ei, NULL); + compiledFunction **compiled_functions = + engine->create(ei->module_ctx, + engine->engine_ctx, + md.code, + timeout, + &num_compiled_functions, + &compile_error); + teardownEngineModuleCtx(ei); + if (compiled_functions == NULL) { + serverAssert(num_compiled_functions == 0); + serverAssert(compile_error != NULL); + *err = sdsnew(compile_error); + zfree(compile_error); goto error; } + for (size_t i = 0; i < num_compiled_functions; i++) { + compiledFunction *func = compiled_functions[i]; + int ret = functionLibCreateFunction(func->name, + func->function, + new_li, + func->desc, + func->f_flags, + err); + if (ret == C_ERR) { + freeCompiledFunctions(ei, + compiled_functions, + num_compiled_functions, + i); + goto error; + } + } + + freeCompiledFunctions(ei, + compiled_functions, + num_compiled_functions, + num_compiled_functions); + if (dictSize(new_li->functions) == 0) { *err = sdsnew("No functions registered"); goto error; @@ -1054,6 +1249,7 @@ void functionLoadCommand(client *c) { timeout = 0; } if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) { + serverAssert(err != NULL); addReplyErrorSds(c, err); return; } @@ -1071,7 +1267,11 @@ unsigned long functionsMemory(void) { while ((entry = dictNext(iter))) { engineInfo *ei = dictGetVal(entry); engine *engine = ei->engine; - engines_memory += engine->get_used_memory(engine->engine_ctx); + setupEngineModuleCtx(ei, NULL); + engineMemoryInfo mem_info = engine->get_memory_info(ei->module_ctx, + engine->engine_ctx); + engines_memory += mem_info.used_memory; + teardownEngineModuleCtx(ei); } dictReleaseIterator(iter); @@ -1111,12 +1311,11 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) { int functionsInit(void) { engines = dictCreate(&engineDictType); + curr_functions_lib_ctx = functionsLibCtxCreate(); + if (luaEngineInitEngine() != C_OK) { return C_ERR; } - /* Must be initialized after engines initialization */ - curr_functions_lib_ctx = functionsLibCtxCreate(); - return C_OK; } diff --git a/src/functions.h b/src/functions.h index da196cf197..89e39fdc56 100644 --- a/src/functions.h +++ b/src/functions.h @@ -54,53 +54,68 @@ typedef struct functionLibInfo functionLibInfo; +/* ValkeyModule type aliases for scripting engine structs and types. */ +typedef ValkeyModuleScriptingEngineCtx engineCtx; +typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx; +typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction; +typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo; +typedef ValkeyModuleScriptingEngineMethods engineMethods; + typedef struct engine { /* engine specific context */ - void *engine_ctx; - - /* Create function callback, get the engine_ctx, and function code - * engine_ctx - opaque struct that was created on engine initialization - * li - library information that need to be provided and when add functions - * code - the library code - * timeout - timeout for the library creation (0 for no timeout) - * err - description of error (if occurred) - * returns C_ERR on error and set err to be the error message */ - int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err); - - /* Invoking a function, r_ctx is an opaque object (from engine POV). - * The r_ctx should be used by the engine to interaction with the server, + engineCtx *engine_ctx; + + /* Compiles the script code and returns an array of compiled functions + * registered in the script./ + * + * Returns NULL on error and set err to be the error message */ + compiledFunction **(*create)( + ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err); + + /* Invoking a function, func_ctx is an opaque object (from engine POV). + * The func_ctx should be used by the engine to interaction with the server, * such interaction could be running commands, set resp, or set * replication mode */ - void (*call)(scriptRunCtx *r_ctx, - void *engine_ctx, + void (*call)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + functionCtx *func_ctx, void *compiled_function, robj **keys, size_t nkeys, robj **args, size_t nargs); - /* get current used memory by the engine */ - size_t (*get_used_memory)(void *engine_ctx); + /* free the given function */ + void (*free_function)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx, + void *compiled_function); /* Return memory overhead for a given function, * such memory is not counted as engine memory but as general * structs memory that hold different information */ - size_t (*get_function_memory_overhead)(void *compiled_function); + size_t (*get_function_memory_overhead)(ValkeyModuleCtx *module_ctx, + void *compiled_function); - /* Return memory overhead for engine (struct size holding the engine)*/ - size_t (*get_engine_memory_overhead)(void *engine_ctx); + /* Get the current used memory by the engine */ + engineMemoryInfo (*get_memory_info)(ValkeyModuleCtx *module_ctx, + engineCtx *engine_ctx); - /* free the given function */ - void (*free_function)(void *engine_ctx, void *compiled_function); } engine; /* Hold information about an engine. * Used on rdb.c so it must be declared here. */ typedef struct engineInfo { - sds name; /* Name of the engine */ - engine *engine; /* engine callbacks that allows to interact with the engine */ - client *c; /* Client that is used to run commands */ + sds name; /* Name of the engine */ + ValkeyModule *engineModule; /* the module that implements the scripting engine */ + ValkeyModuleCtx *module_ctx; /* Scripting engine module context */ + engine *engine; /* engine callbacks that allows to interact with the engine */ + client *c; /* Client that is used to run commands */ } engineInfo; /* Hold information about the specific function. @@ -123,7 +138,12 @@ struct functionLibInfo { sds code; /* Library code */ }; -int functionsRegisterEngine(const char *engine_name, engine *engine_ctx); +int functionsRegisterEngine(const char *engine_name, + ValkeyModule *engine_module, + void *engine_ctx, + engineMethods *engine_methods); +int functionsUnregisterEngine(const char *engine_name); + sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout); unsigned long functionsMemory(void); unsigned long functionsMemoryOverhead(void); @@ -133,12 +153,10 @@ dict *functionsLibGet(void); size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx); functionsLibCtx *functionsLibCtxGetCurrent(void); functionsLibCtx *functionsLibCtxCreate(void); -void functionsLibCtxClearCurrent(int async); -void functionsLibCtxFree(functionsLibCtx *lib_ctx); -void functionsLibCtxClear(functionsLibCtx *lib_ctx); -void functionsLibCtxSwapWithCurrent(functionsLibCtx *lib_ctx); - -int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)); +void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); int luaEngineInitEngine(void); int functionsInit(void); diff --git a/src/geo.c b/src/geo.c index 9e43a6e93b..75654f85a5 100644 --- a/src/geo.c +++ b/src/geo.c @@ -780,8 +780,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) { if (returned_items) { zsetConvertToListpackIfNeeded(zobj, maxelelen, totelelen); - setKey(c, c->db, storekey, zobj, 0); - decrRefCount(zobj); + setKey(c, c->db, storekey, &zobj, 0); notifyKeyspaceEvent(NOTIFY_ZSET, flags & GEOSEARCH ? "geosearchstore" : "georadiusstore", storekey, c->db->id); server.dirty += returned_items; diff --git a/src/geohash_helper.c b/src/geohash_helper.c index aa4b4743a6..c05c2f2634 100644 --- a/src/geohash_helper.c +++ b/src/geohash_helper.c @@ -48,7 +48,7 @@ /// @brief The usual PI/180 constant const double DEG_TO_RAD = 0.017453292519943295769236907684886; -/// @brief Earth's quatratic mean radius for WGS-84 +/// @brief Earth's quadratic mean radius for WGS-84 const double EARTH_RADIUS_IN_METERS = 6372797.560856; const double MERCATOR_MAX = 20037726.37; diff --git a/src/hashtable.c b/src/hashtable.c new file mode 100644 index 0000000000..11ba360800 --- /dev/null +++ b/src/hashtable.c @@ -0,0 +1,2146 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +/* Hashtable + * ========= + * + * This is an implementation of a hash table with cache-line sized buckets. It's + * designed for speed and low memory overhead. It provides the following + * features: + * + * - Incremental rehashing using two tables. + * + * - Stateless iteration using 'scan'. + * + * - A hash table contains pointers to user-defined entries. An entry needs to + * contain a key. Other than that, the hash table implementation doesn't care + * what it contains. To use it as a set, an entry is just a key. Using as a + * key-value map requires combining key and value into an entry object and + * inserting this object into the hash table. A callback for fetching the key + * from within the entry object is provided by the caller when creating the + * hash table. + * + * - The entry type, key type, hash function and other properties are + * configurable as callbacks in a 'type' structure provided when creating a + * hash table. + * + * Conventions + * ----------- + * + * Functions and types are prefixed by "hashtable", macros by "HASHTABLE". Internal + * names don't use the prefix. Internal functions are 'static'. + * + * Credits + * ------- + * + * - The hashtable was designed by Viktor Söderqvist. + * - The bucket chaining is based on an idea by Madelyn Olson. + * - The cache-line sized bucket is inspired by ideas used in 'Swiss tables' + * (Benzaquen, Evlogimenos, Kulukundis, and Perepelitsa et. al.). + * - The incremental rehashing using two tables and much of the API is based on + * the design used in dict, designed by Salvatore Sanfilippo. + * - The original scan algorithm was designed by Pieter Noordhuis. + */ +#include "hashtable.h" +#include "serverassert.h" +#include "zmalloc.h" +#include "mt19937-64.h" +#include "monotonic.h" +#include "config.h" + +#include +#include +#include +#include +#include +#include + +/* The default hashing function uses the SipHash implementation in siphash.c. */ + +uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k); +uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k); + +/* --- Global variables --- */ + +static uint8_t hash_function_seed[16]; +static hashtableResizePolicy resize_policy = HASHTABLE_RESIZE_ALLOW; + +/* --- Fill factor --- */ + +/* We use a soft and a hard limit for the minimum and maximum fill factor. The + * hard limits are used when resizing should be avoided, according to the resize + * policy. Resizing is typically to be avoided when we have forked child process + * running. Then, we don't want to move too much memory around, since the fork + * is using copy-on-write. + * + * Even if we resize and start inserting new entries in the new table, we can + * avoid actively moving entries from the old table to the new table. When the + * resize policy is AVOID, we perform a step of incremental rehashing only on + * insertions and not on lookups. */ + +#define MAX_FILL_PERCENT_SOFT 100 +#define MAX_FILL_PERCENT_HARD 500 + +#define MIN_FILL_PERCENT_SOFT 13 +#define MIN_FILL_PERCENT_HARD 3 + +/* --- Hash function API --- */ + +/* The seed needs to be 16 bytes. */ +void hashtableSetHashFunctionSeed(const uint8_t *seed) { + memcpy(hash_function_seed, seed, sizeof(hash_function_seed)); +} + +uint8_t *hashtableGetHashFunctionSeed(void) { + return hash_function_seed; +} + +uint64_t hashtableGenHashFunction(const char *buf, size_t len) { + return siphash((const uint8_t *)buf, len, hash_function_seed); +} + +uint64_t hashtableGenCaseHashFunction(const char *buf, size_t len) { + return siphash_nocase((const uint8_t *)buf, len, hash_function_seed); +} + +/* --- Global resize policy API --- */ + +/* The global resize policy is one of + * + * - HASHTABLE_RESIZE_ALLOW: Rehash as required for optimal performance. + * + * - HASHTABLE_RESIZE_AVOID: Don't rehash and move memory if it can be avoided; + * used when there is a fork running and we want to avoid affecting + * copy-on-write memory. + * + * - HASHTABLE_RESIZE_FORBID: Don't rehash at all. Used in a child process which + * doesn't add any keys. + * + * Incremental rehashing works in the following way: A new table is allocated + * and entries are incrementally moved from the old to the new table. + * + * To avoid affecting copy-on-write, we avoid rehashing when there is a forked + * child process. + * + * We don't completely forbid resizing the table but the fill factor is + * significantly larger when the resize policy is set to HASHTABLE_RESIZE_AVOID + * and we resize with incremental rehashing paused, so new entries are added to + * the new table and the old entries are rehashed only when the child process is + * done. + */ +void hashtableSetResizePolicy(hashtableResizePolicy policy) { + resize_policy = policy; +} + +/* --- Hash table layout --- */ + +#if SIZE_MAX == UINT64_MAX /* 64-bit version */ + +#define ENTRIES_PER_BUCKET 7 +#define BUCKET_BITS_TYPE uint8_t +#define BITS_NEEDED_TO_STORE_POS_WITHIN_BUCKET 3 + +/* Selecting the number of buckets. + * + * When resizing the table, we want to select an appropriate number of buckets + * without an expensive division. Division by a power of two is cheap, but any + * other division is expensive. We pick a fill factor to make division cheap for + * our choice of ENTRIES_PER_BUCKET. + * + * The number of buckets we want is NUM_ENTRIES / (ENTRIES_PER_BUCKET * FILL_FACTOR), + * rounded up. The fill is the number of entries we have, or want to put, in + * the table. + * + * Instead of the above fraction, we multiply by an integer BUCKET_FACTOR and + * divide by a power-of-two BUCKET_DIVISOR. This gives us a fill factor of at + * most MAX_FILL_PERCENT_SOFT, the soft limit for expanding. + * + * NUM_BUCKETS = ceil(NUM_ENTRIES * BUCKET_FACTOR / BUCKET_DIVISOR) + * + * This gives us + * + * FILL_FACTOR = NUM_ENTRIES / (NUM_BUCKETS * ENTRIES_PER_BUCKET) + * = 1 / (BUCKET_FACTOR / BUCKET_DIVISOR) / ENTRIES_PER_BUCKET + * = BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET + */ + +#define BUCKET_FACTOR 5 +#define BUCKET_DIVISOR 32 +/* When resizing, we get a fill of at most 91.43% (32 / 5 / 7). */ + +#define randomSizeT() ((size_t)genrand64_int64()) + +#elif SIZE_MAX == UINT32_MAX /* 32-bit version */ + +#define ENTRIES_PER_BUCKET 12 +#define BUCKET_BITS_TYPE uint16_t +#define BITS_NEEDED_TO_STORE_POS_WITHIN_BUCKET 4 +#define BUCKET_FACTOR 3 +#define BUCKET_DIVISOR 32 +/* When resizing, we get a fill of at most 88.89% (32 / 3 / 12). */ + +#define randomSizeT() ((size_t)random()) + +#else +#error "Only 64-bit or 32-bit architectures are supported" +#endif /* 64-bit vs 32-bit version */ + +#ifndef static_assert +#define static_assert _Static_assert +#endif + +static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_FILL_PERCENT_SOFT, + "Expand must result in a fill below the soft max fill factor"); +static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor"); + +/* --- Random entry --- */ + +#define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 40) +#define WEAK_RANDOM_SAMPLE_SIZE ENTRIES_PER_BUCKET + +/* --- Types --- */ + +/* Design + * ------ + * + * We use a design with buckets of 64 bytes (one cache line). Each bucket + * contains metadata and entry slots for a fixed number of entries. In a 64-bit + * system, there are up to 7 entries per bucket. These are unordered and an + * entry can be inserted in any of the free slots. Additionally, the bucket + * contains metadata for the entries. This includes a few bits of the hash of + * the key of each entry, which are used to rule out false positives when + * looking up entries. + * + * Bucket chaining + * --------------- + * + * Each key hashes to a bucket in the hash table. If a bucket is full, the last + * entry is replaced by a pointer to a separately allocated child bucket. + * Child buckets form a bucket chain. + * + * Bucket Bucket Bucket + * -----+---------------+---------------+---------------+----- + * ... | x x x x x x p | x x x x x x x | x x x x x x x | ... + * -----+-------------|-+---------------+---------------+----- + * | + * v Child bucket + * +---------------+ + * | x x x x x x p | + * +-------------|-+ + * | + * v Child bucket + * +---------------+ + * | x x x x x x x | + * +---------------+ + * + * Bucket layout + * ------------- + * + * Within each bucket chain, the entries are unordered. To avoid false positives + * when looking up an entry, a few bits of the hash value is stored in a bucket + * metadata section in each bucket. The bucket metadata also contains a bit that + * indicates that the bucket has a child bucket. + * + * +------------------------------------------------------------------+ + * | Metadata | Entry | Entry | Entry | Entry | Entry | Entry | Entry | + * +------------------------------------------------------------------+ + * / ` - - . _ _ + * / `- - . _ _ + * / ` - . _ + * +----------------------------------------------+ + * | c ppppppp hash hash hash hash hash hash hash | + * +----------------------------------------------+ + * | | | + * | | One byte of hash for each entry position in the bucket. + * | | + * | Presence bits. One bit for each entry position, indicating if an + * | entry present or not. + * | + * Chained? One bit. If set, the last entry is a child bucket pointer. + * + * 64-bit version, 7 entries per bucket: + * + * 1 bit 7 bits [1 byte] x 7 [8 bytes] x 7 = 64 bytes + * chained presence hashes entries + * + * 32-bit version, 12 entries per bucket: + * + * 1 bit 12 bits 3 bits [1 byte] x 12 2 bytes [4 bytes] x 12 = 64 bytes + * chained presence unused hashes unused entries + */ + +typedef struct hashtableBucket { + BUCKET_BITS_TYPE chained : 1; + BUCKET_BITS_TYPE presence : ENTRIES_PER_BUCKET; + uint8_t hashes[ENTRIES_PER_BUCKET]; + void *entries[ENTRIES_PER_BUCKET]; +} bucket; + +/* A key property is that the bucket size is one cache line. */ +static_assert(sizeof(bucket) == HASHTABLE_BUCKET_SIZE, "Bucket size mismatch"); + +struct hashtable { + hashtableType *type; + ssize_t rehash_idx; /* -1 = rehashing not in progress. */ + bucket *tables[2]; /* 0 = main table, 1 = rehashing target. */ + size_t used[2]; /* Number of entries in each table. */ + int8_t bucket_exp[2]; /* Exponent for num buckets (num = 1 << exp). */ + int16_t pause_rehash; /* Non-zero = rehashing is paused */ + int16_t pause_auto_shrink; /* Non-zero = automatic resizing disallowed. */ + size_t child_buckets[2]; /* Number of allocated child buckets. */ + void *metadata[]; +}; + +typedef struct { + hashtable *hashtable; + bucket *bucket; + long index; + uint16_t pos_in_bucket; + uint8_t table; + uint8_t safe; + union { + /* Unsafe iterator fingerprint for misuse detection. */ + uint64_t fingerprint; + /* Safe iterator temporary storage for bucket chain compaction. */ + uint64_t last_seen_size; + }; +} iter; + +/* The opaque hashtableIterator is defined as a blob of bytes. */ +static_assert(sizeof(hashtableIterator) >= sizeof(iter), + "Opaque iterator size"); + +/* Position, used by some hashtable functions such as two-phase insert and delete. */ +typedef struct { + bucket *bucket; + uint16_t pos_in_bucket; + uint16_t table_index; +} position; + +static_assert(sizeof(hashtablePosition) >= sizeof(position), + "Opaque iterator size"); + +/* State for incremental find. */ +typedef struct { + enum { + HASHTABLE_CHECK_ENTRY, + HASHTABLE_NEXT_ENTRY, + HASHTABLE_NEXT_BUCKET, + HASHTABLE_FOUND, + HASHTABLE_NOT_FOUND + } state; + short table; + short pos; + hashtable *hashtable; + bucket *bucket; + const void *key; + uint64_t hash; +} incrementalFind; + +static_assert(sizeof(hashtableIncrementalFindState) >= sizeof(incrementalFind), + "Opaque incremental find state size"); + +/* Struct used for stats functions. */ +struct hashtableStats { + int table_index; /* 0 or 1 (old or new while rehashing). */ + unsigned long toplevel_buckets; /* Number of buckets in table. */ + unsigned long child_buckets; /* Number of child buckets. */ + unsigned long size; /* Capacity of toplevel buckets. */ + unsigned long used; /* Number of entries in the table. */ + unsigned long max_chain_len; /* Length of longest bucket chain. */ + unsigned long *clvector; /* Chain length vector; entry i counts + * bucket chains of length i. */ +}; + +/* Struct for sampling entries using scan, used by random key functions. */ + +typedef struct { + unsigned size; /* Size of the entries array. */ + unsigned seen; /* Number of entries seen. */ + void **entries; /* Array of sampled entries. */ +} scan_samples; + +/* --- Internal functions --- */ + +static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_bucket, int *table_index); + +static inline void freeEntry(hashtable *ht, void *entry) { + if (ht->type->entryDestructor) ht->type->entryDestructor(entry); +} + +static inline int compareKeys(hashtable *ht, const void *key1, const void *key2) { + if (ht->type->keyCompare != NULL) { + return ht->type->keyCompare(key1, key2); + } else { + return key1 != key2; + } +} + +static inline const void *entryGetKey(hashtable *ht, const void *entry) { + if (ht->type->entryGetKey != NULL) { + return ht->type->entryGetKey(entry); + } else { + return entry; + } +} + +static inline uint64_t hashKey(hashtable *ht, const void *key) { + if (ht->type->hashFunction != NULL) { + return ht->type->hashFunction(key); + } else { + return hashtableGenHashFunction((const char *)&key, sizeof(key)); + } +} + +static inline uint64_t hashEntry(hashtable *ht, const void *entry) { + return hashKey(ht, entryGetKey(ht, entry)); +} + + +/* For the hash bits stored in the bucket, we use the highest bits of the hash + * value, since these are not used for selecting the bucket. */ +static inline uint8_t highBits(uint64_t hash) { + return hash >> (CHAR_BIT * 7); +} + +static inline int numBucketPositions(bucket *b) { + return ENTRIES_PER_BUCKET - (b->chained ? 1 : 0); +} + +static inline int bucketIsFull(bucket *b) { + return b->presence == (1 << numBucketPositions(b)) - 1; +} + +/* Returns non-zero if the position within the bucket is occupied. */ +static inline int isPositionFilled(bucket *b, int position) { + return b->presence & (1 << position); +} +static void resetTable(hashtable *ht, int table_idx) { + ht->tables[table_idx] = NULL; + ht->used[table_idx] = 0; + ht->bucket_exp[table_idx] = -1; + ht->child_buckets[table_idx] = 0; +} + +/* Number of top-level buckets. */ +static inline size_t numBuckets(int exp) { + return exp == -1 ? 0 : (size_t)1 << exp; +} + +/* Bitmask for masking the hash value to get bucket index. */ +static inline size_t expToMask(int exp) { + return exp == -1 ? 0 : numBuckets(exp) - 1; +} + +/* Returns the 'exp', where num_buckets = 1 << exp. The number of + * buckets is a power of two. */ +static signed char nextBucketExp(size_t min_capacity) { + if (min_capacity == 0) return -1; + /* ceil(x / y) = floor((x - 1) / y) + 1 */ + size_t min_buckets = (min_capacity * BUCKET_FACTOR - 1) / BUCKET_DIVISOR + 1; + if (min_buckets >= SIZE_MAX / 2) return CHAR_BIT * sizeof(size_t) - 1; + if (min_buckets == 1) return 0; + return CHAR_BIT * sizeof(size_t) - __builtin_clzl(min_buckets - 1); +} + +/* Swaps the tables and frees the old table. */ +static void rehashingCompleted(hashtable *ht) { + if (ht->type->rehashingCompleted) ht->type->rehashingCompleted(ht); + if (ht->tables[0]) { + zfree(ht->tables[0]); + if (ht->type->trackMemUsage) { + ht->type->trackMemUsage(ht, -sizeof(bucket) * numBuckets(ht->bucket_exp[0])); + } + } + ht->bucket_exp[0] = ht->bucket_exp[1]; + ht->tables[0] = ht->tables[1]; + ht->used[0] = ht->used[1]; + ht->child_buckets[0] = ht->child_buckets[1]; + resetTable(ht, 1); + ht->rehash_idx = -1; +} + +/* Reverse bits, adapted to use bswap, from + * https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ +static size_t rev(size_t v) { +#if SIZE_MAX == UINT64_MAX + /* Swap odd and even bits. */ + v = ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1); + /* Swap consecutive pairs. */ + v = ((v >> 2) & 0x3333333333333333) | ((v & 0x3333333333333333) << 2); + /* Swap nibbles. */ + v = ((v >> 4) & 0x0F0F0F0F0F0F0F0F) | ((v & 0x0F0F0F0F0F0F0F0F) << 4); + /* Reverse bytes. */ + v = __builtin_bswap64(v); +#else + /* 32-bit version. */ + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = __builtin_bswap32(v); +#endif + return v; +} + +/* Advances a scan cursor to the next value. It increments the reverse bit + * representation of the masked bits of v. This algorithm was invented by Pieter + * Noordhuis. */ +size_t nextCursor(size_t v, size_t mask) { + v |= ~mask; /* Set the unmasked (high) bits. */ + v = rev(v); /* Reverse. The unmasked bits are now the low bits. */ + v++; /* Increment the reversed cursor, flipping the unmasked bits to + * 0 and increments the masked bits. */ + v = rev(v); /* Reverse the bits back to normal. */ + return v; +} + +/* Returns the next bucket in a bucket chain, or NULL if there's no next. */ +static bucket *bucketNext(bucket *b) { + return b->chained ? b->entries[ENTRIES_PER_BUCKET - 1] : NULL; +} + +/* Attempts to defrag bucket 'b' using the defrag callback function. If the + * defrag callback function returns a pointer to a new allocation, this pointer + * is returned and the 'prev' bucket is updated to point to the new allocation. + * Otherwise, the 'b' pointer is returned. */ +static bucket *bucketDefrag(bucket *prev, bucket *b, void *(*defragfn)(void *)) { + bucket *reallocated = defragfn(b); + if (reallocated == NULL) return b; + prev->entries[ENTRIES_PER_BUCKET - 1] = reallocated; + return reallocated; +} + +/* Rehashes one bucket. */ +static void rehashBucket(hashtable *ht, bucket *b) { + int pos; + for (pos = 0; pos < numBucketPositions(b); pos++) { + if (!isPositionFilled(b, pos)) continue; /* empty */ + void *entry = b->entries[pos]; + uint8_t h2 = b->hashes[pos]; + /* Insert into table 1. */ + uint64_t hash; + /* When shrinking, it's possible to avoid computing the hash. We can + * just use idx has the hash. */ + if (ht->bucket_exp[1] < ht->bucket_exp[0]) { + hash = ht->rehash_idx; + } else { + hash = hashEntry(ht, entry); + } + int pos_in_dst_bucket; + bucket *dst = findBucketForInsert(ht, hash, &pos_in_dst_bucket, NULL); + dst->entries[pos_in_dst_bucket] = entry; + dst->hashes[pos_in_dst_bucket] = h2; + dst->presence |= (1 << pos_in_dst_bucket); + ht->used[0]--; + ht->used[1]++; + } + /* Mark the source bucket as empty. */ + b->presence = 0; +} + +static void rehashStep(hashtable *ht) { + assert(hashtableIsRehashing(ht)); + size_t idx = ht->rehash_idx; + bucket *b = &ht->tables[0][idx]; + rehashBucket(ht, b); + if (b->chained) { + /* Rehash and free child buckets. */ + bucket *next = bucketNext(b); + b->chained = 0; + b = next; + while (b != NULL) { + rehashBucket(ht, b); + next = bucketNext(b); + zfree(b); + if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); + ht->child_buckets[0]--; + b = next; + } + } + + /* Advance to the next bucket. */ + ht->rehash_idx++; + if ((size_t)ht->rehash_idx >= numBuckets(ht->bucket_exp[0])) { + rehashingCompleted(ht); + } +} + +/* Called internally on lookup and other reads to the table. */ +static inline void rehashStepOnReadIfNeeded(hashtable *ht) { + if (!hashtableIsRehashing(ht) || ht->pause_rehash) return; + if (resize_policy != HASHTABLE_RESIZE_ALLOW) return; + rehashStep(ht); +} + +/* When inserting or deleting, we first do a find (read) and rehash one step if + * resize policy is set to ALLOW, so here we only do it if resize policy is + * AVOID. The reason for doing it on insert and delete is to ensure that we + * finish rehashing before we need to resize the table again. */ +static inline void rehashStepOnWriteIfNeeded(hashtable *ht) { + if (!hashtableIsRehashing(ht) || ht->pause_rehash) return; + if (resize_policy != HASHTABLE_RESIZE_AVOID) return; + rehashStep(ht); +} + +/* Allocates a new table and initiates incremental rehashing if necessary. + * Returns 1 on resize (success), 0 on no resize (failure). If 0 is returned and + * 'malloc_failed' is provided, it is set to 1 if allocation failed. If + * 'malloc_failed' is not provided, an allocation failure triggers a panic. */ +static int resize(hashtable *ht, size_t min_capacity, int *malloc_failed) { + if (malloc_failed) *malloc_failed = 0; + + /* Adjust minimum size. We don't resize to zero currently. */ + if (min_capacity == 0) min_capacity = 1; + + /* Size of new table. */ + signed char exp = nextBucketExp(min_capacity); + size_t num_buckets = numBuckets(exp); + size_t new_capacity = num_buckets * ENTRIES_PER_BUCKET; + if (new_capacity < min_capacity || num_buckets * sizeof(bucket) < num_buckets) { + /* Overflow */ + return 0; + } + + signed char old_exp = ht->bucket_exp[hashtableIsRehashing(ht) ? 1 : 0]; + size_t alloc_size = num_buckets * sizeof(bucket); + if (exp == old_exp) { + /* Can't resize to same size. */ + return 0; + } + + if (ht->type->resizeAllowed) { + double fill_factor = (double)min_capacity / ((double)numBuckets(old_exp) * ENTRIES_PER_BUCKET); + if (fill_factor * 100 < MAX_FILL_PERCENT_HARD && !ht->type->resizeAllowed(alloc_size, fill_factor)) { + /* Resize callback says no. */ + return 0; + } + } + + /* We can't resize if rehashing is already ongoing. Fast-forward ongoing + * rehashing before we continue. This can happen only in exceptional + * scenarios, such as when many insertions are made while rehashing is + * paused. */ + if (hashtableIsRehashing(ht)) { + if (hashtableIsRehashingPaused(ht)) return 0; + while (hashtableIsRehashing(ht)) { + rehashStep(ht); + } + } + + /* Allocate the new hash table. */ + bucket *new_table; + if (malloc_failed) { + new_table = ztrycalloc(alloc_size); + if (new_table == NULL) { + *malloc_failed = 1; + return 0; + } + } else { + new_table = zcalloc(alloc_size); + } + if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, alloc_size); + ht->bucket_exp[1] = exp; + ht->tables[1] = new_table; + ht->used[1] = 0; + ht->rehash_idx = 0; + if (ht->type->rehashingStarted) ht->type->rehashingStarted(ht); + + /* If the old table was empty, the rehashing is completed immediately. */ + if (ht->tables[0] == NULL || ht->used[0] == 0) { + rehashingCompleted(ht); + } else if (ht->type->instant_rehashing) { + while (hashtableIsRehashing(ht)) { + rehashStep(ht); + } + } + return 1; +} + +/* Returns 1 if the table is expanded, 0 if not expanded. If 0 is returned and + * 'malloc_failed' is provided, it is set to 1 if malloc failed and 0 + * otherwise. */ +static int expand(hashtable *ht, size_t size, int *malloc_failed) { + if (size < hashtableSize(ht)) { + return 0; + } + return resize(ht, size, malloc_failed); +} + +/* Finds an entry matching the key. If a match is found, returns a pointer to + * the bucket containing the matching entry and points 'pos_in_bucket' to the + * index within the bucket. Returns NULL if no matching entry was found. + * + * If 'table_index' is provided, it is set to the index of the table (0 or 1) + * the returned bucket belongs to. */ +static bucket *findBucket(hashtable *ht, uint64_t hash, const void *key, int *pos_in_bucket, int *table_index) { + if (hashtableSize(ht) == 0) return 0; + uint8_t h2 = highBits(hash); + int table; + + /* Do some incremental rehashing. */ + rehashStepOnReadIfNeeded(ht); + + for (table = 0; table <= 1; table++) { + if (ht->used[table] == 0) continue; + size_t mask = expToMask(ht->bucket_exp[table]); + size_t bucket_idx = hash & mask; + /* Skip already rehashed buckets. */ + if (table == 0 && ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) { + continue; + } + bucket *b = &ht->tables[table][bucket_idx]; + do { + /* Find candidate entries with presence flag set and matching h2 hash. */ + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos) && b->hashes[pos] == h2) { + /* It's a candidate. */ + void *entry = b->entries[pos]; + const void *elem_key = entryGetKey(ht, entry); + if (compareKeys(ht, key, elem_key) == 0) { + /* It's a match. */ + assert(pos_in_bucket != NULL); + *pos_in_bucket = pos; + if (table_index) *table_index = table; + return b; + } + } + } + b = bucketNext(b); + } while (b != NULL); + } + return NULL; +} + +/* Move an entry from one bucket to another. */ +static void moveEntry(bucket *bucket_to, int pos_to, bucket *bucket_from, int pos_from) { + assert(!isPositionFilled(bucket_to, pos_to)); + assert(isPositionFilled(bucket_from, pos_from)); + bucket_to->entries[pos_to] = bucket_from->entries[pos_from]; + bucket_to->hashes[pos_to] = bucket_from->hashes[pos_from]; + bucket_to->presence |= (1 << pos_to); + bucket_from->presence &= ~(1 << pos_from); +} + +/* Converts a full bucket b to a chained bucket and adds a new child bucket. */ +static void bucketConvertToChained(hashtable *ht, bucket *b) { + assert(!b->chained); + /* We'll move the last entry from the bucket to the new child bucket. */ + int pos = ENTRIES_PER_BUCKET - 1; + assert(isPositionFilled(b, pos)); + bucket *child = zcalloc(sizeof(bucket)); + if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, sizeof(bucket)); + moveEntry(child, 0, b, pos); + b->chained = 1; + b->entries[pos] = child; +} + +/* Converts a bucket with a next-bucket pointer to one without one. */ +static void bucketConvertToUnchained(bucket *b) { + assert(b->chained); + b->chained = 0; + assert(!isPositionFilled(b, ENTRIES_PER_BUCKET - 1)); +} + +/* If the last bucket is empty, free it. The before-last bucket is converted + * back to an "unchained" bucket, becoming the new last bucket in the chain. If + * there's only one entry left in the last bucket, it's moved to the + * before-last bucket's last position, to take the place of the next-bucket + * link. + * + * This function needs the penultimate 'before_last' bucket in the chain, to be + * able to update it when the last bucket is freed. */ +static void pruneLastBucket(hashtable *ht, bucket *before_last, bucket *last, int table_index) { + assert(before_last->chained && bucketNext(before_last) == last); + assert(!last->chained); + assert(last->presence == 0 || __builtin_popcount(last->presence) == 1); + bucketConvertToUnchained(before_last); + if (last->presence != 0) { + /* Move the last remaining entry to the new last position in the + * before-last bucket. */ + int pos_in_last = __builtin_ctz(last->presence); + moveEntry(before_last, ENTRIES_PER_BUCKET - 1, last, pos_in_last); + } + zfree(last); + if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); + ht->child_buckets[table_index]--; +} + +/* After removing an entry in a bucket with children, we can fill the hole + * with an entry from the end of the bucket chain and potentially free the + * last bucket in the chain. */ +static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int table_index) { + assert(b->chained && !isPositionFilled(b, pos_in_bucket)); + /* Find the last bucket */ + bucket *before_last = b; + bucket *last = bucketNext(b); + while (last->chained) { + before_last = last; + last = bucketNext(last); + } + /* Unless the last bucket is empty, find an entry in the last bucket and + * move it to the hole in b. */ + if (last->presence != 0) { + int pos_in_last = __builtin_ctz(last->presence); + assert(pos_in_last < ENTRIES_PER_BUCKET && isPositionFilled(last, pos_in_last)); + moveEntry(b, pos_in_bucket, last, pos_in_last); + } + /* Free the last bucket if it becomes empty. */ + if (last->presence == 0 || __builtin_popcount(last->presence) == 1) { + pruneLastBucket(ht, before_last, last, table_index); + } +} + +/* When entries are deleted while rehashing is paused, they leave empty holes in + * the buckets. This functions attempts to fill the holes by moving entries from + * the end of the bucket chain to fill the holes and free any empty buckets in + * the end of the chain. */ +static void compactBucketChain(hashtable *ht, size_t bucket_index, int table_index) { + bucket *b = &ht->tables[table_index][bucket_index]; + while (b->chained) { + bucket *next = bucketNext(b); + if (next->chained && next->presence == 0) { + /* Empty bucket in the middle of the chain. Remove it from the chain. */ + bucket *next_next = bucketNext(next); + b->entries[ENTRIES_PER_BUCKET - 1] = next_next; + zfree(next); + if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); + ht->child_buckets[table_index]--; + continue; + } + + if (!next->chained && (next->presence == 0 || __builtin_popcount(next->presence) == 1)) { + /* Next is the last bucket and it's empty or has only one entry. + * Delete it and turn b into an "unchained" bucket. */ + pruneLastBucket(ht, b, next, table_index); + return; + } + + if (__builtin_popcount(b->presence) < ENTRIES_PER_BUCKET - 1) { + /* Fill the holes in the bucket. */ + for (int pos = 0; pos < ENTRIES_PER_BUCKET - 1; pos++) { + if (!isPositionFilled(b, pos)) { + fillBucketHole(ht, b, pos, table_index); + if (!b->chained) return; + } + } + } + + /* Bucket is full. Move forward to next bucket. */ + b = next; + } +} + +/* Find an empty position in the table for inserting an entry with the given hash. */ +static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_bucket, int *table_index) { + int table = hashtableIsRehashing(ht) ? 1 : 0; + assert(ht->tables[table]); + size_t mask = expToMask(ht->bucket_exp[table]); + size_t bucket_idx = hash & mask; + bucket *b = &ht->tables[table][bucket_idx]; + /* Find bucket that's not full, or create one. */ + while (bucketIsFull(b)) { + if (!b->chained) { + bucketConvertToChained(ht, b); + ht->child_buckets[table]++; + } + b = bucketNext(b); + } + /* Find a free slot in the bucket. There must be at least one. */ + int pos; + for (pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + if (!isPositionFilled(b, pos)) break; + } + assert(pos < ENTRIES_PER_BUCKET); + assert(pos_in_bucket != NULL); + *pos_in_bucket = pos; + if (table_index) *table_index = table; + return b; +} + +/* Helper to insert an entry. Doesn't check if an entry with a matching key + * already exists. This must be ensured by the caller. */ +static void insert(hashtable *ht, uint64_t hash, void *entry) { + hashtableExpandIfNeeded(ht); + rehashStepOnWriteIfNeeded(ht); + int pos_in_bucket; + int table_index; + bucket *b = findBucketForInsert(ht, hash, &pos_in_bucket, &table_index); + b->entries[pos_in_bucket] = entry; + b->presence |= (1 << pos_in_bucket); + b->hashes[pos_in_bucket] = highBits(hash); + ht->used[table_index]++; +} + +/* A 64-bit fingerprint of some of the state of the hash table. */ +static uint64_t hashtableFingerprint(hashtable *ht) { + uint64_t integers[6], hash = 0; + integers[0] = (uintptr_t)ht->tables[0]; + integers[1] = ht->bucket_exp[0]; + integers[2] = ht->used[0]; + integers[3] = (uintptr_t)ht->tables[1]; + integers[4] = ht->bucket_exp[1]; + integers[5] = ht->used[1]; + + /* Result = hash(hash(hash(int1)+int2)+int3) */ + for (int j = 0; j < 6; j++) { + hash += integers[j]; + /* Tomas Wang's 64 bit integer hash. */ + hash = (~hash) + (hash << 21); /* hash = (hash << 21) - hash - 1; */ + hash = hash ^ (hash >> 24); + hash = (hash + (hash << 3)) + (hash << 8); /* hash * 265 */ + hash = hash ^ (hash >> 14); + hash = (hash + (hash << 2)) + (hash << 4); /* hash * 21 */ + hash = hash ^ (hash >> 28); + hash = hash + (hash << 31); + } + return hash; +} + +/* Scan callback function used by hashtableGetSomeEntries() for sampling entries + * using scan. */ +static void sampleEntriesScanFn(void *privdata, void *entry) { + scan_samples *samples = privdata; + if (samples->seen < samples->size) { + samples->entries[samples->seen++] = entry; + } else { + /* More entries than we wanted. This can happen if there are long + * bucket chains. Replace random entries using reservoir sampling. */ + samples->seen++; + unsigned idx = random() % samples->seen; + if (idx < samples->size) samples->entries[idx] = entry; + } +} + +/* Conversion from internal iterator struct to user-facing opaque type. */ +static inline hashtableIterator *iteratorToOpaque(iter *iterator) { + return (hashtableIterator *)(void *)iterator; +} + +/* Conversion from user-facing opaque iterator type to internal struct. */ +static inline iter *iteratorFromOpaque(hashtableIterator *iterator) { + return (iter *)(void *)iterator; +} + +/* Conversion from user-facing opaque type to internal struct. */ +static inline position *positionFromOpaque(hashtablePosition *p) { + return (position *)(void *)p; +} + +/* Conversion from user-facing opaque type to internal struct. */ +static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFindState *state) { + return (incrementalFind *)(void *)state; +} + +/* --- API functions --- */ + +/* Allocates and initializes a new hashtable specified by the given type. */ +hashtable *hashtableCreate(hashtableType *type) { + size_t metasize = type->getMetadataSize ? type->getMetadataSize() : 0; + size_t alloc_size = sizeof(hashtable) + metasize; + hashtable *ht = zmalloc(alloc_size); + if (metasize > 0) { + memset(&ht->metadata, 0, metasize); + } + ht->type = type; + ht->rehash_idx = -1; + ht->pause_rehash = 0; + ht->pause_auto_shrink = 0; + resetTable(ht, 0); + resetTable(ht, 1); + if (type->trackMemUsage) type->trackMemUsage(ht, alloc_size); + return ht; +} + +/* Deletes all the entries. If a callback is provided, it is called from time + * to time to indicate progress. */ +void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)) { + if (hashtableIsRehashing(ht)) { + /* Pretend rehashing completed. */ + if (ht->type->rehashingCompleted) ht->type->rehashingCompleted(ht); + ht->rehash_idx = -1; + } + for (int table_index = 0; table_index <= 1; table_index++) { + if (ht->bucket_exp[table_index] < 0) { + continue; + } + if (ht->used[table_index] > 0) { + for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table_index]); idx++) { + if (callback && (idx & 65535) == 0) callback(ht); + bucket *b = &ht->tables[table_index][idx]; + do { + /* Call the destructor with each entry. */ + if (ht->type->entryDestructor != NULL && b->presence != 0) { + for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + if (isPositionFilled(b, pos)) { + ht->type->entryDestructor(b->entries[pos]); + } + } + } + bucket *next = bucketNext(b); + + /* Free allocated bucket. */ + if (b != &ht->tables[table_index][idx]) { + zfree(b); + if (ht->type->trackMemUsage) { + ht->type->trackMemUsage(ht, -sizeof(bucket)); + } + } + b = next; + } while (b != NULL); + } + } + zfree(ht->tables[table_index]); + if (ht->type->trackMemUsage) { + ht->type->trackMemUsage(ht, -sizeof(bucket) * numBuckets(ht->bucket_exp[table_index])); + } + resetTable(ht, table_index); + } +} + +/* Deletes all the entries and frees the table. */ +void hashtableRelease(hashtable *ht) { + hashtableEmpty(ht, NULL); + /* Call trackMemUsage before zfree, so trackMemUsage can access ht. */ + if (ht->type->trackMemUsage) { + size_t alloc_size = sizeof(hashtable); + if (ht->type->getMetadataSize) alloc_size += ht->type->getMetadataSize(); + ht->type->trackMemUsage(ht, -alloc_size); + } + zfree(ht); +} + +/* Returns the type of the hashtable. */ +hashtableType *hashtableGetType(hashtable *ht) { + return ht->type; +} + +/* Returns a pointer to the table's metadata (userdata) section. */ +void *hashtableMetadata(hashtable *ht) { + return &ht->metadata; +} + +/* Returns the number of entries stored. */ +size_t hashtableSize(const hashtable *ht) { + return ht->used[0] + ht->used[1]; +} + +/* Returns the number of buckets in the hash table itself. */ +size_t hashtableBuckets(hashtable *ht) { + return numBuckets(ht->bucket_exp[0]) + numBuckets(ht->bucket_exp[1]); +} + +/* Returns the number of buckets that have a child bucket. Equivalently, the + * number of allocated buckets, outside of the hash table itself. */ +size_t hashtableChainedBuckets(hashtable *ht, int table) { + return ht->child_buckets[table]; +} + +/* Returns the size of the hashtable structures, in bytes (not including the sizes + * of the entries, if the entries are pointers to allocated objects). */ +size_t hashtableMemUsage(hashtable *ht) { + size_t num_buckets = numBuckets(ht->bucket_exp[0]) + numBuckets(ht->bucket_exp[1]); + num_buckets += ht->child_buckets[0] + ht->child_buckets[1]; + size_t metasize = ht->type->getMetadataSize ? ht->type->getMetadataSize() : 0; + return sizeof(hashtable) + metasize + sizeof(bucket) * num_buckets; +} + +/* Pauses automatic shrinking. This can be called before deleting a lot of + * entries, to prevent automatic shrinking from being triggered multiple times. + * Call hashtableResumeAutoShrink afterwards to restore automatic shrinking. */ +void hashtablePauseAutoShrink(hashtable *ht) { + ht->pause_auto_shrink++; +} + +/* Re-enables automatic shrinking, after it has been paused. If you have deleted + * many entries while automatic shrinking was paused, you may want to call + * hashtableShrinkIfNeeded. */ +void hashtableResumeAutoShrink(hashtable *ht) { + ht->pause_auto_shrink--; + if (ht->pause_auto_shrink == 0) { + hashtableShrinkIfNeeded(ht); + } +} + +/* Pauses incremental rehashing. When rehashing is paused, bucket chains are not + * automatically compacted when entries are deleted. Doing so may leave empty + * spaces, "holes", in the bucket chains, which wastes memory. */ +static void hashtablePauseRehashing(hashtable *ht) { + ht->pause_rehash++; +} + +/* Resumes incremental rehashing, after pausing it. */ +static void hashtableResumeRehashing(hashtable *ht) { + ht->pause_rehash--; +} + +/* Returns 1 if incremental rehashing is paused, 0 if it isn't. */ +int hashtableIsRehashingPaused(hashtable *ht) { + return ht->pause_rehash > 0; +} + +/* Returns 1 if incremental rehashing is in progress, 0 otherwise. */ +int hashtableIsRehashing(hashtable *ht) { + return ht->rehash_idx != -1; +} + +/* Provides the number of buckets in the old and new tables during rehashing. To + * get the sizes in bytes, multiply by HASHTABLE_BUCKET_SIZE. This function can + * only be used when rehashing is in progress, and from the rehashingStarted and + * rehashingCompleted callbacks. */ +void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size) { + assert(hashtableIsRehashing(ht)); + *from_size = numBuckets(ht->bucket_exp[0]); + *to_size = numBuckets(ht->bucket_exp[1]); +} + +/* Performs incremental rehashing for the specified number of microseconds. + * Returns the number of rehashed buckets chains. */ +int hashtableRehashMicroseconds(hashtable *ht, uint64_t us) { + if (ht->pause_rehash > 0) return 0; + if (resize_policy != HASHTABLE_RESIZE_ALLOW) return 0; + + monotime timer; + elapsedStart(&timer); + int rehashes = 0; + + while (hashtableIsRehashing(ht)) { + rehashStep(ht); + rehashes++; + if (rehashes % 128 == 0 && elapsedUs(timer) >= us) break; + } + return rehashes; +} + +/* Return 1 if expand was performed; 0 otherwise. */ +int hashtableExpand(hashtable *ht, size_t size) { + return expand(ht, size, NULL); +} + +/* Returns 1 if expand was performed or if expand is not needed. Returns 0 if + * expand failed due to memory allocation failure. */ +int hashtableTryExpand(hashtable *ht, size_t size) { + int malloc_failed = 0; + return expand(ht, size, &malloc_failed) || !malloc_failed; +} + +/* Expanding is done automatically on insertion, but less eagerly if resize + * policy is set to AVOID or FORBID. After restoring resize policy to ALLOW, you + * may want to call hashtableExpandIfNeeded. Returns 1 if expanding, 0 if not + * expanding. */ +int hashtableExpandIfNeeded(hashtable *ht) { + size_t min_capacity = ht->used[0] + ht->used[1] + 1; + size_t num_buckets = numBuckets(ht->bucket_exp[hashtableIsRehashing(ht) ? 1 : 0]); + size_t current_capacity = num_buckets * ENTRIES_PER_BUCKET; + unsigned max_fill_percent = resize_policy == HASHTABLE_RESIZE_AVOID ? MAX_FILL_PERCENT_HARD : MAX_FILL_PERCENT_SOFT; + if (min_capacity * 100 <= current_capacity * max_fill_percent) { + return 0; + } + return resize(ht, min_capacity, NULL); +} + +/* Shrinking is done automatically on deletion, but less eagerly if resize + * policy is set to AVOID and not at all if set to FORBID. After restoring + * resize policy to ALLOW, you may want to call hashtableShrinkIfNeeded. */ +int hashtableShrinkIfNeeded(hashtable *ht) { + /* Don't shrink if rehashing is already in progress. */ + if (hashtableIsRehashing(ht) || resize_policy == HASHTABLE_RESIZE_FORBID) { + return 0; + } + size_t current_capacity = numBuckets(ht->bucket_exp[0]) * ENTRIES_PER_BUCKET; + unsigned min_fill_percent = resize_policy == HASHTABLE_RESIZE_AVOID ? MIN_FILL_PERCENT_HARD : MIN_FILL_PERCENT_SOFT; + if (ht->used[0] * 100 > current_capacity * min_fill_percent) { + return 0; + } + return resize(ht, ht->used[0], NULL); +} + +/* Defragment the main allocations of the hashtable by reallocating them. The + * provided defragfn callback should either return NULL (if reallocation is not + * necessary) or reallocate the memory like realloc() would do. + * + * Note that this doesn't cover allocated chained buckets. To defragment them, + * you need to do a scan using hashtableScanDefrag with the same 'defragfn'. + * + * Returns NULL if the hashtable's top-level struct hasn't been reallocated. + * Returns non-NULL if the top-level allocation has been allocated and thus + * making the 'ht' pointer invalid. */ +hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) { + /* The hashtable struct */ + hashtable *ht1 = defragfn(ht); + if (ht1 != NULL) ht = ht1; + /* The tables */ + for (int i = 0; i <= 1; i++) { + if (ht->tables[i] == NULL) continue; + void *table = defragfn(ht->tables[i]); + if (table != NULL) ht->tables[i] = table; + } + return ht1; +} + +/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've + * forked and memory won't be used again. See zmadvise_dontneed() */ +void dismissHashtable(hashtable *ht) { + for (int i = 0; i < 2; i++) { + zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *)); + } +} + +/* Returns 1 if an entry was found matching the key. Also points *found to it, + * if found is provided. Returns 0 if no matching entry was found. */ +int hashtableFind(hashtable *ht, const void *key, void **found) { + if (hashtableSize(ht) == 0) return 0; + uint64_t hash = hashKey(ht, key); + int pos_in_bucket = 0; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL); + if (b) { + if (found) *found = b->entries[pos_in_bucket]; + return 1; + } else { + return 0; + } +} + +/* Returns a pointer to where an entry is stored within the hash table, or + * NULL if not found. To get the entry, dereference the returned pointer. The + * pointer can be used to replace the entry with an equivalent entry (same + * key, same hash value), but note that the pointer may be invalidated by future + * accesses to the hash table due to incermental rehashing, so use with care. */ +void **hashtableFindRef(hashtable *ht, const void *key) { + if (hashtableSize(ht) == 0) return NULL; + uint64_t hash = hashKey(ht, key); + int pos_in_bucket = 0; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL); + return b ? &b->entries[pos_in_bucket] : NULL; +} + +/* Adds an entry. Returns 1 on success. Returns 0 if there was already an entry + * with the same key. */ +int hashtableAdd(hashtable *ht, void *entry) { + return hashtableAddOrFind(ht, entry, NULL); +} + +/* Adds an entry and returns 1 on success. Returns 0 if there was already an + * entry with the same key and, if an 'existing' pointer is provided, it is + * pointed to the existing entry. */ +int hashtableAddOrFind(hashtable *ht, void *entry, void **existing) { + const void *key = entryGetKey(ht, entry); + uint64_t hash = hashKey(ht, key); + int pos_in_bucket = 0; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL); + if (b != NULL) { + if (existing) *existing = b->entries[pos_in_bucket]; + return 0; + } else { + insert(ht, hash, entry); + return 1; + } +} + +/* Finds a position within the hashtable where an entry with the + * given key should be inserted using hashtableInsertAtPosition. This is the first + * phase in a two-phase insert operation and it can be used if you want to avoid + * creating an entry before you know if it already exists in the table or not, + * and without a separate lookup to the table. + * + * The function returns 1 if a position was found where an entry with the + * given key can be inserted. The position is stored in provided 'position' + * argument, which can be stack-allocated. This position should then be used in + * a call to hashtableInsertAtPosition. + * + * If the function returns 0, it means that an an entry with the given key + * already exists in the table. If an 'existing' pointer is provided, it is + * pointed to the existing entry with the matching key. + * + * Example: + * + * hashtablePosition position; + * void *existing; + * if (hashtableFindPositionForInsert(ht, key, &position, &existing)) { + * // Position found where we can insert an entry with this key. + * void *entry = createNewEntryWithKeyAndValue(key, some_value); + * hashtableInsertAtPosition(ht, entry, &position); + * } else { + * // Existing entry found with the matching key. + * doSomethingWithExistingEntry(existing); + * } + */ +int hashtableFindPositionForInsert(hashtable *ht, void *key, hashtablePosition *pos, void **existing) { + position *p = positionFromOpaque(pos); + uint64_t hash = hashKey(ht, key); + int pos_in_bucket, table_index; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL); + if (b != NULL) { + if (existing) *existing = b->entries[pos_in_bucket]; + return 0; + } else { + hashtableExpandIfNeeded(ht); + rehashStepOnWriteIfNeeded(ht); + b = findBucketForInsert(ht, hash, &pos_in_bucket, &table_index); + assert(!isPositionFilled(b, pos_in_bucket)); + + /* Store the hash bits now, so we don't need to compute the hash again + * when hashtableInsertAtPosition() is called. */ + b->hashes[pos_in_bucket] = highBits(hash); + + /* Populate position struct. */ + assert(p != NULL); + p->bucket = b; + p->pos_in_bucket = pos_in_bucket; + p->table_index = table_index; + return 1; + } +} + +/* Inserts an entry at the position previously acquired using + * hashtableFindPositionForInsert(). The entry must match the key provided when + * finding the position. You must not access the hashtable in any way between + * hashtableFindPositionForInsert() and hashtableInsertAtPosition(), since even a + * hashtableFind() may cause incremental rehashing to move entries in memory. */ +void hashtableInsertAtPosition(hashtable *ht, void *entry, hashtablePosition *pos) { + position *p = positionFromOpaque(pos); + bucket *b = p->bucket; + int pos_in_bucket = p->pos_in_bucket; + int table_index = p->table_index; + assert(!isPositionFilled(b, pos_in_bucket)); + b->presence |= (1 << pos_in_bucket); + b->entries[pos_in_bucket] = entry; + ht->used[table_index]++; + /* Hash bits are already set by hashtableFindPositionForInsert. */ +} + +/* Removes the entry with the matching key and returns it. The entry + * destructor is not called. Returns 1 and points 'popped' to the entry if a + * matching entry was found. Returns 0 if no matching entry was found. */ +int hashtablePop(hashtable *ht, const void *key, void **popped) { + if (hashtableSize(ht) == 0) return 0; + uint64_t hash = hashKey(ht, key); + int pos_in_bucket = 0; + int table_index = 0; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table_index); + if (b) { + if (popped) *popped = b->entries[pos_in_bucket]; + b->presence &= ~(1 << pos_in_bucket); + ht->used[table_index]--; + if (b->chained && !hashtableIsRehashingPaused(ht)) { + /* Rehashing is paused while iterating and when a scan callback is + * running. In those cases, we do the compaction in the scan and + * iterator code instead. */ + fillBucketHole(ht, b, pos_in_bucket, table_index); + } + hashtableShrinkIfNeeded(ht); + return 1; + } else { + return 0; + } +} + +/* Deletes the entry with the matching key. Returns 1 if an entry was + * deleted, 0 if no matching entry was found. */ +int hashtableDelete(hashtable *ht, const void *key) { + void *entry; + if (hashtablePop(ht, key, &entry)) { + freeEntry(ht, entry); + return 1; + } else { + return 0; + } +} + +/* When an entry has been reallocated, it can be replaced in a hash table + * without dereferencing the old pointer which may no longer be valid. The new + * entry with the same key and hash is used for finding the old entry and + * replacing it with the new entry. Returns 1 if the entry was replaced and 0 if + * the entry wasn't found. */ +int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void *new_entry) { + const void *key = entryGetKey(ht, new_entry); + uint64_t hash = hashKey(ht, key); + uint8_t h2 = highBits(hash); + for (int table = 0; table <= 1; table++) { + if (ht->used[table] == 0) continue; + size_t mask = expToMask(ht->bucket_exp[table]); + size_t bucket_idx = hash & mask; + /* Skip already rehashed buckets. */ + if (table == 0 && ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) { + continue; + } + bucket *b = &ht->tables[table][bucket_idx]; + do { + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos) && b->hashes[pos] == h2 && b->entries[pos] == old_entry) { + /* It's a match. */ + b->entries[pos] = new_entry; + return 1; + } + } + b = bucketNext(b); + } while (b != NULL); + } + return 0; +} + +/* Two-phase pop: Look up an entry, do something with it, then delete it + * without searching the hash table again. + * + * hashtableTwoPhasePopFindRef finds an entry in the table and also the position + * of the entry within the table, so that it can be deleted without looking it + * up in the table again. The function returns a pointer to the entry pointer + * within the hash table, if an entry with a matching key is found, and NULL + * otherwise. + * + * If non-NULL is returned, call 'hashtableTwoPhasePopDelete' with the returned + * 'position' afterwards to actually delete the entry from the table. These two + * functions are designed be used in pair. `hashtableTwoPhasePopFindRef` pauses + * rehashing and `hashtableTwoPhasePopDelete` resumes rehashing. + * + * While hashtablePop finds and returns an entry, the purpose of two-phase pop + * is to provide an optimized equivalent of hashtableFindRef followed by + * hashtableDelete, where the first call finds the entry but doesn't delete it + * from the hash table and the latter doesn't need to look up the entry in the + * hash table again. + * + * Example: + * + * hashtablePosition position; + * void **ref = hashtableTwoPhasePopFindRef(ht, key, &position) + * if (ref != NULL) { + * void *entry = *ref; + * // do something with the entry, then... + * hashtableTwoPhasePopDelete(ht, &position); + * } + */ + +/* Like hashtableTwoPhasePopFind, but returns a pointer to where the entry is + * stored in the table, or NULL if no matching entry is found. The 'position' + * argument is populated with a representation of where the entry is stored. + * This must be provided to hashtableTwoPhasePopDelete to complete the + * operation. */ +void **hashtableTwoPhasePopFindRef(hashtable *ht, const void *key, hashtablePosition *pos) { + position *p = positionFromOpaque(pos); + if (hashtableSize(ht) == 0) return NULL; + uint64_t hash = hashKey(ht, key); + int pos_in_bucket = 0; + int table_index = 0; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table_index); + if (b) { + hashtablePauseRehashing(ht); + + /* Store position. */ + assert(p != NULL); + p->bucket = b; + p->pos_in_bucket = pos_in_bucket; + p->table_index = table_index; + return &b->entries[pos_in_bucket]; + } else { + return NULL; + } +} + +/* Clears the position of the entry in the hashtable and resumes rehashing. The + * entry destructor is NOT called. The position is acquired using a preceding + * call to hashtableTwoPhasePopFindRef(). */ +void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *pos) { + /* Read position. */ + position *p = positionFromOpaque(pos); + bucket *b = p->bucket; + int pos_in_bucket = p->pos_in_bucket; + int table_index = p->table_index; + + /* Delete the entry and resume rehashing. */ + assert(isPositionFilled(b, pos_in_bucket)); + b->presence &= ~(1 << pos_in_bucket); + ht->used[table_index]--; + hashtableShrinkIfNeeded(ht); + hashtableResumeRehashing(ht); + if (b->chained && !hashtableIsRehashingPaused(ht)) { + /* Rehashing paused also means bucket chain compaction paused. It is + * paused while iterating and when a scan callback is running, to be + * able to live up to the scan and iterator guarantees. In those cases, + * we do the compaction in the scan and iterator code instead. */ + fillBucketHole(ht, b, pos_in_bucket, table_index); + } +} + +/* Initializes the state for an incremental find operation. + * + * Incremental find can be used to speed up the loading of multiple objects by + * utilizing CPU branch predictions to parallelize memory accesses. Initialize + * the data for a number of incremental find operations. Then call + * hashtableIncrementalFindStep on them in a round-robin order until all of them + * are complete. Finally, if necessary, call hashtableIncrementalFindGetResult. + */ +void hashtableIncrementalFindInit(hashtableIncrementalFindState *state, hashtable *ht, const void *key) { + incrementalFind *data = incrementalFindFromOpaque(state); + if (hashtableSize(ht) == 0) { + data->state = HASHTABLE_NOT_FOUND; + } else { + data->state = HASHTABLE_NEXT_BUCKET; + data->bucket = NULL; + data->hashtable = ht; + data->key = key; + data->hash = hashKey(ht, key); + } +} + +/* Returns 1 if more work is needed, 0 when done. Call this function repeatedly + * until it returns 0. Then use hashtableIncrementalFindGetResult to fetch the + * result. */ +int hashtableIncrementalFindStep(hashtableIncrementalFindState *state) { + incrementalFind *data = incrementalFindFromOpaque(state); + switch (data->state) { + case HASHTABLE_CHECK_ENTRY: + /* Current entry is prefetched. Now check if it's a match. */ + { + hashtable *ht = data->hashtable; + void *entry = data->bucket->entries[data->pos]; + const void *elem_key = entryGetKey(ht, entry); + if (compareKeys(ht, data->key, elem_key) == 0) { + /* It's a match. */ + data->state = HASHTABLE_FOUND; + return 0; + } + /* No match. Look for next candidate entry in the bucket. */ + data->pos++; + } + /* fall through */ + case HASHTABLE_NEXT_ENTRY: + /* Current bucket is prefetched. Prefetch next potential + * matching entry in the current bucket. */ + if (data->bucket->presence != 0 && data->pos < numBucketPositions(data->bucket)) { + bucket *b = data->bucket; + uint8_t h2 = highBits(data->hash); + for (int pos = data->pos; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos) && b->hashes[pos] == h2) { + /* It's a candidate. */ + valkey_prefetch(b->entries[pos]); + data->pos = pos; + data->state = HASHTABLE_CHECK_ENTRY; + return 1; + } + } + } + /* fall through */ + case HASHTABLE_NEXT_BUCKET: + /* Current bucket is prefetched, if any. Find the next bucket in the + * chain, or in next table, and prefetch it. */ + { + hashtable *ht = data->hashtable; + if (data->bucket == NULL) { + data->table = 0; + size_t mask = expToMask(ht->bucket_exp[0]); + size_t bucket_idx = data->hash & mask; + if (ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) { + /* Skip already rehashed bucket in table 0. */ + data->table = 1; + mask = expToMask(ht->bucket_exp[1]); + bucket_idx = data->hash & mask; + } + data->bucket = &ht->tables[data->table][bucket_idx]; + } else if (bucketNext(data->bucket) != NULL) { + data->bucket = bucketNext(data->bucket); + } else if (data->table == 0 && ht->rehash_idx >= 0) { + data->table = 1; + size_t mask = expToMask(ht->bucket_exp[1]); + size_t bucket_idx = data->hash & mask; + data->bucket = &ht->tables[data->table][bucket_idx]; + } else { + /* No more tables. */ + data->state = HASHTABLE_NOT_FOUND; + return 0; + } + valkey_prefetch(data->bucket); + data->state = HASHTABLE_NEXT_ENTRY; + data->pos = 0; + } + return 1; + case HASHTABLE_FOUND: + return 0; + case HASHTABLE_NOT_FOUND: + return 0; + } + assert(0); +} + +/* Call only when hashtableIncrementalFindStep has returned 0. + * + * Returns 1 and points 'found' to the entry if an entry was found, 0 if it + * was not found. */ +int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void **found) { + incrementalFind *data = incrementalFindFromOpaque(state); + if (data->state == HASHTABLE_FOUND) { + if (found) *found = data->bucket->entries[data->pos]; + return 1; + } else { + assert(data->state == HASHTABLE_NOT_FOUND); + return 0; + } +} + +/* --- Scan --- */ + +/* Scan is a stateless iterator. It works with a cursor that is returned to the + * caller and which should be provided to the next call to continue scanning. + * The hash table can be modified in any way between two scan calls. The scan + * still continues iterating where it was. + * + * A full scan is performed like this: Start with a cursor of 0. The scan + * callback is invoked for each entry scanned and a new cursor is returned. Next + * time, call this function with the new cursor. Continue until the function + * returns 0. + * + * We say that an entry is *emitted* when it's passed to the scan callback. + * + * Scan guarantees: + * + * - An entry that is present in the hash table during an entire full scan will + * be returned (emitted) at least once. (Most of the time exactly once, but + * sometimes twice.) + * + * - An entry that is inserted or deleted during a full scan may or may not be + * returned during the scan. + * + * Scan callback rules: + * + * - The scan callback may delete the entry that was passed to it. + * + * - It may not delete other entries, because that may lead to internal + * fragmentation in the form of "holes" in the bucket chains. + * + * - The scan callback may insert or replace any entry. + */ +size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) { + return hashtableScanDefrag(ht, cursor, fn, privdata, NULL, 0); +} + +/* Like hashtableScan, but additionally reallocates the memory used by the dict + * entries using the provided allocation function. This feature was added for + * the active defrag feature. + * + * The 'defragfn' callback is called with a pointer to memory that callback can + * reallocate. The callbacks should return a new memory address or NULL, where + * NULL means that no reallocation happened and the old memory is still valid. + * The 'defragfn' can be NULL if you don't need defrag reallocation. + * + * The 'flags' argument can be used to tweak the behaviour. It's a bitwise-or + * (zero means no flags) of the following: + * + * - HASHTABLE_SCAN_EMIT_REF: Emit a pointer to the entry's location in the + * table to the scan function instead of the actual entry. This can be used + * for advanced things like reallocating the memory of an entry (for the + * purpose of defragmentation) and updating the pointer to the entry inside + * the hash table. + */ +size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags) { + if (hashtableSize(ht) == 0) return 0; + + /* Prevent entries from being moved around during the scan call, as a + * side-effect of the scan callback. */ + hashtablePauseRehashing(ht); + + /* Flags. */ + int emit_ref = (flags & HASHTABLE_SCAN_EMIT_REF); + + if (!hashtableIsRehashing(ht)) { + /* Emit entries at the cursor index. */ + size_t mask = expToMask(ht->bucket_exp[0]); + bucket *b = &ht->tables[0][cursor & mask]; + do { + if (b->presence != 0) { + int pos; + for (pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + if (isPositionFilled(b, pos)) { + void *emit = emit_ref ? &b->entries[pos] : b->entries[pos]; + fn(privdata, emit); + } + } + } + bucket *next = bucketNext(b); + if (next != NULL && defragfn != NULL) { + next = bucketDefrag(b, next, defragfn); + } + b = next; + } while (b != NULL); + + /* Advance cursor. */ + cursor = nextCursor(cursor, mask); + } else { + int table_small, table_large; + if (ht->bucket_exp[0] <= ht->bucket_exp[1]) { + table_small = 0; + table_large = 1; + } else { + table_small = 1; + table_large = 0; + } + + size_t mask_small = expToMask(ht->bucket_exp[table_small]); + size_t mask_large = expToMask(ht->bucket_exp[table_large]); + + /* Emit entries in the smaller table, if this index hasn't already been + * rehashed. */ + size_t idx = cursor & mask_small; + if (table_small == 1 || ht->rehash_idx == -1 || idx >= (size_t)ht->rehash_idx) { + size_t used_before = ht->used[table_small]; + bucket *b = &ht->tables[table_small][idx]; + do { + if (b->presence) { + for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + if (isPositionFilled(b, pos)) { + void *emit = emit_ref ? &b->entries[pos] : b->entries[pos]; + fn(privdata, emit); + } + } + } + bucket *next = bucketNext(b); + if (next != NULL && defragfn != NULL) { + next = bucketDefrag(b, next, defragfn); + } + b = next; + } while (b != NULL); + /* If any entries were deleted, fill the holes. */ + if (ht->used[table_small] < used_before) { + compactBucketChain(ht, idx, table_small); + } + } + + /* Iterate over indices in larger table that are the expansion of the + * index pointed to by the cursor in the smaller table. */ + do { + /* Emit entries in the larger table at this cursor, if this index + * hash't already been rehashed. */ + idx = cursor & mask_large; + if (table_large == 1 || ht->rehash_idx == -1 || idx >= (size_t)ht->rehash_idx) { + size_t used_before = ht->used[table_large]; + bucket *b = &ht->tables[table_large][idx]; + do { + if (b->presence) { + for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + if (isPositionFilled(b, pos)) { + void *emit = emit_ref ? &b->entries[pos] : b->entries[pos]; + fn(privdata, emit); + } + } + } + bucket *next = bucketNext(b); + if (next != NULL && defragfn != NULL) { + next = bucketDefrag(b, next, defragfn); + } + b = next; + } while (b != NULL); + /* If any entries were deleted, fill the holes. */ + if (ht->used[table_large] < used_before) { + compactBucketChain(ht, idx, table_large); + } + } + + /* Increment the reverse cursor not covered by the smaller mask. */ + cursor = nextCursor(cursor, mask_large); + + /* Continue while bits covered by mask difference is non-zero. */ + } while (cursor & (mask_small ^ mask_large)); + } + hashtableResumeRehashing(ht); + return cursor; +} + +/* --- Iterator --- */ + +/* Initialize a iterator, that is not allowed to insert, delete or even lookup + * entries in the hashtable, because such operations can trigger incremental + * rehashing which moves entries around and confuses the iterator. Only + * hashtableNext is allowed. Each entry is returned exactly once. Call + * hashtableResetIterator when you are done. See also + * hashtableInitSafeIterator. */ +void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) { + iter *iter; + iter = iteratorFromOpaque(iterator); + iter->hashtable = ht; + iter->table = 0; + iter->index = -1; + iter->safe = 0; +} + +/* Initialize a safe iterator, which is allowed to modify the hash table while + * iterating. It pauses incremental rehashing to prevent entries from moving + * around. Call hashtableNext to fetch each entry. You must call + * hashtableResetIterator when you are done with a safe iterator. + * + * It's allowed to insert and replace entries. Deleting entries is only allowed + * for the entry that was just returned by hashtableNext. Deleting other entries + * is possible, but doing so can cause internal fragmentation, so don't. + * + * Guarantees: + * + * - Entries that are in the hash table for the entire iteration are returned + * exactly once. + * + * - Entries that are deleted or replaced after they have been returned are not + * returned again. + * + * - Entries that are replaced before they've been returned by the iterator will + * be returned. + * + * - Entries that are inserted during the iteration may or may not be returned + * by the iterator. + */ +void hashtableInitSafeIterator(hashtableIterator *iterator, hashtable *ht) { + hashtableInitIterator(iterator, ht); + iter *iter = iteratorFromOpaque(iterator); + iter->safe = 1; +} + +/* Resets a stack-allocated iterator. */ +void hashtableResetIterator(hashtableIterator *iterator) { + iter *iter = iteratorFromOpaque(iterator); + if (!(iter->index == -1 && iter->table == 0)) { + if (iter->safe) { + hashtableResumeRehashing(iter->hashtable); + assert(iter->hashtable->pause_rehash >= 0); + } else { + assert(iter->fingerprint == hashtableFingerprint(iter->hashtable)); + } + } +} + +/* Allocates and initializes an iterator. */ +hashtableIterator *hashtableCreateIterator(hashtable *ht) { + iter *iter = zmalloc(sizeof(*iter)); + hashtableIterator *opaque = iteratorToOpaque(iter); + hashtableInitIterator(opaque, ht); + return opaque; +} + +/* Allocates and initializes a safe iterator. */ +hashtableIterator *hashtableCreateSafeIterator(hashtable *ht) { + hashtableIterator *iterator = hashtableCreateIterator(ht); + iter *iter = iteratorFromOpaque(iterator); + iter->safe = 1; + return iterator; +} + +/* Resets and frees the memory of an allocated iterator, i.e. one created using + * hashtableCreate(Safe)Iterator. */ +void hashtableReleaseIterator(hashtableIterator *iterator) { + hashtableResetIterator(iterator); + iter *iter = iteratorFromOpaque(iterator); + zfree(iter); +} + +/* Points elemptr to the next entry and returns 1 if there is a next entry. + * Returns 0 if there are no more entries. */ +int hashtableNext(hashtableIterator *iterator, void **elemptr) { + iter *iter = iteratorFromOpaque(iterator); + while (1) { + if (iter->index == -1 && iter->table == 0) { + /* It's the first call to next. */ + if (iter->safe) { + hashtablePauseRehashing(iter->hashtable); + iter->last_seen_size = iter->hashtable->used[iter->table]; + } else { + iter->fingerprint = hashtableFingerprint(iter->hashtable); + } + if (iter->hashtable->tables[0] == NULL) { + /* Empty hashtable. We're done. */ + break; + } + iter->index = 0; + /* Skip already rehashed buckets. */ + if (hashtableIsRehashing(iter->hashtable)) { + iter->index = iter->hashtable->rehash_idx; + } + iter->bucket = &iter->hashtable->tables[iter->table][iter->index]; + iter->pos_in_bucket = 0; + } else { + /* Advance to the next position within the bucket, or to the next + * child bucket in a chain, or to the next bucket index, or to the + * next table. */ + iter->pos_in_bucket++; + if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { + iter->pos_in_bucket = 0; + iter->bucket = bucketNext(iter->bucket); + } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { + /* Bucket index done. */ + if (iter->safe) { + /* If entries in this bucket chain have been deleted, + * they've left empty spaces in the buckets. The chain is + * not automatically compacted when rehashing is paused. If + * this iterator is the only reason for pausing rehashing, + * we can do the compaction now when we're done with a + * bucket chain, before we move on to the next index. */ + if (iter->hashtable->pause_rehash == 1 && + iter->hashtable->used[iter->table] < iter->last_seen_size) { + compactBucketChain(iter->hashtable, iter->index, iter->table); + } + iter->last_seen_size = iter->hashtable->used[iter->table]; + } + iter->pos_in_bucket = 0; + iter->index++; + if ((size_t)iter->index >= numBuckets(iter->hashtable->bucket_exp[iter->table])) { + if (hashtableIsRehashing(iter->hashtable) && iter->table == 0) { + iter->index = 0; + iter->table++; + } else { + /* Done. */ + break; + } + } + iter->bucket = &iter->hashtable->tables[iter->table][iter->index]; + } + } + bucket *b = iter->bucket; + if (!isPositionFilled(b, iter->pos_in_bucket)) { + /* No entry here. */ + continue; + } + /* Return the entry at this position. */ + if (elemptr) { + *elemptr = b->entries[iter->pos_in_bucket]; + } + return 1; + } + return 0; +} + +/* --- Random entries --- */ + +/* Points 'found' to a random entry in the hash table and returns 1. Returns 0 + * if the table is empty. */ +int hashtableRandomEntry(hashtable *ht, void **found) { + void *samples[WEAK_RANDOM_SAMPLE_SIZE]; + unsigned count = hashtableSampleEntries(ht, &samples[0], WEAK_RANDOM_SAMPLE_SIZE); + if (count == 0) return 0; + unsigned idx = random() % count; + *found = samples[idx]; + return 1; +} + +/* Points 'found' to a random entry in the hash table and returns 1. Returns 0 + * if the table is empty. This one is more fair than hashtableRandomEntry(). */ +int hashtableFairRandomEntry(hashtable *ht, void **found) { + void *samples[FAIR_RANDOM_SAMPLE_SIZE]; + unsigned count = hashtableSampleEntries(ht, &samples[0], FAIR_RANDOM_SAMPLE_SIZE); + if (count == 0) return 0; + unsigned idx = random() % count; + *found = samples[idx]; + return 1; +} + +/* This function samples a sequence of entries starting at a random location in + * the hash table. + * + * The sampled entries are stored in the array 'dst' which must have space for + * at least 'count' entries. + * + * The function returns the number of sampled entries, which is 'count' except + * if 'count' is greater than the total number of entries in the hash table. */ +unsigned hashtableSampleEntries(hashtable *ht, void **dst, unsigned count) { + /* Adjust count. */ + if (count > hashtableSize(ht)) count = hashtableSize(ht); + scan_samples samples; + samples.size = count; + samples.seen = 0; + samples.entries = dst; + size_t cursor = randomSizeT(); + while (samples.seen < count) { + cursor = hashtableScan(ht, cursor, sampleEntriesScanFn, &samples); + } + rehashStepOnReadIfNeeded(ht); + /* samples.seen is the number of entries scanned. It may be greater than + * the requested count and the size of the dst array. */ + return samples.seen <= count ? samples.seen : count; +} + +/* --- Stats --- */ + +#define HASHTABLE_STATS_VECTLEN 50 +void hashtableFreeStats(hashtableStats *stats) { + zfree(stats->clvector); + zfree(stats); +} + +void hashtableCombineStats(hashtableStats *from, hashtableStats *into) { + into->toplevel_buckets += from->toplevel_buckets; + into->child_buckets += from->child_buckets; + into->max_chain_len = (from->max_chain_len > into->max_chain_len) ? from->max_chain_len : into->max_chain_len; + into->size += from->size; + into->used += from->used; + for (int i = 0; i < HASHTABLE_STATS_VECTLEN; i++) { + into->clvector[i] += from->clvector[i]; + } +} + +hashtableStats *hashtableGetStatsHt(hashtable *ht, int table_index, int full) { + unsigned long *clvector = zcalloc(sizeof(unsigned long) * HASHTABLE_STATS_VECTLEN); + hashtableStats *stats = zcalloc(sizeof(hashtableStats)); + stats->table_index = table_index; + stats->clvector = clvector; + stats->toplevel_buckets = numBuckets(ht->bucket_exp[table_index]); + stats->child_buckets = ht->child_buckets[table_index]; + stats->size = numBuckets(ht->bucket_exp[table_index]) * ENTRIES_PER_BUCKET; + stats->used = ht->used[table_index]; + if (!full) return stats; + /* Compute stats about bucket chain lengths. */ + stats->max_chain_len = 0; + for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table_index]); idx++) { + bucket *b = &ht->tables[table_index][idx]; + unsigned long chainlen = 0; + while (b->chained) { + chainlen++; + b = bucketNext(b); + } + if (chainlen > stats->max_chain_len) { + stats->max_chain_len = chainlen; + } + if (chainlen >= HASHTABLE_STATS_VECTLEN) { + chainlen = HASHTABLE_STATS_VECTLEN - 1; + } + clvector[chainlen]++; + } + return stats; +} + +/* Generates human readable stats. */ +size_t hashtableGetStatsMsg(char *buf, size_t bufsize, hashtableStats *stats, int full) { + if (stats->used == 0) { + return snprintf(buf, bufsize, + "Hash table %d stats (%s):\n" + "No stats available for empty hash tables\n", + stats->table_index, + (stats->table_index == 0) ? "main hash table" : "rehashing target"); + } + size_t l = 0; + l += snprintf(buf + l, bufsize - l, + "Hash table %d stats (%s):\n" + " table size: %lu\n" + " number of entries: %lu\n", + stats->table_index, + (stats->table_index == 0) ? "main hash table" : "rehashing target", stats->size, + stats->used); + if (full) { + l += snprintf(buf + l, bufsize - l, + " top-level buckets: %lu\n" + " child buckets: %lu\n" + " max chain length: %lu\n" + " avg chain length: %.02f\n" + " chain length distribution:\n", + stats->toplevel_buckets, + stats->child_buckets, + stats->max_chain_len, + (float)stats->child_buckets / stats->toplevel_buckets); + for (unsigned long i = 0; i < HASHTABLE_STATS_VECTLEN - 1; i++) { + if (stats->clvector[i] == 0) continue; + if (l >= bufsize) break; + l += snprintf(buf + l, bufsize - l, " %ld: %ld (%.02f%%)\n", i, stats->clvector[i], + ((float)stats->clvector[i] / stats->toplevel_buckets) * 100); + } + } + + /* Make sure there is a NULL term at the end. */ + buf[bufsize - 1] = '\0'; + /* Unlike snprintf(), return the number of characters actually written. */ + return strlen(buf); +} + +void hashtableGetStats(char *buf, size_t bufsize, hashtable *ht, int full) { + size_t l; + char *orig_buf = buf; + size_t orig_bufsize = bufsize; + + hashtableStats *mainHtStats = hashtableGetStatsHt(ht, 0, full); + l = hashtableGetStatsMsg(buf, bufsize, mainHtStats, full); + hashtableFreeStats(mainHtStats); + buf += l; + bufsize -= l; + if (hashtableIsRehashing(ht) && bufsize > 0) { + hashtableStats *rehashHtStats = hashtableGetStatsHt(ht, 1, full); + hashtableGetStatsMsg(buf, bufsize, rehashHtStats, full); + hashtableFreeStats(rehashHtStats); + } + /* Make sure there is a NULL term at the end. */ + orig_buf[orig_bufsize - 1] = '\0'; +} + +/* --- DEBUG --- */ + +void hashtableDump(hashtable *ht) { + for (int table = 0; table <= 1; table++) { + printf("Table %d, used %zu, exp %d, top-level buckets %zu, child buckets %zu\n", + table, ht->used[table], ht->bucket_exp[table], + numBuckets(ht->bucket_exp[table]), ht->child_buckets[table]); + for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table]); idx++) { + bucket *b = &ht->tables[table][idx]; + int level = 0; + do { + printf("Bucket %d:%zu level:%d\n", table, idx, level); + for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) { + printf(" %d ", pos); + if (isPositionFilled(b, pos)) { + printf("h2 %02x, key \"%s\"\n", b->hashes[pos], + (const char *)entryGetKey(ht, b->entries[pos])); + } else { + printf("(empty)\n"); + } + } + b = bucketNext(b); + level++; + } while (b != NULL); + } + } +} + +/* Prints a histogram-like view of the number of entries in each bucket and + * sub-bucket. Example: + * + * Bucket fill table=0 size=32 children=9 used=200: + * 67453462673764475436556656776756 + * 2 3 2 3 3 45 5 3 + */ +void hashtableHistogram(hashtable *ht) { + for (int table = 0; table <= 1; table++) { + if (ht->bucket_exp[table] < 0) continue; + size_t size = numBuckets(ht->bucket_exp[table]); + bucket *buckets[size]; + for (size_t idx = 0; idx < size; idx++) { + buckets[idx] = &ht->tables[table][idx]; + } + size_t chains_left = size; + printf("Bucket fill table=%d size=%zu children=%zu used=%zu:\n", + table, size, ht->child_buckets[table], ht->used[table]); + do { + printf(" "); + for (size_t idx = 0; idx < size; idx++) { + bucket *b = buckets[idx]; + if (b == NULL) { + printf(" "); + continue; + } + printf("%X", __builtin_popcount(b->presence)); + buckets[idx] = bucketNext(b); + if (buckets[idx] == NULL) chains_left--; + } + printf("\n"); + } while (chains_left > 0); + } +} + +int hashtableLongestBucketChain(hashtable *ht) { + int maxlen = 0; + for (int table = 0; table <= 1; table++) { + if (ht->bucket_exp[table] < 0) { + continue; /* table not used */ + } + for (size_t i = 0; i < numBuckets(ht->bucket_exp[table]); i++) { + int chainlen = 0; + bucket *b = &ht->tables[table][i]; + while (b->chained) { + if (++chainlen > maxlen) { + maxlen = chainlen; + } + b = bucketNext(b); + } + } + } + return maxlen; +} diff --git a/src/hashtable.h b/src/hashtable.h new file mode 100644 index 0000000000..4291cf5a5d --- /dev/null +++ b/src/hashtable.h @@ -0,0 +1,168 @@ +#ifndef HASHTABLE_H +#define HASHTABLE_H + +/* Hash table implementation. + * + * This is a cache-friendly hash table implementation. For details about the + * implementation and documentation of functions, see comments in hashtable.c. + * + * The entries in a hashtable are of a user-defined type, but an entry needs to + * contain a key. It can represent a key-value entry, or it can be just a key, + * if set semantics are desired. + * + * Terminology: + * + * hashtable + * An instance of the data structure. + * + * entry + * An entry in the hashtable. This may be of the same type as the key, + * or a struct containing a key and other fields. + * key + * The part of the entry used for looking the entry up in the hashtable. + * May be the entire entry or a struct field within the entry. + * + * type + * A struct containing callbacks, such as hash function, key comparison + * function and how to get the key in an entry. + */ + +#include "fmacros.h" +#include +#include +#include + +/* --- Opaque types --- */ + +typedef struct hashtable hashtable; +typedef struct hashtableStats hashtableStats; + +/* Can types that can be stack allocated. */ +typedef uint64_t hashtableIterator[5]; +typedef uint64_t hashtablePosition[2]; +typedef uint64_t hashtableIncrementalFindState[5]; + +/* --- Non-opaque types --- */ + +/* The hashtableType is a set of callbacks for a hashtable. All callbacks are + * optional. With all callbacks omitted, the hashtable is effectively a set of + * pointer-sized integers. */ +typedef struct { + /* If the type of an entry is not the same as the type of a key used for + * lookup, this callback needs to return the key within an entry. */ + const void *(*entryGetKey)(const void *entry); + /* Hash function. Defaults to hashing the bits in the pointer, effectively + * treating the pointer as an integer. */ + uint64_t (*hashFunction)(const void *key); + /* Compare function, returns 0 if the keys are equal. Defaults to just + * comparing the pointers for equality. */ + int (*keyCompare)(const void *key1, const void *key2); + /* Callback to free an entry when it's overwritten or deleted. + * Optional. */ + void (*entryDestructor)(void *entry); + /* Callback to control when resizing should be allowed. */ + int (*resizeAllowed)(size_t moreMem, double usedRatio); + /* Invoked at the start of rehashing. */ + void (*rehashingStarted)(hashtable *ht); + /* Invoked at the end of rehashing. */ + void (*rehashingCompleted)(hashtable *ht); + /* Track memory usage using this callback. It is called with a positive + * number when the hashtable allocates some memory and with a negative number + * when freeing. */ + void (*trackMemUsage)(hashtable *ht, ssize_t delta); + /* Allow a hashtable to carry extra caller-defined metadata. The extra memory + * is initialized to 0. */ + size_t (*getMetadataSize)(void); + /* Flag to disable incremental rehashing */ + unsigned instant_rehashing : 1; +} hashtableType; + +typedef enum { + HASHTABLE_RESIZE_ALLOW = 0, + HASHTABLE_RESIZE_AVOID, + HASHTABLE_RESIZE_FORBID, +} hashtableResizePolicy; + +typedef void (*hashtableScanFunction)(void *privdata, void *entry); + +/* Constants */ +#define HASHTABLE_BUCKET_SIZE 64 /* bytes, the most common cache line size */ + +/* Scan flags */ +#define HASHTABLE_SCAN_EMIT_REF (1 << 0) + +/* --- Prototypes --- */ + +/* Hash function (global seed) */ +void hashtableSetHashFunctionSeed(const uint8_t *seed); +uint8_t *hashtableGetHashFunctionSeed(void); +uint64_t hashtableGenHashFunction(const char *buf, size_t len); +uint64_t hashtableGenCaseHashFunction(const char *buf, size_t len); + +/* Global resize policy */ +void hashtableSetResizePolicy(hashtableResizePolicy policy); + +/* Hashtable instance */ +hashtable *hashtableCreate(hashtableType *type); +void hashtableRelease(hashtable *ht); +void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)); +hashtableType *hashtableGetType(hashtable *ht); +void *hashtableMetadata(hashtable *ht); +size_t hashtableSize(const hashtable *ht); +size_t hashtableBuckets(hashtable *ht); +size_t hashtableChainedBuckets(hashtable *ht, int table); +size_t hashtableMemUsage(hashtable *ht); +void hashtablePauseAutoShrink(hashtable *ht); +void hashtableResumeAutoShrink(hashtable *ht); +int hashtableIsRehashing(hashtable *ht); +int hashtableIsRehashingPaused(hashtable *ht); +void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size); +int hashtableRehashMicroseconds(hashtable *ht, uint64_t us); +int hashtableExpand(hashtable *ht, size_t size); +int hashtableTryExpand(hashtable *ht, size_t size); +int hashtableExpandIfNeeded(hashtable *ht); +int hashtableShrinkIfNeeded(hashtable *ht); +hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)); +void dismissHashtable(hashtable *ht); + +/* Entries */ +int hashtableFind(hashtable *ht, const void *key, void **found); +void **hashtableFindRef(hashtable *ht, const void *key); +int hashtableAdd(hashtable *ht, void *entry); +int hashtableAddOrFind(hashtable *ht, void *entry, void **existing); +int hashtableFindPositionForInsert(hashtable *ht, void *key, hashtablePosition *position, void **existing); +void hashtableInsertAtPosition(hashtable *ht, void *entry, hashtablePosition *position); +int hashtablePop(hashtable *ht, const void *key, void **popped); +int hashtableDelete(hashtable *ht, const void *key); +void **hashtableTwoPhasePopFindRef(hashtable *ht, const void *key, hashtablePosition *position); +void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *position); +int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void *new_entry); +void hashtableIncrementalFindInit(hashtableIncrementalFindState *state, hashtable *ht, const void *key); +int hashtableIncrementalFindStep(hashtableIncrementalFindState *state); +int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void **found); + +/* Iteration & scan */ +size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags); +void hashtableInitIterator(hashtableIterator *iter, hashtable *ht); +void hashtableInitSafeIterator(hashtableIterator *iter, hashtable *ht); +void hashtableResetIterator(hashtableIterator *iter); +hashtableIterator *hashtableCreateIterator(hashtable *ht); +hashtableIterator *hashtableCreateSafeIterator(hashtable *ht); +void hashtableReleaseIterator(hashtableIterator *iter); +int hashtableNext(hashtableIterator *iter, void **elemptr); + +/* Random entries */ +int hashtableRandomEntry(hashtable *ht, void **found); +int hashtableFairRandomEntry(hashtable *ht, void **found); +unsigned hashtableSampleEntries(hashtable *ht, void **dst, unsigned count); + +/* Debug & stats */ + +void hashtableFreeStats(hashtableStats *stats); +void hashtableCombineStats(hashtableStats *from, hashtableStats *into); +hashtableStats *hashtableGetStatsHt(hashtable *ht, int htidx, int full); +size_t hashtableGetStatsMsg(char *buf, size_t bufsize, hashtableStats *stats, int full); +void hashtableGetStats(char *buf, size_t bufsize, hashtable *ht, int full); + +#endif /* HASHTABLE_H */ diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 563c5e7941..6056bc0098 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -35,6 +35,13 @@ #include #include +#ifdef HAVE_AVX2 +/* Define __MM_MALLOC_H to prevent importing the memory aligned + * allocation functions, which we don't use. */ +#define __MM_MALLOC_H +#include +#endif + /* The HyperLogLog implementation is based on the following ideas: * * * The use of a 64 bit hash function as proposed in [1], in order to estimate @@ -208,6 +215,13 @@ struct hllhdr { static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected"; +#ifdef HAVE_AVX2 +static int simd_enabled = 1; +#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2")) +#else +#define HLL_USE_AVX2 0 +#endif + /* =========================== Low level bit macros ========================= */ /* Macros to access the dense representation. @@ -1064,6 +1078,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) { } } +#ifdef HAVE_AVX2 +/* A specialized version of hllMergeDense, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense) + * + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { + /* Shuffle indices for unpacking bytes of dense registers + * From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * To: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 4, 5, 6, -1, // + 7, 8, 9, -1, // + 10, 11, 12, -1, // + 13, 14, 15, -1, // + 0, 1, 2, -1, // + 3, 4, 5, -1, // + 6, 7, 8, -1, // + 9, 10, 11, -1 // + ); + + /* Merge the first 8 registers (6 bytes) normally + * as the AVX2 algorithm needs 4 padding bytes at the start */ + uint8_t val; + for (int i = 0; i < 8; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } + + /* Dense to Raw: + * + * 4 registers in 3 bytes: + * {bbaaaaaa|ccccbbbb|ddddddcc} + * + * LOAD 32 bytes (32 registers) per iteration: + * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding) + * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * + * SHUFFLE to: + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 3 valid bytes (4 registers) and a zero byte. + * + * extract registers in each group with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (<<0) + * {00000000|00bbbbbb|00000000|00000000} x8 (<<2) + * {00000000|00000000|00cccccc|00000000} x8 (<<4) + * {00000000|00000000|00000000|00dddddd} x8 (<<6) + * + * merge the extracted registers with OR: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw + */ + + /* Skip 8 registers (6 bytes) */ + const uint8_t *r = reg_dense + 6 - 4; + uint8_t *t = reg_raw + 8; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x0, x; + x0 = _mm256_loadu_si256((__m256i *)r); + x = _mm256_shuffle_epi8(x0, shuffle); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000)); + + a2 = _mm256_slli_epi32(a2, 2); + a3 = _mm256_slli_epi32(a3, 4); + a4 = _mm256_slli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + + __m256i z = _mm256_loadu_si256((__m256i *)t); + + z = _mm256_max_epu8(z, y); + + _mm256_storeu_si256((__m256i *)t, z); + + r += 24; + t += 32; + } + + /* Merge the last 24 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} +#endif + +/* Merge dense-encoded registers to raw registers array. */ +void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllMergeDenseAVX2(reg_raw, reg_dense); + return; + } + } +#endif + + uint8_t val; + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} + /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll' * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'. * @@ -1077,12 +1221,7 @@ int hllMerge(uint8_t *max, robj *hll) { int i; if (hdr->encoding == HLL_DENSE) { - uint8_t val; - - for (i = 0; i < HLL_REGISTERS; i++) { - HLL_DENSE_GET_REGISTER(val, hdr->registers, i); - if (val > max[i]) max[i] = val; - } + hllMergeDense(max, hdr->registers); } else { uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr); long runlen, regval; @@ -1114,6 +1253,121 @@ int hllMerge(uint8_t *max, robj *hll) { return C_OK; } +#ifdef HAVE_AVX2 +/* A specialized version of hllDenseCompress, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress) + * + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { + /* Shuffle indices for packing bytes of dense registers + * From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * To: {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1, // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1 // + ); + + /* Raw to Dense: + * + * LOAD 32 bytes (32 registers) per iteration: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 4 registers. + * + * move the registers to correct positions with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (>>0) + * {bb000000|0000bbbb|00000000|00000000} x8 (>>2) + * {00000000|cccc0000|000000cc|00000000} x8 (>>4) + * {00000000|00000000|dddddd00|00000000} x8 (>>6) + * + * merge the registers with OR: + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * + * SHUFFLE to: + * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + * + * STORE the lower half and higher half respectively: + * AAABBBCCCDDD0000 + * EEEFFFGGGHHH0000 + * AAABBBCCCDDDEEEFFFGGGHHH0000 + * + * Note that the last 4 bytes are padding bytes. + */ + + const uint8_t *r = reg_raw; + uint8_t *t = reg_dense; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x = _mm256_loadu_si256((__m256i *)r); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000)); + + a2 = _mm256_srli_epi32(a2, 2); + a3 = _mm256_srli_epi32(a3, 4); + a4 = _mm256_srli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + y = _mm256_shuffle_epi8(y, shuffle); + + __m128i lower, higher; + lower = _mm256_castsi256_si128(y); + higher = _mm256_extracti128_si256(y, 1); + + _mm_storeu_si128((__m128i *)t, lower); + _mm_storeu_si128((__m128i *)(t + 12), higher); + + r += 32; + t += 24; + } + + /* Merge the last 32 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} +#endif + +/* Compress raw registers to dense representation. */ +void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllDenseCompressAVX2(reg_dense, reg_raw); + return; + } + } +#endif + + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} + /* ========================== HyperLogLog commands ========================== */ /* Create an HLL object. We always create the HLL using sparse encoding. @@ -1189,7 +1443,7 @@ void pfaddCommand(client *c) { * hold our HLL data structure. sdsnewlen() when NULL is passed * is guaranteed to return bytes initialized to zero. */ o = createHLLObject(); - dbAdd(c->db, c->argv[1], o); + dbAdd(c->db, c->argv[1], &o); updated++; } else { if (isHLLObjectOrReply(c, o) != C_OK) return; @@ -1346,7 +1600,7 @@ void pfmergeCommand(client *c) { * hold our HLL data structure. sdsnewlen() when NULL is passed * is guaranteed to return bytes initialized to zero. */ o = createHLLObject(); - dbAdd(c->db, c->argv[1], o); + dbAdd(c->db, c->argv[1], &o); } else { /* If key exists we are sure it's of the right type/size * since we checked when merging the different HLLs, so we @@ -1363,12 +1617,17 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ - for (j = 0; j < HLL_REGISTERS; j++) { - if (max[j] == 0) continue; + if (use_dense) { hdr = o->ptr; - switch (hdr->encoding) { - case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; - case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + hllDenseCompress(hdr->registers, max); + } else { + for (j = 0; j < HLL_REGISTERS; j++) { + if (max[j] == 0) continue; + hdr = o->ptr; + switch (hdr->encoding) { + case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; + case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + } } } hdr = o->ptr; /* o->ptr may be different now, as a side effect of @@ -1494,6 +1753,7 @@ void pfselftestCommand(client *c) { * PFDEBUG DECODE * PFDEBUG ENCODING * PFDEBUG TODENSE + * PFDEBUG SIMD (ON|OFF) */ void pfdebugCommand(client *c) { char *cmd = c->argv[1]->ptr; @@ -1501,6 +1761,30 @@ void pfdebugCommand(client *c) { robj *o; int j; + if (!strcasecmp(cmd, "simd")) { + if (c->argc != 3) goto arityerr; + + if (!strcasecmp(c->argv[2]->ptr, "on")) { +#ifdef HAVE_AVX2 + simd_enabled = 1; +#endif + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { +#ifdef HAVE_AVX2 + simd_enabled = 0; +#endif + } else { + addReplyError(c, "Argument must be ON or OFF"); + } + + if (HLL_USE_AVX2) { + addReplyStatus(c, "enabled"); + } else { + addReplyStatus(c, "disabled"); + } + + return; + } + o = lookupKeyWrite(c->db, c->argv[2]); if (o == NULL) { addReplyError(c, "The specified key does not exist"); diff --git a/src/io_threads.c b/src/io_threads.c index f4471b96d0..90f5b88700 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -441,8 +441,8 @@ void IOThreadFreeArgv(void *data) { /* This function attempts to offload the client's argv to an IO thread. * Returns C_OK if the client's argv were successfully offloaded to an IO thread, * C_ERR otherwise. */ -int tryOffloadFreeArgvToIOThreads(client *c) { - if (server.active_io_threads_num <= 1 || c->argc == 0) { +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv) { + if (server.active_io_threads_num <= 1 || argc == 0) { return C_ERR; } @@ -456,11 +456,11 @@ int tryOffloadFreeArgvToIOThreads(client *c) { int last_arg_to_free = -1; /* Prepare the argv */ - for (int j = 0; j < c->argc; j++) { - if (c->argv[j]->refcount > 1) { - decrRefCount(c->argv[j]); + for (int j = 0; j < argc; j++) { + if (argv[j]->refcount > 1) { + decrRefCount(argv[j]); /* Set argv[j] to NULL to avoid double free */ - c->argv[j] = NULL; + argv[j] = NULL; } else { last_arg_to_free = j; } @@ -468,17 +468,17 @@ int tryOffloadFreeArgvToIOThreads(client *c) { /* If no argv to free, free the argv array at the main thread */ if (last_arg_to_free == -1) { - zfree(c->argv); + zfree(argv); return C_OK; } /* We set the refcount of the last arg to free to 0 to indicate that * this is the last argument to free. With this approach, we don't need to * send the argc to the IO thread and we can send just the argv ptr. */ - c->argv[last_arg_to_free]->refcount = 0; + argv[last_arg_to_free]->refcount = 0; /* Must succeed as we checked the free space before. */ - IOJobQueue_push(jq, IOThreadFreeArgv, c->argv); + IOJobQueue_push(jq, IOThreadFreeArgv, argv); return C_OK; } @@ -493,6 +493,8 @@ int tryOffloadFreeObjToIOThreads(robj *obj) { if (obj->refcount > 1) return C_ERR; + if (obj->encoding != OBJ_ENCODING_RAW || obj->type != OBJ_STRING) return C_ERR; + /* We select the thread ID in a round-robin fashion. */ size_t tid = (server.stat_io_freed_objects % (server.active_io_threads_num - 1)) + 1; @@ -501,7 +503,12 @@ int tryOffloadFreeObjToIOThreads(robj *obj) { return C_ERR; } - IOJobQueue_push(jq, decrRefCountVoid, obj); + /* We offload only the free of the ptr that may be allocated by the I/O thread. + * The object itself was allocated by the main thread and will be freed by the main thread. */ + IOJobQueue_push(jq, sdsfreeVoid, obj->ptr); + obj->ptr = NULL; + decrRefCount(obj); + server.stat_io_freed_objects++; return C_OK; } @@ -554,3 +561,55 @@ void trySendPollJobToIOThreads(void) { aeSetPollProtect(server.el, 1); IOJobQueue_push(jq, IOThreadPoll, server.el); } + +static void ioThreadAccept(void *data) { + client *c = (client *)data; + connAccept(c->conn, NULL); + c->io_read_state = CLIENT_COMPLETED_IO; +} + +/* + * Attempts to offload an Accept operation (currently used for TLS accept) for a client + * connection to I/O threads. + * + * Returns: + * C_OK - If the accept operation was successfully queued for processing + * C_ERR - If the connection is not eligible for offloading + * + * Parameters: + * conn - The connection object to perform the accept operation on + */ +int trySendAcceptToIOThreads(connection *conn) { + if (server.io_threads_num <= 1) { + return C_ERR; + } + + if (!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD)) { + return C_ERR; + } + + client *c = connGetPrivateData(conn); + if (c->io_read_state != CLIENT_IDLE) { + return C_OK; + } + + if (server.active_io_threads_num <= 1) { + return C_ERR; + } + + size_t thread_id = (c->id % (server.active_io_threads_num - 1)) + 1; + IOJobQueue *job_queue = &io_jobs[thread_id]; + + if (IOJobQueue_isFull(job_queue)) { + return C_ERR; + } + + c->io_read_state = CLIENT_PENDING_IO; + c->flag.pending_read = 1; + listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); + connSetPostponeUpdateState(c->conn, 1); + server.stat_io_accept_offloaded++; + IOJobQueue_push(job_queue, ioThreadAccept, c); + + return C_OK; +} diff --git a/src/io_threads.h b/src/io_threads.h index f9a9cf762f..a3ff582a77 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -9,9 +9,10 @@ int inMainThread(void); int trySendReadToIOThreads(client *c); int trySendWriteToIOThreads(client *c); int tryOffloadFreeObjToIOThreads(robj *o); -int tryOffloadFreeArgvToIOThreads(client *c); +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); +int trySendAcceptToIOThreads(connection *conn); #endif /* IO_THREADS_H */ diff --git a/src/kvstore.c b/src/kvstore.c index 7142fa0f61..d6db4d3fe1 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -1,11 +1,11 @@ /* * Index-based KV store implementation - * This file implements a KV store comprised of an array of dicts (see dict.c) + * This file implements a KV store comprised of an array of hash tables (see hashtable.c) * The purpose of this KV store is to have easy access to all keys that belong - * in the same dict (i.e. are in the same dict-index) + * in the same hash table (i.e. are in the same hashtable-index) * * For example, when the server is running in cluster mode, we use kvstore to save - * all keys that map to the same hash-slot in a separate dict within the kvstore + * all keys that map to the same hash-slot in a separate hash table within the kvstore * struct. * This enables us to easily access all keys that map to a specific hash-slot. * @@ -40,6 +40,7 @@ #include #include +#include #include "zmalloc.h" #include "kvstore.h" @@ -48,236 +49,248 @@ #define UNUSED(V) ((void)V) -static dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it); +static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it); struct _kvstore { int flags; - dictType *dtype; - dict **dicts; - int num_dicts; - int num_dicts_bits; - list *rehashing; /* List of dictionaries in this kvstore that are currently rehashing. */ - int resize_cursor; /* Cron job uses this cursor to gradually resize dictionaries (only used if num_dicts > 1). */ - int allocated_dicts; /* The number of allocated dicts. */ - int non_empty_dicts; /* The number of non-empty dicts. */ - unsigned long long key_count; /* Total number of keys in this kvstore. */ - unsigned long long bucket_count; /* Total number of buckets in this kvstore across dictionaries. */ - unsigned long long *dict_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until - given dict-index. */ - size_t overhead_hashtable_lut; /* The overhead of all dictionaries. */ - size_t overhead_hashtable_rehashing; /* The overhead of dictionaries rehashing. */ + hashtableType *dtype; + hashtable **hashtables; + int num_hashtables; + int num_hashtables_bits; + list *rehashing; /* List of hash tables in this kvstore that are currently rehashing. */ + int resize_cursor; /* Cron job uses this cursor to gradually resize hash tables (only used if num_hashtables > 1). */ + int allocated_hashtables; /* The number of allocated hashtables. */ + int non_empty_hashtables; /* The number of non-empty hashtables. */ + unsigned long long key_count; /* Total number of keys in this kvstore. */ + unsigned long long bucket_count; /* Total number of buckets in this kvstore across hash tables. */ + unsigned long long *hashtable_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until + * given hashtable-index. */ + size_t overhead_hashtable_lut; /* Overhead of all hashtables in bytes. */ + size_t overhead_hashtable_rehashing; /* Overhead of hash tables rehashing in bytes. */ }; -/* Structure for kvstore iterator that allows iterating across multiple dicts. */ +/* Structure for kvstore iterator that allows iterating across multiple hashtables. */ struct _kvstoreIterator { kvstore *kvs; long long didx; long long next_didx; - dictIterator di; + hashtableIterator di; }; -/* Structure for kvstore dict iterator that allows iterating the corresponding dict. */ -struct _kvstoreDictIterator { +/* Structure for kvstore hashtable iterator that allows iterating the corresponding hashtable. */ +struct _kvstoreHashtableIterator { kvstore *kvs; long long didx; - dictIterator di; + hashtableIterator di; }; -/* Dict metadata for database, used for record the position in rehashing list. */ +/* Hashtable metadata for database, used for record the position in rehashing list. */ typedef struct { listNode *rehashing_node; /* list node in rehashing list */ kvstore *kvs; -} kvstoreDictMetadata; +} kvstoreHashtableMetadata; /**********************************/ /*** Helpers **********************/ /**********************************/ -/* Get the dictionary pointer based on dict-index. */ -dict *kvstoreGetDict(kvstore *kvs, int didx) { - return kvs->dicts[didx]; +/* Get the hash table pointer based on hashtable-index. */ +hashtable *kvstoreGetHashtable(kvstore *kvs, int didx) { + return kvs->hashtables[didx]; } -static dict **kvstoreGetDictRef(kvstore *kvs, int didx) { - return &kvs->dicts[didx]; +static hashtable **kvstoreGetHashtableRef(kvstore *kvs, int didx) { + return &kvs->hashtables[didx]; } -static int kvstoreDictIsRehashingPaused(kvstore *kvs, int didx) { - dict *d = kvstoreGetDict(kvs, didx); - return d ? dictIsRehashingPaused(d) : 0; +static int kvstoreHashtableIsRehashingPaused(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + return ht ? hashtableIsRehashingPaused(ht) : 0; } -/* Returns total (cumulative) number of keys up until given dict-index (inclusive). - * Time complexity is O(log(kvs->num_dicts)). */ +/* Returns total (cumulative) number of keys up until given hashtable-index (inclusive). + * Time complexity is O(log(kvs->num_hashtables)). */ static unsigned long long cumulativeKeyCountRead(kvstore *kvs, int didx) { - if (kvs->num_dicts == 1) { + if (kvs->num_hashtables == 1) { assert(didx == 0); return kvstoreSize(kvs); } int idx = didx + 1; unsigned long long sum = 0; while (idx > 0) { - sum += kvs->dict_size_index[idx]; + sum += kvs->hashtable_size_index[idx]; idx -= (idx & -idx); } return sum; } -static void addDictIndexToCursor(kvstore *kvs, int didx, unsigned long long *cursor) { - if (kvs->num_dicts == 1) return; - /* didx can be -1 when iteration is over and there are no more dicts to visit. */ +static void addHashtableIndexToCursor(kvstore *kvs, int didx, unsigned long long *cursor) { + if (kvs->num_hashtables == 1) return; + /* didx can be -1 when iteration is over and there are no more hashtables to visit. */ if (didx < 0) return; - *cursor = (*cursor << kvs->num_dicts_bits) | didx; + *cursor = (*cursor << kvs->num_hashtables_bits) | didx; } -static int getAndClearDictIndexFromCursor(kvstore *kvs, unsigned long long *cursor) { - if (kvs->num_dicts == 1) return 0; - int didx = (int)(*cursor & (kvs->num_dicts - 1)); - *cursor = *cursor >> kvs->num_dicts_bits; +static int getAndClearHashtableIndexFromCursor(kvstore *kvs, unsigned long long *cursor) { + if (kvs->num_hashtables == 1) return 0; + int didx = (int)(*cursor & (kvs->num_hashtables - 1)); + *cursor = *cursor >> kvs->num_hashtables_bits; return didx; } -/* Updates binary index tree (also known as Fenwick tree), increasing key count for a given dict. +/* Updates binary index tree (also known as Fenwick tree), increasing key count for a given hashtable. * You can read more about this data structure here https://en.wikipedia.org/wiki/Fenwick_tree - * Time complexity is O(log(kvs->num_dicts)). */ + * Time complexity is O(log(kvs->num_hashtables)). */ static void cumulativeKeyCountAdd(kvstore *kvs, int didx, long delta) { kvs->key_count += delta; - dict *d = kvstoreGetDict(kvs, didx); - size_t dsize = dictSize(d); - int non_empty_dicts_delta = dsize == 1 ? 1 : dsize == 0 ? -1 - : 0; - kvs->non_empty_dicts += non_empty_dicts_delta; + hashtable *ht = kvstoreGetHashtable(kvs, didx); + size_t size = hashtableSize(ht); + if (delta < 0 && size == 0) { + kvs->non_empty_hashtables--; /* It became empty. */ + } else if (delta > 0 && size == (size_t)delta) { + kvs->non_empty_hashtables++; /* It was empty before. */ + } - /* BIT does not need to be calculated when there's only one dict. */ - if (kvs->num_dicts == 1) return; + /* BIT does not need to be calculated when there's only one hashtable. */ + if (kvs->num_hashtables == 1) return; /* Update the BIT */ - int idx = didx + 1; /* Unlike dict indices, BIT is 1-based, so we need to add 1. */ - while (idx <= kvs->num_dicts) { + int idx = didx + 1; /* Unlike hashtable indices, BIT is 1-based, so we need to add 1. */ + while (idx <= kvs->num_hashtables) { if (delta < 0) { - assert(kvs->dict_size_index[idx] >= (unsigned long long)labs(delta)); + assert(kvs->hashtable_size_index[idx] >= (unsigned long long)labs(delta)); } - kvs->dict_size_index[idx] += delta; + kvs->hashtable_size_index[idx] += delta; idx += (idx & -idx); } } -/* Create the dict if it does not exist and return it. */ -static dict *createDictIfNeeded(kvstore *kvs, int didx) { - dict *d = kvstoreGetDict(kvs, didx); - if (d) return d; +/* Create the hashtable if it does not exist and return it. */ +static hashtable *createHashtableIfNeeded(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht) return ht; - kvs->dicts[didx] = dictCreate(kvs->dtype); - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(kvs->dicts[didx]); + kvs->hashtables[didx] = hashtableCreate(kvs->dtype); + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(kvs->hashtables[didx]); metadata->kvs = kvs; - kvs->allocated_dicts++; - return kvs->dicts[didx]; + /* Memory is counted by kvstoreHashtableTrackMemUsage, but when it's invoked + * by hashtableCreate above, we don't know which hashtable it is for, because + * the metadata has yet been initialized. Account for the newly created + * hashtable here instead. */ + kvs->overhead_hashtable_lut += hashtableMemUsage(kvs->hashtables[didx]); + kvs->allocated_hashtables++; + return kvs->hashtables[didx]; } -/* Called when the dict will delete entries, the function will check - * KVSTORE_FREE_EMPTY_DICTS to determine whether the empty dict needs +/* Called when the hashtable will delete entries, the function will check + * KVSTORE_FREE_EMPTY_HASHTABLES to determine whether the empty hashtable needs * to be freed. * - * Note that for rehashing dicts, that is, in the case of safe iterators - * and Scan, we won't delete the dict. We will check whether it needs + * Note that for rehashing hashtables, that is, in the case of safe iterators + * and Scan, we won't delete the hashtable. We will check whether it needs * to be deleted when we're releasing the iterator. */ -static void freeDictIfNeeded(kvstore *kvs, int didx) { - if (!(kvs->flags & KVSTORE_FREE_EMPTY_DICTS) || !kvstoreGetDict(kvs, didx) || kvstoreDictSize(kvs, didx) != 0 || - kvstoreDictIsRehashingPaused(kvs, didx)) +static void freeHashtableIfNeeded(kvstore *kvs, int didx) { + if (!(kvs->flags & KVSTORE_FREE_EMPTY_HASHTABLES) || !kvstoreGetHashtable(kvs, didx) || kvstoreHashtableSize(kvs, didx) != 0 || + kvstoreHashtableIsRehashingPaused(kvs, didx)) return; - dictRelease(kvs->dicts[didx]); - kvs->dicts[didx] = NULL; - kvs->allocated_dicts--; + hashtableRelease(kvs->hashtables[didx]); + kvs->hashtables[didx] = NULL; + kvs->allocated_hashtables--; } -/**********************************/ -/*** dict callbacks ***************/ -/**********************************/ +/*************************************/ +/*** hashtable callbacks ***************/ +/*************************************/ -/* Adds dictionary to the rehashing list, which allows us +/* Adds hash table to the rehashing list, which allows us * to quickly find rehash targets during incremental rehashing. * - * If there are multiple dicts, updates the bucket count for the given dictionary + * If there are multiple hashtables, updates the bucket count for the given hash table * in a DB, bucket count incremented with the new ht size during the rehashing phase. - * If there's one dict, bucket count can be retrieved directly from single dict bucket. */ -void kvstoreDictRehashingStarted(dict *d) { - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + * If there's one hashtable, bucket count can be retrieved directly from single hashtable bucket. */ +void kvstoreHashtableRehashingStarted(hashtable *ht) { + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); kvstore *kvs = metadata->kvs; - listAddNodeTail(kvs->rehashing, d); + listAddNodeTail(kvs->rehashing, ht); metadata->rehashing_node = listLast(kvs->rehashing); - unsigned long long from, to; - dictRehashingInfo(d, &from, &to); + size_t from, to; + hashtableRehashingInfo(ht, &from, &to); kvs->bucket_count += to; /* Started rehashing (Add the new ht size) */ - kvs->overhead_hashtable_lut += to; - kvs->overhead_hashtable_rehashing += from; + kvs->overhead_hashtable_rehashing += from * HASHTABLE_BUCKET_SIZE; } -/* Remove dictionary from the rehashing list. +/* Remove hash table from the rehashing list. * - * Updates the bucket count for the given dictionary in a DB. It removes - * the old ht size of the dictionary from the total sum of buckets for a DB. */ -void kvstoreDictRehashingCompleted(dict *d) { - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + * Updates the bucket count for the given hash table in a DB. It removes + * the old ht size of the hash table from the total sum of buckets for a DB. */ +void kvstoreHashtableRehashingCompleted(hashtable *ht) { + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); kvstore *kvs = metadata->kvs; if (metadata->rehashing_node) { listDelNode(kvs->rehashing, metadata->rehashing_node); metadata->rehashing_node = NULL; } - unsigned long long from, to; - dictRehashingInfo(d, &from, &to); + size_t from, to; + hashtableRehashingInfo(ht, &from, &to); kvs->bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ - kvs->overhead_hashtable_lut -= from; - kvs->overhead_hashtable_rehashing -= from; + kvs->overhead_hashtable_rehashing -= from * HASHTABLE_BUCKET_SIZE; +} + +/* Hashtable callback to keep track of memory usage. */ +void kvstoreHashtableTrackMemUsage(hashtable *ht, ssize_t delta) { + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); + if (metadata->kvs == NULL) { + /* This is the initial allocation by hashtableCreate, when the metadata + * hasn't been initialized yet. */ + return; + } + metadata->kvs->overhead_hashtable_lut += delta; } -/* Returns the size of the DB dict metadata in bytes. */ -size_t kvstoreDictMetadataSize(dict *d) { - UNUSED(d); - return sizeof(kvstoreDictMetadata); +/* Returns the size of the DB hashtable metadata in bytes. */ +size_t kvstoreHashtableMetadataSize(void) { + return sizeof(kvstoreHashtableMetadata); } /**********************************/ /*** API **************************/ /**********************************/ -/* Create an array of dictionaries - * num_dicts_bits is the log2 of the amount of dictionaries needed (e.g. 0 for 1 dict, - * 3 for 8 dicts, etc.) - * - * The kvstore handles `key` based on `dictType` during initialization: - * - If `dictType.embedded-entry` is 1, it clones the `key`. - * - Otherwise, it assumes ownership of the `key`. +/* Create an array of hash tables + * num_hashtables_bits is the log2 of the amount of hash tables needed (e.g. 0 for 1 hashtable, + * 3 for 8 hashtables, etc.) */ -kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { - /* We can't support more than 2^16 dicts because we want to save 48 bits - * for the dict cursor, see kvstoreScan */ - assert(num_dicts_bits <= 16); +kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags) { + /* We can't support more than 2^16 hashtables because we want to save 48 bits + * for the hashtable cursor, see kvstoreScan */ + assert(num_hashtables_bits <= 16); - /* The dictType of kvstore needs to use the specific callbacks. + /* The hashtableType of kvstore needs to use the specific callbacks. * If there are any changes in the future, it will need to be modified. */ - assert(type->rehashingStarted == kvstoreDictRehashingStarted); - assert(type->rehashingCompleted == kvstoreDictRehashingCompleted); - assert(type->dictMetadataBytes == kvstoreDictMetadataSize); + assert(type->rehashingStarted == kvstoreHashtableRehashingStarted); + assert(type->rehashingCompleted == kvstoreHashtableRehashingCompleted); + assert(type->trackMemUsage == kvstoreHashtableTrackMemUsage); + assert(type->getMetadataSize == kvstoreHashtableMetadataSize); kvstore *kvs = zcalloc(sizeof(*kvs)); kvs->dtype = type; kvs->flags = flags; - kvs->num_dicts_bits = num_dicts_bits; - kvs->num_dicts = 1 << kvs->num_dicts_bits; - kvs->dicts = zcalloc(sizeof(dict *) * kvs->num_dicts); - if (!(kvs->flags & KVSTORE_ALLOCATE_DICTS_ON_DEMAND)) { - for (int i = 0; i < kvs->num_dicts; i++) createDictIfNeeded(kvs, i); + kvs->num_hashtables_bits = num_hashtables_bits; + kvs->num_hashtables = 1 << kvs->num_hashtables_bits; + kvs->hashtables = zcalloc(sizeof(hashtable *) * kvs->num_hashtables); + if (!(kvs->flags & KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND)) { + for (int i = 0; i < kvs->num_hashtables; i++) createHashtableIfNeeded(kvs, i); } kvs->rehashing = listCreate(); kvs->key_count = 0; - kvs->non_empty_dicts = 0; + kvs->non_empty_hashtables = 0; kvs->resize_cursor = 0; - kvs->dict_size_index = kvs->num_dicts > 1 ? zcalloc(sizeof(unsigned long long) * (kvs->num_dicts + 1)) : NULL; + kvs->hashtable_size_index = kvs->num_hashtables > 1 ? zcalloc(sizeof(unsigned long long) * (kvs->num_hashtables + 1)) : NULL; kvs->bucket_count = 0; kvs->overhead_hashtable_lut = 0; kvs->overhead_hashtable_rehashing = 0; @@ -285,105 +298,102 @@ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) { return kvs; } -void kvstoreEmpty(kvstore *kvs, void(callback)(dict *)) { - for (int didx = 0; didx < kvs->num_dicts; didx++) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) continue; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); +void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) { + for (int didx = 0; didx < kvs->num_hashtables; didx++) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) continue; + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); if (metadata->rehashing_node) metadata->rehashing_node = NULL; - dictEmpty(d, callback); - freeDictIfNeeded(kvs, didx); + hashtableEmpty(ht, callback); + freeHashtableIfNeeded(kvs, didx); } listEmpty(kvs->rehashing); kvs->key_count = 0; - kvs->non_empty_dicts = 0; + kvs->non_empty_hashtables = 0; kvs->resize_cursor = 0; kvs->bucket_count = 0; - if (kvs->dict_size_index) memset(kvs->dict_size_index, 0, sizeof(unsigned long long) * (kvs->num_dicts + 1)); - kvs->overhead_hashtable_lut = 0; + if (kvs->hashtable_size_index) memset(kvs->hashtable_size_index, 0, sizeof(unsigned long long) * (kvs->num_hashtables + 1)); kvs->overhead_hashtable_rehashing = 0; } void kvstoreRelease(kvstore *kvs) { - for (int didx = 0; didx < kvs->num_dicts; didx++) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) continue; - kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d); + for (int didx = 0; didx < kvs->num_hashtables; didx++) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) continue; + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); if (metadata->rehashing_node) metadata->rehashing_node = NULL; - dictRelease(d); + hashtableRelease(ht); } - zfree(kvs->dicts); + assert(kvs->overhead_hashtable_lut == 0); + zfree(kvs->hashtables); listRelease(kvs->rehashing); - if (kvs->dict_size_index) zfree(kvs->dict_size_index); + if (kvs->hashtable_size_index) zfree(kvs->hashtable_size_index); zfree(kvs); } unsigned long long int kvstoreSize(kvstore *kvs) { - if (kvs->num_dicts != 1) { + if (kvs->num_hashtables != 1) { return kvs->key_count; } else { - return kvs->dicts[0] ? dictSize(kvs->dicts[0]) : 0; + return kvs->hashtables[0] ? hashtableSize(kvs->hashtables[0]) : 0; } } -/* This method provides the cumulative sum of all the dictionary buckets - * across dictionaries in a database. */ +/* This method provides the cumulative sum of all the hash table buckets + * across hash tables in a database. */ unsigned long kvstoreBuckets(kvstore *kvs) { - if (kvs->num_dicts != 1) { + if (kvs->num_hashtables != 1) { return kvs->bucket_count; } else { - return kvs->dicts[0] ? dictBuckets(kvs->dicts[0]) : 0; + return kvs->hashtables[0] ? hashtableBuckets(kvs->hashtables[0]) : 0; } } size_t kvstoreMemUsage(kvstore *kvs) { size_t mem = sizeof(*kvs); + mem += kvs->overhead_hashtable_lut; - unsigned long long keys_count = kvstoreSize(kvs); - mem += keys_count * dictEntryMemUsage(NULL) + kvstoreBuckets(kvs) * sizeof(dictEntry *) + - kvs->allocated_dicts * (sizeof(dict) + kvstoreDictMetadataSize(NULL)); - - /* Values are dict* shared with kvs->dicts */ + /* Values are hashtable* shared with kvs->hashtables */ mem += listLength(kvs->rehashing) * sizeof(listNode); - if (kvs->dict_size_index) mem += sizeof(unsigned long long) * (kvs->num_dicts + 1); + if (kvs->hashtable_size_index) mem += sizeof(unsigned long long) * (kvs->num_hashtables + 1); return mem; } /* - * This method is used to iterate over the elements of the entire kvstore specifically across dicts. + * This method is used to iterate over the elements of the entire kvstore specifically across hashtables. * It's a three pronged approach. * - * 1. It uses the provided cursor `cursor` to retrieve the dict index from it. - * 2. If the dictionary is in a valid state checked through the provided callback `dictScanValidFunction`, - * it performs a dictScan over the appropriate `keyType` dictionary of `db`. - * 3. If the dict is entirely scanned i.e. the cursor has reached 0, the next non empty dict is discovered. - * The dict information is embedded into the cursor and returned. + * 1. It uses the provided cursor `cursor` to retrieve the hashtable index from it. + * 2. If the hash table is in a valid state checked through the provided callback `hashtableScanValidFunction`, + * it performs a hashtableScan over the appropriate `keyType` hash table of `db`. + * 3. If the hashtable is entirely scanned i.e. the cursor has reached 0, the next non empty hashtable is discovered. + * The hashtable information is embedded into the cursor and returned. * - * To restrict the scan to a single dict, pass a valid dict index as + * To restrict the scan to a single hashtable, pass a valid hashtable index as * 'onlydidx', otherwise pass -1. */ unsigned long long kvstoreScan(kvstore *kvs, unsigned long long cursor, int onlydidx, - dictScanFunction *scan_cb, - kvstoreScanShouldSkipDict *skip_cb, + hashtableScanFunction scan_cb, + kvstoreScanShouldSkipHashtable *skip_cb, void *privdata) { - unsigned long long _cursor = 0; - /* During dictionary traversal, 48 upper bits in the cursor are used for positioning in the HT. - * Following lower bits are used for the dict index number, ranging from 0 to 2^num_dicts_bits-1. - * Dict index is always 0 at the start of iteration and can be incremented only if there are - * multiple dicts. */ - int didx = getAndClearDictIndexFromCursor(kvs, &cursor); + unsigned long long next_cursor = 0; + /* During hash table traversal, 48 upper bits in the cursor are used for positioning in the HT. + * Following lower bits are used for the hashtable index number, ranging from 0 to 2^num_hashtables_bits-1. + * Hashtable index is always 0 at the start of iteration and can be incremented only if there are + * multiple hashtables. */ + int didx = getAndClearHashtableIndexFromCursor(kvs, &cursor); if (onlydidx >= 0) { if (didx < onlydidx) { /* Fast-forward to onlydidx. */ - assert(onlydidx < kvs->num_dicts); + assert(onlydidx < kvs->num_hashtables); didx = onlydidx; cursor = 0; } else if (didx > onlydidx) { @@ -392,53 +402,60 @@ unsigned long long kvstoreScan(kvstore *kvs, } } - dict *d = kvstoreGetDict(kvs, didx); + hashtable *ht = kvstoreGetHashtable(kvs, didx); - int skip = !d || (skip_cb && skip_cb(d)); + int skip = !ht || (skip_cb && skip_cb(ht)); if (!skip) { - _cursor = dictScan(d, cursor, scan_cb, privdata); - /* In dictScan, scan_cb may delete entries (e.g., in active expire case). */ - freeDictIfNeeded(kvs, didx); + next_cursor = hashtableScan(ht, cursor, scan_cb, privdata); + /* In hashtableScan, scan_cb may delete entries (e.g., in active expire case). */ + freeHashtableIfNeeded(kvs, didx); } - /* scanning done for the current dictionary or if the scanning wasn't possible, move to the next dict index. */ - if (_cursor == 0 || skip) { + /* scanning done for the current hash table or if the scanning wasn't possible, move to the next hashtable index. */ + if (next_cursor == 0 || skip) { if (onlydidx >= 0) return 0; - didx = kvstoreGetNextNonEmptyDictIndex(kvs, didx); + didx = kvstoreGetNextNonEmptyHashtableIndex(kvs, didx); } if (didx == -1) { return 0; } - addDictIndexToCursor(kvs, didx, &_cursor); - return _cursor; + addHashtableIndexToCursor(kvs, didx, &next_cursor); + return next_cursor; } /* * This functions increases size of kvstore to match desired number. - * It resizes all individual dictionaries, unless skip_cb indicates otherwise. + * It resizes all individual hash tables, unless skip_cb indicates otherwise. * - * Based on the parameter `try_expand`, appropriate dict expand API is invoked. - * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`. - * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s). - * `DICT_OK` response is for successful expansion. However, `DICT_ERR` response signifies failure in allocation in - * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. + * Based on the parameter `try_expand`, appropriate hashtable expand API is invoked. + * if try_expand is set to 1, `hashtableTryExpand` is used else `hashtableExpand`. + * The return code is either 1 or 0 for both the API(s). + * 1 response is for successful expansion. However, 0 response signifies failure in allocation in + * `hashtableTryExpand` call and in case of `hashtableExpand` call it signifies no expansion was performed. */ -int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb) { - for (int i = 0; i < kvs->num_dicts; i++) { - dict *d = kvstoreGetDict(kvs, i); - if (!d || (skip_cb && skip_cb(i))) continue; - int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize); - if (try_expand && result == DICT_ERR) return 0; +int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipHashtableIndex *skip_cb) { + if (newsize == 0) return 1; + for (int i = 0; i < kvs->num_hashtables; i++) { + if (skip_cb && skip_cb(i)) continue; + /* If the hash table doesn't exist, create it. */ + hashtable *ht = createHashtableIfNeeded(kvs, i); + if (try_expand) { + if (!hashtableTryExpand(ht, newsize)) return 0; + } else { + hashtableExpand(ht, newsize); + } } return 1; } -/* Returns fair random dict index, probability of each dict being returned is proportional to the number of elements - * that dictionary holds. This function guarantees that it returns a dict-index of a non-empty dict, unless the entire - * kvstore is empty. Time complexity of this function is O(log(kvs->num_dicts)). */ -int kvstoreGetFairRandomDictIndex(kvstore *kvs) { - unsigned long target = kvstoreSize(kvs) ? (randomULong() % kvstoreSize(kvs)) + 1 : 0; - return kvstoreFindDictIndexByKeyIndex(kvs, target); +/* Returns fair random hashtable index, probability of each hashtable being + * returned is proportional to the number of elements that hash table holds. + * This function guarantees that it returns a hashtable-index of a non-empty + * hashtable, unless the entire kvstore is empty. Time complexity of this + * function is O(log(kvs->num_hashtables)). */ +int kvstoreGetFairRandomHashtableIndex(kvstore *kvs) { + unsigned long target = kvstoreSize(kvs) ? (random() % kvstoreSize(kvs)) + 1 : 0; + return kvstoreFindHashtableIndexByKeyIndex(kvs, target); } void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { @@ -447,40 +464,40 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { size_t l; char *orig_buf = buf; size_t orig_bufsize = bufsize; - dictStats *mainHtStats = NULL; - dictStats *rehashHtStats = NULL; - dict *d; + hashtableStats *mainHtStats = NULL; + hashtableStats *rehashHtStats = NULL; + hashtable *ht; kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs); - while ((d = kvstoreIteratorNextDict(kvs_it))) { - dictStats *stats = dictGetStatsHt(d, 0, full); + while ((ht = kvstoreIteratorNextHashtable(kvs_it))) { + hashtableStats *stats = hashtableGetStatsHt(ht, 0, full); if (!mainHtStats) { mainHtStats = stats; } else { - dictCombineStats(stats, mainHtStats); - dictFreeStats(stats); + hashtableCombineStats(stats, mainHtStats); + hashtableFreeStats(stats); } - if (dictIsRehashing(d)) { - stats = dictGetStatsHt(d, 1, full); + if (hashtableIsRehashing(ht)) { + stats = hashtableGetStatsHt(ht, 1, full); if (!rehashHtStats) { rehashHtStats = stats; } else { - dictCombineStats(stats, rehashHtStats); - dictFreeStats(stats); + hashtableCombineStats(stats, rehashHtStats); + hashtableFreeStats(stats); } } } kvstoreIteratorRelease(kvs_it); if (mainHtStats && bufsize > 0) { - l = dictGetStatsMsg(buf, bufsize, mainHtStats, full); - dictFreeStats(mainHtStats); + l = hashtableGetStatsMsg(buf, bufsize, mainHtStats, full); + hashtableFreeStats(mainHtStats); buf += l; bufsize -= l; } if (rehashHtStats && bufsize > 0) { - l = dictGetStatsMsg(buf, bufsize, rehashHtStats, full); - dictFreeStats(rehashHtStats); + l = hashtableGetStatsMsg(buf, bufsize, rehashHtStats, full); + hashtableFreeStats(rehashHtStats); buf += l; bufsize -= l; } @@ -488,142 +505,143 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) { if (orig_bufsize) orig_buf[orig_bufsize - 1] = '\0'; } -/* Finds a dict containing target element in a key space ordered by dict index. - * Consider this example. Dictionaries are represented by brackets and keys by dots: +/* Finds a hashtable containing target element in a key space ordered by hashtable index. + * Consider this example. Hash Tables are represented by brackets and keys by dots: * #0 #1 #2 #3 #4 * [..][....][...][.......][.] * ^ * target * - * In this case dict #3 contains key that we are trying to find. + * In this case hashtable #3 contains key that we are trying to find. * - * The return value is 0 based dict-index, and the range of the target is [1..kvstoreSize], kvstoreSize inclusive. + * The return value is 0 based hashtable-index, and the range of the target is [1..kvstoreSize], kvstoreSize inclusive. * - * To find the dict, we start with the root node of the binary index tree and search through its children - * from the highest index (2^num_dicts_bits in our case) to the lowest index. At each node, we check if the target + * To find the hashtable, we start with the root node of the binary index tree and search through its children + * from the highest index (2^num_hashtables_bits in our case) to the lowest index. At each node, we check if the target * value is greater than the node's value. If it is, we remove the node's value from the target and recursively * search for the new target using the current node as the parent. - * Time complexity of this function is O(log(kvs->num_dicts)) + * Time complexity of this function is O(log(kvs->num_hashtables)) */ -int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target) { - if (kvs->num_dicts == 1 || kvstoreSize(kvs) == 0) return 0; +int kvstoreFindHashtableIndexByKeyIndex(kvstore *kvs, unsigned long target) { + if (kvs->num_hashtables == 1 || kvstoreSize(kvs) == 0) return 0; assert(target <= kvstoreSize(kvs)); - int result = 0, bit_mask = 1 << kvs->num_dicts_bits; + int result = 0, bit_mask = 1 << kvs->num_hashtables_bits; for (int i = bit_mask; i != 0; i >>= 1) { int current = result + i; /* When the target index is greater than 'current' node value the we will update * the target and search in the 'current' node tree. */ - if (target > kvs->dict_size_index[current]) { - target -= kvs->dict_size_index[current]; + if (target > kvs->hashtable_size_index[current]) { + target -= kvs->hashtable_size_index[current]; result = current; } } - /* Adjust the result to get the correct dict: + /* Adjust the result to get the correct hashtable: * 1. result += 1; - * After the calculations, the index of target in dict_size_index should be the next one, + * After the calculations, the index of target in hashtable_size_index should be the next one, * so we should add 1. * 2. result -= 1; - * Unlike BIT(dict_size_index is 1-based), dict indices are 0-based, so we need to subtract 1. + * Unlike BIT(hashtable_size_index is 1-based), hashtable indices are 0-based, so we need to subtract 1. * As the addition and subtraction cancel each other out, we can simply return the result. */ return result; } -/* Wrapper for kvstoreFindDictIndexByKeyIndex to get the first non-empty dict index in the kvstore. */ -int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs) { - return kvstoreFindDictIndexByKeyIndex(kvs, 1); +/* Wrapper for kvstoreFindHashtableIndexByKeyIndex to get the first non-empty hashtable index in the kvstore. */ +int kvstoreGetFirstNonEmptyHashtableIndex(kvstore *kvs) { + return kvstoreFindHashtableIndexByKeyIndex(kvs, 1); } -/* Returns next non-empty dict index strictly after given one, or -1 if provided didx is the last one. */ -int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx) { - if (kvs->num_dicts == 1) { +/* Returns next non-empty hashtable index strictly after given one, or -1 if provided didx is the last one. */ +int kvstoreGetNextNonEmptyHashtableIndex(kvstore *kvs, int didx) { + if (kvs->num_hashtables == 1) { assert(didx == 0); return -1; } unsigned long long next_key = cumulativeKeyCountRead(kvs, didx) + 1; - return next_key <= kvstoreSize(kvs) ? kvstoreFindDictIndexByKeyIndex(kvs, next_key) : -1; + return next_key <= kvstoreSize(kvs) ? kvstoreFindHashtableIndexByKeyIndex(kvs, next_key) : -1; } -int kvstoreNumNonEmptyDicts(kvstore *kvs) { - return kvs->non_empty_dicts; +int kvstoreNumNonEmptyHashtables(kvstore *kvs) { + return kvs->non_empty_hashtables; } -int kvstoreNumAllocatedDicts(kvstore *kvs) { - return kvs->allocated_dicts; +int kvstoreNumAllocatedHashtables(kvstore *kvs) { + return kvs->allocated_hashtables; } -int kvstoreNumDicts(kvstore *kvs) { - return kvs->num_dicts; +int kvstoreNumHashtables(kvstore *kvs) { + return kvs->num_hashtables; } -/* Returns kvstore iterator that can be used to iterate through sub-dictionaries. +/* Returns kvstore iterator that can be used to iterate through sub-hash tables. * * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); kvs_it->kvs = kvs; kvs_it->didx = -1; - kvs_it->next_didx = kvstoreGetFirstNonEmptyDictIndex(kvs_it->kvs); /* Finds first non-empty dict index. */ - dictInitSafeIterator(&kvs_it->di, NULL); + kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */ + hashtableInitSafeIterator(&kvs_it->di, NULL); return kvs_it; } /* Free the kvs_it returned by kvstoreIteratorInit. */ void kvstoreIteratorRelease(kvstoreIterator *kvs_it) { - dictIterator *iter = &kvs_it->di; - dictResetIterator(iter); + hashtableIterator *iter = &kvs_it->di; + hashtableResetIterator(iter); /* In the safe iterator context, we may delete entries. */ - freeDictIfNeeded(kvs_it->kvs, kvs_it->didx); + freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx); zfree(kvs_it); } -/* Returns next dictionary from the iterator, or NULL if iteration is complete. */ -static dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it) { +/* Returns next hash table from the iterator, or NULL if iteration is complete. */ +static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) { if (kvs_it->next_didx == -1) return NULL; - /* The dict may be deleted during the iteration process, so here need to check for NULL. */ - if (kvs_it->didx != -1 && kvstoreGetDict(kvs_it->kvs, kvs_it->didx)) { - /* Before we move to the next dict, reset the iter of the previous dict. */ - dictIterator *iter = &kvs_it->di; - dictResetIterator(iter); + /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */ + if (kvs_it->didx != -1 && kvstoreGetHashtable(kvs_it->kvs, kvs_it->didx)) { + /* Before we move to the next hashtable, reset the iter of the previous hashtable. */ + hashtableIterator *iter = &kvs_it->di; + hashtableResetIterator(iter); /* In the safe iterator context, we may delete entries. */ - freeDictIfNeeded(kvs_it->kvs, kvs_it->didx); + freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx); } kvs_it->didx = kvs_it->next_didx; - kvs_it->next_didx = kvstoreGetNextNonEmptyDictIndex(kvs_it->kvs, kvs_it->didx); - return kvs_it->kvs->dicts[kvs_it->didx]; + kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx); + return kvs_it->kvs->hashtables[kvs_it->didx]; } -int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it) { - assert(kvs_it->didx >= 0 && kvs_it->didx < kvs_it->kvs->num_dicts); +int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) { + assert(kvs_it->didx >= 0 && kvs_it->didx < kvs_it->kvs->num_hashtables); return kvs_it->didx; } -/* Returns next entry. */ -dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it) { - dictEntry *de = kvs_it->di.d ? dictNext(&kvs_it->di) : NULL; - if (!de) { /* No current dict or reached the end of the dictionary. */ - dict *d = kvstoreIteratorNextDict(kvs_it); - if (!d) return NULL; - dictInitSafeIterator(&kvs_it->di, d); - de = dictNext(&kvs_it->di); +/* Fetches the next element and returns 1. Returns 0 if there are no more elements. */ +int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { + if (kvs_it->didx != -1 && hashtableNext(&kvs_it->di, next)) { + return 1; + } else { + /* No current hashtable or reached the end of the hash table. */ + hashtable *ht = kvstoreIteratorNextHashtable(kvs_it); + if (!ht) return 0; + hashtableInitSafeIterator(&kvs_it->di, ht); + return hashtableNext(&kvs_it->di, next); } - return de; } -/* This method traverses through kvstore dictionaries and triggers a resize. +/* This method traverses through kvstore hash tables and triggers a resize. * It first tries to shrink if needed, and if it isn't, it tries to expand. */ -void kvstoreTryResizeDicts(kvstore *kvs, int limit) { - if (limit > kvs->num_dicts) limit = kvs->num_dicts; +void kvstoreTryResizeHashtables(kvstore *kvs, int limit) { + if (limit > kvs->num_hashtables) limit = kvs->num_hashtables; for (int i = 0; i < limit; i++) { int didx = kvs->resize_cursor; - dict *d = kvstoreGetDict(kvs, didx); - if (d && dictShrinkIfNeeded(d) == DICT_ERR) { - dictExpandIfNeeded(d); + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht && !hashtableShrinkIfNeeded(ht)) { + hashtableExpandIfNeeded(ht); } - kvs->resize_cursor = (didx + 1) % kvs->num_dicts; + kvs->resize_cursor = (didx + 1) % kvs->num_hashtables; } } @@ -637,14 +655,14 @@ void kvstoreTryResizeDicts(kvstore *kvs, int limit) { uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us) { if (listLength(kvs->rehashing) == 0) return 0; - /* Our goal is to rehash as many dictionaries as we can before reaching threshold_us, - * after each dictionary completes rehashing, it removes itself from the list. */ + /* Our goal is to rehash as many hash tables as we can before reaching threshold_us, + * after each hash table completes rehashing, it removes itself from the list. */ listNode *node; monotime timer; uint64_t elapsed_us = 0; elapsedStart(&timer); while ((node = listFirst(kvs->rehashing))) { - dictRehashMicroseconds(listNodeValue(node), threshold_us - elapsed_us); + hashtableRehashMicroseconds(listNodeValue(node), threshold_us - elapsed_us); elapsed_us = elapsedUs(timer); if (elapsed_us >= threshold_us) { @@ -654,176 +672,197 @@ uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us) { return elapsed_us; } +/* Size in bytes of hash tables used by the hashtables. */ size_t kvstoreOverheadHashtableLut(kvstore *kvs) { - return kvs->overhead_hashtable_lut * sizeof(dictEntry *); + return kvs->overhead_hashtable_lut; } size_t kvstoreOverheadHashtableRehashing(kvstore *kvs) { - return kvs->overhead_hashtable_rehashing * sizeof(dictEntry *); + return kvs->overhead_hashtable_rehashing; } -unsigned long kvstoreDictRehashingCount(kvstore *kvs) { +unsigned long kvstoreHashtableRehashingCount(kvstore *kvs) { return listLength(kvs->rehashing); } -unsigned long kvstoreDictSize(kvstore *kvs, int didx) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return 0; - return dictSize(d); +unsigned long kvstoreHashtableSize(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableSize(ht); } -kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx) { - kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di)); +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx) { + kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); kvs_di->kvs = kvs; kvs_di->didx = didx; - dictInitIterator(&kvs_di->di, kvstoreGetDict(kvs, didx)); + hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); return kvs_di; } -kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx) { - kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di)); +kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx) { + kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di)); kvs_di->kvs = kvs; kvs_di->didx = didx; - dictInitSafeIterator(&kvs_di->di, kvstoreGetDict(kvs, didx)); + hashtableInitSafeIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx)); return kvs_di; } -/* Free the kvs_di returned by kvstoreGetDictIterator and kvstoreGetDictSafeIterator. */ -void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_di) { - /* The dict may be deleted during the iteration process, so here need to check for NULL. */ - if (kvstoreGetDict(kvs_di->kvs, kvs_di->didx)) { - dictResetIterator(&kvs_di->di); +/* Free the kvs_di returned by kvstoreGetHashtableIterator and kvstoreGetHashtableSafeIterator. */ +void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_di) { + /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */ + if (kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx)) { + hashtableResetIterator(&kvs_di->di); /* In the safe iterator context, we may delete entries. */ - freeDictIfNeeded(kvs_di->kvs, kvs_di->didx); + freeHashtableIfNeeded(kvs_di->kvs, kvs_di->didx); } zfree(kvs_di); } -/* Get the next element of the dict through kvstoreDictIterator and dictNext. */ -dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di) { - /* The dict may be deleted during the iteration process, so here need to check for NULL. */ - dict *d = kvstoreGetDict(kvs_di->kvs, kvs_di->didx); - if (!d) return NULL; - - return dictNext(&kvs_di->di); +/* Get the next element of the hashtable through kvstoreHashtableIterator and hashtableNext. */ +int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next) { + /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */ + hashtable *ht = kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx); + if (!ht) return 0; + return hashtableNext(&kvs_di->di, next); } -dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return NULL; - return dictGetRandomKey(d); +int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **entry) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableRandomEntry(ht, entry); } -dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return NULL; - return dictGetFairRandomKey(d); +int kvstoreHashtableFairRandomEntry(kvstore *kvs, int didx, void **entry) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableFairRandomEntry(ht, entry); } -unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return 0; - return dictGetSomeKeys(d, des, count); +unsigned int kvstoreHashtableSampleEntries(kvstore *kvs, int didx, void **dst, unsigned int count) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableSampleEntries(ht, dst, count); } -int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return DICT_ERR; - return dictExpand(d, size); +int kvstoreHashtableExpand(kvstore *kvs, int didx, unsigned long size) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableExpand(ht, size); } -unsigned long kvstoreDictScanDefrag(kvstore *kvs, - int didx, - unsigned long v, - dictScanFunction *fn, - dictDefragFunctions *defragfns, - void *privdata) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return 0; - return dictScanDefrag(d, v, fn, defragfns, privdata); +unsigned long kvstoreHashtableScanDefrag(kvstore *kvs, + int didx, + unsigned long v, + hashtableScanFunction fn, + void *privdata, + void *(*defragfn)(void *), + int flags) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableScanDefrag(ht, v, fn, privdata, defragfn, flags); } -/* Unlike kvstoreDictScanDefrag(), this method doesn't defrag the data(keys and values) - * within dict, it only reallocates the memory used by the dict structure itself using - * the provided allocation function. This feature was added for the active defrag feature. +/* This function doesn't defrag the data (keys and values) within hashtable. It + * only reallocates the memory used by the hashtable structure itself using the + * provided allocation function. This feature was added for the active defrag + * feature. + * + * A "cursor" is used to perform the operation iteratively. When first called, a + * cursor value of 0 should be provided. The return value is an updated cursor which should be + * provided on the next iteration. The operation is complete when 0 is returned. * - * The 'defragfn' callback is called with a reference to the dict - * that callback can reallocate. */ -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) { - for (int didx = 0; didx < kvs->num_dicts; didx++) { - dict **d = kvstoreGetDictRef(kvs, didx), *newd; - if (!*d) continue; - if ((newd = defragfn(*d))) *d = newd; + * The provided defragfn callback should return either NULL (if reallocation + * isn't necessary) or return a pointer to reallocated memory like realloc(). */ +unsigned long kvstoreHashtableDefragTables(kvstore *kvs, unsigned long cursor, void *(*defragfn)(void *)) { + for (int didx = cursor; didx < kvs->num_hashtables; didx++) { + hashtable **ref = kvstoreGetHashtableRef(kvs, didx), *new; + if (!*ref) continue; + new = hashtableDefragTables(*ref, defragfn); + if (new) { + *ref = new; + kvstoreHashtableMetadata *metadata = hashtableMetadata(new); + if (metadata->rehashing_node) metadata->rehashing_node->value = new; + } + return (didx + 1); } + return 0; } uint64_t kvstoreGetHash(kvstore *kvs, const void *key) { return kvs->dtype->hashFunction(key); } -void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return NULL; - return dictFetchValue(d, key); +int kvstoreHashtableFind(kvstore *kvs, int didx, void *key, void **found) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + return hashtableFind(ht, key, found); } -dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return NULL; - return dictFind(d, key); +void **kvstoreHashtableFindRef(kvstore *kvs, int didx, const void *key) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return NULL; + return hashtableFindRef(ht, key); } -/* - * The kvstore handles `key` based on `dictType` during initialization: - * - If `dictType.embedded-entry` is 1, it clones the `key`. - * - Otherwise, it assumes ownership of the `key`. - * The caller must ensure the `key` is properly freed. - * - * kvstore current usage: - * - * 1. keyspace (db.keys) kvstore - creates a copy of the key. - * 2. expiry (db.expires), pubsub_channels and pubsubshard_channels kvstore - takes ownership of the key. - */ -dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing) { - dict *d = createDictIfNeeded(kvs, didx); - dictEntry *ret = dictAddRaw(d, key, existing); +int kvstoreHashtableAddOrFind(kvstore *kvs, int didx, void *key, void **existing) { + hashtable *ht = createHashtableIfNeeded(kvs, didx); + int ret = hashtableAddOrFind(ht, key, existing); + if (ret) cumulativeKeyCountAdd(kvs, didx, 1); + return ret; +} + +int kvstoreHashtableAdd(kvstore *kvs, int didx, void *entry) { + hashtable *ht = createHashtableIfNeeded(kvs, didx); + int ret = hashtableAdd(ht, entry); if (ret) cumulativeKeyCountAdd(kvs, didx, 1); return ret; } -void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry *de, void *key) { - dict *d = kvstoreGetDict(kvs, didx); - dictSetKey(d, de, key); +int kvstoreHashtableFindPositionForInsert(kvstore *kvs, int didx, void *key, hashtablePosition *position, void **existing) { + hashtable *ht = createHashtableIfNeeded(kvs, didx); + return hashtableFindPositionForInsert(ht, key, position, existing); } -void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val) { - UNUSED(kvs); - UNUSED(didx); - dictSetVal(NULL, de, val); +/* Must be used together with kvstoreHashtableFindPositionForInsert, with returned + * position and with the same didx. */ +void kvstoreHashtableInsertAtPosition(kvstore *kvs, int didx, void *entry, void *position) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + hashtableInsertAtPosition(ht, entry, position); + cumulativeKeyCountAdd(kvs, didx, 1); } -dictEntry * -kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return NULL; - return dictTwoPhaseUnlinkFind(kvstoreGetDict(kvs, didx), key, plink, table_index); +void **kvstoreHashtableTwoPhasePopFindRef(kvstore *kvs, int didx, const void *key, void *position) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return NULL; + return hashtableTwoPhasePopFindRef(ht, key, position); } -void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index) { - dict *d = kvstoreGetDict(kvs, didx); - dictTwoPhaseUnlinkFree(d, he, plink, table_index); +void kvstoreHashtableTwoPhasePopDelete(kvstore *kvs, int didx, void *position) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + hashtableTwoPhasePopDelete(ht, position); cumulativeKeyCountAdd(kvs, didx, -1); - freeDictIfNeeded(kvs, didx); + freeHashtableIfNeeded(kvs, didx); +} + +int kvstoreHashtablePop(kvstore *kvs, int didx, const void *key, void **popped) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + int ret = hashtablePop(ht, key, popped); + if (ret) { + cumulativeKeyCountAdd(kvs, didx, -1); + freeHashtableIfNeeded(kvs, didx); + } + return ret; } -int kvstoreDictDelete(kvstore *kvs, int didx, const void *key) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return DICT_ERR; - int ret = dictDelete(d, key); - if (ret == DICT_OK) { +int kvstoreHashtableDelete(kvstore *kvs, int didx, const void *key) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return 0; + int ret = hashtableDelete(ht, key); + if (ret) { cumulativeKeyCountAdd(kvs, didx, -1); - freeDictIfNeeded(kvs, didx); + freeHashtableIfNeeded(kvs, didx); } return ret; } diff --git a/src/kvstore.h b/src/kvstore.h index 81a0d9a96e..1a8c74a6b9 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -1,20 +1,20 @@ -#ifndef DICTARRAY_H_ -#define DICTARRAY_H_ +#ifndef KVSTORE_H +#define KVSTORE_H -#include "dict.h" +#include "hashtable.h" #include "adlist.h" typedef struct _kvstore kvstore; typedef struct _kvstoreIterator kvstoreIterator; -typedef struct _kvstoreDictIterator kvstoreDictIterator; +typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator; -typedef int(kvstoreScanShouldSkipDict)(dict *d); -typedef int(kvstoreExpandShouldSkipDictIndex)(int didx); +typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d); +typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx); -#define KVSTORE_ALLOCATE_DICTS_ON_DEMAND (1 << 0) -#define KVSTORE_FREE_EMPTY_DICTS (1 << 1) -kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags); -void kvstoreEmpty(kvstore *kvs, void(callback)(dict *)); +#define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0) +#define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1) +kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags); +void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)); void kvstoreRelease(kvstore *kvs); unsigned long long kvstoreSize(kvstore *kvs); unsigned long kvstoreBuckets(kvstore *kvs); @@ -22,64 +22,69 @@ size_t kvstoreMemUsage(kvstore *kvs); unsigned long long kvstoreScan(kvstore *kvs, unsigned long long cursor, int onlydidx, - dictScanFunction *scan_cb, - kvstoreScanShouldSkipDict *skip_cb, + hashtableScanFunction scan_cb, + kvstoreScanShouldSkipHashtable *skip_cb, void *privdata); -int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb); -int kvstoreGetFairRandomDictIndex(kvstore *kvs); +int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipHashtableIndex *skip_cb); +int kvstoreGetFairRandomHashtableIndex(kvstore *kvs); void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full); -int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target); -int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs); -int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx); -int kvstoreNumNonEmptyDicts(kvstore *kvs); -int kvstoreNumAllocatedDicts(kvstore *kvs); -int kvstoreNumDicts(kvstore *kvs); +int kvstoreFindHashtableIndexByKeyIndex(kvstore *kvs, unsigned long target); +int kvstoreGetFirstNonEmptyHashtableIndex(kvstore *kvs); +int kvstoreGetNextNonEmptyHashtableIndex(kvstore *kvs, int didx); +int kvstoreNumNonEmptyHashtables(kvstore *kvs); +int kvstoreNumAllocatedHashtables(kvstore *kvs); +int kvstoreNumHashtables(kvstore *kvs); uint64_t kvstoreGetHash(kvstore *kvs, const void *key); -void kvstoreDictRehashingStarted(dict *d); -void kvstoreDictRehashingCompleted(dict *d); -size_t kvstoreDictMetadataSize(dict *d); +void kvstoreHashtableRehashingStarted(hashtable *d); +void kvstoreHashtableRehashingCompleted(hashtable *d); +void kvstoreHashtableTrackMemUsage(hashtable *s, ssize_t delta); +size_t kvstoreHashtableMetadataSize(void); /* kvstore iterator specific functions */ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); void kvstoreIteratorRelease(kvstoreIterator *kvs_it); -int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it); -dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it); +int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it); +int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next); /* Rehashing */ -void kvstoreTryResizeDicts(kvstore *kvs, int limit); +void kvstoreTryResizeHashtables(kvstore *kvs, int limit); uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us); size_t kvstoreOverheadHashtableLut(kvstore *kvs); size_t kvstoreOverheadHashtableRehashing(kvstore *kvs); -unsigned long kvstoreDictRehashingCount(kvstore *kvs); +unsigned long kvstoreHashtableRehashingCount(kvstore *kvs); -/* Specific dict access by dict-index */ -unsigned long kvstoreDictSize(kvstore *kvs, int didx); -kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx); -kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx); -void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id); -dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di); -dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx); -dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx); -unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count); -int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size); -unsigned long kvstoreDictScanDefrag(kvstore *kvs, - int didx, - unsigned long v, - dictScanFunction *fn, - dictDefragFunctions *defragfns, - void *privdata); -typedef dict *(kvstoreDictLUTDefragFunction)(dict *d); -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn); -void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key); -dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key); -dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing); -void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry *de, void *key); -void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val); -dictEntry *kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index); -void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index); -int kvstoreDictDelete(kvstore *kvs, int didx, const void *key); -dict *kvstoreGetDict(kvstore *kvs, int didx); +/* Specific hashtable access by hashtable-index */ +unsigned long kvstoreHashtableSize(kvstore *kvs, int didx); +kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx); +kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx); +void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_id); +int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next); +int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **found); +int kvstoreHashtableFairRandomEntry(kvstore *kvs, int didx, void **found); +unsigned int kvstoreHashtableSampleEntries(kvstore *kvs, int didx, void **dst, unsigned int count); +int kvstoreHashtableExpand(kvstore *kvs, int didx, unsigned long size); +unsigned long kvstoreHashtableScanDefrag(kvstore *kvs, + int didx, + unsigned long v, + hashtableScanFunction fn, + void *privdata, + void *(*defragfn)(void *), + int flags); +unsigned long kvstoreHashtableDefragTables(kvstore *kvs, unsigned long cursor, void *(*defragfn)(void *)); +int kvstoreHashtableFind(kvstore *kvs, int didx, void *key, void **found); +void **kvstoreHashtableFindRef(kvstore *kvs, int didx, const void *key); +int kvstoreHashtableAddOrFind(kvstore *kvs, int didx, void *key, void **existing); +int kvstoreHashtableAdd(kvstore *kvs, int didx, void *entry); -#endif /* DICTARRAY_H_ */ +int kvstoreHashtableFindPositionForInsert(kvstore *kvs, int didx, void *key, hashtablePosition *position, void **existing); +void kvstoreHashtableInsertAtPosition(kvstore *kvs, int didx, void *entry, void *position); + +void **kvstoreHashtableTwoPhasePopFindRef(kvstore *kvs, int didx, const void *key, void *position); +void kvstoreHashtableTwoPhasePopDelete(kvstore *kvs, int didx, void *position); +int kvstoreHashtablePop(kvstore *kvs, int didx, const void *key, void **popped); +int kvstoreHashtableDelete(kvstore *kvs, int didx, const void *key); +hashtable *kvstoreGetHashtable(kvstore *kvs, int didx); + +#endif /* KVSTORE_H */ diff --git a/src/latency.c b/src/latency.c index 783f04b197..2beb4859d1 100644 --- a/src/latency.c +++ b/src/latency.c @@ -526,13 +526,12 @@ void fillCommandCDF(client *c, struct hdr_histogram *histogram) { /* latencyCommand() helper to produce for all commands, * a per command cumulative distribution of latencies. */ -void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data) { - dictIterator *di = dictGetSafeIterator(commands); - dictEntry *de; - struct serverCommand *cmd; - - while ((de = dictNext(di)) != NULL) { - cmd = (struct serverCommand *)dictGetVal(de); +void latencyAllCommandsFillCDF(client *c, hashtable *commands, int *command_with_data) { + hashtableIterator iter; + hashtableInitSafeIterator(&iter, commands); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (cmd->latency_histogram) { addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); fillCommandCDF(c, cmd->latency_histogram); @@ -540,10 +539,10 @@ void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data } if (cmd->subcommands) { - latencyAllCommandsFillCDF(c, cmd->subcommands_dict, command_with_data); + latencyAllCommandsFillCDF(c, cmd->subcommands_ht, command_with_data); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* latencyCommand() helper to produce for a specific command set, @@ -564,19 +563,19 @@ void latencySpecificCommandsFillCDF(client *c) { command_with_data++; } - if (cmd->subcommands_dict) { - dictEntry *de; - dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict); - - while ((de = dictNext(di)) != NULL) { - struct serverCommand *sub = dictGetVal(de); + if (cmd->subcommands_ht) { + hashtableIterator iter; + hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + void *next; + while (hashtableNext(&iter, &next)) { + struct serverCommand *sub = next; if (sub->latency_histogram) { addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname)); fillCommandCDF(c, sub->latency_histogram); command_with_data++; } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } } setDeferredMapLen(c, replylen, command_with_data); diff --git a/src/lazyfree.c b/src/lazyfree.c index 6176b43440..4b4c7f06ad 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -116,9 +116,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = obj->ptr; return ql->len; - } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) { - dict *ht = obj->ptr; - return dictSize(ht); + } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = obj->ptr; + return hashtableSize(ht); } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = obj->ptr; return zs->zsl->length; @@ -186,14 +186,14 @@ void freeObjAsync(robj *key, robj *obj, int dbid) { * lazy freeing. */ void emptyDbAsync(serverDb *db) { int slot_count_bits = 0; - int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND; + int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND; if (server.cluster_enabled) { slot_count_bits = CLUSTER_SLOT_MASK_BITS; - flags |= KVSTORE_FREE_EMPTY_DICTS; + flags |= KVSTORE_FREE_EMPTY_HASHTABLES; } kvstore *oldkeys = db->keys, *oldexpires = db->expires; - db->keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags); - db->expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags); + db->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags); + db->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags); atomic_fetch_add_explicit(&lazyfree_objects, kvstoreSize(oldkeys), memory_order_relaxed); bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires); } diff --git a/src/listpack.c b/src/listpack.c index 2dfb321f56..76c2f9ea38 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) { lp_free(lp); } +/* Same as lpFree, but useful for when you are passing the listpack + * into a generic free function that expects (void *) */ +void lpFreeVoid(void *lp) { + lp_free((unsigned char *)lp); +} + /* Shrink the memory to fit. */ unsigned char *lpShrinkToFit(unsigned char *lp) { size_t size = lpGetTotalBytes(lp); diff --git a/src/listpack.h b/src/listpack.h index aa7636143f..b143797261 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -56,6 +56,7 @@ typedef struct { unsigned char *lpNew(size_t capacity); void lpFree(unsigned char *lp); +void lpFreeVoid(void *lp); unsigned char *lpShrinkToFit(unsigned char *lp); unsigned char * lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp); diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c index d888170176..ef6a6c6d02 100644 --- a/src/memory_prefetch.c +++ b/src/memory_prefetch.c @@ -9,65 +9,16 @@ #include "memory_prefetch.h" #include "server.h" -#include "dict.h" - -/* Forward declarations of dict.c functions */ -dictEntry *dictGetNext(const dictEntry *de); - -/* Forward declarations of kvstore.c functions */ -dict *kvstoreGetDict(kvstore *kvs, int didx); - -typedef enum { - HT_IDX_FIRST = 0, - HT_IDX_SECOND = 1, - HT_IDX_INVALID = -1 -} HashTableIndex; typedef enum { - PREFETCH_BUCKET, /* Initial state, determines which hash table to use and prefetch the table's bucket */ - PREFETCH_ENTRY, /* prefetch entries associated with the given key's hash */ - PREFETCH_VALUE, /* prefetch the value object of the entry found in the previous step */ - PREFETCH_VALUE_DATA, /* prefetch the value object's data (if applicable) */ - PREFETCH_DONE /* Indicates that prefetching for this key is complete */ + PREFETCH_ENTRY, /* Initial state, prefetch entries associated with the given key's hash */ + PREFETCH_VALUE, /* prefetch the value object of the entry found in the previous step */ + PREFETCH_DONE /* Indicates that prefetching for this key is complete */ } PrefetchState; - -/************************************ State machine diagram for the prefetch operation. ******************************** - │ - start - │ - ┌────────▼─────────┐ - ┌─────────►│ PREFETCH_BUCKET ├────►────────┐ - │ └────────┬─────────┘ no more tables -> done - | bucket|found | - │ | │ - entry not found - goto next table ┌────────▼────────┐ │ - └────◄─────┤ PREFETCH_ENTRY | ▼ - ┌────────────►└────────┬────────┘ │ - | Entry│found │ - │ | │ - value not found - goto next entry ┌───────▼────────┐ | - └───────◄──────┤ PREFETCH_VALUE | ▼ - └───────┬────────┘ │ - Value│found │ - | | - ┌───────────▼──────────────┐ │ - │ PREFETCH_VALUE_DATA │ ▼ - └───────────┬──────────────┘ │ - | │ - ┌───────-─▼─────────────┐ │ - │ PREFETCH_DONE │◄────────┘ - └───────────────────────┘ -**********************************************************************************************************************/ - -typedef void *(*GetValueDataFunc)(const void *val); - typedef struct KeyPrefetchInfo { - PrefetchState state; /* Current state of the prefetch operation */ - HashTableIndex ht_idx; /* Index of the current hash table (0 or 1 for rehashing) */ - uint64_t bucket_idx; /* Index of the bucket in the current hash table */ - uint64_t key_hash; /* Hash value of the key being prefetched */ - dictEntry *current_entry; /* Pointer to the current entry being processed */ + PrefetchState state; /* Current state of the prefetch operation */ + hashtableIncrementalFindState hashtab_state; } KeyPrefetchInfo; /* PrefetchCommandsBatch structure holds the state of the current batch of client commands being processed. */ @@ -81,9 +32,7 @@ typedef struct PrefetchCommandsBatch { int *slots; /* Array of slots for each key */ void **keys; /* Array of keys to prefetch in the current batch */ client **clients; /* Array of clients in the current batch */ - dict **keys_dicts; /* Main dict for each key */ - dict **expire_dicts; /* Expire dict for each key */ - dict **current_dicts; /* Points to either keys_dicts or expire_dicts */ + hashtable **keys_tables; /* Main table for each key */ KeyPrefetchInfo *prefetch_info; /* Prefetch info for each key */ } PrefetchCommandsBatch; @@ -96,8 +45,7 @@ void freePrefetchCommandsBatch(void) { zfree(batch->clients); zfree(batch->keys); - zfree(batch->keys_dicts); - zfree(batch->expire_dicts); + zfree(batch->keys_tables); zfree(batch->slots); zfree(batch->prefetch_info); zfree(batch); @@ -116,8 +64,7 @@ void prefetchCommandsBatchInit(void) { batch->max_prefetch_size = max_prefetch_size; batch->clients = zcalloc(max_prefetch_size * sizeof(client *)); batch->keys = zcalloc(max_prefetch_size * sizeof(void *)); - batch->keys_dicts = zcalloc(max_prefetch_size * sizeof(dict *)); - batch->expire_dicts = zcalloc(max_prefetch_size * sizeof(dict *)); + batch->keys_tables = zcalloc(max_prefetch_size * sizeof(hashtable *)); batch->slots = zcalloc(max_prefetch_size * sizeof(int)); batch->prefetch_info = zcalloc(max_prefetch_size * sizeof(KeyPrefetchInfo)); } @@ -132,10 +79,8 @@ void onMaxBatchSizeChange(void) { prefetchCommandsBatchInit(); } -/* Prefetch the given pointer and move to the next key in the batch. */ -static void prefetchAndMoveToNextKey(void *addr) { - valkey_prefetch(addr); - /* While the prefetch is in progress, we can continue to the next key */ +/* Move to the next key in the batch. */ +static void moveToNextKey(void) { batch->cur_idx = (batch->cur_idx + 1) % batch->key_count; } @@ -156,144 +101,64 @@ static KeyPrefetchInfo *getNextPrefetchInfo(void) { return NULL; } -static void initBatchInfo(dict **dicts) { - batch->current_dicts = dicts; - +static void initBatchInfo(hashtable **tables) { /* Initialize the prefetch info */ for (size_t i = 0; i < batch->key_count; i++) { KeyPrefetchInfo *info = &batch->prefetch_info[i]; - if (!batch->current_dicts[i] || dictSize(batch->current_dicts[i]) == 0) { + if (!tables[i] || hashtableSize(tables[i]) == 0) { info->state = PREFETCH_DONE; batch->keys_done++; continue; } - info->ht_idx = HT_IDX_INVALID; - info->current_entry = NULL; - info->state = PREFETCH_BUCKET; - info->key_hash = dictHashKey(batch->current_dicts[i], batch->keys[i]); - } -} - -/* Prefetch the bucket of the next hash table index. - * If no tables are left, move to the PREFETCH_DONE state. */ -static void prefetchBucket(KeyPrefetchInfo *info) { - size_t i = batch->cur_idx; - - /* Determine which hash table to use */ - if (info->ht_idx == HT_IDX_INVALID) { - info->ht_idx = HT_IDX_FIRST; - } else if (info->ht_idx == HT_IDX_FIRST && dictIsRehashing(batch->current_dicts[i])) { - info->ht_idx = HT_IDX_SECOND; - } else { - /* No more tables left - mark as done. */ - markKeyAsdone(info); - return; + info->state = PREFETCH_ENTRY; + hashtableIncrementalFindInit(&info->hashtab_state, tables[i], batch->keys[i]); } - - /* Prefetch the bucket */ - info->bucket_idx = info->key_hash & DICTHT_SIZE_MASK(batch->current_dicts[i]->ht_size_exp[info->ht_idx]); - prefetchAndMoveToNextKey(&batch->current_dicts[i]->ht_table[info->ht_idx][info->bucket_idx]); - info->current_entry = NULL; - info->state = PREFETCH_ENTRY; } -/* Prefetch the next entry in the bucket and move to the PREFETCH_VALUE state. - * If no more entries in the bucket, move to the PREFETCH_BUCKET state to look at the next table. */ static void prefetchEntry(KeyPrefetchInfo *info) { - size_t i = batch->cur_idx; - - if (info->current_entry) { - /* We already found an entry in the bucket - move to the next entry */ - info->current_entry = dictGetNext(info->current_entry); + if (hashtableIncrementalFindStep(&info->hashtab_state) == 1) { + /* Not done yet */ + moveToNextKey(); } else { - /* Go to the first entry in the bucket */ - info->current_entry = batch->current_dicts[i]->ht_table[info->ht_idx][info->bucket_idx]; - } - - if (info->current_entry) { - prefetchAndMoveToNextKey(info->current_entry); info->state = PREFETCH_VALUE; - } else { - /* No entry found in the bucket - try the bucket in the next table */ - info->state = PREFETCH_BUCKET; } } -/* Prefetch the entry's value. If the value is found, move to the PREFETCH_VALUE_DATA state. - * If the value is not found, move to the PREFETCH_ENTRY state to look at the next entry in the bucket. */ +/* Prefetch the entry's value. If the value is found.*/ static void prefetchValue(KeyPrefetchInfo *info) { - size_t i = batch->cur_idx; - void *value = dictGetVal(info->current_entry); - - if (dictGetNext(info->current_entry) == NULL && !dictIsRehashing(batch->current_dicts[i])) { - /* If this is the last element, we assume a hit and don't compare the keys */ - prefetchAndMoveToNextKey(value); - info->state = PREFETCH_VALUE_DATA; - return; - } - - void *current_entry_key = dictGetKey(info->current_entry); - if (batch->keys[i] == current_entry_key || - dictCompareKeys(batch->current_dicts[i], batch->keys[i], current_entry_key)) { - /* If the key is found, prefetch the value */ - prefetchAndMoveToNextKey(value); - info->state = PREFETCH_VALUE_DATA; - } else { - /* Move to the next entry */ - info->state = PREFETCH_ENTRY; + void *entry; + if (hashtableIncrementalFindGetResult(&info->hashtab_state, &entry)) { + robj *val = entry; + if (val->encoding == OBJ_ENCODING_RAW && val->type == OBJ_STRING) { + valkey_prefetch(val->ptr); + } } -} -/* Prefetch the value data if available. */ -static void prefetchValueData(KeyPrefetchInfo *info, GetValueDataFunc get_val_data_func) { - if (get_val_data_func) { - void *value_data = get_val_data_func(dictGetVal(info->current_entry)); - if (value_data) prefetchAndMoveToNextKey(value_data); - } markKeyAsdone(info); } -/* Prefetch dictionary data for an array of keys. +/* Prefetch hashtable data for an array of keys. * - * This function takes an array of dictionaries and keys, attempting to bring - * data closer to the L1 cache that might be needed for dictionary operations + * This function takes an array of tables and keys, attempting to bring + * data closer to the L1 cache that might be needed for hashtable operations * on those keys. * - * The dictFind algorithm: - * 1. Evaluate the hash of the key - * 2. Access the index in the first table - * 3. Walk the entries linked list until the key is found - * If the key hasn't been found and the dictionary is in the middle of rehashing, - * access the index on the second table and repeat step 3 - * - * dictPrefetch executes the same algorithm as dictFind, but one step at a time - * for each key. Instead of waiting for data to be read from memory, it prefetches - * the data and then moves on to execute the next prefetch for another key. - * - * dicts - An array of dictionaries to prefetch data from. - * get_val_data_func - A callback function that dictPrefetch can invoke + * tables - An array of hashtables to prefetch data from. + * prefetch_value - If true, we prefetch the value data for each key. * to bring the key's value data closer to the L1 cache as well. */ -static void dictPrefetch(dict **dicts, GetValueDataFunc get_val_data_func) { - initBatchInfo(dicts); +static void hashtablePrefetch(hashtable **tables) { + initBatchInfo(tables); KeyPrefetchInfo *info; while ((info = getNextPrefetchInfo())) { switch (info->state) { - case PREFETCH_BUCKET: prefetchBucket(info); break; case PREFETCH_ENTRY: prefetchEntry(info); break; case PREFETCH_VALUE: prefetchValue(info); break; - case PREFETCH_VALUE_DATA: prefetchValueData(info, get_val_data_func); break; default: serverPanic("Unknown prefetch state %d", info->state); } } } -/* Helper function to get the value pointer of an object. */ -static void *getObjectValuePtr(const void *val) { - robj *o = (robj *)val; - return (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_RAW) ? o->ptr : NULL; -} - static void resetCommandsBatch(void) { batch->cur_idx = 0; batch->keys_done = 0; @@ -304,7 +169,7 @@ static void resetCommandsBatch(void) { /* Prefetch command-related data: * 1. Prefetch the command arguments allocated by the I/O thread to bring them closer to the L1 cache. - * 2. Prefetch the keys and values for all commands in the current batch from the main and expires dictionaries. */ + * 2. Prefetch the keys and values for all commands in the current batch from the main hashtable. */ static void prefetchCommands(void) { /* Prefetch argv's for all clients */ for (size_t i = 0; i < batch->client_count; i++) { @@ -332,13 +197,11 @@ static void prefetchCommands(void) { batch->keys[i] = ((robj *)batch->keys[i])->ptr; } - /* Prefetch dict keys for all commands. Prefetching is beneficial only if there are more than one key. */ + /* Prefetch hashtable keys for all commands. Prefetching is beneficial only if there are more than one key. */ if (batch->key_count > 1) { server.stat_total_prefetch_batches++; - /* Prefetch keys from the main dict */ - dictPrefetch(batch->keys_dicts, getObjectValuePtr); - /* Prefetch keys from the expires dict - no value data to prefetch */ - dictPrefetch(batch->expire_dicts, NULL); + /* Prefetch keys from the main hashtable */ + hashtablePrefetch(batch->keys_tables); } } @@ -388,8 +251,7 @@ int addCommandToBatchAndProcessIfFull(client *c) { for (int i = 0; i < num_keys && batch->key_count < batch->max_prefetch_size; i++) { batch->keys[batch->key_count] = c->argv[result.keys[i].pos]; batch->slots[batch->key_count] = c->slot > 0 ? c->slot : 0; - batch->keys_dicts[batch->key_count] = kvstoreGetDict(c->db->keys, batch->slots[batch->key_count]); - batch->expire_dicts[batch->key_count] = kvstoreGetDict(c->db->expires, batch->slots[batch->key_count]); + batch->keys_tables[batch->key_count] = kvstoreGetHashtable(c->db->keys, batch->slots[batch->key_count]); batch->key_count++; } getKeysFreeResult(&result); diff --git a/src/module.c b/src/module.c index 1e98b36f30..db493dd8bc 100644 --- a/src/module.c +++ b/src/module.c @@ -62,6 +62,7 @@ #include "crc16_slottable.h" #include "valkeymodule.h" #include "io_threads.h" +#include "functions.h" #include #include #include @@ -681,6 +682,7 @@ void moduleReleaseTempClient(client *c) { c->bufpos = 0; c->raw_flag = 0; c->flag.module = 1; + c->flag.fake = 1; c->user = NULL; /* Root user */ c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL; if (c->bstate.async_rm_call_handle) { @@ -717,7 +719,7 @@ int moduleCreateEmptyKey(ValkeyModuleKey *key, int type) { case VALKEYMODULE_KEYTYPE_STREAM: obj = createStreamObject(); break; default: return VALKEYMODULE_ERR; } - dbAdd(key->db, key->key, obj); + dbAdd(key->db, key->key, &obj); key->value = obj; moduleInitKeyTypeSpecific(key); return VALKEYMODULE_OK; @@ -878,6 +880,15 @@ void moduleCallCommandUnblockedHandler(client *c) { moduleReleaseTempClient(c); } +/* Allocates the memory necessary to hold the ValkeyModuleCtx structure, and + * returns the pointer to the allocated memory. + * + * Used by the scripting engines implementation to cache the context structure. + */ +ValkeyModuleCtx *moduleAllocateContext(void) { + return (ValkeyModuleCtx *)zcalloc(sizeof(ValkeyModuleCtx)); +} + /* Create a module ctx and keep track of the nesting level. * * Note: When creating ctx for threads (VM_GetThreadSafeContext and @@ -920,6 +931,16 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx } } +/* Initialize a module context to be used by scripting engines callback + * functions. + */ +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client) { + moduleCreateContext(out_ctx, module, VALKEYMODULE_CTX_NONE); + out_ctx->client = client; +} + /* This command binds the normal command invocation with commands * exported by modules. */ void ValkeyModuleCommandDispatcher(client *c) { @@ -1297,8 +1318,8 @@ int VM_CreateCommand(ValkeyModuleCtx *ctx, cp->serverCmd->arity = cmdfunc ? -1 : -2; /* Default value, can be changed later via dedicated API */ /* Drain IO queue before modifying commands dictionary to prevent concurrent access while modifying it. */ drainIOThreadsQueue(); - serverAssert(dictAdd(server.commands, sdsdup(declared_name), cp->serverCmd) == DICT_OK); - serverAssert(dictAdd(server.orig_commands, sdsdup(declared_name), cp->serverCmd) == DICT_OK); + serverAssert(hashtableAdd(server.commands, cp->serverCmd)); + serverAssert(hashtableAdd(server.orig_commands, cp->serverCmd)); cp->serverCmd->id = ACLGetCommandID(declared_name); /* ID used for ACL. */ return VALKEYMODULE_OK; } @@ -1430,7 +1451,7 @@ int VM_CreateSubcommand(ValkeyModuleCommand *parent, /* Check if the command name is busy within the parent command. */ sds declared_name = sdsnew(name); - if (parent_cmd->subcommands_dict && lookupSubcommand(parent_cmd, declared_name) != NULL) { + if (parent_cmd->subcommands_ht && lookupSubcommand(parent_cmd, declared_name) != NULL) { sdsfree(declared_name); return VALKEYMODULE_ERR; } @@ -1440,7 +1461,7 @@ int VM_CreateSubcommand(ValkeyModuleCommand *parent, moduleCreateCommandProxy(parent->module, declared_name, fullname, cmdfunc, flags, firstkey, lastkey, keystep); cp->serverCmd->arity = -2; - commandAddSubcommand(parent_cmd, cp->serverCmd, name); + commandAddSubcommand(parent_cmd, cp->serverCmd); return VALKEYMODULE_OK; } @@ -2254,6 +2275,27 @@ int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd) { return (cp->module == module_handle); } +/* ValkeyModule_UpdateRuntimeArgs can be used to update the module argument values. + * The function parameter 'argc' indicates the number of updated arguments, and 'argv' + * represents the values of the updated arguments. + * Once 'CONFIG REWRITE' command is called, the updated argument values can be saved into conf file. + * + * The function always returns VALKEYMODULE_OK. */ +int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + struct moduleLoadQueueEntry *loadmod = ctx->module->loadmod; + for (int i = 0; i < loadmod->argc; i++) { + decrRefCount(loadmod->argv[i]); + } + zfree(loadmod->argv); + loadmod->argv = argc - 1 ? zmalloc(sizeof(robj *) * (argc - 1)) : NULL; + loadmod->argc = argc - 1; + for (int i = 1; i < argc; i++) { + loadmod->argv[i - 1] = argv[i]; + incrRefCount(loadmod->argv[i - 1]); + } + return VALKEYMODULE_OK; +} + /* -------------------------------------------------------------------------- * ## Module information and time measurement * -------------------------------------------------------------------------- */ @@ -4174,7 +4216,7 @@ int VM_SetExpire(ValkeyModuleKey *key, mstime_t expire) { return VALKEYMODULE_ERR; if (expire != VALKEYMODULE_NO_EXPIRE) { expire += commandTimeSnapshot(); - setExpire(key->ctx->client, key->db, key->key, expire); + key->value = setExpire(key->ctx->client, key->db, key->key, expire); } else { removeExpire(key->db, key->key); } @@ -4203,7 +4245,7 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) { if (!(key->mode & VALKEYMODULE_WRITE) || key->value == NULL || (expire < 0 && expire != VALKEYMODULE_NO_EXPIRE)) return VALKEYMODULE_ERR; if (expire != VALKEYMODULE_NO_EXPIRE) { - setExpire(key->ctx->client, key->db, key->key, expire); + key->value = setExpire(key->ctx->client, key->db, key->key, expire); } else { removeExpire(key->db, key->key); } @@ -4264,7 +4306,9 @@ int VM_GetToDbIdFromOptCtx(ValkeyModuleKeyOptCtx *ctx) { int VM_StringSet(ValkeyModuleKey *key, ValkeyModuleString *str) { if (!(key->mode & VALKEYMODULE_WRITE) || key->iter) return VALKEYMODULE_ERR; VM_DeleteKey(key); - setKey(key->ctx->client, key->db, key->key, str, SETKEY_NO_SIGNAL); + /* Retain str so setKey copies it to db rather than reallocating it. */ + incrRefCount(str); + setKey(key->ctx->client, key->db, key->key, &str, SETKEY_NO_SIGNAL); key->value = str; return VALKEYMODULE_OK; } @@ -4344,9 +4388,8 @@ int VM_StringTruncate(ValkeyModuleKey *key, size_t newlen) { if (key->value == NULL) { /* Empty key: create it with the new size. */ robj *o = createObject(OBJ_STRING, sdsnewlen(NULL, newlen)); - setKey(key->ctx->client, key->db, key->key, o, SETKEY_NO_SIGNAL); + setKey(key->ctx->client, key->db, key->key, &o, SETKEY_NO_SIGNAL); key->value = o; - decrRefCount(o); } else { /* Unshare and resize. */ key->value = dbUnshareStringValue(key->db, key->key, key->value); @@ -6911,8 +6954,7 @@ int VM_ModuleTypeSetValue(ValkeyModuleKey *key, moduleType *mt, void *value) { if (!(key->mode & VALKEYMODULE_WRITE) || key->iter) return VALKEYMODULE_ERR; VM_DeleteKey(key); robj *o = createModuleObject(mt, value); - setKey(key->ctx->client, key->db, key->key, o, SETKEY_NO_SIGNAL); - decrRefCount(o); + setKey(key->ctx->client, key->db, key->key, &o, SETKEY_NO_SIGNAL); key->value = o; return VALKEYMODULE_OK; } @@ -10377,7 +10419,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s * context instead of passing NULL. */ void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) { if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data); - raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree); + raxFreeWithCallback(data->rax, sdsfreeVoid); zfree(data); } @@ -10878,10 +10920,10 @@ typedef struct ValkeyModuleScanCursor { int done; } ValkeyModuleScanCursor; -static void moduleScanCallback(void *privdata, const dictEntry *de) { +static void moduleScanCallback(void *privdata, void *element) { ScanCBData *data = privdata; - sds key = dictGetKey(de); - robj *val = dictGetVal(de); + robj *val = element; + sds key = objectGetKey(val); ValkeyModuleString *keyname = createObject(OBJ_STRING, sdsdup(key)); /* Setup the key handle. */ @@ -10995,20 +11037,20 @@ typedef struct { ValkeyModuleScanKeyCB fn; } ScanKeyCBData; -static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { +static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) { ScanKeyCBData *data = privdata; sds key = dictGetKey(de); robj *o = data->key->value; robj *field = createStringObject(key, sdslen(key)); robj *value = NULL; - if (o->type == OBJ_SET) { - value = NULL; - } else if (o->type == OBJ_HASH) { + if (o->type == OBJ_HASH) { sds val = dictGetVal(de); value = createStringObject(val, sdslen(val)); } else if (o->type == OBJ_ZSET) { double *val = (double *)dictGetVal(de); value = createStringObjectFromLongDouble(*val, 0); + } else { + serverPanic("unexpected object type"); } data->fn(data->key, field, value, data->user_data); @@ -11016,6 +11058,17 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) { if (value) decrRefCount(value); } +static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { + ScanKeyCBData *data = privdata; + robj *o = data->key->value; + serverAssert(o->type == OBJ_SET); + sds key = entry; + robj *field = createStringObject(key, sdslen(key)); + + data->fn(data->key, field, NULL, data->user_data); + decrRefCount(field); +} + /* Scan api that allows a module to scan the elements in a hash, set or sorted set key * * Callback for scan implementation. @@ -11069,14 +11122,15 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul errno = EINVAL; return 0; } - dict *ht = NULL; + dict *d = NULL; + hashtable *ht = NULL; robj *o = key->value; if (o->type == OBJ_SET) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_HASH) { - if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr; + if (o->encoding == OBJ_ENCODING_HT) d = o->ptr; } else if (o->type == OBJ_ZSET) { - if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict; + if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict; } else { errno = EINVAL; return 0; @@ -11086,9 +11140,16 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul return 0; } int ret = 1; - if (ht) { + if (d) { ScanKeyCBData data = {key, privdata, fn}; - cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data); + cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data); + if (cursor->cursor == 0) { + cursor->done = 1; + ret = 0; + } + } else if (ht) { + ScanKeyCBData data = {key, privdata, fn}; + cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data); if (cursor->cursor == 0) { cursor->done = 1; ret = 0; @@ -12058,20 +12119,21 @@ int moduleFreeCommand(struct ValkeyModule *module, struct serverCommand *cmd) { moduleFreeArgs(cmd->args, cmd->num_args); zfree(cp); - if (cmd->subcommands_dict) { - dictEntry *de; - dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict); - while ((de = dictNext(di)) != NULL) { - struct serverCommand *sub = dictGetVal(de); + if (cmd->subcommands_ht) { + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + while (hashtableNext(&iter, &next)) { + struct serverCommand *sub = next; if (moduleFreeCommand(module, sub) != C_OK) continue; - serverAssert(dictDelete(cmd->subcommands_dict, sub->declared_name) == DICT_OK); + serverAssert(hashtableDelete(cmd->subcommands_ht, sub->declared_name)); sdsfree((sds)sub->declared_name); sdsfree(sub->fullname); zfree(sub); } - dictReleaseIterator(di); - dictRelease(cmd->subcommands_dict); + hashtableResetIterator(&iter); + hashtableRelease(cmd->subcommands_ht); } return C_OK; @@ -12081,19 +12143,20 @@ void moduleUnregisterCommands(struct ValkeyModule *module) { /* Drain IO queue before modifying commands dictionary to prevent concurrent access while modifying it. */ drainIOThreadsQueue(); /* Unregister all the commands registered by this module. */ - dictIterator *di = dictGetSafeIterator(server.commands); - dictEntry *de; - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, server.commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (moduleFreeCommand(module, cmd) != C_OK) continue; - serverAssert(dictDelete(server.commands, cmd->fullname) == DICT_OK); - serverAssert(dictDelete(server.orig_commands, cmd->fullname) == DICT_OK); + serverAssert(hashtableDelete(server.commands, cmd->fullname)); + serverAssert(hashtableDelete(server.orig_commands, cmd->fullname)); sdsfree((sds)cmd->declared_name); sdsfree(cmd->fullname); zfree(cmd); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* We parse argv to add sds "NAME VALUE" pairs to the server.module_configs_queue list of configs. @@ -13031,6 +13094,60 @@ int VM_RdbSave(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { return VALKEYMODULE_OK; } +/* Registers a new scripting engine in the server. + * + * - `module_ctx`: the module context object. + * + * - `engine_name`: the name of the scripting engine. This name will match + * against the engine name specified in the script header using a shebang. + * + * - `engine_ctx`: engine specific context pointer. + * + * - `engine_methods`: the struct with the scripting engine callback functions + * pointers. + * + * Returns VALKEYMODULE_OK if the engine is successfully registered, and + * VALKEYMODULE_ERR in case some failure occurs. In case of a failure, an error + * message is logged. + */ +int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx, + const char *engine_name, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineMethods *engine_methods) { + serverLog(LL_DEBUG, "Registering a new scripting engine: %s", engine_name); + + if (engine_methods->version > VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION) { + serverLog(LL_WARNING, "The engine implementation version is greater " + "than what this server supports. Server ABI " + "Version: %lu, Engine ABI version: %lu", + VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + (unsigned long)engine_methods->version); + return VALKEYMODULE_ERR; + } + + if (functionsRegisterEngine(engine_name, + module_ctx->module, + engine_ctx, + engine_methods) != C_OK) { + return VALKEYMODULE_ERR; + } + + return VALKEYMODULE_OK; +} + +/* Removes the scripting engine from the server. + * + * `engine_name` is the name of the scripting engine. + * + * Returns VALKEYMODULE_OK. + * + */ +int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) { + UNUSED(ctx); + functionsUnregisterEngine(engine_name); + return VALKEYMODULE_OK; +} + /* MODULE command. * * MODULE LIST @@ -13343,7 +13460,7 @@ const char *VM_GetCurrentCommandName(ValkeyModuleCtx *ctx) { * defrag callback. */ struct ValkeyModuleDefragCtx { - long long int endtime; + monotime endtime; unsigned long *cursor; struct serverObject *key; /* Optional name of key processed, NULL when unknown. */ int dbid; /* The dbid of the key being processed, -1 when unknown. */ @@ -13372,7 +13489,7 @@ int VM_RegisterDefragFunc(ValkeyModuleCtx *ctx, ValkeyModuleDefragFunc cb) { * so it generally makes sense to do small batches of work in between calls. */ int VM_DefragShouldStop(ValkeyModuleDefragCtx *ctx) { - return (ctx->endtime != 0 && ctx->endtime < ustime()); + return (ctx->endtime != 0 && ctx->endtime <= getMonotonicUs()); } /* Store an arbitrary cursor value for future re-use. @@ -13454,7 +13571,7 @@ ValkeyModuleString *VM_DefragValkeyModuleString(ValkeyModuleDefragCtx *ctx, Valk * Returns a zero value (and initializes the cursor) if no more needs to be done, * or a non-zero value otherwise. */ -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid) { +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid) { moduleValue *mv = value->ptr; moduleType *mt = mv->type; @@ -13559,6 +13676,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(SetModuleAttribs); REGISTER_API(IsModuleNameBusy); REGISTER_API(WrongArity); + REGISTER_API(UpdateRuntimeArgs); REGISTER_API(ReplyWithLongLong); REGISTER_API(ReplyWithError); REGISTER_API(ReplyWithErrorFormat); @@ -13900,4 +14018,6 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(RdbStreamFree); REGISTER_API(RdbLoad); REGISTER_API(RdbSave); + REGISTER_API(RegisterScriptingEngine); + REGISTER_API(UnregisterScriptingEngine); } diff --git a/src/module.h b/src/module.h new file mode 100644 index 0000000000..f61ef1e3cb --- /dev/null +++ b/src/module.h @@ -0,0 +1,17 @@ +#ifndef _MODULE_H_ +#define _MODULE_H_ + +/* This header file exposes a set of functions defined in module.c that are + * not part of the module API, but are used by the core to interact with modules + */ + +typedef struct ValkeyModuleCtx ValkeyModuleCtx; +typedef struct ValkeyModule ValkeyModule; + +ValkeyModuleCtx *moduleAllocateContext(void); +void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx, + ValkeyModule *module, + client *client); +void moduleFreeContext(ValkeyModuleCtx *ctx); + +#endif /* _MODULE_H_ */ diff --git a/src/modules/hellodict.c b/src/modules/hellodict.c index e0af06ba2f..db2fd17e8a 100644 --- a/src/modules/hellodict.c +++ b/src/modules/hellodict.c @@ -109,13 +109,13 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg if (ValkeyModule_Init(ctx, "hellodict", 1, VALKEYMODULE_APIVER_1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; /* Create our global dictionary. Here we'll set our keys and values. */ diff --git a/src/multi.c b/src/multi.c index bcffb90912..9e1f019244 100644 --- a/src/multi.c +++ b/src/multi.c @@ -238,6 +238,10 @@ void execCommand(client *c) { c->mstate.commands[j].argv = c->argv; c->mstate.commands[j].argv_len = c->argv_len; c->mstate.commands[j].cmd = c->cmd; + + /* The original argv has already been processed for slowlog and monitor, + * so we can safely free it before proceeding to the next command. */ + freeClientOriginalArgv(c); } // restore old DENY_BLOCKING value diff --git a/src/networking.c b/src/networking.c index 4791055b5a..9f36f24275 100644 --- a/src/networking.c +++ b/src/networking.c @@ -134,6 +134,7 @@ client *createClient(connection *conn) { if (server.tcpkeepalive) connKeepAlive(conn, server.tcpkeepalive); connSetReadHandler(conn, readQueryFromClient); connSetPrivateData(conn, c); + conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD; } c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size); selectDb(c, 0); @@ -314,7 +315,11 @@ int prepareClientToWrite(client *c) { * is set. */ if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR; - if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ + /* Skip the fake client, such as the fake client for AOF loading. + * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client + * but has a connection to cache the response. */ + if (c->flag.fake && c->id != CLIENT_ID_CACHED_RESPONSE) return C_ERR; + serverAssert(c->conn); /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ @@ -348,6 +353,9 @@ sds aggregateClientOutputBuffer(client *c) { * It needs be paired with `deleteCachedResponseClient` function to stop caching. */ client *createCachedResponseClient(int resp) { struct client *recording_client = createClient(NULL); + /* It is a fake client but with a connection, setting a special client id, + * so we can identify it's a fake cached response client. */ + recording_client->id = CLIENT_ID_CACHED_RESPONSE; recording_client->resp = resp; /* Allocating the `conn` allows to prepare the caching client before adding * data to the clients output buffer by `prepareClientToWrite`. */ @@ -549,7 +557,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { if (c->flag.module) { if (!c->deferred_reply_errors) { c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree); + listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); } listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); return; @@ -1481,14 +1489,19 @@ void freeClientOriginalArgv(client *c) { /* We didn't rewrite this client */ if (!c->original_argv) return; - for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); - zfree(c->original_argv); + if (tryOffloadFreeArgvToIOThreads(c, c->original_argc, c->original_argv) == C_ERR) { + for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); + zfree(c->original_argv); + } + c->original_argv = NULL; c->original_argc = 0; } void freeClientArgv(client *c) { - if (tryOffloadFreeArgvToIOThreads(c) == C_ERR) { + /* If original_argv exists, 'c->argv' was allocated by the main thread, + * so it's more efficient to free it directly here rather than offloading to IO threads */ + if (c->original_argv || tryOffloadFreeArgvToIOThreads(c, c->argc, c->argv) == C_ERR) { for (int j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); zfree(c->argv); } @@ -1713,10 +1726,10 @@ void freeClient(client *c) { /* Log link disconnection with replica */ if (getClientType(c) == CLIENT_TYPE_REPLICA) { - serverLog(LL_NOTICE, - c->flag.repl_rdb_channel ? "Replica %s rdb channel disconnected." - : "Connection with replica %s lost.", - replicationGetReplicaName(c)); + if (c->flag.repl_rdb_channel) + dualChannelServerLog(LL_NOTICE, "Replica %s rdb channel disconnected.", replicationGetReplicaName(c)); + else + serverLog(LL_NOTICE, "Connection with replica %s lost.", replicationGetReplicaName(c)); } /* Free the query buffer */ @@ -1753,7 +1766,7 @@ void freeClient(client *c) { /* Free data structures. */ listRelease(c->reply); c->reply = NULL; - zfree(c->buf); + zfree_with_size(c->buf, c->buf_usable_size); c->buf = NULL; freeReplicaReferencedReplBuffer(c); freeClientArgv(c); @@ -1963,14 +1976,15 @@ int freeClientsInAsyncFreeQueue(void) { if (!c->rdb_client_disconnect_time) { if (c->conn) connSetReadHandler(c->conn, NULL); c->rdb_client_disconnect_time = server.unixtime; - serverLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", (unsigned long long)c->id, - replicationGetReplicaName(c), server.wait_before_rdb_client_free); + dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", + (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free); } if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; - serverLog(LL_NOTICE, - "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " - "Freeing RDB client %llu.", - (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); + dualChannelServerLog( + LL_NOTICE, + "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " + "Freeing RDB client %llu.", + (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); c->flag.protected_rdb_channel = 0; } @@ -2537,6 +2551,7 @@ void resetClient(client *c) { serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; freeClientArgv(c); + freeClientOriginalArgv(c); c->cur_script = NULL; c->reqtype = 0; c->multibulklen = 0; @@ -3326,6 +3341,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { if (client->flag.readonly) *p++ = 'r'; if (client->flag.no_evict) *p++ = 'e'; if (client->flag.no_touch) *p++ = 'T'; + if (client->flag.import_source) *p++ = 'I'; if (p == flags) *p++ = 'N'; *p++ = '\0'; @@ -3385,6 +3401,29 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { return ret; } +/* Concatenate a string representing the state of a client in a human + * readable format, into the sds string 's'. + * + * This is a simplified and shortened version of catClientInfoString, + * it only added some basic fields for tracking clients. */ +sds catClientInfoShortString(sds s, client *client, int hide_user_data) { + if (!server.crashed) waitForClientIO(client); + char conninfo[CONN_INFO_LEN]; + + sds ret = sdscatfmt( + s, + FMTARGS( + "id=%U", (unsigned long long)client->id, + " addr=%s", getClientPeerId(client), + " laddr=%s", getClientSockname(client), + " %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)), + " name=%s", hide_user_data ? "*redacted*" : (client->name ? (char *)client->name->ptr : ""), + " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"), + " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "", + " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "")); + return ret; +} + sds getAllClientsInfoString(int type, int hide_user_data) { listNode *ln; listIter li; @@ -3585,6 +3624,10 @@ void clientCommand(client *c) { " Protect current client connection from eviction.", "NO-TOUCH (ON|OFF)", " Will not touch LRU/LFU stats when this mode is on.", + "IMPORT-SOURCE (ON|OFF)", + " Mark this connection as an import source if import-mode is enabled.", + " Sync tools can set their connections into 'import-source' state to visit", + " expired keys.", NULL}; addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "id") && c->argc == 2) { @@ -4058,6 +4101,22 @@ void clientCommand(client *c) { } } addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "import-source")) { + /* CLIENT IMPORT-SOURCE ON|OFF */ + if (!server.import_mode && strcasecmp(c->argv[2]->ptr, "off")) { + addReplyError(c, "Server is not in import mode"); + return; + } + if (!strcasecmp(c->argv[2]->ptr, "on")) { + c->flag.import_source = 1; + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { + c->flag.import_source = 0; + addReply(c, shared.ok); + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } } else { addReplySubcommandSyntaxError(c); } @@ -4197,16 +4256,53 @@ void securityWarningCommand(client *c) { freeClientAsync(c); } -/* Keep track of the original command arguments so that we can generate - * an accurate slowlog entry after the command has been executed. */ -static void retainOriginalCommandVector(client *c) { - /* We already rewrote this command, so don't rewrite it again */ - if (c->original_argv) return; - c->original_argc = c->argc; - c->original_argv = zmalloc(sizeof(robj *) * (c->argc)); - for (int j = 0; j < c->argc; j++) { - c->original_argv[j] = c->argv[j]; - incrRefCount(c->argv[j]); +/* This function preserves the original command arguments for accurate slowlog recording. + * + * It performs the following operations: + * - Stores the initial command vector if not already saved + * - Manages memory allocation for command argument modifications + * + * new_argc - The new number of arguments to allocate space for if necessary. + * new_argv - Optional pointer to a new argument vector. If NULL, space will be + * allocated for new_argc arguments, preserving the existing arguments. + */ +static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) { + robj **old_argv = c->argv; + int old_argc = c->argc; + + /* Store original arguments if not already saved */ + if (!c->original_argv) { + c->original_argc = old_argc; + c->original_argv = old_argv; + } + + /* Handle direct argv replacement */ + if (new_argv) { + c->argv = new_argv; + } else if (c->original_argv == old_argv || new_argc > old_argc) { + /* Allocate new array if necessary */ + c->argv = zmalloc(sizeof(robj *) * new_argc); + + for (int i = 0; i < old_argc && i < new_argc; i++) { + c->argv[i] = old_argv[i]; + incrRefCount(c->argv[i]); + } + + /* Initialize new argument slots to NULL */ + for (int i = old_argc; i < new_argc; i++) { + c->argv[i] = NULL; + } + } + + c->argc = new_argc; + c->argv_len = new_argc; + + /* Clean up old argv if necessary */ + if (c->argv != old_argv && c->original_argv != old_argv) { + for (int i = 0; i < old_argc; i++) { + if (old_argv[i]) decrRefCount(old_argv[i]); + } + zfree(old_argv); } } @@ -4214,7 +4310,7 @@ static void retainOriginalCommandVector(client *c) { * in the slowlog. This information is stored in the * original_argv array. */ void redactClientCommandArgument(client *c, int argc) { - retainOriginalCommandVector(c); + backupAndUpdateClientArgv(c, c->argc, NULL); if (c->original_argv[argc] == shared.redacted) { /* This argument has already been redacted */ return; @@ -4247,10 +4343,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) { /* Completely replace the client command vector with the provided one. */ void replaceClientCommandVector(client *c, int argc, robj **argv) { int j; - retainOriginalCommandVector(c); - freeClientArgv(c); - c->argv = argv; - c->argc = argc; + backupAndUpdateClientArgv(c, argc, argv); c->argv_len_sum = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) c->argv_len_sum += getStringObjectLen(c->argv[j]); @@ -4271,19 +4364,9 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { * free the no longer used objects on c->argv. */ void rewriteClientCommandArgument(client *c, int i, robj *newval) { robj *oldval; - retainOriginalCommandVector(c); + int new_argc = (i >= c->argc) ? i + 1 : c->argc; + backupAndUpdateClientArgv(c, new_argc, NULL); - /* We need to handle both extending beyond argc (just update it and - * initialize the new element) or beyond argv_len (realloc is needed). - */ - if (i >= c->argc) { - if (i >= c->argv_len) { - c->argv = zrealloc(c->argv, sizeof(robj *) * (i + 1)); - c->argv_len = i + 1; - } - c->argc = i + 1; - c->argv[i] = NULL; - } oldval = c->argv[i]; if (oldval) c->argv_len_sum -= getStringObjectLen(oldval); if (newval) c->argv_len_sum += getStringObjectLen(newval); @@ -4455,7 +4538,8 @@ int checkClientOutputBufferLimits(client *c) { * * Returns 1 if client was (flagged) closed. */ int closeClientOnOutputBufferLimitReached(client *c, int async) { - if (!c->conn) return 0; /* It is unsafe to free fake clients. */ + if (c->flag.fake) return 0; /* It is unsafe to free fake clients. */ + serverAssert(c->conn); serverAssert(c->reply_bytes < SIZE_MAX - (1024 * 64)); /* Note that c->reply_bytes is irrelevant for replica clients * (they use the global repl buffers). */ @@ -4722,9 +4806,14 @@ int processIOThreadsReadDone(void) { processed++; server.stat_io_reads_processed++; + /* Save the current conn state, as connUpdateState may modify it */ + int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING); connSetPostponeUpdateState(c->conn, 0); connUpdateState(c->conn); + /* In accept state, no client's data was read - stop here. */ + if (in_accept_state) continue; + /* On read error - stop here. */ if (handleReadResult(c) == C_ERR) { continue; diff --git a/src/object.c b/src/object.c index 8c1cf64892..15363f31b8 100644 --- a/src/object.c +++ b/src/object.c @@ -41,18 +41,68 @@ #define strtold(a, b) ((long double)strtod((a), (b))) #endif +/* For objects with large embedded keys, we reserve space for an expire field, + * so if expire is set later, we don't need to reallocate the object. */ +#define KEY_SIZE_TO_INCLUDE_EXPIRE_THRESHOLD 128 + /* ===================== Creation and parsing of objects ==================== */ -robj *createObject(int type, void *ptr) { - robj *o = zmalloc(sizeof(*o)); +/* Creates an object, optionally with embedded key and expire fields. The key + * and expire fields can be omitted by passing NULL and -1, respectively. */ +robj *createObjectWithKeyAndExpire(int type, void *ptr, const sds key, long long expire) { + /* Calculate sizes */ + int has_expire = (expire != -1 || + (key != NULL && sdslen(key) >= KEY_SIZE_TO_INCLUDE_EXPIRE_THRESHOLD)); + size_t key_sds_size = 0; + size_t min_size = sizeof(robj); + if (has_expire) { + min_size += sizeof(long long); + } + if (key != NULL) { + /* Size of embedded key, incl. 1 byte for prefixed sds hdr size. */ + key_sds_size = sdscopytobuffer(NULL, 0, key, NULL); + min_size += 1 + key_sds_size; + } + /* Allocate and set the declared fields. */ + size_t bufsize = 0; + robj *o = zmalloc_usable(min_size, &bufsize); o->type = type; o->encoding = OBJ_ENCODING_RAW; o->ptr = ptr; o->refcount = 1; o->lru = 0; + o->hasembkey = (key != NULL); + + /* If the allocation has enough space for an expire field, add it even if we + * don't need it now. Then we don't need to realloc if it's needed later. */ + if (key != NULL && !has_expire && bufsize >= min_size + sizeof(long long)) { + has_expire = 1; + min_size += sizeof(long long); + } + o->hasexpire = has_expire; + + /* The memory after the struct where we embedded data. */ + unsigned char *data = (void *)(o + 1); + + /* Set the expire field. */ + if (o->hasexpire) { + *(long long *)data = expire; + data += sizeof(long long); + } + + /* Copy embedded key. */ + if (o->hasembkey) { + sdscopytobuffer(data + 1, key_sds_size, key, data); + data += 1 + key_sds_size; + } + return o; } +robj *createObject(int type, void *ptr) { + return createObjectWithKeyAndExpire(type, ptr, NULL, -1); +} + void initObjectLRUOrLFU(robj *o) { if (o->refcount == OBJ_SHARED_REFCOUNT) return; /* Set the LRU to the current lruclock (minutes resolution), or @@ -88,39 +138,85 @@ robj *createRawStringObject(const char *ptr, size_t len) { return createObject(OBJ_STRING, sdsnewlen(ptr, len)); } -/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is - * an object where the sds string is actually an unmodifiable string - * allocated in the same chunk as the object itself. */ -robj *createEmbeddedStringObject(const char *ptr, size_t len) { - size_t bufsize = 0; - size_t sds_hdrlen = sizeof(struct sdshdr8); - robj *o = zmalloc_usable(sizeof(robj) + sds_hdrlen + len + 1, &bufsize); - struct sdshdr8 *sh = (void *)(o + 1); +/* Creates a new embedded string object and copies the content of key, val and + * expire to the new object. LRU is set to 0. */ +static robj *createEmbeddedStringObjectWithKeyAndExpire(const char *val_ptr, + size_t val_len, + const sds key, + long long expire) { + /* Calculate sizes */ + size_t key_sds_size = 0; + size_t min_size = sizeof(robj); + if (expire != -1) { + min_size += sizeof(long long); + } + if (key != NULL) { + /* Size of embedded key, incl. 1 byte for prefixed sds hdr size. */ + key_sds_size = sdscopytobuffer(NULL, 0, key, NULL); + min_size += 1 + key_sds_size; + } + /* Size of embedded value (EMBSTR) including \0 term. */ + min_size += sizeof(struct sdshdr8) + val_len + 1; + /* Allocate and set the declared fields. */ + size_t bufsize = 0; + robj *o = zmalloc_usable(min_size, &bufsize); o->type = OBJ_STRING; o->encoding = OBJ_ENCODING_EMBSTR; - o->ptr = sh + 1; o->refcount = 1; o->lru = 0; + o->hasexpire = (expire != -1); + o->hasembkey = (key != NULL); + + /* If the allocation has enough space for an expire field, add it even if we + * don't need it now. Then we don't need to realloc if it's needed later. */ + if (!o->hasexpire && bufsize >= min_size + sizeof(long long)) { + o->hasexpire = 1; + min_size += sizeof(long long); + } + + /* The memory after the struct where we embedded data. */ + unsigned char *data = (void *)(o + 1); - sh->len = len; - size_t usable = bufsize - (sizeof(robj) + sds_hdrlen + 1); - sh->alloc = usable; - /* Overflow check. This must not happen as we use embedded strings only - * for sds strings that fit into SDS_TYPE_8. */ - serverAssert(usable == sh->alloc); + /* Set the expire field. */ + if (o->hasexpire) { + *(long long *)data = expire; + data += sizeof(long long); + } + + /* Copy embedded key. */ + if (o->hasembkey) { + sdscopytobuffer(data + 1, key_sds_size, key, data); + data += 1 + key_sds_size; + } + + /* Copy embedded value (EMBSTR). */ + struct sdshdr8 *sh = (void *)data; sh->flags = SDS_TYPE_8; - if (ptr == SDS_NOINIT) - sh->buf[len] = '\0'; - else if (ptr) { - memcpy(sh->buf, ptr, len); - sh->buf[len] = '\0'; + sh->len = val_len; + size_t capacity = bufsize - (min_size - val_len); + sh->alloc = capacity; + serverAssert(capacity == sh->alloc); /* Overflow check. */ + if (val_ptr == SDS_NOINIT) { + sh->buf[val_len] = '\0'; + } else if (val_ptr != NULL) { + memcpy(sh->buf, val_ptr, val_len); + sh->buf[val_len] = '\0'; } else { - memset(sh->buf, 0, len + 1); + memset(sh->buf, 0, val_len + 1); } + o->ptr = sh->buf; + return o; } +/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is + * an object where the sds string is actually an unmodifiable string + * allocated in the same chunk as the object itself. */ +robj *createEmbeddedStringObject(const char *ptr, size_t len) { + return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, NULL, -1); +} + /* Create a string object with EMBSTR encoding if it is smaller than * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is * used. @@ -135,6 +231,96 @@ robj *createStringObject(const char *ptr, size_t len) { return createRawStringObject(ptr, len); } +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire) { + /* When to embed? Embed when the sum is up to 64 bytes. There may be better + * heuristics, e.g. we can look at the jemalloc sizes (16-byte intervals up + * to 128 bytes). */ + size_t size = sizeof(robj); + size += (key != NULL) * (sdslen(key) + 3); /* hdr size (1) + hdr (1) + nullterm (1) */ + size += (expire != -1) * sizeof(long long); + size += 4 + len; /* embstr header (3) + nullterm (1) */ + if (size <= 64) { + return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire); + } else { + return createObjectWithKeyAndExpire(OBJ_STRING, sdsnewlen(ptr, len), key, expire); + } +} + +sds objectGetKey(const robj *val) { + unsigned char *data = (void *)(val + 1); + if (val->hasexpire) { + /* Skip expire field */ + data += sizeof(long long); + } + if (val->hasembkey) { + uint8_t hdr_size = *(uint8_t *)data; + data += 1 + hdr_size; + return (sds)data; + } + return NULL; +} + +long long objectGetExpire(const robj *val) { + unsigned char *data = (void *)(val + 1); + if (val->hasexpire) { + return *(long long *)data; + } else { + return -1; + } +} + +/* This functions may reallocate the value. The new allocation is returned and + * the old object's reference counter is decremented and possibly freed. Use the + * returned object instead of 'val' after calling this function. */ +robj *objectSetExpire(robj *val, long long expire) { + if (val->hasexpire) { + /* Update existing expire field. */ + unsigned char *data = (void *)(val + 1); + *(long long *)data = expire; + return val; + } else if (expire == -1) { + return val; + } else { + return objectSetKeyAndExpire(val, objectGetKey(val), expire); + } +} + +/* This functions may reallocate the value. The new allocation is returned and + * the old object's reference counter is decremented and possibly freed. Use the + * returned object instead of 'val' after calling this function. */ +robj *objectSetKeyAndExpire(robj *val, sds key, long long expire) { + if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_EMBSTR) { + robj *new = createStringObjectWithKeyAndExpire(val->ptr, sdslen(val->ptr), key, expire); + new->lru = val->lru; + decrRefCount(val); + return new; + } + + /* Create a new object with embedded key. Reuse ptr if possible. */ + void *ptr; + if (val->refcount == 1) { + /* Reuse the ptr. There are no other references to val. */ + ptr = val->ptr; + val->ptr = NULL; + } else if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_INT) { + /* The pointer is not allocated memory. We can just copy the pointer. */ + ptr = val->ptr; + } else if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_RAW) { + /* Dup the string. */ + ptr = sdsdup(val->ptr); + } else { + serverAssert(val->type != OBJ_STRING); + /* There are multiple references to this non-string object. Most types + * can be duplicated, but for a module type is not always possible. */ + serverPanic("Not implemented"); + } + robj *new = createObjectWithKeyAndExpire(val->type, ptr, key, expire); + new->encoding = val->encoding; + new->lru = val->lru; + decrRefCount(val); + return new; +} + /* Same as CreateRawStringObject, can return NULL if allocation fails */ robj *tryCreateRawStringObject(const char *ptr, size_t len) { sds str = sdstrynewlen(ptr, len); @@ -179,18 +365,10 @@ robj *createStringObjectFromLongLong(long long value) { return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO); } -/* The function avoids returning a shared integer when LFU/LRU info - * are needed, that is, when the object is used as a value in the key - * space(for instance when the INCR command is used), and the server is - * configured to evict based on LFU/LRU, so we want LFU/LRU values - * specific for each key. */ +/* The function doesn't return a shared integer when the object is used as a + * value in the key space (for instance when the INCR command is used). */ robj *createStringObjectFromLongLongForValue(long long value) { - if (server.maxmemory == 0 || !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) { - /* If the maxmemory policy permits, we can still return shared integers */ - return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO); - } else { - return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_SHARED); - } + return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_SHARED); } /* Create a string object that contains an sds inside it. That means it can't be @@ -251,9 +429,9 @@ robj *createListListpackObject(void) { } robj *createSetObject(void) { - dict *d = dictCreate(&setDictType); - robj *o = createObject(OBJ_SET, d); - o->encoding = OBJ_ENCODING_HT; + hashtable *ht = hashtableCreate(&setHashtableType); + robj *o = createObject(OBJ_SET, ht); + o->encoding = OBJ_ENCODING_HASHTABLE; return o; } @@ -328,7 +506,7 @@ void freeListObject(robj *o) { void freeSetObject(robj *o) { switch (o->encoding) { - case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break; + case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break; case OBJ_ENCODING_INTSET: case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break; default: serverPanic("Unknown set encoding type"); @@ -381,15 +559,17 @@ void incrRefCount(robj *o) { void decrRefCount(robj *o) { if (o->refcount == 1) { - switch (o->type) { - case OBJ_STRING: freeStringObject(o); break; - case OBJ_LIST: freeListObject(o); break; - case OBJ_SET: freeSetObject(o); break; - case OBJ_ZSET: freeZsetObject(o); break; - case OBJ_HASH: freeHashObject(o); break; - case OBJ_MODULE: freeModuleObject(o); break; - case OBJ_STREAM: freeStreamObject(o); break; - default: serverPanic("Unknown object type"); break; + if (o->ptr != NULL) { + switch (o->type) { + case OBJ_STRING: freeStringObject(o); break; + case OBJ_LIST: freeListObject(o); break; + case OBJ_SET: freeSetObject(o); break; + case OBJ_ZSET: freeZsetObject(o); break; + case OBJ_HASH: freeHashObject(o); break; + case OBJ_MODULE: freeModuleObject(o); break; + case OBJ_STREAM: freeStreamObject(o); break; + default: serverPanic("Unknown object type"); break; + } } zfree(o); } else { @@ -398,9 +578,14 @@ void decrRefCount(robj *o) { } } -/* See dismissObject() */ +/* See dismissObject(). sds is an exception, because the allocation + * size is known. Instead of dismissing it with madvise(MADV_DONTNEED) + * we free it via the allocator, which has minimal overhead when the + * size is known. This has advantage that it allows the allocator to + * accumulate free buffers to free whole pages, while madvise is nop + * if the buffer is less than a page. */ void dismissSds(sds s) { - dismissMemory(sdsAllocPtr(s), sdsAllocSize(s)); + sdsfree(s); } /* See dismissObject() */ @@ -437,23 +622,23 @@ void dismissListObject(robj *o, size_t size_hint) { /* See dismissObject() */ void dismissSetObject(robj *o, size_t size_hint) { - if (o->encoding == OBJ_ENCODING_HT) { - dict *set = o->ptr; - serverAssert(dictSize(set) != 0); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + serverAssert(hashtableSize(ht) != 0); /* We iterate all nodes only when average member size is bigger than a * page size, and there's a high chance we'll actually dismiss something. */ - if (size_hint / dictSize(set) >= server.page_size) { - dictEntry *de; - dictIterator *di = dictGetIterator(set); - while ((de = dictNext(di)) != NULL) { - dismissSds(dictGetKey(de)); + if (size_hint / hashtableSize(ht) >= server.page_size) { + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + sds item = next; + dismissSds(item); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } - /* Dismiss hash table memory. */ - dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *)); - dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *)); + dismissHashtable(ht); } else if (o->encoding == OBJ_ENCODING_INTSET) { dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { @@ -543,7 +728,7 @@ void dismissStreamObject(robj *o, size_t size_hint) { * modifies any keys due to write traffic, it'll cause CoW which consume * physical memory. In the child process, after serializing the key and value, * the data is definitely not accessed again, so to avoid unnecessary CoW, we - * try to release their memory back to OS. see dismissMemory(). + * try to release their memory back to OS. see zmadvise_dontneed(). * * Because of the cost of iterating all node/field/member/entry of complex data * types, we iterate and dismiss them only when approximate average we estimate @@ -574,13 +759,6 @@ void dismissObject(robj *o, size_t size_hint) { #endif } -/* This variant of decrRefCount() gets its argument as void, and is useful - * as free method in data structures that expect a 'void free_object(void*)' - * prototype for the free method. */ -void decrRefCountVoid(void *o) { - decrRefCount(o); -} - int checkType(client *c, robj *o, int type) { /* A NULL is considered an empty key */ if (o && o->type != type) { @@ -648,23 +826,15 @@ robj *tryObjectEncodingEx(robj *o, int try_trim) { * representable as a 32 nor 64 bit integer. */ len = sdslen(s); if (len <= 20 && string2l(s, len, &value)) { - /* This object is encodable as a long. Try to use a shared object. - * Note that we avoid using shared integers when maxmemory is used - * because every object needs to have a private LRU field for the LRU - * algorithm to work well. */ - if (canUseSharedObject() && value >= 0 && value < OBJ_SHARED_INTEGERS) { + /* This object is encodable as a long. */ + if (o->encoding == OBJ_ENCODING_RAW) { + sdsfree(o->ptr); + o->encoding = OBJ_ENCODING_INT; + o->ptr = (void *)value; + return o; + } else if (o->encoding == OBJ_ENCODING_EMBSTR) { decrRefCount(o); - return shared.integers[value]; - } else { - if (o->encoding == OBJ_ENCODING_RAW) { - sdsfree(o->ptr); - o->encoding = OBJ_ENCODING_INT; - o->ptr = (void *)value; - return o; - } else if (o->encoding == OBJ_ENCODING_EMBSTR) { - decrRefCount(o); - return createStringObjectFromLongLongForValue(value); - } + return createStringObjectFromLongLongForValue(value); } } @@ -939,6 +1109,7 @@ char *strEncoding(int encoding) { case OBJ_ENCODING_RAW: return "raw"; case OBJ_ENCODING_INT: return "int"; case OBJ_ENCODING_HT: return "hashtable"; + case OBJ_ENCODING_HASHTABLE: return "hashtable"; case OBJ_ENCODING_QUICKLIST: return "quicklist"; case OBJ_ENCODING_LISTPACK: return "listpack"; case OBJ_ENCODING_INTSET: return "intset"; @@ -990,17 +1161,20 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { serverPanic("Unknown list encoding"); } } else if (o->type == OBJ_SET) { - if (o->encoding == OBJ_ENCODING_HT) { - d = o->ptr; - di = dictGetIterator(d); - asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)); - while ((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); - elesize += dictEntryMemUsage(de) + sdsAllocSize(ele); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + asize = sizeof(*o) + hashtableMemUsage(ht); + + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next) && samples < sample_size) { + sds element = next; + elesize += sdsAllocSize(element); samples++; } - dictReleaseIterator(di); - if (samples) asize += (double)elesize / samples * dictSize(d); + hashtableResetIterator(&iter); + if (samples) asize += (double)elesize / samples * hashtableSize(ht); } else if (o->encoding == OBJ_ENCODING_INTSET) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { @@ -1194,7 +1368,7 @@ struct serverMemOverhead *getMemoryOverheadData(void) { for (j = 0; j < server.dbnum; j++) { serverDb *db = server.db + j; - if (!kvstoreNumAllocatedDicts(db->keys)) continue; + if (!kvstoreNumAllocatedHashtables(db->keys)) continue; unsigned long long keyscount = kvstoreSize(db->keys); @@ -1216,8 +1390,8 @@ struct serverMemOverhead *getMemoryOverheadData(void) { mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->expires); mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->keys); mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->expires); - mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->keys); - mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->expires); + mh->db_dict_rehashing_count += kvstoreHashtableRehashingCount(db->keys); + mh->db_dict_rehashing_count += kvstoreHashtableRehashingCount(db->expires); } mh->overhead_total = mem_total; @@ -1515,7 +1689,6 @@ void memoryCommand(client *c) { }; addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "usage") && c->argc >= 3) { - dictEntry *de; long long samples = OBJ_COMPUTE_SIZE_DEF_SAMPLES; for (int j = 3; j < c->argc; j++) { if (!strcasecmp(c->argv[j]->ptr, "samples") && j + 1 < c->argc) { @@ -1531,12 +1704,12 @@ void memoryCommand(client *c) { return; } } - if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) { + robj *obj = dbFind(c->db, c->argv[2]->ptr); + if (obj == NULL) { addReplyNull(c); return; } - size_t usage = objectComputeSize(c->argv[2], dictGetVal(de), samples, c->db->id); - usage += dictEntryMemUsage(de); + size_t usage = objectComputeSize(c->argv[2], obj, samples, c->db->id); addReplyLongLong(c, usage); } else if (!strcasecmp(c->argv[1]->ptr, "stats") && c->argc == 2) { struct serverMemOverhead *mh = getMemoryOverheadData(); diff --git a/src/pubsub.c b/src/pubsub.c index 5b037b5721..3781fa39aa 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -258,7 +258,6 @@ void unmarkClientAsPubSub(client *c) { /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or * 0 if the client was already subscribed to that channel. */ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { - dictEntry *de, *existing; dict *clients = NULL; int retval = 0; unsigned int slot = 0; @@ -272,15 +271,18 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { slot = getKeySlot(channel->ptr); } - de = kvstoreDictAddRaw(*type.serverPubSubChannels, slot, channel, &existing); - - if (existing) { - clients = dictGetVal(existing); - channel = dictGetKey(existing); + hashtablePosition pos; + void *existing; + if (!kvstoreHashtableFindPositionForInsert(*type.serverPubSubChannels, slot, channel, &pos, &existing)) { + clients = existing; + channel = *(robj **)dictMetadata(clients); } else { + /* Store pointer to channel name in the dict's metadata. */ clients = dictCreate(&clientDictType); - kvstoreDictSetVal(*type.serverPubSubChannels, slot, de, clients); + *(robj **)dictMetadata(clients) = channel; incrRefCount(channel); + /* Insert this dict in the kvstore at the position returned above. */ + kvstoreHashtableInsertAtPosition(*type.serverPubSubChannels, slot, clients, &pos); } serverAssert(dictAdd(clients, c, NULL) != DICT_ERR); @@ -295,7 +297,6 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) { /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or * 0 if the client was not subscribed to the specified channel. */ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype type) { - dictEntry *de; dict *clients; int retval = 0; int slot = 0; @@ -309,15 +310,16 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty if (server.cluster_enabled && type.shard) { slot = getKeySlot(channel->ptr); } - de = kvstoreDictFind(*type.serverPubSubChannels, slot, channel); - serverAssertWithInfo(c, NULL, de != NULL); - clients = dictGetVal(de); + void *found; + kvstoreHashtableFind(*type.serverPubSubChannels, slot, channel, &found); + serverAssertWithInfo(c, NULL, found); + clients = found; serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK); if (dictSize(clients) == 0) { /* Free the dict and associated hash entry at all if this was * the latest client, so that it will be possible to abuse * PUBSUB creating millions of channels. */ - kvstoreDictDelete(*type.serverPubSubChannels, slot, channel); + kvstoreHashtableDelete(*type.serverPubSubChannels, slot, channel); } } /* Notify the client */ @@ -330,13 +332,13 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty /* Unsubscribe all shard channels in a slot. */ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { - if (!kvstoreDictSize(server.pubsubshard_channels, slot)) return; + if (!kvstoreHashtableSize(server.pubsubshard_channels, slot)) return; - kvstoreDictIterator *kvs_di = kvstoreGetDictSafeIterator(server.pubsubshard_channels, slot); - dictEntry *de; - while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { - robj *channel = dictGetKey(de); - dict *clients = dictGetVal(de); + kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableSafeIterator(server.pubsubshard_channels, slot); + void *element; + while (kvstoreHashtableIteratorNext(kvs_di, &element)) { + dict *clients = element; + robj *channel = *(robj **)dictMetadata(clients); /* For each client subscribed to the channel, unsubscribe it. */ dictIterator *iter = dictGetIterator(clients); dictEntry *entry; @@ -352,9 +354,9 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) { } } dictReleaseIterator(iter); - kvstoreDictDelete(server.pubsubshard_channels, slot, channel); + kvstoreHashtableDelete(server.pubsubshard_channels, slot, channel); } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); } /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to @@ -474,6 +476,7 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify) { */ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) { int receivers = 0; + void *element; dictEntry *de; dictIterator *di; int slot = -1; @@ -482,9 +485,8 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) if (server.cluster_enabled && type.shard) { slot = keyHashSlot(channel->ptr, sdslen(channel->ptr)); } - de = kvstoreDictFind(*type.serverPubSubChannels, (slot == -1) ? 0 : slot, channel); - if (de) { - dict *clients = dictGetVal(de); + if (kvstoreHashtableFind(*type.serverPubSubChannels, (slot == -1) ? 0 : slot, channel, &element)) { + dict *clients = element; dictEntry *entry; dictIterator *iter = dictGetIterator(clients); while ((entry = dictNext(iter)) != NULL) { @@ -650,8 +652,9 @@ void pubsubCommand(client *c) { addReplyArrayLen(c, (c->argc - 2) * 2); for (j = 2; j < c->argc; j++) { - dict *d = kvstoreDictFetchValue(server.pubsub_channels, 0, c->argv[j]); - + void *found = NULL; + kvstoreHashtableFind(server.pubsub_channels, 0, c->argv[j], &found); + dict *d = found; addReplyBulk(c, c->argv[j]); addReplyLongLong(c, d ? dictSize(d) : 0); } @@ -669,8 +672,9 @@ void pubsubCommand(client *c) { for (j = 2; j < c->argc; j++) { sds key = c->argv[j]->ptr; unsigned int slot = server.cluster_enabled ? keyHashSlot(key, (int)sdslen(key)) : 0; - dict *clients = kvstoreDictFetchValue(server.pubsubshard_channels, slot, c->argv[j]); - + void *found = NULL; + kvstoreHashtableFind(server.pubsubshard_channels, slot, c->argv[j], &found); + dict *clients = found; addReplyBulk(c, c->argv[j]); addReplyLongLong(c, clients ? dictSize(clients) : 0); } @@ -682,15 +686,16 @@ void pubsubCommand(client *c) { void channelList(client *c, sds pat, kvstore *pubsub_channels) { long mblen = 0; void *replylen; - unsigned int slot_cnt = kvstoreNumDicts(pubsub_channels); + unsigned int slot_cnt = kvstoreNumHashtables(pubsub_channels); replylen = addReplyDeferredLen(c); for (unsigned int i = 0; i < slot_cnt; i++) { - if (!kvstoreDictSize(pubsub_channels, i)) continue; - kvstoreDictIterator *kvs_di = kvstoreGetDictIterator(pubsub_channels, i); - dictEntry *de; - while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { - robj *cobj = dictGetKey(de); + if (!kvstoreHashtableSize(pubsub_channels, i)) continue; + kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i); + void *next; + while (kvstoreHashtableIteratorNext(kvs_di, &next)) { + dict *clients = next; + robj *cobj = *(robj **)dictMetadata(clients); sds channel = cobj->ptr; if (!pat || stringmatchlen(pat, sdslen(pat), channel, sdslen(channel), 0)) { @@ -698,7 +703,7 @@ void channelList(client *c, sds pat, kvstore *pubsub_channels) { mblen++; } } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); } setDeferredArrayLen(c, replylen, mblen); } diff --git a/src/rdb.c b/src/rdb.c index 1c200e54f5..5fb77a2897 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -49,6 +49,9 @@ #include #include +/* Size of the static buffer used for rdbcompression */ +#define LZF_STATIC_BUFFER_SIZE (8 * 1024) + /* This macro is called when the internal RDB structure is corrupt */ #define rdbReportCorruptRDB(...) rdbReportError(1, __LINE__, __VA_ARGS__) /* This macro is called when RDB read failed (possibly a short read) */ @@ -388,18 +391,20 @@ ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, size_t origina ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { size_t comprlen, outlen; void *out; + static void *buffer = NULL; /* We require at least four bytes compression for this to be worth it */ if (len <= 4) return 0; outlen = len - 4; - if ((out = zmalloc(outlen + 1)) == NULL) return 0; - comprlen = lzf_compress(s, len, out, outlen); - if (comprlen == 0) { - zfree(out); - return 0; + if (outlen < LZF_STATIC_BUFFER_SIZE) { + if (!buffer) buffer = zmalloc(LZF_STATIC_BUFFER_SIZE); + out = buffer; + } else { + if ((out = zmalloc(outlen + 1)) == NULL) return 0; } - ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len); - zfree(out); + comprlen = lzf_compress(s, len, out, outlen); + ssize_t nwritten = comprlen ? rdbSaveLzfBlob(rdb, out, comprlen, len) : 0; + if (out != buffer) zfree(out); return nwritten; } @@ -687,7 +692,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) { case OBJ_SET: if (o->encoding == OBJ_ENCODING_INTSET) return rdbSaveType(rdb, RDB_TYPE_SET_INTSET); - else if (o->encoding == OBJ_ENCODING_HT) + else if (o->encoding == OBJ_ENCODING_HASHTABLE) return rdbSaveType(rdb, RDB_TYPE_SET); else if (o->encoding == OBJ_ENCODING_LISTPACK) return rdbSaveType(rdb, RDB_TYPE_SET_LISTPACK); @@ -871,26 +876,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { } } else if (o->type == OBJ_SET) { /* Save a set value */ - if (o->encoding == OBJ_ENCODING_HT) { - dict *set = o->ptr; - dictIterator *di = dictGetIterator(set); - dictEntry *de; + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *set = o->ptr; - if ((n = rdbSaveLen(rdb, dictSize(set))) == -1) { - dictReleaseIterator(di); + if ((n = rdbSaveLen(rdb, hashtableSize(set))) == -1) { return -1; } nwritten += n; - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); + hashtableIterator iterator; + hashtableInitIterator(&iterator, set); + void *next; + while (hashtableNext(&iterator, &next)) { + sds ele = next; if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) { - dictReleaseIterator(di); + hashtableResetIterator(&iterator); return -1; } nwritten += n; } - dictReleaseIterator(di); + hashtableResetIterator(&iterator); } else if (o->encoding == OBJ_ENCODING_INTSET) { size_t l = intsetBlobLen((intset *)o->ptr); @@ -1316,7 +1321,6 @@ ssize_t rdbSaveFunctions(rio *rdb) { } ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { - dictEntry *de; ssize_t written = 0; ssize_t res; kvstoreIterator *kvs_it = NULL; @@ -1345,12 +1349,14 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { kvs_it = kvstoreIteratorInit(db->keys); int last_slot = -1; /* Iterate this DB writing every entry */ - while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { - int curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); + void *next; + while (kvstoreIteratorNext(kvs_it, &next)) { + robj *o = next; + int curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it); /* Save slot info. */ if (server.cluster_enabled && curr_slot != last_slot) { - sds slot_info = sdscatprintf(sdsempty(), "%i,%lu,%lu", curr_slot, kvstoreDictSize(db->keys, curr_slot), - kvstoreDictSize(db->expires, curr_slot)); + sds slot_info = sdscatprintf(sdsempty(), "%i,%lu,%lu", curr_slot, kvstoreHashtableSize(db->keys, curr_slot), + kvstoreHashtableSize(db->expires, curr_slot)); if ((res = rdbSaveAuxFieldStrStr(rdb, "slot-info", slot_info)) < 0) { sdsfree(slot_info); goto werr; @@ -1359,8 +1365,8 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) { last_slot = curr_slot; sdsfree(slot_info); } - sds keystr = dictGetKey(de); - robj key, *o = dictGetVal(de); + sds keystr = objectGetKey(o); + robj key; long long expire; size_t rdb_bytes_before_key = rdb->processed_bytes; @@ -1903,8 +1909,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o = createSetObject(); /* It's faster to expand the dict to the right size asap in order * to avoid rehashing */ - if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + if (!hashtableTryExpand(o->ptr, len)) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; } @@ -1943,8 +1949,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { * of many small ones. It's OK since lpSafeToAdd doesn't * care about individual elements, only the total size. */ setTypeConvert(o, OBJ_ENCODING_LISTPACK); - } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); sdsfree(sdsele); decrRefCount(o); return NULL; @@ -1964,8 +1970,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { return NULL; } o->ptr = lpAppend(o->ptr, (unsigned char *)sdsele, elelen); - } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); sdsfree(sdsele); decrRefCount(o); return NULL; @@ -1974,8 +1980,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* This will also be called when the set was just converted * to a regular hash table encoded set. */ - if (o->encoding == OBJ_ENCODING_HT) { - if (dictAdd((dict *)o->ptr, sdsele, NULL) != DICT_OK) { + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + if (!hashtableAdd((hashtable *)o->ptr, sdsele)) { rdbReportCorruptRDB("Duplicate set members detected"); decrRefCount(o); sdsfree(sdsele); @@ -2350,7 +2356,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } o->type = OBJ_SET; o->encoding = OBJ_ENCODING_INTSET; - if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HT); + if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE); break; case RDB_TYPE_SET_LISTPACK: if (deep_integrity_validation) server.stat_dump_payload_sanitizations++; @@ -2370,7 +2376,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { decrRefCount(o); goto emptykey; } - if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HT); + if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE); break; case RDB_TYPE_ZSET_ZIPLIST: { unsigned char *lp = lpNew(encoded_len); @@ -3141,8 +3147,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin if (server.cluster_enabled) { /* In cluster mode we resize individual slot specific dictionaries based on the number of keys that * slot holds. */ - kvstoreDictExpand(db->keys, slot_id, slot_size); - kvstoreDictExpand(db->expires, slot_id, expires_slot_size); + kvstoreHashtableExpand(db->keys, slot_id, slot_size); + kvstoreHashtableExpand(db->expires, slot_id, expires_slot_size); should_expand_db = 0; } } else { @@ -3300,7 +3306,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin initStaticStringObject(keyobj, key); /* Add the new object in the hash table */ - int added = dbAddRDBLoad(db, key, val); + int added = dbAddRDBLoad(db, key, &val); server.rdb_last_load_keys_loaded++; if (!added) { if (rdbflags & RDBFLAGS_ALLOW_DUP) { @@ -3308,7 +3314,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin * When it's set we allow new keys to replace the current * keys with the same name. */ dbSyncDelete(db, &keyobj); - dbAddRDBLoad(db, key, val); + added = dbAddRDBLoad(db, key, &val); + serverAssert(added); } else { serverLog(LL_WARNING, "RDB has duplicated key '%s' in DB %d", key, db->id); serverPanic("Duplicated key found in RDB file"); @@ -3317,7 +3324,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin /* Set the expire time if needed */ if (expiretime != -1) { - setExpire(NULL, db, &keyobj, expiretime); + val = setExpire(NULL, db, &keyobj, expiretime); } /* Set usage information (for eviction). */ diff --git a/src/rdma.c b/src/rdma.c index 7cdcb24913..7fe65ad2d2 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -10,9 +10,10 @@ #define VALKEYMODULE_CORE_MODULE #include "server.h" - -#if defined USE_RDMA && defined __linux__ /* currently RDMA is only supported on Linux */ #include "connection.h" + +#if defined __linux__ /* currently RDMA is only supported on Linux */ +#if (USE_RDMA == 1 /* BUILD_YES */) || ((USE_RDMA == 2 /* BUILD_MODULE */) && (BUILD_RDMA_MODULE == 2)) #include "connhelpers.h" #include @@ -76,9 +77,12 @@ typedef enum ValkeyRdmaOpcode { #define VALKEY_RDMA_INVALID_OPCODE 0xffff #define VALKEY_RDMA_KEEPALIVE_MS 3000 +#define RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 0) + typedef struct rdma_connection { connection c; struct rdma_cm_id *cm_id; + int flags; int last_errno; listNode *pending_list_node; } rdma_connection; @@ -128,12 +132,10 @@ typedef struct rdma_listener { static list *pending_list; static rdma_listener *rdma_listeners; +static serverRdmaContextConfig *rdma_config; static ConnectionType CT_RDMA; -static int valkey_rdma_rx_size = VALKEY_RDMA_DEFAULT_RX_SIZE; -static int valkey_rdma_comp_vector = -1; /* -1 means a random one */ - static void serverRdmaError(char *err, const char *fmt, ...) { va_list ap; @@ -272,7 +274,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) { /* setup recv buf & MR */ access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE; - length = valkey_rdma_rx_size; + length = rdma_config->rx_size; ctx->rx.addr = page_aligned_zalloc(length); ctx->rx.length = length; ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access); @@ -295,6 +297,7 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { struct ibv_comp_channel *comp_channel = NULL; struct ibv_cq *cq = NULL; struct ibv_pd *pd = NULL; + int comp_vector = rdma_config->completion_vector; if (ibv_query_device(cm_id->verbs, &device_attr)) { serverLog(LL_WARNING, "RDMA: ibv ibv query device failed"); @@ -317,8 +320,13 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { ctx->comp_channel = comp_channel; + /* negative number means a random one */ + if (comp_vector < 0) { + comp_vector = abs((int)random()); + } + cq = ibv_create_cq(cm_id->verbs, VALKEY_RDMA_MAX_WQE * 2, NULL, comp_channel, - valkey_rdma_comp_vector % cm_id->verbs->num_comp_vectors); + comp_vector % cm_id->verbs->num_comp_vectors); if (!cq) { serverLog(LL_WARNING, "RDMA: ibv create cq failed"); return C_ERR; @@ -688,7 +696,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* uplayer should read all */ - while (ctx->rx.pos < ctx->rx.offset) { + while (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->rx.pos < ctx->rx.offset) { if (conn->read_handler && (callHandler(conn, conn->read_handler) == C_ERR)) { return; } @@ -700,7 +708,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* RDMA comp channel has no POLLOUT event, try to send remaining buffer */ - if ((ctx->tx.offset < ctx->tx.length) && conn->write_handler) { + if (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->tx.offset < ctx->tx.length && conn->write_handler) { callHandler(conn, conn->write_handler); } } @@ -879,6 +887,9 @@ static void connRdmaAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m } static int connRdmaSetRwHandler(connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) return C_OK; + /* IB channel only has POLLIN event */ if (conn->read_handler || conn->write_handler) { if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE, conn->type->ae_handler, conn) == AE_ERR) { @@ -1610,9 +1621,28 @@ int connRdmaListen(connListener *listener) { rdma_listener++; } + rdma_config = listener->priv; return C_OK; } +static void connRdmaCloseListener(connListener *listener) { + /* Close old servers */ + for (int i = 0; i < listener->count; i++) { + if (listener->fd[i] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); + listener->fd[i] = -1; + struct rdma_listener *rdma_listener = &rdma_listeners[i]; + rdma_destroy_id(rdma_listener->cm_id); + rdma_destroy_event_channel(rdma_listener->cm_channel); + } + + listener->count = 0; + zfree(rdma_listeners); + rdma_listeners = NULL; + rdma_config = NULL; +} + static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) { rdma_connection *rdma_conn = (rdma_connection *)conn; struct rdma_cm_id *cm_id = rdma_conn->cm_id; @@ -1697,12 +1727,12 @@ static int rdmaProcessPendingData(void) { listNode *ln; rdma_connection *rdma_conn; connection *conn; - int processed; + int processed = 0; - processed = listLength(pending_list); listRewind(pending_list, &li); while ((ln = listNext(&li))) { rdma_conn = listNodeValue(ln); + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) continue; conn = &rdma_conn->c; /* a connection can be disconnected by remote peer, CM event mark state as CONN_STATE_CLOSED, kick connection @@ -1717,15 +1747,32 @@ static int rdmaProcessPendingData(void) { callHandler(conn, conn->write_handler); } + ++processed; continue; } connRdmaEventHandler(NULL, -1, rdma_conn, 0); + ++processed; } return processed; } +static void postPoneUpdateRdmaState(struct connection *conn, int postpone) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (postpone) { + rdma_conn->flags |= RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } else { + rdma_conn->flags &= ~RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } +} + +static void updateRdmaState(struct connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + connRdmaSetRwHandler(conn); + connRdmaEventHandler(NULL, -1, rdma_conn, 0); +} + static ConnectionType CT_RDMA = { /* connection type */ .get_type = connRdmaGetType, @@ -1740,6 +1787,7 @@ static ConnectionType CT_RDMA = { //.cluster_accept_handler = NULL, .is_local = connRdmaIsLocal, .listen = connRdmaListen, + .closeListener = connRdmaCloseListener, .addr = connRdmaAddr, /* create/close connection */ @@ -1767,19 +1815,10 @@ static ConnectionType CT_RDMA = { /* pending data */ .has_pending_data = rdmaHasPendingData, .process_pending_data = rdmaProcessPendingData, + .postpone_update_state = postPoneUpdateRdmaState, + .update_state = updateRdmaState, }; -static struct connListener *rdmaListener(void) { - static struct connListener *listener = NULL; - - if (listener) return listener; - - listener = listenerByType(CONN_TYPE_RDMA); - serverAssert(listener != NULL); - - return listener; -} - ConnectionType *connectionTypeRdma(void) { static ConnectionType *ct_rdma = NULL; @@ -1791,133 +1830,28 @@ ConnectionType *connectionTypeRdma(void) { return ct_rdma; } -/* rdma listener has different create/close logic from TCP, we can't re-use 'int changeListener(connListener *listener)' - * directly */ -static int rdmaChangeListener(void) { - struct connListener *listener = rdmaListener(); - - /* Close old servers */ - for (int i = 0; i < listener->count; i++) { - if (listener->fd[i] == -1) continue; - - aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); - listener->fd[i] = -1; - struct rdma_listener *rdma_listener = &rdma_listeners[i]; - rdma_destroy_id(rdma_listener->cm_id); - rdma_destroy_event_channel(rdma_listener->cm_channel); - } - - listener->count = 0; - zfree(rdma_listeners); - rdma_listeners = NULL; - - closeListener(listener); - - /* Just close the server if port disabled */ - if (listener->port == 0) { - if (server.set_proc_title) serverSetProcTitle(NULL); - return VALKEYMODULE_OK; - } - - /* Re-create listener */ - if (connListen(listener) != C_OK) { - return VALKEYMODULE_ERR; - } - - /* Create event handlers */ - if (createSocketAcceptHandler(listener, listener->ct->accept_handler) != C_OK) { - serverPanic("Unrecoverable error creating %s accept handler.", listener->ct->get_type(NULL)); - } - - if (server.set_proc_title) serverSetProcTitle(NULL); - - return VALKEYMODULE_OK; -} - -#ifdef BUILD_RDMA_MODULE - -#include "release.h" - -static long long rdmaGetPort(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); - struct connListener *listener = rdmaListener(); - - return listener->port; +int RegisterConnectionTypeRdma(void) { + return connTypeRegister(&CT_RDMA); } -static int rdmaSetPort(const char *name, long long val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(privdata); - UNUSED(err); - struct connListener *listener = rdmaListener(); - listener->port = val; - - return VALKEYMODULE_OK; -} - -static ValkeyModuleString *rdma_bind; - -static void rdmaBuildBind(void *ctx) { - struct connListener *listener = rdmaListener(); - - if (rdma_bind) ValkeyModule_FreeString(NULL, rdma_bind); +#else - sds rdma_bind_str = sdsjoin(listener->bindaddr, listener->bindaddr_count, " "); - rdma_bind = ValkeyModule_CreateString(ctx, rdma_bind_str, sdslen(rdma_bind_str)); -} - -static ValkeyModuleString *rdmaGetBind(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); - - return rdma_bind; +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s not builtin", CONN_TYPE_RDMA); + return C_ERR; } -static int rdmaSetBind(const char *name, ValkeyModuleString *val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(err); - struct connListener *listener = rdmaListener(); - const char *bind = ValkeyModule_StringPtrLen(val, NULL); - int nexts; - sds *exts = sdssplitlen(bind, strlen(bind), " ", 1, &nexts); - - if (nexts > CONFIG_BINDADDR_MAX) { - serverLog(LL_WARNING, "RDMA: Unsupported bind ( > %d)", CONFIG_BINDADDR_MAX); - return VALKEYMODULE_ERR; - } - - /* Free old bind addresses */ - for (int j = 0; j < listener->bindaddr_count; j++) { - zfree(listener->bindaddr[j]); - } - - for (int j = 0; j < nexts; j++) listener->bindaddr[j] = zstrdup(exts[j]); - listener->bindaddr_count = nexts; - - sdsfreesplitres(exts, nexts); - rdmaBuildBind(privdata); - - return VALKEYMODULE_OK; -} +#endif -static int rdmaApplyListener(ValkeyModuleCtx *ctx, void *privdata, ValkeyModuleString **err) { - UNUSED(ctx); - UNUSED(privdata); - UNUSED(err); +#if BUILD_RDMA_MODULE == 2 /* BUILD_MODULE */ - return rdmaChangeListener(); -} +#include "release.h" -static void rdmaListenerAddConfig(void *ctx) { - serverAssert(ValkeyModule_RegisterNumericConfig(ctx, "port", 0, VALKEYMODULE_CONFIG_DEFAULT, 0, 65535, rdmaGetPort, - rdmaSetPort, rdmaApplyListener, NULL) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_RegisterStringConfig(ctx, "bind", "", VALKEYMODULE_CONFIG_DEFAULT, rdmaGetBind, - rdmaSetBind, rdmaApplyListener, ctx) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_LoadConfigs(ctx) == VALKEYMODULE_OK); -} int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { + UNUSED(argv); + UNUSED(argc); + /* Connection modules MUST be part of the same build as valkey. */ if (strcmp(REDIS_BUILD_ID_RAW, serverBuildIdRaw())) { serverLog(LL_NOTICE, "Connection type %s was not built together with the valkey-server used.", CONN_TYPE_RDMA); @@ -1936,40 +1870,6 @@ int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { if (connTypeRegister(&CT_RDMA) != C_OK) return VALKEYMODULE_ERR; - rdmaListenerAddConfig(ctx); - - struct connListener *listener = rdmaListener(); - listener->ct = connectionTypeRdma(); - listener->bindaddr = zcalloc_num(CONFIG_BINDADDR_MAX, sizeof(listener->bindaddr[0])); - - for (int i = 0; i < argc; i++) { - robj *str = (robj *)argv[i]; - int nexts; - sds *exts = sdssplitlen(str->ptr, strlen(str->ptr), "=", 1, &nexts); - if (nexts != 2) { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - if (!strcasecmp(exts[0], "bind")) { - listener->bindaddr[listener->bindaddr_count++] = zstrdup(exts[1]); - } else if (!strcasecmp(exts[0], "port")) { - listener->port = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "rx-size")) { - valkey_rdma_rx_size = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "comp-vector")) { - valkey_rdma_comp_vector = atoi(exts[1]); - } else { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - sdsfreesplitres(exts, nexts); - } - - rdmaBuildBind(ctx); - if (valkey_rdma_comp_vector == -1) valkey_rdma_comp_vector = abs((int)random()); - return VALKEYMODULE_OK; } @@ -1981,4 +1881,11 @@ int ValkeyModule_OnUnload(void *arg) { #endif /* BUILD_RDMA_MODULE */ -#endif /* USE_RDMA && __linux__ */ +#else /* __linux__ */ + +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s is supported on Linux only", CONN_TYPE_RDMA); + return C_ERR; +} + +#endif /* __linux__ */ diff --git a/src/replication.c b/src/replication.c index 48f02cf658..3a207a1d0f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -227,9 +227,9 @@ void addRdbReplicaToPsyncWait(client *replica_rdb_client) { tail->refcount++; } } - serverLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", - replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, - tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); + dualChannelServerLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", + replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, + tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); replica_rdb_client->ref_repl_buf_node = tail ? ln : NULL; /* Prevent rdb client from being freed before psync is established. */ replica_rdb_client->flag.protected_rdb_channel = 1; @@ -252,8 +252,8 @@ void backfillRdbReplicasToPsyncWait(void) { if (replica_rdb_client->ref_repl_buf_node) continue; replica_rdb_client->ref_repl_buf_node = ln; head->refcount++; - serverLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", - (long long unsigned int)replica_rdb_client->id); + dualChannelServerLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", + (long long unsigned int)replica_rdb_client->id); } raxStop(&iter); } @@ -271,10 +271,10 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { } replica_rdb_client->ref_repl_buf_node = NULL; replica_rdb_client->flag.protected_rdb_channel = 0; - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", - replicationGetReplicaName(replica_main_client), - (long long unsigned int)replica_main_client->associated_rdb_client_id, - o ? "ref count decreased" : "doesn't exist"); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", + replicationGetReplicaName(replica_main_client), + (long long unsigned int)replica_main_client->associated_rdb_client_id, + o ? "ref count decreased" : "doesn't exist"); uint64_t id = htonu64(replica_rdb_client->id); raxRemove(server.replicas_waiting_psync, (unsigned char *)&id, sizeof(id), NULL); } @@ -282,7 +282,7 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { void resetReplicationBuffer(void) { server.repl_buffer_mem = 0; server.repl_buffer_blocks = listCreate(); - listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void *))zfree); + listSetFreeMethod(server.repl_buffer_blocks, zfree); } int canFeedReplicaReplBuffer(client *replica) { @@ -391,8 +391,8 @@ void freeReplicaReferencedReplBuffer(client *replica) { if (replica->flag.repl_rdb_channel) { uint64_t rdb_cid = htonu64(replica->id); if (raxRemove(server.replicas_waiting_psync, (unsigned char *)&rdb_cid, sizeof(rdb_cid), NULL)) { - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", - replicationGetReplicaName(replica), (long long unsigned int)replica->id); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", + replicationGetReplicaName(replica), (long long unsigned int)replica->id); } } if (replica->ref_repl_buf_node != NULL) { @@ -1051,7 +1051,7 @@ void syncCommand(client *c) { } else { replicationUnsetPrimary(); } - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (failover request from '%s')", client); sdsfree(client); } else { @@ -1121,10 +1121,11 @@ void syncCommand(client *c) { * resync. */ if (primary_replid[0] != '?') server.stat_sync_partial_err++; if (c->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) { - serverLog(LL_NOTICE, - "Replica %s is capable of dual channel synchronization, and partial sync isn't possible. " - "Full sync will continue with dedicated RDB channel.", - replicationGetReplicaName(c)); + dualChannelServerLog(LL_NOTICE, + "Replica %s is capable of dual channel synchronization, and partial sync " + "isn't possible. " + "Full sync will continue with dedicated RDB channel.", + replicationGetReplicaName(c)); const char *buf = "+DUALCHANNELSYNC\r\n"; if (connWrite(c->conn, buf, strlen(buf)) != (int)strlen(buf)) { freeClientAsync(c); @@ -1885,7 +1886,7 @@ void replicationSendNewlineToPrimary(void) { /* Callback used by emptyData() while flushing away old data to load * the new dataset received by the primary and by discardTempDb() * after loading succeeded or failed. */ -void replicationEmptyDbCallback(dict *d) { +void replicationEmptyDbCallback(hashtable *d) { UNUSED(d); if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); } @@ -1981,7 +1982,20 @@ serverDb *disklessLoadInitTempDb(void) { /* Helper function for readSyncBulkPayload() to discard our tempDb * when the loading succeeded or failed. */ void disklessLoadDiscardTempDb(serverDb *tempDb) { - discardTempDb(tempDb, replicationEmptyDbCallback); + discardTempDb(tempDb); +} + +/* Helper function for to initialize temp function lib context. + * The temp ctx may be populated by functionsLibCtxSwapWithCurrent or + * freed by disklessLoadDiscardFunctionsLibCtx later. */ +functionsLibCtx *disklessLoadFunctionsLibCtxCreate(void) { + return functionsLibCtxCreate(); +} + +/* Helper function to discard our temp function lib context + * when the loading succeeded or failed. */ +void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx) { + freeFunctionsAsync(temp_functions_lib_ctx); } /* If we know we got an entirely different data set from our primary @@ -2091,8 +2105,7 @@ void readSyncBulkPayload(connection *conn) { } serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s", (nread == -1) ? connGetLastError(conn) : "connection lost"); - cancelReplicationHandshake(1); - return; + goto error; } server.stat_net_repl_input_bytes += nread; @@ -2187,7 +2200,7 @@ void readSyncBulkPayload(connection *conn) { if (use_diskless_load && server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { /* Initialize empty tempDb dictionaries. */ diskless_load_tempDb = disklessLoadInitTempDb(); - temp_functions_lib_ctx = functionsLibCtxCreate(); + temp_functions_lib_ctx = disklessLoadFunctionsLibCtxCreate(); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_STARTED, NULL); } @@ -2227,7 +2240,6 @@ void readSyncBulkPayload(connection *conn) { dbarray = server.db; functions_lib_ctx = functionsLibCtxGetCurrent(); - functionsLibCtxClear(functions_lib_ctx); } rioInitWithConn(&rdb, conn, server.repl_transfer_size); @@ -2257,7 +2269,6 @@ void readSyncBulkPayload(connection *conn) { if (loadingFailed) { stopLoading(0); - cancelReplicationHandshake(1); rioFreeConn(&rdb, NULL); if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { @@ -2266,7 +2277,7 @@ void readSyncBulkPayload(connection *conn) { NULL); disklessLoadDiscardTempDb(diskless_load_tempDb); - functionsLibCtxFree(temp_functions_lib_ctx); + disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); } else { /* Remove the half-loaded data in case we started with an empty replica. */ @@ -2277,7 +2288,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on SYNC * failure, it'll be restarted when sync succeeds or the replica * gets promoted. */ - return; + goto error; } /* RDB loading succeeded if we reach this point. */ @@ -2291,7 +2302,7 @@ void readSyncBulkPayload(connection *conn) { swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ - functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx); + functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx, 1); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_COMPLETED, NULL); @@ -2319,8 +2330,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync the temp DB to disk in " "PRIMARY <-> REPLICA synchronization: %s", strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* Rename rdb like renaming rewrite aof asynchronously. */ @@ -2330,9 +2340,8 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to rename the temp DB into %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); if (old_rdb_fd != -1) close(old_rdb_fd); - return; + goto error; } /* Close old rdb asynchronously. */ if (old_rdb_fd != -1) bioCreateCloseJob(old_rdb_fd, 0, 0); @@ -2343,8 +2352,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync DB directory %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* We will soon start loading the RDB from disk, the replication history is changed, @@ -2361,7 +2369,6 @@ void readSyncBulkPayload(connection *conn) { if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) { serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " "DB from disk, check server logs."); - cancelReplicationHandshake(1); if (server.rdb_del_sync_files && allPersistenceDisabled()) { serverLog(LL_NOTICE, "Removing the RDB file obtained from " "the primary. This replica has persistence " @@ -2375,7 +2382,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on sync failure, it'll be restarted when sync succeeds or replica promoted. */ - return; + goto error; } /* Cleanup. */ @@ -2398,10 +2405,10 @@ void readSyncBulkPayload(connection *conn) { } else { replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db); server.repl_state = REPL_STATE_CONNECTED; + server.repl_down_since = 0; /* Send the initial ACK immediately to put this replica in online state. */ replicationSendAck(); } - server.repl_down_since = 0; /* Fire the primary link modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); @@ -2559,7 +2566,7 @@ void freePendingReplDataBuf(void) { * provisional primary struct, and free local replication buffer. */ void replicationAbortDualChannelSyncTransfer(void) { serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); - serverLog(LL_NOTICE, "Aborting dual channel sync"); + dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync"); if (server.repl_rdb_transfer_s) { connClose(server.repl_rdb_transfer_s); server.repl_rdb_transfer_s = NULL; @@ -2588,8 +2595,9 @@ int sendCurrentOffsetToReplica(client *replica) { int buflen; buflen = snprintf(buf, sizeof(buf), "$ENDOFF:%lld %s %d %llu\r\n", server.primary_repl_offset, server.replid, server.db->id, (long long unsigned int)replica->id); - serverLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", - replicationGetReplicaName(replica), server.primary_repl_offset, (long long unsigned int)replica->id); + dualChannelServerLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", + replicationGetReplicaName(replica), server.primary_repl_offset, + (long long unsigned int)replica->id); if (connSyncWrite(replica->conn, buf, buflen, server.repl_syncio_timeout * 1000) != buflen) { freeClientAsync(replica); return C_ERR; @@ -2598,7 +2606,7 @@ int sendCurrentOffsetToReplica(client *replica) { } static int dualChannelReplHandleHandshake(connection *conn, sds *err) { - serverLog(LL_DEBUG, "Received first reply from primary using rdb connection."); + dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection."); /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[] = {"AUTH", NULL, NULL}; @@ -2614,7 +2622,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { argc++; *err = sendCommandArgv(conn, argc, args, lens); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } } @@ -2624,14 +2632,14 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { NULL); sdsfree(portstr); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); return C_ERR; } return C_OK; @@ -2640,11 +2648,11 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; } if ((*err)[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); + dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); return C_ERR; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; @@ -2654,17 +2662,17 @@ static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; } if (*err[0] == '-') { - serverLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", - *err); + dualChannelServerLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", + *err); return C_ERR; } if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); return C_ERR; } return C_OK; @@ -2678,7 +2686,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { } if (*err[0] == '\0') { /* Retry again later */ - serverLog(LL_DEBUG, "Received empty $ENDOFF response"); + dualChannelServerLog(LL_DEBUG, "Received empty $ENDOFF response"); return C_RETRY; } long long reploffset; @@ -2687,7 +2695,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Parse end offset response */ char *endoff_format = "$ENDOFF:%lld %40s %d %llu"; if (sscanf(*err, endoff_format, &reploffset, primary_replid, &dbid, &rdb_client_id) != 4) { - serverLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); + dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); return C_ERR; } server.rdb_client_id = rdb_client_id; @@ -2735,7 +2743,8 @@ static void dualChannelFullSyncWithPrimary(connection *conn) { /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", + connGetLastError(conn)); goto error; } switch (server.repl_rdb_channel_state) { @@ -2824,13 +2833,13 @@ int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t int nread = connRead(conn, data_block->buf + data_block->used, read); if (nread == -1) { if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn)); + dualChannelServerLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn)); cancelReplicationHandshake(1); } return C_ERR; } if (nread == 0) { - serverLog(LL_VERBOSE, "Provisional primary closed connection"); + dualChannelServerLog(LL_VERBOSE, "Provisional primary closed connection"); cancelReplicationHandshake(1); return C_ERR; } @@ -2859,7 +2868,7 @@ void bufferReplData(connection *conn) { if (readlen && remaining_bytes == 0) { if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes && server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { - serverLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); + dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); /* Stop accumulating primary commands. */ connSetReadHandler(conn, NULL); break; @@ -2932,7 +2941,7 @@ void dualChannelSyncSuccess(void) { /* Wait for the accumulated buffer to be processed before reading any more replication updates */ if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) { /* Sync session aborted during repl data streaming. */ - serverLog(LL_WARNING, "Failed to stream local replication buffer into memory"); + dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory"); /* Verify sync is still in progress */ if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { replicationAbortDualChannelSyncTransfer(); @@ -2941,7 +2950,7 @@ void dualChannelSyncSuccess(void) { return; } freePendingReplDataBuf(); - serverLog(LL_NOTICE, "Successfully streamed replication data into memory"); + dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ replicationSteadyStateInit(); replicationSendAck(); /* Send ACK to notify primary that replica is synced */ @@ -2957,7 +2966,7 @@ int dualChannelSyncHandlePsync(void) { if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { - serverLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); + dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); cancelReplicationHandshake(1); return C_ERR; } @@ -2966,7 +2975,7 @@ int dualChannelSyncHandlePsync(void) { } serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ - serverLog(LL_DEBUG, "Dual channel sync - psync established after rdb load"); + dualChannelServerLog(LL_DEBUG, "Psync established after rdb load"); dualChannelSyncSuccess(); return C_OK; } @@ -3060,8 +3069,9 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* While in dual channel replication, we should use our prepared repl id and offset. */ psync_replid = server.repl_provisional_primary.replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1); - serverLog(LL_NOTICE, "Trying a partial resynchronization using main channel (request %s:%s).", psync_replid, - psync_offset); + dualChannelServerLog(LL_NOTICE, + "Trying a partial resynchronization using main channel (request %s:%s).", + psync_replid, psync_offset); } else if (server.cached_primary) { psync_replid = server.cached_primary->replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->reploff + 1); @@ -3208,7 +3218,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* A response of +DUALCHANNELSYNC from the primary implies that partial * synchronization is not possible and that the primary supports full * sync using dedicated RDB channel. Full sync will continue that way. */ - serverLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); + dualChannelServerLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); sdsfree(reply); return PSYNC_FULLRESYNC_DUAL_CHANNEL; } @@ -3252,7 +3262,7 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) return C_ERR; if ((*err)[0] == '-') { - serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); + dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); return C_ERR; } return C_OK; @@ -3261,7 +3271,7 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { if (server.debug_pause_after_fork) debugPauseProcess(); if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Write error."); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error."); *err = sdsnew(connGetLastError(conn)); return C_ERR; } @@ -3273,8 +3283,8 @@ int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", - server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); + dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", + server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); @@ -3322,7 +3332,7 @@ void dualChannelSetupMainConnForPsync(connection *conn) { } if (ret == C_ERR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); cancelReplicationHandshake(1); } sdsfree(err); @@ -3379,15 +3389,15 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │ │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * │ │ │ │ │+CONTINUE │ - * │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * │ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ + * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ - * ┌────────▼──────────┐ │ │ │Done loading │ │ - * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ - * │ │ │ └───────┬───────────────┘ │ │ + * ┌────────▼─────────────┐ │ │ │Done loading │ │ + * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ + * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ + * │+OK │ │ └───────┬───────────────┘ │ │ * ┌────────▼───┐ │ │ │ │ │ * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ * └─┬──────────┘ │ │ │buffer into memory │ │ @@ -3408,7 +3418,6 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * establish a connection with the primary. */ void syncWithPrimary(connection *conn) { char tmpfile[256], *err = NULL; - int dfd = -1, maxtries = 5; int psync_result; /* If this event fired after the user turned the instance into a primary @@ -3589,6 +3598,7 @@ void syncWithPrimary(connection *conn) { sdsfree(err); err = NULL; server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY; + return; } /* Receive VERSION reply. */ @@ -3677,11 +3687,16 @@ void syncWithPrimary(connection *conn) { /* Prepare a suitable temp file for bulk transfer */ if (!useDisklessLoad()) { + int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); dfd = open(tmpfile, O_CREAT | O_WRONLY | O_EXCL, 0644); if (dfd != -1) break; + /* We save the errno of open to prevent some systems from modifying it after + * the sleep call. For example, sleep in Mac will change errno to ETIMEDOUT. */ + int saved_errno = errno; sleep(1); + errno = saved_errno; } if (dfd == -1) { serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s", @@ -3706,8 +3721,8 @@ void syncWithPrimary(connection *conn) { } if (connSetReadHandler(conn, NULL) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; @@ -3733,7 +3748,6 @@ void syncWithPrimary(connection *conn) { /* Fall through to regular error handling */ error: - if (dfd != -1) close(dfd); connClose(conn); server.repl_transfer_s = NULL; if (server.repl_rdb_transfer_s) { @@ -3976,7 +3990,7 @@ void replicaofCommand(client *c) { if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) { if (server.primary_host) { replicationUnsetPrimary(); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client); sdsfree(client); } @@ -4005,7 +4019,7 @@ void replicaofCommand(client *c) { /* There was no previous primary or the user specified a different one, * we can continue. */ replicationSetPrimary(c->argv[1]->ptr, port, 0); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host, server.primary_port, client); sdsfree(client); diff --git a/src/resp_parser.c b/src/resp_parser.c index 950d2227b7..101e883d2f 100644 --- a/src/resp_parser.c +++ b/src/resp_parser.c @@ -58,6 +58,8 @@ #include "resp_parser.h" #include "server.h" +#include "valkey_strtod.h" + static int parseBulk(ReplyParser *parser, void *p_ctx) { const char *proto = parser->curr_location; char *p = strchr(proto + 1, '\r'); @@ -150,13 +152,11 @@ static int parseDouble(ReplyParser *parser, void *p_ctx) { parser->curr_location = p + 2; /* for \r\n */ char buf[MAX_LONG_DOUBLE_CHARS + 1]; size_t len = p - proto - 1; - double d; + double d = 0; if (len <= MAX_LONG_DOUBLE_CHARS) { memcpy(buf, proto + 1, len); buf[len] = '\0'; - d = strtod(buf, NULL); /* We expect a valid representation. */ - } else { - d = 0; + d = valkey_strtod(buf, NULL); /* We expect a valid representation. */ } parser->callbacks.double_callback(p_ctx, d, proto, parser->curr_location - proto); return C_OK; diff --git a/src/script.h b/src/script.h index 7fff34a40b..194cc8bd05 100644 --- a/src/script.h +++ b/src/script.h @@ -67,6 +67,8 @@ #define SCRIPT_ALLOW_CROSS_SLOT (1ULL << 8) /* Indicate that the current script may access keys from multiple slots */ typedef struct scriptRunCtx scriptRunCtx; +/* This struct stores the necessary information to manage the execution of + * scripts using EVAL and FCALL. */ struct scriptRunCtx { const char *funcname; client *c; diff --git a/src/script_lua.c b/src/script_lua.c index 5093fa944f..29d352d44b 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -1258,15 +1258,15 @@ static void luaLoadLibraries(lua_State *lua) { /* Return sds of the string value located on stack at the given index. * Return NULL if the value is not a string. */ -sds luaGetStringSds(lua_State *lua, int index) { +robj *luaGetStringObject(lua_State *lua, int index) { if (!lua_isstring(lua, index)) { return NULL; } size_t len; const char *str = lua_tolstring(lua, index, &len); - sds str_sds = sdsnewlen(str, len); - return str_sds; + robj *str_obj = createStringObject(str, len); + return str_obj; } static int luaProtectedTableError(lua_State *lua) { diff --git a/src/script_lua.h b/src/script_lua.h index 35edf46af6..6c60754bbc 100644 --- a/src/script_lua.h +++ b/src/script_lua.h @@ -67,7 +67,7 @@ typedef struct errorInfo { } errorInfo; void luaRegisterServerAPI(lua_State *lua); -sds luaGetStringSds(lua_State *lua, int index); +robj *luaGetStringObject(lua_State *lua, int index); void luaRegisterGlobalProtectionFunction(lua_State *lua); void luaSetErrorMetatable(lua_State *lua); void luaSetAllowListProtection(lua_State *lua); diff --git a/src/sds.c b/src/sds.c index 4dd7d709aa..97be74ea47 100644 --- a/src/sds.c +++ b/src/sds.c @@ -194,12 +194,12 @@ sds sdsdup(const sds s) { /* * This method returns the minimum amount of bytes required to store the sds (header + data + NULL terminator). */ -static inline size_t sdsminlen(sds s) { +static inline size_t sdsminlen(const sds s) { return sdslen(s) + sdsHdrSize(s[-1]) + 1; } /* This method copies the sds `s` into `buf` which is the target character buffer. */ -size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size) { +size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const sds s, uint8_t *hdr_size) { size_t required_keylen = sdsminlen(s); if (buf == NULL) { return required_keylen; @@ -216,6 +216,13 @@ void sdsfree(sds s) { s_free_with_size(sdsAllocPtr(s), sdsAllocSize(s)); } +/* This variant of sdsfree() gets its argument as void, and is useful + * as free method in data structures that expect a 'void free_object(void*)' + * prototype for the free method. */ +void sdsfreeVoid(void *s) { + sdsfree(s); +} + /* Set the sds string length to the length as obtained with strlen(), so * considering as content only up to the first null term character. * @@ -954,23 +961,30 @@ void sdsfreesplitres(sds *tokens, int count) { sds sdscatrepr(sds s, const char *p, size_t len) { s = sdsMakeRoomFor(s, len + 2); s = sdscatlen(s, "\"", 1); - while (len--) { - switch (*p) { - case '\\': - case '"': s = sdscatprintf(s, "\\%c", *p); break; - case '\n': s = sdscatlen(s, "\\n", 2); break; - case '\r': s = sdscatlen(s, "\\r", 2); break; - case '\t': s = sdscatlen(s, "\\t", 2); break; - case '\a': s = sdscatlen(s, "\\a", 2); break; - case '\b': s = sdscatlen(s, "\\b", 2); break; - default: - if (isprint(*p)) - s = sdscatlen(s, p, 1); - else + while (len) { + if (isprint(*p)) { + const char *start = p; + while (len && isprint(*p)) { + len--; + p++; + } + s = sdscatlen(s, start, p - start); + } else { + switch (*p) { + case '\\': + case '"': s = sdscatprintf(s, "\\%c", *p); break; + case '\n': s = sdscatlen(s, "\\n", 2); break; + case '\r': s = sdscatlen(s, "\\r", 2); break; + case '\t': s = sdscatlen(s, "\\t", 2); break; + case '\a': s = sdscatlen(s, "\\a", 2); break; + case '\b': s = sdscatlen(s, "\\b", 2); break; + default: s = sdscatprintf(s, "\\x%02x", (unsigned char)*p); - break; + break; + } + p++; + len--; } - p++; } return sdscatlen(s, "\"", 1); } diff --git a/src/sds.h b/src/sds.h index e9c4a95f9a..e1b8531955 100644 --- a/src/sds.h +++ b/src/sds.h @@ -183,6 +183,7 @@ sds sdsempty(void); sds sdsdup(const sds s); size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size); void sdsfree(sds s); +void sdsfreeVoid(void *s); sds sdsgrowzero(sds s, size_t len); sds sdscatlen(sds s, const void *t, size_t len); sds sdscat(sds s, const char *t); diff --git a/src/server.c b/src/server.c index 12691df8ee..3cdec9fa9b 100644 --- a/src/server.c +++ b/src/server.c @@ -372,6 +372,7 @@ void dictDictDestructor(void *val) { dictRelease((dict *)val); } +/* Returns 1 when keys match */ int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; l1 = sdslen((sds)key1); @@ -380,6 +381,12 @@ int dictSdsKeyCompare(const void *key1, const void *key2) { return memcmp(key1, key2, l1) == 0; } +/* Returns 0 when keys match */ +int hashtableSdsKeyCompare(const void *key1, const void *key2) { + const sds sds1 = (const sds)key1, sds2 = (const sds)key2; + return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2); +} + size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint8_t *key_offset) { return sdscopytobuffer(buf, buf_len, (sds)key, key_offset); } @@ -390,6 +397,11 @@ int dictSdsKeyCaseCompare(const void *key1, const void *key2) { return strcasecmp(key1, key2) == 0; } +/* Case insensitive key comparison */ +int hashtableStringKeyCaseCompare(const void *key1, const void *key2) { + return strcasecmp(key1, key2); +} + void dictObjectDestructor(void *val) { if (val == NULL) return; /* Lazy freeing will set value to NULL. */ decrRefCount(val); @@ -489,21 +501,30 @@ uint64_t dictEncObjHash(const void *key) { } } -/* Return 1 if currently we allow dict to expand. Dict may allocate huge - * memory to contain hash buckets when dict expands, that may lead the server to - * reject user's requests or evict some keys, we can stop dict to expand - * provisionally if used memory will be over maxmemory after dict expands, - * but to guarantee the performance of the server, we still allow dict to expand - * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */ -int dictResizeAllowed(size_t moreMem, double usedRatio) { - /* for debug purposes: dict is not allowed to be resized. */ +/* Return 1 if we allow a hash table to expand. It may allocate a huge amount of + * memory to contain hash buckets when it expands, that may lead the server to + * reject user's requests or evict some keys. We can prevent expansion + * provisionally if used memory will be over maxmemory after it expands, + * but to guarantee the performance of the server, we still allow it to expand + * if the load factor exceeds the hard limit defined in hashtable.c. */ +int hashtableResizeAllowed(size_t moreMem, double usedRatio) { + UNUSED(usedRatio); + + /* For debug purposes, not allowed to be resized. */ if (!server.dict_resizing) return 0; - if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) { - return !overMaxmemoryAfterAlloc(moreMem); - } else { - return 1; - } + /* Avoid resizing over max memory. */ + return !overMaxmemoryAfterAlloc(moreMem); +} + +const void *hashtableCommandGetKey(const void *element) { + struct serverCommand *command = (struct serverCommand *)element; + return command->fullname; +} + +const void *hashtableSubcommandGetKey(const void *element) { + struct serverCommand *command = (struct serverCommand *)element; + return command->declared_name; } /* Generic hash table type where keys are Objects, Values @@ -528,17 +549,11 @@ dictType objectKeyHeapPointerValueDictType = { NULL /* allow to expand */ }; -/* Set dictionary type. Keys are SDS strings, values are not used. */ -dictType setDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL, /* allow to expand */ - .no_value = 1, /* no values in this dict */ - .keys_are_odd = 1 /* an SDS string is always an odd pointer */ -}; +/* Set hashtable type. Items are SDS strings */ +hashtableType setHashtableType = { + .hashFunction = dictSdsHash, + .keyCompare = hashtableSdsKeyCompare, + .entryDestructor = dictSdsDestructor}; /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ dictType zsetDictType = { @@ -550,44 +565,61 @@ dictType zsetDictType = { NULL, /* allow to expand */ }; +uint64_t hashtableSdsHash(const void *key) { + return hashtableGenHashFunction((const char *)key, sdslen((char *)key)); +} + +const void *hashtableObjectGetKey(const void *entry) { + return objectGetKey(entry); +} + +int hashtableObjKeyCompare(const void *key1, const void *key2) { + const robj *o1 = key1, *o2 = key2; + return hashtableSdsKeyCompare(o1->ptr, o2->ptr); +} + +void hashtableObjectDestructor(void *val) { + if (val == NULL) return; /* Lazy freeing will set value to NULL. */ + decrRefCount(val); +} + /* Kvstore->keys, keys are sds strings, vals are Objects. */ -dictType kvstoreKeysDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* key is embedded in the dictEntry and freed internally */ - dictObjectDestructor, /* val destructor */ - dictResizeAllowed, /* allow to resize */ - kvstoreDictRehashingStarted, - kvstoreDictRehashingCompleted, - kvstoreDictMetadataSize, - .embedKey = dictSdsEmbedKey, - .embedded_entry = 1, +hashtableType kvstoreKeysHashtableType = { + .entryGetKey = hashtableObjectGetKey, + .hashFunction = hashtableSdsHash, + .keyCompare = hashtableSdsKeyCompare, + .entryDestructor = hashtableObjectDestructor, + .resizeAllowed = hashtableResizeAllowed, + .rehashingStarted = kvstoreHashtableRehashingStarted, + .rehashingCompleted = kvstoreHashtableRehashingCompleted, + .trackMemUsage = kvstoreHashtableTrackMemUsage, + .getMetadataSize = kvstoreHashtableMetadataSize, }; /* Kvstore->expires */ -dictType kvstoreExpiresDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* key destructor */ - NULL, /* val destructor */ - dictResizeAllowed, /* allow to resize */ - kvstoreDictRehashingStarted, - kvstoreDictRehashingCompleted, - kvstoreDictMetadataSize, +hashtableType kvstoreExpiresHashtableType = { + .entryGetKey = hashtableObjectGetKey, + .hashFunction = hashtableSdsHash, + .keyCompare = hashtableSdsKeyCompare, + .entryDestructor = NULL, /* shared with keyspace table */ + .resizeAllowed = hashtableResizeAllowed, + .rehashingStarted = kvstoreHashtableRehashingStarted, + .rehashingCompleted = kvstoreHashtableRehashingCompleted, + .trackMemUsage = kvstoreHashtableTrackMemUsage, + .getMetadataSize = kvstoreHashtableMetadataSize, }; -/* Command table. sds string -> command struct pointer. */ -dictType commandTableDictType = { - dictSdsCaseHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCaseCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - NULL, /* val destructor */ - NULL, /* allow to expand */ - .no_incremental_rehash = 1, /* no incremental rehash as the command table may be accessed from IO threads. */ -}; +/* Command set, hashed by sds string, stores serverCommand structs. */ +hashtableType commandSetType = {.entryGetKey = hashtableCommandGetKey, + .hashFunction = dictSdsCaseHash, + .keyCompare = hashtableStringKeyCaseCompare, + .instant_rehashing = 1}; + +/* Sub-command set, hashed by char* string, stores serverCommand structs. */ +hashtableType subcommandSetType = {.entryGetKey = hashtableSubcommandGetKey, + .hashFunction = dictCStrCaseHash, + .keyCompare = hashtableStringKeyCaseCompare, + .instant_rehashing = 1}; /* Hash type hash table (note that small hashes are represented with listpacks) */ dictType hashDictType = { @@ -609,6 +641,11 @@ dictType sdsReplyDictType = { NULL /* allow to expand */ }; +/* Hashtable type without destructor */ +hashtableType sdsReplyHashtableType = { + .hashFunction = dictSdsCaseHash, + .keyCompare = hashtableSdsKeyCompare}; + /* Keylist hash table type has unencoded Objects as keys and * lists as values. It's used for blocking operations (BLPOP) and to * map swapped keys to a list of clients waiting for this keys to be loaded. */ @@ -632,18 +669,33 @@ dictType objToDictDictType = { NULL /* allow to expand */ }; -/* Same as objToDictDictType, added some kvstore callbacks, it's used - * for PUBSUB command to track clients subscribing the channels. */ -dictType kvstoreChannelDictType = { - dictObjHash, /* hash function */ - NULL, /* key dup */ - dictObjKeyCompare, /* key compare */ - dictObjectDestructor, /* key destructor */ - dictDictDestructor, /* val destructor */ - NULL, /* allow to expand */ - kvstoreDictRehashingStarted, - kvstoreDictRehashingCompleted, - kvstoreDictMetadataSize, +/* Callback used for hash tables where the entries are dicts and the key + * (channel name) is stored in each dict's metadata. */ +const void *hashtableChannelsDictGetKey(const void *entry) { + const dict *d = entry; + return *((const void **)dictMetadata(d)); +} + +void hashtableChannelsDictDestructor(void *entry) { + dict *d = entry; + robj *channel = *((void **)dictMetadata(d)); + decrRefCount(channel); + dictRelease(d); +} + +/* Similar to objToDictDictType, but changed to hashtable and added some kvstore + * callbacks, it's used for PUBSUB command to track clients subscribing the + * channels. The elements are dicts where the keys are clients. The metadata in + * each dict stores a pointer to the channel name. */ +hashtableType kvstoreChannelHashtableType = { + .entryGetKey = hashtableChannelsDictGetKey, + .hashFunction = dictObjHash, + .keyCompare = hashtableObjKeyCompare, + .entryDestructor = hashtableChannelsDictDestructor, + .rehashingStarted = kvstoreHashtableRehashingStarted, + .rehashingCompleted = kvstoreHashtableRehashingCompleted, + .trackMemUsage = kvstoreHashtableTrackMemUsage, + .getMetadataSize = kvstoreHashtableMetadataSize, }; /* Modules system dictionary type. Keys are module name, @@ -700,12 +752,18 @@ dictType sdsHashDictType = { NULL /* allow to expand */ }; +size_t clientSetDictTypeMetadataBytes(dict *d) { + UNUSED(d); + return sizeof(void *); +} + /* Client Set dictionary type. Keys are client, values are not used. */ dictType clientDictType = { dictClientHash, /* hash function */ NULL, /* key dup */ dictClientKeyCompare, /* key compare */ - .no_value = 1 /* no values in this dict */ + .dictMetadataBytes = clientSetDictTypeMetadataBytes, + .no_value = 1 /* no values in this dict */ }; /* This function is called once a background process of some kind terminates, @@ -715,12 +773,16 @@ dictType clientDictType = { * for dict.c to resize or rehash the tables accordingly to the fact we have an * active fork child running. */ void updateDictResizePolicy(void) { - if (server.in_fork_child != CHILD_TYPE_NONE) + if (server.in_fork_child != CHILD_TYPE_NONE) { dictSetResizeEnabled(DICT_RESIZE_FORBID); - else if (hasActiveChildProcess()) + hashtableSetResizePolicy(HASHTABLE_RESIZE_FORBID); + } else if (hasActiveChildProcess()) { dictSetResizeEnabled(DICT_RESIZE_AVOID); - else + hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID); + } else { dictSetResizeEnabled(DICT_RESIZE_ENABLE); + hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW); + } } const char *strChildType(int type) { @@ -889,9 +951,10 @@ int clientsCronResizeOutputBuffer(client *c, mstime_t now_ms) { if (new_buffer_size) { oldbuf = c->buf; + size_t oldbuf_size = c->buf_usable_size; c->buf = zmalloc_usable(new_buffer_size, &c->buf_usable_size); memcpy(c->buf, oldbuf, c->bufpos); - zfree(oldbuf); + zfree_with_size(oldbuf, oldbuf_size); } return 0; } @@ -970,9 +1033,10 @@ void updateClientMemoryUsage(client *c) { } int clientEvictionAllowed(client *c) { - if (server.maxmemory_clients == 0 || c->flag.no_evict || !c->conn) { + if (server.maxmemory_clients == 0 || c->flag.no_evict || c->flag.fake) { return 0; } + serverAssert(c->conn); int type = getClientType(c); return (type == CLIENT_TYPE_NORMAL || type == CLIENT_TYPE_PUBSUB); } @@ -1131,15 +1195,15 @@ void databasesCron(void) { /* Expire keys by random sampling. Not required for replicas * as primary will synthesize DELs for us. */ if (server.active_expire_enabled) { - if (iAmPrimary()) { - activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); - } else { + if (!iAmPrimary()) { expireReplicaKeys(); + } else if (!server.import_mode) { + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); } } - /* Defrag keys gradually. */ - activeDefragCycle(); + /* Start active defrag cycle or adjust defrag CPU if needed. */ + monitorActiveDefrag(); /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1158,8 +1222,8 @@ void databasesCron(void) { for (j = 0; j < dbs_per_call; j++) { serverDb *db = &server.db[resize_db % server.dbnum]; - kvstoreTryResizeDicts(db->keys, CRON_DICTS_PER_DB); - kvstoreTryResizeDicts(db->expires, CRON_DICTS_PER_DB); + kvstoreTryResizeHashtables(db->keys, CRON_DICTS_PER_DB); + kvstoreTryResizeHashtables(db->expires, CRON_DICTS_PER_DB); resize_db++; } @@ -1297,8 +1361,8 @@ void cronUpdateMemoryStats(void) { * allocations, and allocator reserved pages that can be pursed (all not actual frag) */ zmalloc_get_allocator_info( &server.cron_malloc_stats.allocator_allocated, &server.cron_malloc_stats.allocator_active, - &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy, - &server.cron_malloc_stats.allocator_frag_smallbins_bytes); + &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy); + server.cron_malloc_stats.allocator_frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); /* in case the allocator isn't providing these stats, fake them so that * fragmentation info still shows some (inaccurate metrics) */ if (!server.cron_malloc_stats.allocator_resident) { @@ -1606,27 +1670,16 @@ void whileBlockedCron(void) { * latency monitor if this function is called too often. */ if (server.blocked_last_cron >= server.mstime) return; - mstime_t latency; - latencyStartMonitor(latency); - - /* In some cases we may be called with big intervals, so we may need to do - * extra work here. This is because some of the functions in serverCron rely - * on the fact that it is performed every 10 ms or so. For instance, if - * activeDefragCycle needs to utilize 25% cpu, it will utilize 2.5ms, so we - * need to call it multiple times. */ + /* Increment server.cronloops so that run_with_period works. */ long hz_ms = 1000 / server.hz; - while (server.blocked_last_cron < server.mstime) { - /* Defrag keys gradually. */ - activeDefragCycle(); - - server.blocked_last_cron += hz_ms; + int cronloops = (server.mstime - server.blocked_last_cron + (hz_ms - 1)) / hz_ms; // rounding up + server.blocked_last_cron += cronloops * hz_ms; + server.cronloops += cronloops; - /* Increment cronloop so that run_with_period works. */ - server.cronloops++; - } + mstime_t latency; + latencyStartMonitor(latency); - /* Other cron jobs do not need to be done in a loop. No need to check - * server.blocked_last_cron since we have an early exit at the top. */ + defragWhileBlocked(); /* Update memory stats during loading (excluding blocked scripts) */ if (server.loading) cronUpdateMemoryStats(); @@ -1649,7 +1702,7 @@ static void sendGetackToReplicas(void) { robj *argv[3]; argv[0] = shared.replconf; argv[1] = shared.getack; - argv[2] = shared.special_asterick; /* Not used argument. */ + argv[2] = shared.special_asterisk; /* Not used argument. */ replicationFeedReplicas(-1, argv, 3); } @@ -1727,7 +1780,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Run a fast expire cycle (the called function will return * ASAP if a fast cycle is not needed). */ - if (server.active_expire_enabled && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); + if (server.active_expire_enabled && !server.import_mode && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); if (moduleCount()) { moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_BEFORE_SLEEP, NULL); @@ -2035,7 +2088,7 @@ void createSharedObjects(void) { shared.load = createStringObject("LOAD", 4); shared.createconsumer = createStringObject("CREATECONSUMER", 14); shared.getack = createStringObject("GETACK", 6); - shared.special_asterick = createStringObject("*", 1); + shared.special_asterisk = createStringObject("*", 1); shared.special_equals = createStringObject("=", 1); shared.redacted = makeObjectShared(createStringObject("(redacted)", 10)); @@ -2118,7 +2171,7 @@ void initServerConfig(void) { server.aof_flush_postponed_start = 0; server.aof_last_incr_size = 0; server.aof_last_incr_fsync_offset = 0; - server.active_defrag_running = 0; + server.active_defrag_cpu_percent = 0; server.active_defrag_configuration_changed = 0; server.notify_keyspace_events = 0; server.blocked_clients = 0; @@ -2133,6 +2186,7 @@ void initServerConfig(void) { server.extended_redis_compat = 0; server.pause_cron = 0; server.dict_resizing = 1; + server.import_mode = 0; server.latency_tracking_info_percentiles_len = 3; server.latency_tracking_info_percentiles = zmalloc(sizeof(double) * (server.latency_tracking_info_percentiles_len)); @@ -2191,8 +2245,8 @@ void initServerConfig(void) { /* Command table -- we initialize it here as it is part of the * initial configuration, since command names may be changed via * valkey.conf using the rename-command directive. */ - server.commands = dictCreate(&commandTableDictType); - server.orig_commands = dictCreate(&commandTableDictType); + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); populateCommandTable(); /* Debugging */ @@ -2480,19 +2534,6 @@ void checkTcpBacklogSettings(void) { #endif } -void closeListener(connListener *sfd) { - int j; - - for (j = 0; j < sfd->count; j++) { - if (sfd->fd[j] == -1) continue; - - aeDeleteFileEvent(server.el, sfd->fd[j], AE_READABLE); - close(sfd->fd[j]); - } - - sfd->count = 0; -} - /* Create an event handler for accepting new connections in TCP or TLS domain sockets. * This works atomically for all socket fds */ int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler) { @@ -2556,7 +2597,7 @@ int listenToPort(connListener *sfd) { continue; /* Rollback successful listens before exiting */ - closeListener(sfd); + connCloseListener(sfd); return C_ERR; } if (server.socket_mark_id > 0) anetSetSockMarkId(NULL, sfd->fd[sfd->count], server.socket_mark_id); @@ -2604,6 +2645,7 @@ void resetServerStats(void) { server.stat_total_reads_processed = 0; server.stat_io_writes_processed = 0; server.stat_io_freed_objects = 0; + server.stat_io_accept_offloaded = 0; server.stat_poll_processed_by_io_threads = 0; server.stat_total_writes_processed = 0; server.stat_client_qbuf_limit_disconnections = 0; @@ -2692,6 +2734,7 @@ void initServer(void) { server.blocking_op_nesting = 0; server.thp_enabled = 0; server.cluster_drop_packet_filter = -1; + server.debug_cluster_disable_random_ping = 0; server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; @@ -2716,14 +2759,14 @@ void initServer(void) { /* Create the databases, and initialize other internal state. */ int slot_count_bits = 0; - int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND; + int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND; if (server.cluster_enabled) { slot_count_bits = CLUSTER_SLOT_MASK_BITS; - flags |= KVSTORE_FREE_EMPTY_DICTS; + flags |= KVSTORE_FREE_EMPTY_HASHTABLES; } for (j = 0; j < server.dbnum; j++) { - server.db[j].keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags); - server.db[j].expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags); + server.db[j].keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags); + server.db[j].expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags); server.db[j].expires_cursor = 0; server.db[j].blocking_keys = dictCreate(&keylistDictType); server.db[j].blocking_keys_unblock_on_nokey = dictCreate(&objectKeyPointerValueDictType); @@ -2731,17 +2774,15 @@ void initServer(void) { server.db[j].watched_keys = dictCreate(&keylistDictType); server.db[j].id = j; server.db[j].avg_ttl = 0; - server.db[j].defrag_later = listCreate(); - listSetFreeMethod(server.db[j].defrag_later, (void (*)(void *))sdsfree); } evictionPoolAlloc(); /* Initialize the LRU keys pool. */ /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which * seems odd) just to make the code cleaner by making it be the same type as server.pubsubshard_channels * (which has to be kvstore), see pubsubtype.serverPubSubChannels */ - server.pubsub_channels = kvstoreCreate(&kvstoreChannelDictType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + server.pubsub_channels = kvstoreCreate(&kvstoreChannelHashtableType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND); server.pubsub_patterns = dictCreate(&objToDictDictType); - server.pubsubshard_channels = kvstoreCreate(&kvstoreChannelDictType, slot_count_bits, - KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + server.pubsubshard_channels = kvstoreCreate(&kvstoreChannelHashtableType, slot_count_bits, + KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES); server.pubsub_clients = 0; server.watching_clients = 0; server.cronloops = 0; @@ -2896,6 +2937,17 @@ void initListeners(void) { listener->priv = &server.unix_ctx_config; /* Unix socket specified */ } + if (server.rdma_ctx_config.port != 0) { + conn_index = connectionIndexByType(CONN_TYPE_RDMA); + if (conn_index < 0) serverPanic("Failed finding connection listener of %s", CONN_TYPE_RDMA); + listener = &server.listeners[conn_index]; + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + listener->priv = &server.rdma_ctx_config; + } + /* create all the configured listener, and add handler to start to accept */ int listen_fds = 0; for (int j = 0; j < CONN_TYPE_MAX; j++) { @@ -3034,13 +3086,13 @@ sds catSubCommandFullname(const char *parent_name, const char *sub_name) { return sdscatfmt(sdsempty(), "%s|%s", parent_name, sub_name); } -void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand, const char *declared_name) { - if (!parent->subcommands_dict) parent->subcommands_dict = dictCreate(&commandTableDictType); +void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand) { + if (!parent->subcommands_ht) parent->subcommands_ht = hashtableCreate(&subcommandSetType); subcommand->parent = parent; /* Assign the parent command */ subcommand->id = ACLGetCommandID(subcommand->fullname); /* Assign the ID used for ACL. */ - serverAssert(dictAdd(parent->subcommands_dict, sdsnew(declared_name), subcommand) == DICT_OK); + serverAssert(hashtableAdd(parent->subcommands_ht, subcommand)); } /* Set implicit ACl categories (see comment above the definition of @@ -3092,7 +3144,7 @@ int populateCommandStructure(struct serverCommand *c) { sub->fullname = catSubCommandFullname(c->declared_name, sub->declared_name); if (populateCommandStructure(sub) == C_ERR) continue; - commandAddSubcommand(c, sub, sub->declared_name); + commandAddSubcommand(c, sub); } } @@ -3116,22 +3168,20 @@ void populateCommandTable(void) { c->fullname = sdsnew(c->declared_name); if (populateCommandStructure(c) == C_ERR) continue; - retval1 = dictAdd(server.commands, sdsdup(c->fullname), c); + retval1 = hashtableAdd(server.commands, c); /* Populate an additional dictionary that will be unaffected * by rename-command statements in valkey.conf. */ - retval2 = dictAdd(server.orig_commands, sdsdup(c->fullname), c); - serverAssert(retval1 == DICT_OK && retval2 == DICT_OK); + retval2 = hashtableAdd(server.orig_commands, c); + serverAssert(retval1 && retval2); } } -void resetCommandTableStats(dict *commands) { - struct serverCommand *c; - dictEntry *de; - dictIterator *di; - - di = dictGetSafeIterator(commands); - while ((de = dictNext(di)) != NULL) { - c = (struct serverCommand *)dictGetVal(de); +void resetCommandTableStats(hashtable *commands) { + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *c = next; c->microseconds = 0; c->calls = 0; c->rejected_calls = 0; @@ -3140,9 +3190,9 @@ void resetCommandTableStats(dict *commands) { hdr_close(c->latency_histogram); c->latency_histogram = NULL; } - if (c->subcommands_dict) resetCommandTableStats(c->subcommands_dict); + if (c->subcommands_ht) resetCommandTableStats(c->subcommands_ht); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } void resetErrorTableStats(void) { @@ -3189,13 +3239,18 @@ void serverOpArrayFree(serverOpArray *oa) { /* ====================== Commands lookup and execution ===================== */ int isContainerCommandBySds(sds s) { - struct serverCommand *base_cmd = dictFetchValue(server.commands, s); - int has_subcommands = base_cmd && base_cmd->subcommands_dict; + void *entry; + int found_command = hashtableFind(server.commands, s, &entry); + struct serverCommand *base_cmd = entry; + int has_subcommands = found_command && base_cmd->subcommands_ht; return has_subcommands; } struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name) { - return dictFetchValue(container->subcommands_dict, sub_name); + void *entry = NULL; + hashtableFind(container->subcommands_ht, sub_name, &entry); + struct serverCommand *subcommand = entry; + return subcommand; } /* Look up a command by argv and argc @@ -3206,9 +3261,11 @@ struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_ * name (e.g. in COMMAND INFO) rather than to find the command * a user requested to execute (in processCommand). */ -struct serverCommand *lookupCommandLogic(dict *commands, robj **argv, int argc, int strict) { - struct serverCommand *base_cmd = dictFetchValue(commands, argv[0]->ptr); - int has_subcommands = base_cmd && base_cmd->subcommands_dict; +struct serverCommand *lookupCommandLogic(hashtable *commands, robj **argv, int argc, int strict) { + void *entry = NULL; + int found_command = hashtableFind(commands, argv[0]->ptr, &entry); + struct serverCommand *base_cmd = entry; + int has_subcommands = found_command && base_cmd->subcommands_ht; if (argc == 1 || !has_subcommands) { if (strict && argc != 1) return NULL; /* Note: It is possible that base_cmd->proc==NULL (e.g. CONFIG) */ @@ -3224,7 +3281,7 @@ struct serverCommand *lookupCommand(robj **argv, int argc) { return lookupCommandLogic(server.commands, argv, argc, 0); } -struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s) { +struct serverCommand *lookupCommandBySdsLogic(hashtable *commands, sds s) { int argc, j; sds *strings = sdssplitlen(s, sdslen(s), "|", 1, &argc); if (strings == NULL) return NULL; @@ -3251,7 +3308,7 @@ struct serverCommand *lookupCommandBySds(sds s) { return lookupCommandBySdsLogic(server.commands, s); } -struct serverCommand *lookupCommandByCStringLogic(dict *commands, const char *s) { +struct serverCommand *lookupCommandByCStringLogic(hashtable *commands, const char *s) { struct serverCommand *cmd; sds name = sdsnew(s); @@ -3676,10 +3733,6 @@ void call(client *c, int flags) { replicationFeedMonitors(c, server.monitors, c->db->id, argv, argc); } - /* Clear the original argv. - * If the client is blocked we will handle slowlog when it is unblocked. */ - if (!c->flag.blocked) freeClientOriginalArgv(c); - /* Populate the per-command and per-slot statistics that we show in INFO commandstats and CLUSTER SLOT-STATS, * respectively. If the client is blocked we will handle latency stats and duration when it is unblocked. */ if (update_command_stats && !c->flag.blocked) { @@ -4323,7 +4376,7 @@ int prepareForShutdown(client *c, int flags) { server.shutdown_flags = flags; if (c != NULL) { - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "User requested shutdown... (user request from '%s')", client); sdsfree(client); } else { @@ -4898,23 +4951,25 @@ void addReplyCommandSubCommands(client *c, struct serverCommand *cmd, void (*reply_function)(client *, struct serverCommand *), int use_map) { - if (!cmd->subcommands_dict) { + if (!cmd->subcommands_ht) { addReplySetLen(c, 0); return; } if (use_map) - addReplyMapLen(c, dictSize(cmd->subcommands_dict)); + addReplyMapLen(c, hashtableSize(cmd->subcommands_ht)); else - addReplyArrayLen(c, dictSize(cmd->subcommands_dict)); - dictEntry *de; - dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict); - while ((de = dictNext(di)) != NULL) { - struct serverCommand *sub = (struct serverCommand *)dictGetVal(de); + addReplyArrayLen(c, hashtableSize(cmd->subcommands_ht)); + + void *next; + hashtableIterator iter; + hashtableInitSafeIterator(&iter, cmd->subcommands_ht); + while (hashtableNext(&iter, &next)) { + struct serverCommand *sub = next; if (use_map) addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname)); reply_function(c, sub); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* Output the representation of a server command. Used by the COMMAND command and COMMAND INFO. */ @@ -4960,7 +5015,7 @@ void addReplyCommandDocs(client *c, struct serverCommand *cmd) { if (cmd->reply_schema) maplen++; #endif if (cmd->args) maplen++; - if (cmd->subcommands_dict) maplen++; + if (cmd->subcommands_ht) maplen++; addReplyMapLen(c, maplen); if (cmd->summary) { @@ -5010,7 +5065,7 @@ void addReplyCommandDocs(client *c, struct serverCommand *cmd) { addReplyBulkCString(c, "arguments"); addReplyCommandArgList(c, cmd->args, cmd->num_args); } - if (cmd->subcommands_dict) { + if (cmd->subcommands_ht) { addReplyBulkCString(c, "subcommands"); addReplyCommandSubCommands(c, cmd, addReplyCommandDocs, 1); } @@ -5067,20 +5122,20 @@ void getKeysSubcommand(client *c) { /* COMMAND (no args) */ void commandCommand(client *c) { - dictIterator *di; - dictEntry *de; - - addReplyArrayLen(c, dictSize(server.commands)); - di = dictGetIterator(server.commands); - while ((de = dictNext(di)) != NULL) { - addReplyCommandInfo(c, dictGetVal(de)); + hashtableIterator iter; + void *next; + addReplyArrayLen(c, hashtableSize(server.commands)); + hashtableInitIterator(&iter, server.commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; + addReplyCommandInfo(c, cmd); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* COMMAND COUNT */ void commandCountCommand(client *c) { - addReplyLongLong(c, dictSize(server.commands)); + addReplyLongLong(c, hashtableSize(server.commands)); } typedef enum { @@ -5126,39 +5181,39 @@ int shouldFilterFromCommandList(struct serverCommand *cmd, commandListFilter *fi } /* COMMAND LIST FILTERBY (MODULE |ACLCAT |PATTERN ) */ -void commandListWithFilter(client *c, dict *commands, commandListFilter filter, int *numcmds) { - dictEntry *de; - dictIterator *di = dictGetIterator(commands); - - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); +void commandListWithFilter(client *c, hashtable *commands, commandListFilter filter, int *numcmds) { + hashtableIterator iter; + void *next; + hashtableInitIterator(&iter, commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; if (!shouldFilterFromCommandList(cmd, &filter)) { addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); (*numcmds)++; } - if (cmd->subcommands_dict) { - commandListWithFilter(c, cmd->subcommands_dict, filter, numcmds); + if (cmd->subcommands_ht) { + commandListWithFilter(c, cmd->subcommands_ht, filter, numcmds); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* COMMAND LIST */ -void commandListWithoutFilter(client *c, dict *commands, int *numcmds) { - dictEntry *de; - dictIterator *di = dictGetIterator(commands); - - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); +void commandListWithoutFilter(client *c, hashtable *commands, int *numcmds) { + hashtableIterator iter; + void *next; + hashtableInitIterator(&iter, commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); (*numcmds)++; - if (cmd->subcommands_dict) { - commandListWithoutFilter(c, cmd->subcommands_dict, numcmds); + if (cmd->subcommands_ht) { + commandListWithoutFilter(c, cmd->subcommands_ht, numcmds); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } /* COMMAND LIST [FILTERBY (MODULE |ACLCAT |PATTERN )] */ @@ -5207,14 +5262,15 @@ void commandInfoCommand(client *c) { int i; if (c->argc == 2) { - dictIterator *di; - dictEntry *de; - addReplyArrayLen(c, dictSize(server.commands)); - di = dictGetIterator(server.commands); - while ((de = dictNext(di)) != NULL) { - addReplyCommandInfo(c, dictGetVal(de)); + hashtableIterator iter; + void *next; + addReplyArrayLen(c, hashtableSize(server.commands)); + hashtableInitIterator(&iter, server.commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; + addReplyCommandInfo(c, cmd); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { addReplyArrayLen(c, c->argc - 2); for (i = 2; i < c->argc; i++) { @@ -5228,16 +5284,16 @@ void commandDocsCommand(client *c) { int i; if (c->argc == 2) { /* Reply with an array of all commands */ - dictIterator *di; - dictEntry *de; - addReplyMapLen(c, dictSize(server.commands)); - di = dictGetIterator(server.commands); - while ((de = dictNext(di)) != NULL) { - struct serverCommand *cmd = dictGetVal(de); + hashtableIterator iter; + void *next; + addReplyMapLen(c, hashtableSize(server.commands)); + hashtableInitIterator(&iter, server.commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *cmd = next; addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); addReplyCommandDocs(c, cmd); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { /* Reply with an array of the requested commands (if we find them) */ int numcmds = 0; @@ -5357,14 +5413,13 @@ const char *getSafeInfoString(const char *s, size_t len, char **tmp) { return memmapchars(new, len, unsafe_info_chars, unsafe_info_chars_substs, sizeof(unsafe_info_chars) - 1); } -sds genValkeyInfoStringCommandStats(sds info, dict *commands) { - struct serverCommand *c; - dictEntry *de; - dictIterator *di; - di = dictGetSafeIterator(commands); - while ((de = dictNext(di)) != NULL) { +sds genValkeyInfoStringCommandStats(sds info, hashtable *commands) { + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *c = next; char *tmpsafe; - c = (struct serverCommand *)dictGetVal(de); if (c->calls || c->failed_calls || c->rejected_calls) { info = sdscatprintf(info, "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f" @@ -5374,11 +5429,11 @@ sds genValkeyInfoStringCommandStats(sds info, dict *commands) { c->rejected_calls, c->failed_calls); if (tmpsafe != NULL) zfree(tmpsafe); } - if (c->subcommands_dict) { - info = genValkeyInfoStringCommandStats(info, c->subcommands_dict); + if (c->subcommands_ht) { + info = genValkeyInfoStringCommandStats(info, c->subcommands_ht); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); return info; } @@ -5395,24 +5450,23 @@ sds genValkeyInfoStringACLStats(sds info) { return info; } -sds genValkeyInfoStringLatencyStats(sds info, dict *commands) { - struct serverCommand *c; - dictEntry *de; - dictIterator *di; - di = dictGetSafeIterator(commands); - while ((de = dictNext(di)) != NULL) { +sds genValkeyInfoStringLatencyStats(sds info, hashtable *commands) { + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, commands); + while (hashtableNext(&iter, &next)) { + struct serverCommand *c = next; char *tmpsafe; - c = (struct serverCommand *)dictGetVal(de); if (c->latency_histogram) { info = fillPercentileDistributionLatencies( info, getSafeInfoString(c->fullname, sdslen(c->fullname), &tmpsafe), c->latency_histogram); if (tmpsafe != NULL) zfree(tmpsafe); } - if (c->subcommands_dict) { - info = genValkeyInfoStringLatencyStats(info, c->subcommands_dict); + if (c->subcommands_ht) { + info = genValkeyInfoStringLatencyStats(info, c->subcommands_ht); } } - dictReleaseIterator(di); + hashtableResetIterator(&iter); return info; } @@ -5702,7 +5756,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "mem_aof_buffer:%zu\r\n", mh->aof_buffer, "mem_allocator:%s\r\n", ZMALLOC_LIB, "mem_overhead_db_hashtable_rehashing:%zu\r\n", mh->overhead_db_hashtable_rehashing, - "active_defrag_running:%d\r\n", server.active_defrag_running, + "active_defrag_running:%d\r\n", server.active_defrag_cpu_percent, "lazyfree_pending_objects:%zu\r\n", lazyfreeGetPendingObjectsCount(), "lazyfreed_objects:%zu\r\n", lazyfreeGetFreedObjectsCount())); freeMemoryOverheadData(mh); @@ -5862,6 +5916,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, "io_threaded_freed_objects:%lld\r\n", server.stat_io_freed_objects, + "io_threaded_accept_processed:%lld\r\n", server.stat_io_accept_offloaded, "io_threaded_poll_processed:%lld\r\n", server.stat_poll_processed_by_io_threads, "io_threaded_total_prefetch_batches:%lld\r\n", server.stat_total_prefetch_batches, "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries, @@ -6294,7 +6349,7 @@ connListener *listenerByType(const char *typename) { /* Close original listener, re-create a new listener from the updated bind address & port */ int changeListener(connListener *listener) { /* Close old servers */ - closeListener(listener); + connCloseListener(listener); /* Just close the server if port disabled */ if (listener->port == 0) { @@ -6469,27 +6524,7 @@ void sendChildInfo(childInfoType info_type, size_t keys, char *pname) { sendChildInfoGeneric(info_type, keys, -1, pname); } -/* Try to release pages back to the OS directly (bypassing the allocator), - * in an effort to decrease CoW during fork. For small allocations, we can't - * release any full page, so in an effort to avoid getting the size of the - * allocation from the allocator (malloc_size) when we already know it's small, - * we check the size_hint. If the size is not already known, passing a size_hint - * of 0 will lead the checking the real size of the allocation. - * Also please note that the size may be not accurate, so in order to make this - * solution effective, the judgement for releasing memory pages should not be - * too strict. */ -void dismissMemory(void *ptr, size_t size_hint) { - if (ptr == NULL) return; - - /* madvise(MADV_DONTNEED) can not release pages if the size of memory - * is too small, we try to release only for the memory which the size - * is more than half of page size. */ - if (size_hint && size_hint <= server.page_size / 2) return; - - zmadvise_dontneed(ptr); -} - -/* Dismiss big chunks of memory inside a client structure, see dismissMemory() */ +/* Dismiss big chunks of memory inside a client structure, see zmadvise_dontneed() */ void dismissClientMemory(client *c) { /* Dismiss client query buffer and static reply buffer. */ dismissMemory(c->buf, c->buf_usable_size); @@ -6520,7 +6555,7 @@ void dismissClientMemory(client *c) { /* In the child process, we don't need some buffers anymore, and these are * likely to change in the parent when there's heavy write traffic. * We dismiss them right away, to avoid CoW. - * see dismissMemory(). */ + * see zmadvise_dontneed(). */ void dismissMemoryInChild(void) { /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */ if (server.thp_enabled) return; @@ -6793,7 +6828,10 @@ __attribute__((weak)) int main(int argc, char **argv) { #endif tzset(); /* Populates 'timezone' global. */ zmalloc_set_oom_handler(serverOutOfMemoryHandler); - +#if defined(HAVE_DEFRAG) + int res = allocatorDefragInit(); + serverAssert(res == 0); +#endif /* To achieve entropy, in case of containers, their time() and getpid() can * be the same. But value of tv_usec is fast enough to make the difference */ gettimeofday(&tv, NULL); @@ -6811,6 +6849,7 @@ __attribute__((weak)) int main(int argc, char **argv) { uint8_t hashseed[16]; getRandomBytes(hashseed, sizeof(hashseed)); dictSetHashFunctionSeed(hashseed); + hashtableSetHashFunctionSeed(hashseed); char *exec_name = strrchr(argv[0], '/'); if (exec_name == NULL) exec_name = argv[0]; diff --git a/src/server.h b/src/server.h index 5ef04a9080..841db70614 100644 --- a/src/server.h +++ b/src/server.h @@ -35,6 +35,7 @@ #include "solarisfixes.h" #include "rio.h" #include "commands.h" +#include "allocator_defrag.h" #include #include @@ -66,7 +67,8 @@ typedef long long ustime_t; /* microsecond time type. */ #include "ae.h" /* Event driven programming library */ #include "sds.h" /* Dynamic safe strings */ -#include "dict.h" /* Hash tables */ +#include "dict.h" /* Hash tables (old implementation) */ +#include "hashtable.h" /* Hash tables (new implementation) */ #include "kvstore.h" /* Slot-based hash table */ #include "adlist.h" /* Linked lists */ #include "zmalloc.h" /* total memory usage aware version of malloc/free */ @@ -81,6 +83,8 @@ typedef long long ustime_t; /* microsecond time type. */ #include "connection.h" /* Connection abstraction */ #include "memory_prefetch.h" +#define dismissMemory zmadvise_dontneed + #define VALKEYMODULE_CORE 1 typedef struct serverObject robj; #include "valkeymodule.h" /* Modules API defines. */ @@ -144,6 +148,11 @@ struct hdr_histogram; #define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60 /* Grace period in seconds for replica main \ * channel to establish psync. */ #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */ +#if !defined(DEBUG_FORCE_DEFRAG) +#define CONFIG_ACTIVE_DEFRAG_DEFAULT 0 +#else +#define CONFIG_ACTIVE_DEFRAG_DEFAULT 1 +#endif /* Bucket sizes for client eviction pools. Each bucket stores clients with * memory usage of up to twice the size of the bucket below it. */ @@ -208,9 +217,6 @@ struct hdr_histogram; extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; -/* Hash table parameters */ -#define HASHTABLE_MAX_LOAD_FACTOR 1.618 /* Maximum hash table load factor. */ - /* Command flags. Please check the definition of struct serverCommand in this file * for more information about the meaning of every flag. */ #define CMD_WRITE (1ULL << 0) @@ -874,13 +880,15 @@ struct ValkeyModuleDigest { #define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */ #define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */ #define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */ +#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */ #define LRU_BITS 24 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */ #define LRU_CLOCK_RESOLUTION 1000 /* LRU clock resolution in ms */ -#define OBJ_SHARED_REFCOUNT INT_MAX /* Global object never destroyed. */ -#define OBJ_STATIC_REFCOUNT (INT_MAX - 1) /* Object allocated in the stack. */ +#define OBJ_REFCOUNT_BITS 30 +#define OBJ_SHARED_REFCOUNT ((1 << OBJ_REFCOUNT_BITS) - 1) /* Global object never destroyed. */ +#define OBJ_STATIC_REFCOUNT ((1 << OBJ_REFCOUNT_BITS) - 2) /* Object allocated in the stack. */ #define OBJ_FIRST_SPECIAL_REFCOUNT OBJ_STATIC_REFCOUNT struct serverObject { unsigned type : 4; @@ -888,7 +896,9 @@ struct serverObject { unsigned lru : LRU_BITS; /* LRU time (relative to global lru_clock) or * LFU data (least significant 8 bits frequency * and most significant 16 bits access time). */ - int refcount; + unsigned hasexpire : 1; + unsigned hasembkey : 1; + unsigned refcount : OBJ_REFCOUNT_BITS; void *ptr; }; @@ -906,6 +916,8 @@ char *getObjectTypeName(robj *); _var.refcount = OBJ_STATIC_REFCOUNT; \ _var.type = OBJ_STRING; \ _var.encoding = OBJ_ENCODING_RAW; \ + _var.hasexpire = 0; \ + _var.hasembkey = 0; \ _var.ptr = _ptr; \ } while (0) @@ -960,7 +972,6 @@ typedef struct serverDb { int id; /* Database ID */ long long avg_ttl; /* Average TTL, just for stats */ unsigned long expires_cursor; /* Cursor of the active expire cycle. */ - list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ } serverDb; /* forward declaration for functions ctx */ @@ -1093,9 +1104,10 @@ typedef struct { /* With multiplexing we need to take per-client state. * Clients are taken in a linked list. */ -#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ - need more reserved IDs use UINT64_MAX-1, \ - -2, ... and so forth. */ +#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ + need more reserved IDs use UINT64_MAX-1, \ + -2, ... and so forth. */ +#define CLIENT_ID_CACHED_RESPONSE (UINT64_MAX - 1) /* Client for cached response, see createCachedResponseClient. */ /* Replication backlog is not a separate memory, it just is one consumer of * the global replication buffer. This structure records the reference of @@ -1233,7 +1245,8 @@ typedef struct ClientFlags { * knows that it does not need the cache and required a full sync. With this * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ - uint64_t reserved : 5; /* Reserved for future use */ + uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ + uint64_t reserved : 4; /* Reserved for future use */ } ClientFlags; typedef struct client { @@ -1431,7 +1444,7 @@ struct sharedObjectsStruct { *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *srem, *xgroup, *xclaim, *script, *replconf, *eval, *persist, *set, *pexpireat, *pexpire, *time, *pxat, *absttl, *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *getack, - *special_asterick, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk, + *special_asterisk, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk, *smessagebulk, *select[PROTO_SHARED_SELECT_CMDS], *integers[OBJ_SHARED_INTEGERS], *mbulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "*\r\n" */ *bulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "$\r\n" */ @@ -1447,6 +1460,10 @@ typedef struct zskiplistNode { struct zskiplistNode *backward; struct zskiplistLevel { struct zskiplistNode *forward; + /* At each level we keep the span, which is the number of elements which are on the "subtree" + * from this node at this level to the next node at the same level. + * One exception is the value at level 0. In level 0 the span can only be 1 or 0 (in case the last elements in the list) + * So we use it in order to hold the height of the node, which is the number of levels. */ unsigned long span; } level[]; } zskiplistNode; @@ -1611,6 +1628,17 @@ typedef struct serverUnixContextConfig { unsigned int perm; /* UNIX socket permission (see mode_t) */ } serverUnixContextConfig; +/*----------------------------------------------------------------------------- + * RDMA Context Configuration + *----------------------------------------------------------------------------*/ +typedef struct serverRdmaContextConfig { + char *bindaddr[CONFIG_BINDADDR_MAX]; + int bindaddr_count; + int port; + int rx_size; + int completion_vector; +} serverRdmaContextConfig; + /*----------------------------------------------------------------------------- * AOF manifest definition *----------------------------------------------------------------------------*/ @@ -1676,8 +1704,8 @@ struct valkeyServer { int hz; /* serverCron() calls frequency in hertz */ int in_fork_child; /* indication that this is a fork child */ serverDb *db; - dict *commands; /* Command table */ - dict *orig_commands; /* Command table before command renaming. */ + hashtable *commands; /* Command table */ + hashtable *orig_commands; /* Command table before command renaming. */ aeEventLoop *el; _Atomic AeIoState io_poll_state; /* Indicates the state of the IO polling. */ int io_ae_fired_events; /* Number of poll events received by the IO thread. */ @@ -1688,7 +1716,7 @@ struct valkeyServer { int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */ int shutdown_flags; /* Flags passed to prepareForShutdown(). */ int activerehashing; /* Incremental rehash in serverCron() */ - int active_defrag_running; /* Active defragmentation running (holds current scan aggressiveness) */ + int active_defrag_cpu_percent; /* Current desired CPU percentage for active defrag */ char *pidfile; /* PID file path */ int arch_bits; /* 32 or 64 depending on sizeof(long) */ int cronloops; /* Number of times the cron function run */ @@ -1841,6 +1869,7 @@ struct valkeyServer { long long stat_io_reads_processed; /* Number of read events processed by IO threads */ long long stat_io_writes_processed; /* Number of write events processed by IO threads */ long long stat_io_freed_objects; /* Number of objects freed by IO threads */ + long long stat_io_accept_offloaded; /* Number of offloaded accepts */ long long stat_poll_processed_by_io_threads; /* Total number of poll jobs processed by IO */ long long stat_total_reads_processed; /* Total number of read events processed */ long long stat_total_writes_processed; /* Total number of write events processed */ @@ -1880,13 +1909,13 @@ struct valkeyServer { int sanitize_dump_payload; /* Enables deep sanitization for ziplist and listpack in RDB and RESTORE. */ int skip_checksum_validation; /* Disable checksum validation for RDB and RESTORE payload. */ int jemalloc_bg_thread; /* Enable jemalloc background thread */ - int active_defrag_configuration_changed; /* defrag configuration has been changed and need to reconsider - * active_defrag_running in computeDefragCycles. */ + int active_defrag_configuration_changed; /* Config changed; need to recompute active_defrag_cpu_percent. */ size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ int active_defrag_threshold_lower; /* minimum percentage of fragmentation to start active defrag */ int active_defrag_threshold_upper; /* maximum percentage of fragmentation at which we use maximum effort */ - int active_defrag_cycle_min; /* minimal effort for defrag in CPU percentage */ - int active_defrag_cycle_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cpu_min; /* minimal effort for defrag in CPU percentage */ + int active_defrag_cpu_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cycle_us; /* standard duration of defrag cycle */ unsigned long active_defrag_max_scan_fields; /* maximum number of fields of set/hash/zset/list to process from within the main dict scan */ size_t client_max_querybuf_len; /* Limit for client query buffer length */ @@ -2089,6 +2118,8 @@ struct valkeyServer { char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ long long primary_initial_offset; /* Primary PSYNC offset. */ int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + /* Import Mode */ + int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */ /* Synchronous replication. */ list *clients_waiting_acks; /* Clients waiting in WAIT or WAITAOF. */ int get_ack_from_replicas; /* If true we send REPLCONF GETACK. */ @@ -2190,6 +2221,8 @@ struct valkeyServer { int cluster_slot_stats_enabled; /* Cluster slot usage statistics tracking enabled. */ /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */ uint32_t debug_cluster_close_link_on_packet_drop : 1; + /* Debug config to control the random ping. When set, we will disable the random ping in clusterCron. */ + uint32_t debug_cluster_disable_random_ping : 1; sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */ /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ @@ -2222,6 +2255,7 @@ struct valkeyServer { int tls_auth_clients; serverTLSContextConfig tls_ctx_config; serverUnixContextConfig unix_ctx_config; + serverRdmaContextConfig rdma_ctx_config; /* cpu affinity */ char *server_cpulist; /* cpu affinity list of server main/io thread. */ char *bio_cpulist; /* cpu affinity list of bio thread. */ @@ -2554,12 +2588,12 @@ struct serverCommand { bit set in the bitmap of allowed commands. */ sds fullname; /* A SDS string representing the command fullname. */ struct hdr_histogram - *latency_histogram; /*points to the command latency command histogram (unit of time nanosecond) */ + *latency_histogram; /* Points to the command latency command histogram (unit of time nanosecond). */ keySpec legacy_range_key_spec; /* The legacy (first,last,step) key spec is * still maintained (if applicable) so that * we can still support the reply format of * COMMAND INFO and COMMAND GETKEYS */ - dict *subcommands_dict; /* A dictionary that holds the subcommands, the key is the subcommand sds name + hashtable *subcommands_ht; /* Subcommands hash table. The key is the subcommand sds name * (not the fullname), and the value is the serverCommand structure pointer. */ struct serverCommand *parent; struct ValkeyModuleCommand *module_cmd; /* A pointer to the module command data (NULL if native command) */ @@ -2609,7 +2643,7 @@ typedef struct { robj *subject; int encoding; int ii; /* intset iterator */ - dictIterator *di; + hashtableIterator *hashtable_iterator; unsigned char *lpi; /* listpack iterator */ } setTypeIterator; @@ -2640,11 +2674,11 @@ extern struct valkeyServer server; extern struct sharedObjectsStruct shared; extern dictType objectKeyPointerValueDictType; extern dictType objectKeyHeapPointerValueDictType; -extern dictType setDictType; +extern hashtableType setHashtableType; extern dictType BenchmarkDictType; extern dictType zsetDictType; -extern dictType kvstoreKeysDictType; -extern dictType kvstoreExpiresDictType; +extern hashtableType kvstoreKeysHashtableType; +extern hashtableType kvstoreExpiresHashtableType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; extern dictType hashDictType; extern dictType stringSetDictType; @@ -2652,9 +2686,10 @@ extern dictType externalStringType; extern dictType sdsHashDictType; extern dictType clientDictType; extern dictType objToDictDictType; -extern dictType kvstoreChannelDictType; +extern hashtableType kvstoreChannelHashtableType; extern dictType modulesDictType; extern dictType sdsReplyDictType; +extern hashtableType sdsReplyHashtableType; extern dictType keylistDictType; extern dict *modules; @@ -2713,7 +2748,7 @@ size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); int moduleDefragValue(robj *key, robj *obj, int dbid); -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid); +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); void moduleDefragGlobals(void); void *moduleGetHandleByName(char *modulename); int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); @@ -2847,6 +2882,7 @@ char *getClientPeerId(client *client); char *getClientSockName(client *client); int isClientConnIpV6(client *c); sds catClientInfoString(sds s, client *client, int hide_user_data); +sds catClientInfoShortString(sds s, client *client, int hide_user_data); sds getAllClientsInfoString(int type, int hide_user_data); int clientSetName(client *c, robj *name, const char **err); void rewriteClientCommandVector(client *c, int argc, ...); @@ -2975,7 +3011,6 @@ void execCommandAbort(client *c, sds error); /* Object implementation */ void decrRefCount(robj *o); -void decrRefCountVoid(void *o); void incrRefCount(robj *o); robj *makeObjectShared(robj *o); void freeStringObject(robj *o); @@ -2988,7 +3023,6 @@ robj *createObject(int type, void *ptr); void initObjectLRUOrLFU(robj *o); robj *createStringObject(const char *ptr, size_t len); robj *createRawStringObject(const char *ptr, size_t len); -robj *createEmbeddedStringObject(const char *ptr, size_t len); robj *tryCreateRawStringObject(const char *ptr, size_t len); robj *tryCreateStringObject(const char *ptr, size_t len); robj *dupStringObject(const robj *o); @@ -3029,11 +3063,15 @@ int collateStringObjects(const robj *a, const robj *b); int equalStringObjects(robj *a, robj *b); unsigned long long estimateObjectIdleTime(robj *o); void trimStringObjectIfNeeded(robj *o, int trim_small_values); -static inline int canUseSharedObject(void) { - return server.maxmemory == 0 || !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS); -} #define sdsEncodedObject(objptr) (objptr->encoding == OBJ_ENCODING_RAW || objptr->encoding == OBJ_ENCODING_EMBSTR) +/* Objects with key attached, AKA valkey (val+key) objects */ +robj *createObjectWithKeyAndExpire(int type, void *ptr, const sds key, long long expire); +robj *objectSetKeyAndExpire(robj *val, sds key, long long expire); +robj *objectSetExpire(robj *val, long long expire); +sds objectGetKey(const robj *val); +long long objectGetExpire(const robj *val); + /* Synchronous I/O with timeout */ ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout); ssize_t syncRead(int fd, char *ptr, ssize_t size, long long timeout); @@ -3285,12 +3323,11 @@ void setupSignalHandlers(void); int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler); connListener *listenerByType(const char *typename); int changeListener(connListener *listener); -void closeListener(connListener *listener); struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name); struct serverCommand *lookupCommand(robj **argv, int argc); -struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s); +struct serverCommand *lookupCommandBySdsLogic(hashtable *commands, sds s); struct serverCommand *lookupCommandBySds(sds s); -struct serverCommand *lookupCommandByCStringLogic(dict *commands, const char *s); +struct serverCommand *lookupCommandByCStringLogic(hashtable *commands, const char *s); struct serverCommand *lookupCommandByCString(const char *s); struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc); int commandCheckExistence(client *c, sds *err); @@ -3324,7 +3361,7 @@ void serverLogRawFromHandler(int level, const char *msg); void usage(void); void updateDictResizePolicy(void); void populateCommandTable(void); -void resetCommandTableStats(dict *commands); +void resetCommandTableStats(hashtable *commands); void resetErrorTableStats(void); void adjustOpenFilesLimit(void); void incrementErrorCount(const char *fullerr, size_t namelen); @@ -3334,7 +3371,8 @@ void bytesToHuman(char *s, size_t size, unsigned long long n); void enterExecutionUnit(int update_cached_time, long long us); void exitExecutionUnit(void); void resetServerStats(void); -void activeDefragCycle(void); +void monitorActiveDefrag(void); +void defragWhileBlocked(void); unsigned int getLRUClock(void); unsigned int LRU_CLOCK(void); const char *evictPolicyToString(void); @@ -3346,7 +3384,6 @@ void rejectCommandFormat(client *c, const char *fmt, ...); void *activeDefragAlloc(void *ptr); robj *activeDefragStringOb(robj *ob); void dismissSds(sds s); -void dismissMemory(void *ptr, size_t size_hint); void dismissMemoryInChild(void); #define RESTART_SERVER_NONE 0 @@ -3359,10 +3396,10 @@ int calculateKeySlot(sds key); /* kvstore wrappers */ int dbExpand(serverDb *db, uint64_t db_size, int try_expand); int dbExpandExpires(serverDb *db, uint64_t db_size, int try_expand); -dictEntry *dbFind(serverDb *db, void *key); -dictEntry *dbFindExpires(serverDb *db, void *key); +robj *dbFind(serverDb *db, sds key); +robj *dbFindExpires(serverDb *db, sds key); unsigned long long dbSize(serverDb *db); -unsigned long long dbScan(serverDb *db, unsigned long long cursor, dictScanFunction *scan_cb, void *privdata); +unsigned long long dbScan(serverDb *db, unsigned long long cursor, hashtableScanFunction scan_cb, void *privdata); /* Set data type */ robj *setTypeCreate(sds value, size_t size_hint); @@ -3521,7 +3558,7 @@ void deleteExpiredKeyFromOverwriteAndPropagate(client *c, robj *keyobj); void propagateDeletion(serverDb *db, robj *key, int lazy); int keyIsExpired(serverDb *db, robj *key); long long getExpire(serverDb *db, robj *key); -void setExpire(client *c, serverDb *db, robj *key, long long when); +robj *setExpire(client *c, serverDb *db, robj *key, long long when); int checkAlreadyExpired(long long when); robj *lookupKeyRead(serverDb *db, robj *key); robj *lookupKeyWrite(serverDb *db, robj *key); @@ -3541,16 +3578,16 @@ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, long lo #define LOOKUP_NOEFFECTS \ (LOOKUP_NONOTIFY | LOOKUP_NOSTATS | LOOKUP_NOTOUCH | LOOKUP_NOEXPIRE) /* Avoid any effects from fetching the key */ -void dbAdd(serverDb *db, robj *key, robj *val); -int dbAddRDBLoad(serverDb *db, sds key, robj *val); -void dbReplaceValue(serverDb *db, robj *key, robj *val); +void dbAdd(serverDb *db, robj *key, robj **valref); +int dbAddRDBLoad(serverDb *db, sds key, robj **valref); +void dbReplaceValue(serverDb *db, robj *key, robj **valref); #define SETKEY_KEEPTTL 1 #define SETKEY_NO_SIGNAL 2 #define SETKEY_ALREADY_EXIST 4 #define SETKEY_DOESNT_EXIST 8 #define SETKEY_ADD_OR_UPDATE 16 /* Key most likely doesn't exists */ -void setKey(client *c, serverDb *db, robj *key, robj *val, int flags); +void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags); robj *dbRandomKey(serverDb *db); int dbGenericDelete(serverDb *db, robj *key, int async, int flags); int dbSyncDelete(serverDb *db, robj *key); @@ -3560,14 +3597,12 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o); #define EMPTYDB_NO_FLAGS 0 /* No flags. */ #define EMPTYDB_ASYNC (1 << 0) /* Reclaim memory in another thread. */ #define EMPTYDB_NOFUNCTIONS (1 << 1) /* Indicate not to flush the functions. */ -long long emptyData(int dbnum, int flags, void(callback)(dict *)); -long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(dict *)); +long long emptyData(int dbnum, int flags, void(callback)(hashtable *)); +long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(hashtable *)); void flushAllDataAndResetRDB(int flags); long long dbTotalServerKeyCount(void); serverDb *initTempDb(void); -void discardTempDb(serverDb *tempDb, void(callback)(dict *)); - - +void discardTempDb(serverDb *tempDb); int selectDb(client *c, int id); void signalModifiedKey(client *c, serverDb *db, robj *key); void signalFlushedDb(int dbid, int async); @@ -4021,7 +4056,7 @@ int memtest_preserving_test(unsigned long *m, size_t bytes, int passes); void mixDigest(unsigned char *digest, const void *ptr, size_t len); void xorDigest(unsigned char *digest, const void *ptr, size_t len); sds catSubCommandFullname(const char *parent_name, const char *sub_name); -void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand, const char *declared_name); +void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand); void debugDelay(int usec); void killThreads(void); void makeThreadKillable(void); @@ -4037,6 +4072,11 @@ void debugPauseProcess(void); _serverLog(level, __VA_ARGS__); \ } while (0) +/* dualChannelServerLog - Log messages related to dual-channel operations + * This macro wraps the serverLog function, prepending "" + * to the log message. */ +#define dualChannelServerLog(level, ...) serverLog(level, " " __VA_ARGS__) + #define serverDebug(fmt, ...) printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__) #define serverDebugMark() printf("-- MARK %s:%d --\n", __FILE__, __LINE__) diff --git a/src/socket.c b/src/socket.c index 7344d66ad8..d89e6c8767 100644 --- a/src/socket.c +++ b/src/socket.c @@ -339,6 +339,19 @@ static int connSocketListen(connListener *listener) { return listenToPort(listener); } +static void connSocketCloseListener(connListener *listener) { + int j; + + for (j = 0; j < listener->count; j++) { + if (listener->fd[j] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[j], AE_READABLE); + close(listener->fd[j]); + } + + listener->count = 0; +} + static int connSocketBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { int fd = anetTcpNonBlockConnect(NULL, addr, port); if (fd == -1) { @@ -395,6 +408,7 @@ static ConnectionType CT_Socket = { .addr = connSocketAddr, .is_local = connSocketIsLocal, .listen = connSocketListen, + .closeListener = connSocketCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateSocket, diff --git a/src/sort.c b/src/sort.c index 92777b068c..b1723daff0 100644 --- a/src/sort.c +++ b/src/sort.c @@ -34,6 +34,8 @@ #include /* isnan() */ #include "cluster.h" +#include "valkey_strtod.h" + zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); serverSortOperation *createSortOperation(int type, robj *pattern) { @@ -479,9 +481,9 @@ void sortCommandGeneric(client *c, int readonly) { } else { if (sdsEncodedObject(byval)) { char *eptr; - - vector[j].u.score = strtod(byval->ptr, &eptr); - if (eptr[0] != '\0' || errno == ERANGE || isnan(vector[j].u.score)) { + errno = 0; + vector[j].u.score = valkey_strtod(byval->ptr, &eptr); + if (eptr[0] != '\0' || errno == ERANGE || errno == EINVAL || isnan(vector[j].u.score)) { int_conversion_error = 1; } } else if (byval->encoding == OBJ_ENCODING_INT) { @@ -577,7 +579,10 @@ void sortCommandGeneric(client *c, int readonly) { } if (outputlen) { listTypeTryConversion(sobj, LIST_CONV_AUTO, NULL, NULL); - setKey(c, c->db, storekey, sobj, 0); + setKey(c, c->db, storekey, &sobj, 0); + /* Ownership of sobj transferred to the db. Set to NULL to prevent + * freeing it below. */ + sobj = NULL; notifyKeyspaceEvent(NOTIFY_LIST, "sortstore", storekey, c->db->id); server.dirty += outputlen; } else if (dbDelete(c->db, storekey)) { @@ -585,7 +590,7 @@ void sortCommandGeneric(client *c, int readonly) { notifyKeyspaceEvent(NOTIFY_GENERIC, "del", storekey, c->db->id); server.dirty++; } - decrRefCount(sobj); + if (sobj != NULL) decrRefCount(sobj); addReplyLongLong(c, outputlen); } diff --git a/src/t_hash.c b/src/t_hash.c index dabe279808..1aa37968b7 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -432,7 +432,7 @@ robj *hashTypeLookupWriteOrCreate(client *c, robj *key) { if (o == NULL) { o = createHashObject(); - dbAdd(c->db, key, o); + dbAdd(c->db, key, &o); } return o; } diff --git a/src/t_list.c b/src/t_list.c index ffe3e9b08a..57a47ee681 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -471,7 +471,7 @@ void pushGenericCommand(client *c, int where, int xx) { } lobj = createListListpackObject(); - dbAdd(c->db, c->argv[1], lobj); + dbAdd(c->db, c->argv[1], &lobj); } listTypeTryConversionAppend(lobj, c->argv, 2, c->argc - 1, NULL, NULL); @@ -1068,7 +1068,7 @@ void lmoveHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value, int whe /* Create the list if the key does not exist */ if (!dstobj) { dstobj = createListListpackObject(); - dbAdd(c->db, dstkey, dstobj); + dbAdd(c->db, dstkey, &dstobj); } listTypeTryConversionAppend(dstobj, &value, 0, 0, NULL, NULL); listTypePush(dstobj, value, where); diff --git a/src/t_set.c b/src/t_set.c index a540c3c49b..4279baf82f 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -28,6 +28,7 @@ */ #include "server.h" +#include "hashtable.h" #include "intset.h" /* Compact integer set structure */ /*----------------------------------------------------------------------------- @@ -50,7 +51,7 @@ robj *setTypeCreate(sds value, size_t size_hint) { /* We may oversize the set by using the hint if the hint is not accurate, * but we will assume this is acceptable to maximize performance. */ robj *o = createSetObject(); - dictExpand(o->ptr, size_hint); + hashtableExpand(o->ptr, size_hint); return o; } @@ -59,7 +60,7 @@ robj *setTypeCreate(sds value, size_t size_hint) { void setTypeMaybeConvert(robj *set, size_t size_hint) { if ((set->encoding == OBJ_ENCODING_LISTPACK && size_hint > server.set_max_listpack_entries) || (set->encoding == OBJ_ENCODING_INTSET && size_hint > server.set_max_intset_entries)) { - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, size_hint, 1); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, size_hint, 1); } } @@ -74,7 +75,7 @@ static size_t intsetMaxEntries(void) { /* Converts intset to HT if it contains too many entries. */ static void maybeConvertIntset(robj *subject) { serverAssert(subject->encoding == OBJ_ENCODING_INTSET); - if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HT); + if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HASHTABLE); } /* When you know all set elements are integers, call this to convert the set to @@ -91,7 +92,7 @@ static void maybeConvertToIntset(robj *set) { while (setTypeNext(si, &str, &len, &llval) != -1) { if (str) { /* If the element is returned as a string, we may be able to convert - * it to integer. This happens for OBJ_ENCODING_HT. */ + * it to integer. This happens for OBJ_ENCODING_HASHTABLE. */ serverAssert(string2ll(str, len, (long long *)&llval)); } uint8_t success = 0; @@ -134,20 +135,21 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd } serverAssert(str); - if (set->encoding == OBJ_ENCODING_HT) { + if (set->encoding == OBJ_ENCODING_HASHTABLE) { /* Avoid duping the string if it is an sds string. */ sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); - dict *ht = set->ptr; - void *position = dictFindPositionForInsert(ht, sdsval, NULL); - if (position) { + hashtable *ht = set->ptr; + hashtablePosition position; + if (hashtableFindPositionForInsert(ht, sdsval, &position, NULL)) { /* Key doesn't already exist in the set. Add it but dup the key. */ if (sdsval == str) sdsval = sdsdup(sdsval); - dictInsertAtPosition(ht, sdsval, position); + hashtableInsertAtPosition(ht, sdsval, &position); + return 1; } else if (sdsval != str) { /* String is already a member. Free our temporary sds copy. */ sdsfree(sdsval); + return 0; } - return (position != NULL); } else if (set->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *lp = set->ptr; unsigned char *p = lpFirst(lp); @@ -166,8 +168,8 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd set->ptr = lp; } else { /* Size limit is reached. Convert to hashtable and add. */ - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, lpLength(lp) + 1, 1); - serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, lpLength(lp) + 1, 1); + serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len))); } return 1; } @@ -204,10 +206,10 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd set->ptr = lp; return 1; } else { - setTypeConvertAndExpand(set, OBJ_ENCODING_HT, intsetLen(set->ptr) + 1, 1); + setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, intsetLen(set->ptr) + 1, 1); /* The set *was* an intset and this value is not integer - * encodable, so dictAdd should always work. */ - serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK); + * encodable, so hashtableAdd should always work. */ + serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len))); return 1; } } @@ -242,9 +244,9 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str str_is_sds = 0; } - if (setobj->encoding == OBJ_ENCODING_HT) { + if (setobj->encoding == OBJ_ENCODING_HASHTABLE) { sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); - int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK); + int deleted = hashtableDelete(setobj->ptr, sdsval); if (sdsval != str) sdsfree(sdsval); /* free temp copy */ return deleted; } else if (setobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -298,11 +300,11 @@ int setTypeIsMemberAux(robj *set, char *str, size_t len, int64_t llval, int str_ } else if (set->encoding == OBJ_ENCODING_INTSET) { long long llval; return string2ll(str, len, &llval) && intsetFind(set->ptr, llval); - } else if (set->encoding == OBJ_ENCODING_HT && str_is_sds) { - return dictFind(set->ptr, (sds)str) != NULL; - } else if (set->encoding == OBJ_ENCODING_HT) { + } else if (set->encoding == OBJ_ENCODING_HASHTABLE && str_is_sds) { + return hashtableFind(set->ptr, (sds)str, NULL); + } else if (set->encoding == OBJ_ENCODING_HASHTABLE) { sds sdsval = sdsnewlen(str, len); - int result = dictFind(set->ptr, sdsval) != NULL; + int result = hashtableFind(set->ptr, sdsval, NULL); sdsfree(sdsval); return result; } else { @@ -314,8 +316,8 @@ setTypeIterator *setTypeInitIterator(robj *subject) { setTypeIterator *si = zmalloc(sizeof(setTypeIterator)); si->subject = subject; si->encoding = subject->encoding; - if (si->encoding == OBJ_ENCODING_HT) { - si->di = dictGetIterator(subject->ptr); + if (si->encoding == OBJ_ENCODING_HASHTABLE) { + si->hashtable_iterator = hashtableCreateIterator(subject->ptr); } else if (si->encoding == OBJ_ENCODING_INTSET) { si->ii = 0; } else if (si->encoding == OBJ_ENCODING_LISTPACK) { @@ -327,7 +329,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) { } void setTypeReleaseIterator(setTypeIterator *si) { - if (si->encoding == OBJ_ENCODING_HT) dictReleaseIterator(si->di); + if (si->encoding == OBJ_ENCODING_HASHTABLE) hashtableReleaseIterator(si->hashtable_iterator); zfree(si); } @@ -340,7 +342,7 @@ void setTypeReleaseIterator(setTypeIterator *si) { * (str and len) or (llele) depending on whether the value is stored as a string * or as an integer internally. * - * If OBJ_ENCODING_HT is returned, then str points to an sds string and can be + * If OBJ_ENCODING_HASHTABLE is returned, then str points to an sds string and can be * used as such. If OBJ_ENCODING_INTSET, then llele is populated and str is * pointed to NULL. If OBJ_ENCODING_LISTPACK is returned, the value can be * either a string or an integer. If *str is not NULL, then str and len are @@ -353,10 +355,10 @@ void setTypeReleaseIterator(setTypeIterator *si) { * * When there are no more elements -1 is returned. */ int setTypeNext(setTypeIterator *si, char **str, size_t *len, int64_t *llele) { - if (si->encoding == OBJ_ENCODING_HT) { - dictEntry *de = dictNext(si->di); - if (de == NULL) return -1; - *str = dictGetKey(de); + if (si->encoding == OBJ_ENCODING_HASHTABLE) { + void *next; + if (!hashtableNext(si->hashtable_iterator, &next)) return -1; + *str = next; *len = sdslen(*str); *llele = -123456789; /* Not needed. Defensive. */ } else if (si->encoding == OBJ_ENCODING_INTSET) { @@ -406,15 +408,16 @@ sds setTypeNextObject(setTypeIterator *si) { * object. The return value of the function is the object->encoding * field of the object and can be used by the caller to check if the * int64_t pointer or the str and len pointers were populated, as for - * setTypeNext. If OBJ_ENCODING_HT is returned, str is pointed to a + * setTypeNext. If OBJ_ENCODING_HASHTABLE is returned, str is pointed to a * string which is actually an sds string and it can be used as such. * * Note that both the str, len and llele pointers should be passed and cannot * be NULL. If str is set to NULL, the value is an integer stored in llele. */ int setTypeRandomElement(robj *setobj, char **str, size_t *len, int64_t *llele) { - if (setobj->encoding == OBJ_ENCODING_HT) { - dictEntry *de = dictGetFairRandomKey(setobj->ptr); - *str = dictGetKey(de); + if (setobj->encoding == OBJ_ENCODING_HASHTABLE) { + void *entry = NULL; + hashtableFairRandomEntry(setobj->ptr, &entry); + *str = entry; *len = sdslen(*str); *llele = -123456789; /* Not needed. Defensive. */ } else if (setobj->encoding == OBJ_ENCODING_INTSET) { @@ -457,14 +460,14 @@ robj *setTypePopRandom(robj *set) { obj = createStringObject(str, len); else obj = createStringObjectFromLongLong(llele); - setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT); + setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); } return obj; } unsigned long setTypeSize(const robj *subject) { - if (subject->encoding == OBJ_ENCODING_HT) { - return dictSize((const dict *)subject->ptr); + if (subject->encoding == OBJ_ENCODING_HASHTABLE) { + return hashtableSize((const hashtable *)subject->ptr); } else if (subject->encoding == OBJ_ENCODING_INTSET) { return intsetLen((const intset *)subject->ptr); } else if (subject->encoding == OBJ_ENCODING_LISTPACK) { @@ -474,7 +477,7 @@ unsigned long setTypeSize(const robj *subject) { } } -/* Convert the set to specified encoding. The resulting dict (when converting +/* Convert the set to specified encoding. The resulting hashtable (when converting * to a hash table) is presized to hold the number of elements in the original * set. */ void setTypeConvert(robj *setobj, int enc) { @@ -489,28 +492,28 @@ int setTypeConvertAndExpand(robj *setobj, int enc, unsigned long cap, int panic) setTypeIterator *si; serverAssertWithInfo(NULL, setobj, setobj->type == OBJ_SET && setobj->encoding != enc); - if (enc == OBJ_ENCODING_HT) { - dict *d = dictCreate(&setDictType); + if (enc == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = hashtableCreate(&setHashtableType); sds element; - /* Presize the dict to avoid rehashing */ + /* Presize the hashtable to avoid rehashing */ if (panic) { - dictExpand(d, cap); - } else if (dictTryExpand(d, cap) != DICT_OK) { - dictRelease(d); + hashtableExpand(ht, cap); + } else if (!hashtableTryExpand(ht, cap)) { + hashtableRelease(ht); return C_ERR; } /* To add the elements we extract integers and create Objects */ si = setTypeInitIterator(setobj); while ((element = setTypeNextObject(si)) != NULL) { - serverAssert(dictAdd(d, element, NULL) == DICT_OK); + serverAssert(hashtableAdd(ht, element)); } setTypeReleaseIterator(si); freeSetObject(setobj); /* frees the internals but not setobj itself */ - setobj->encoding = OBJ_ENCODING_HT; - setobj->ptr = d; + setobj->encoding = OBJ_ENCODING_HASHTABLE; + setobj->ptr = ht; } else if (enc == OBJ_ENCODING_LISTPACK) { /* Preallocate the minimum two bytes per element (enc/value + backlen) */ size_t estcap = cap * 2; @@ -568,10 +571,10 @@ robj *setTypeDup(robj *o) { memcpy(new_lp, lp, sz); set = createObject(OBJ_SET, new_lp); set->encoding = OBJ_ENCODING_LISTPACK; - } else if (o->encoding == OBJ_ENCODING_HT) { + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { set = createSetObject(); - dict *d = o->ptr; - dictExpand(set->ptr, dictSize(d)); + hashtable *ht = o->ptr; + hashtableExpand(set->ptr, hashtableSize(ht)); si = setTypeInitIterator(o); char *str; size_t len; @@ -595,7 +598,7 @@ void saddCommand(client *c) { if (set == NULL) { set = setTypeCreate(c->argv[2]->ptr, c->argc - 2); - dbAdd(c->db, c->argv[1], set); + dbAdd(c->db, c->argv[1], &set); } else { setTypeMaybeConvert(set, c->argc - 2); } @@ -674,7 +677,7 @@ void smoveCommand(client *c) { /* Create the destination set when it doesn't exist */ if (!dstset) { dstset = setTypeCreate(ele->ptr, 1); - dbAdd(c->db, c->argv[2], dstset); + dbAdd(c->db, c->argv[2], &dstset); } signalModifiedKey(c, c->db, c->argv[1]); @@ -891,8 +894,8 @@ void spopWithCountCommand(client *c) { if (!newset) { newset = str ? createSetListpackObject() : createIntsetObject(); } - setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HT); - setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT); + setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); + setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE); } } @@ -919,7 +922,7 @@ void spopWithCountCommand(client *c) { setTypeReleaseIterator(si); /* Assign the new set as the key value. */ - dbReplaceValue(c->db, c->argv[1], newset); + dbReplaceValue(c->db, c->argv[1], &newset); } /* Replicate/AOF the remaining elements as an SREM operation */ @@ -1001,8 +1004,6 @@ void srandmemberWithCountCommand(client *c) { size_t len; int64_t llele; - dict *d; - if (getRangeLongFromObjectOrReply(c, c->argv[2], -LONG_MAX, LONG_MAX, &l, NULL) != C_OK) return; if (l >= 0) { count = (unsigned long)l; @@ -1111,8 +1112,8 @@ void srandmemberWithCountCommand(client *c) { return; } - /* For CASE 3 and CASE 4 we need an auxiliary dictionary. */ - d = dictCreate(&sdsReplyDictType); + /* For CASE 3 and CASE 4 we need an auxiliary hashtable. */ + hashtable *ht = hashtableCreate(&sdsReplyHashtableType); /* CASE 3: * The number of elements inside the set is not greater than @@ -1126,29 +1127,25 @@ void srandmemberWithCountCommand(client *c) { if (count * SRANDMEMBER_SUB_STRATEGY_MUL > size) { setTypeIterator *si; - /* Add all the elements into the temporary dictionary. */ + /* Add all the elements into the temporary hashtable. */ si = setTypeInitIterator(set); - dictExpand(d, size); + hashtableExpand(ht, size); while (setTypeNext(si, &str, &len, &llele) != -1) { - int retval = DICT_ERR; - if (str == NULL) { - retval = dictAdd(d, sdsfromlonglong(llele), NULL); + serverAssert(hashtableAdd(ht, (void *)sdsfromlonglong(llele))); } else { - retval = dictAdd(d, sdsnewlen(str, len), NULL); + serverAssert(hashtableAdd(ht, (void *)sdsnewlen(str, len))); } - serverAssert(retval == DICT_OK); } setTypeReleaseIterator(si); - serverAssert(dictSize(d) == size); + serverAssert(hashtableSize(ht) == size); /* Remove random elements to reach the right count. */ while (size > count) { - dictEntry *de; - de = dictGetFairRandomKey(d); - dictUnlink(d, dictGetKey(de)); - sdsfree(dictGetKey(de)); - dictFreeUnlinkedEntry(d, de); + void *element; + hashtableFairRandomEntry(ht, &element); + hashtableDelete(ht, element); + sdsfree((sds)element); size--; } } @@ -1161,7 +1158,7 @@ void srandmemberWithCountCommand(client *c) { unsigned long added = 0; sds sdsele; - dictExpand(d, count); + hashtableExpand(ht, count); while (added < count) { setTypeRandomElement(set, &str, &len, &llele); if (str == NULL) { @@ -1172,7 +1169,7 @@ void srandmemberWithCountCommand(client *c) { /* Try to add the object to the dictionary. If it already exists * free it, otherwise increment the number of objects we have * in the result dictionary. */ - if (dictAdd(d, sdsele, NULL) == DICT_OK) + if (hashtableAdd(ht, sdsele)) added++; else sdsfree(sdsele); @@ -1181,14 +1178,15 @@ void srandmemberWithCountCommand(client *c) { /* CASE 3 & 4: send the result to the user. */ { - dictIterator *di; - dictEntry *de; + hashtableIterator iter; + hashtableInitIterator(&iter, ht); addReplyArrayLen(c, count); - di = dictGetIterator(d); - while ((de = dictNext(di)) != NULL) addReplyBulkSds(c, dictGetKey(de)); - dictReleaseIterator(di); - dictRelease(d); + serverAssert(count == hashtableSize(ht)); + void *element; + while (hashtableNext(&iter, &element)) addReplyBulkSds(c, (sds)element); + hashtableResetIterator(&iter); + hashtableRelease(ht); } } @@ -1336,7 +1334,7 @@ void sinterGenericCommand(client *c, while ((encoding = setTypeNext(si, &str, &len, &intobj)) != -1) { for (j = 1; j < setnum; j++) { if (sets[j] == sets[0]) continue; - if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HT)) break; + if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE)) break; } /* Only take action when all sets contain the member */ @@ -1355,7 +1353,7 @@ void sinterGenericCommand(client *c, } else { if (str && only_integers) { /* It may be an integer although we got it as a string. */ - if (encoding == OBJ_ENCODING_HT && string2ll(str, len, (long long *)&intobj)) { + if (encoding == OBJ_ENCODING_HASHTABLE && string2ll(str, len, (long long *)&intobj)) { if (dstset->encoding == OBJ_ENCODING_LISTPACK || dstset->encoding == OBJ_ENCODING_INTSET) { /* Adding it as an integer is more efficient. */ str = NULL; @@ -1365,7 +1363,7 @@ void sinterGenericCommand(client *c, only_integers = 0; } } - setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HT); + setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE); } } } @@ -1383,7 +1381,7 @@ void sinterGenericCommand(client *c, * frequent reallocs. Therefore, we shrink it now. */ dstset->ptr = lpShrinkToFit(dstset->ptr); } - setKey(c, c->db, dstkey, dstset, 0); + setKey(c, c->db, dstkey, &dstset, 0); addReplyLongLong(c, setTypeSize(dstset)); notifyKeyspaceEvent(NOTIFY_SET, "sinterstore", dstkey, c->db->id); server.dirty++; @@ -1394,8 +1392,8 @@ void sinterGenericCommand(client *c, signalModifiedKey(c, c->db, dstkey); notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id); } + decrRefCount(dstset); } - decrRefCount(dstset); } else { setDeferredSetLen(c, replylen, cardinality); } @@ -1467,7 +1465,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke /* For a SET's encoding, according to the factory method setTypeCreate(), currently have 3 types: * 1. OBJ_ENCODING_INTSET * 2. OBJ_ENCODING_LISTPACK - * 3. OBJ_ENCODING_HT + * 3. OBJ_ENCODING_HASHTABLE * 'dstset_encoding' is used to determine which kind of encoding to use when initialize 'dstset'. * * If all sets are all OBJ_ENCODING_INTSET encoding or 'dstkey' is not null, keep 'dstset' @@ -1478,8 +1476,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke * the hashtable is more efficient when find and compare than the listpack. The corresponding * time complexity are O(1) vs O(n). */ if (!dstkey && dstset_encoding == OBJ_ENCODING_INTSET && - (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HT)) { - dstset_encoding = OBJ_ENCODING_HT; + (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HASHTABLE)) { + dstset_encoding = OBJ_ENCODING_HASHTABLE; } sets[j] = setobj; if (j > 0 && sets[0] == sets[j]) { @@ -1536,7 +1534,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke si = setTypeInitIterator(sets[j]); while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) { - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } setTypeReleaseIterator(si); } @@ -1556,11 +1554,11 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke for (j = 1; j < setnum; j++) { if (!sets[j]) continue; /* no key is an empty set. */ if (sets[j] == sets[0]) break; /* same set! */ - if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HT)) break; + if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HASHTABLE)) break; } if (j == setnum) { /* There is no other set with this element. Add it. */ - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } } setTypeReleaseIterator(si); @@ -1578,9 +1576,9 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke si = setTypeInitIterator(sets[j]); while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) { if (j == 0) { - cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } else { - cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT); + cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE); } } setTypeReleaseIterator(si); @@ -1607,7 +1605,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke /* If we have a target key where to store the resulting set * create this key with the result set inside */ if (setTypeSize(dstset) > 0) { - setKey(c, c->db, dstkey, dstset, 0); + setKey(c, c->db, dstkey, &dstset, 0); addReplyLongLong(c, setTypeSize(dstset)); notifyKeyspaceEvent(NOTIFY_SET, op == SET_OP_UNION ? "sunionstore" : "sdiffstore", dstkey, c->db->id); server.dirty++; @@ -1618,8 +1616,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke signalModifiedKey(c, c->db, dstkey); notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id); } + decrRefCount(dstset); } - decrRefCount(dstset); } zfree(sets); } diff --git a/src/t_stream.c b/src/t_stream.c index a42822dabc..17254b58dd 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -54,6 +54,7 @@ #define STREAM_LISTPACK_MAX_SIZE (1 << 30) void streamFreeCG(streamCG *cg); +void streamFreeCGVoid(void *cg); void streamFreeNACK(streamNACK *na); size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, @@ -86,8 +87,8 @@ stream *streamNew(void) { /* Free a stream, including the listpacks stored inside the radix tree. */ void freeStream(stream *s) { - raxFreeWithCallback(s->rax, (void (*)(void *))lpFree); - if (s->cgroups) raxFreeWithCallback(s->cgroups, (void (*)(void *))streamFreeCG); + raxFreeWithCallback(s->rax, lpFreeVoid); + if (s->cgroups) raxFreeWithCallback(s->cgroups, streamFreeCGVoid); zfree(s); } @@ -1839,7 +1840,7 @@ robj *streamTypeLookupWriteOrCreate(client *c, robj *key, int no_create) { return NULL; } o = createStreamObject(); - dbAdd(c->db, key, o); + dbAdd(c->db, key, &o); } return o; } @@ -2454,6 +2455,11 @@ void streamFreeConsumer(streamConsumer *sc) { zfree(sc); } +/* Used for generic free functions. */ +static void streamFreeConsumerVoid(void *sc) { + streamFreeConsumer((streamConsumer *)sc); +} + /* Create a new consumer group in the context of the stream 's', having the * specified name, last server ID and reads counter. If a consumer group with * the same name already exists NULL is returned, otherwise the pointer to the @@ -2473,11 +2479,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo /* Free a consumer group and all its associated data. */ void streamFreeCG(streamCG *cg) { - raxFreeWithCallback(cg->pel, (void (*)(void *))streamFreeNACK); - raxFreeWithCallback(cg->consumers, (void (*)(void *))streamFreeConsumer); + raxFreeWithCallback(cg->pel, zfree); + raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid); zfree(cg); } +/* Used for generic free functions. */ +void streamFreeCGVoid(void *cg) { + streamFreeCG((streamCG *)cg); +} + /* Lookup the consumer group in the specified stream and returns its * pointer, otherwise if there is no such group, NULL is returned. */ streamCG *streamLookupCG(stream *s, sds groupname) { @@ -2645,7 +2656,7 @@ void xgroupCommand(client *c) { if (s == NULL) { serverAssert(mkstream); o = createStreamObject(); - dbAdd(c->db, c->argv[2], o); + dbAdd(c->db, c->argv[2], &o); s = o->ptr; signalModifiedKey(c, c->db, c->argv[2]); } diff --git a/src/t_string.c b/src/t_string.c index 1c90eabf3e..da8953ee08 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -67,15 +67,18 @@ static int checkStringLength(client *c, long long size, long long append) { * If abort_reply is NULL, "$-1" is used. */ #define OBJ_NO_FLAGS 0 -#define OBJ_SET_NX (1 << 0) /* Set if key not exists. */ -#define OBJ_SET_XX (1 << 1) /* Set if key exists. */ -#define OBJ_EX (1 << 2) /* Set if time in seconds is given */ -#define OBJ_PX (1 << 3) /* Set if time in ms in given */ -#define OBJ_KEEPTTL (1 << 4) /* Set and keep the ttl */ -#define OBJ_SET_GET (1 << 5) /* Set if want to get key before set */ -#define OBJ_EXAT (1 << 6) /* Set if timestamp in second is given */ -#define OBJ_PXAT (1 << 7) /* Set if timestamp in ms is given */ -#define OBJ_PERSIST (1 << 8) /* Set if we need to remove the ttl */ +#define OBJ_SET_NX (1 << 0) /* Set if key not exists. */ +#define OBJ_SET_XX (1 << 1) /* Set if key exists. */ +#define OBJ_EX (1 << 2) /* Set if time in seconds is given */ +#define OBJ_PX (1 << 3) /* Set if time in ms in given */ +#define OBJ_KEEPTTL (1 << 4) /* Set and keep the ttl */ +#define OBJ_SET_GET (1 << 5) /* Set if want to get key before set */ +#define OBJ_EXAT (1 << 6) /* Set if timestamp in second is given */ +#define OBJ_PXAT (1 << 7) /* Set if timestamp in ms is given */ +#define OBJ_PERSIST (1 << 8) /* Set if we need to remove the ttl */ +#define OBJ_SET_IFEQ (1 << 9) /* Set if we need compare and set */ +#define OBJ_ARGV3 (1 << 10) /* Set if the value is at argv[3]; otherwise it's \ + * at argv[2]. */ /* Forward declaration */ static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int unit, long long *milliseconds); @@ -87,7 +90,8 @@ void setGenericCommand(client *c, robj *expire, int unit, robj *ok_reply, - robj *abort_reply) { + robj *abort_reply, + robj *comparison) { long long milliseconds = 0; /* initialized to avoid any harmness warning */ int found = 0; int setkey_flags = 0; @@ -100,7 +104,27 @@ void setGenericCommand(client *c, if (getGenericCommand(c) == C_ERR) return; } - found = (lookupKeyWrite(c->db, key) != NULL); + robj *existing_value = lookupKeyWrite(c->db, key); + found = existing_value != NULL; + + /* Handle the IFEQ conditional check */ + if (flags & OBJ_SET_IFEQ && found) { + if (!(flags & OBJ_SET_GET) && checkType(c, existing_value, OBJ_STRING)) { + return; + } + + if (compareStringObjects(existing_value, comparison) != 0) { + if (!(flags & OBJ_SET_GET)) { + addReply(c, abort_reply ? abort_reply : shared.null[c->resp]); + } + return; + } + } else if (flags & OBJ_SET_IFEQ && !found) { + if (!(flags & OBJ_SET_GET)) { + addReply(c, abort_reply ? abort_reply : shared.null[c->resp]); + } + return; + } if ((flags & OBJ_SET_NX && found) || (flags & OBJ_SET_XX && !found)) { if (!(flags & OBJ_SET_GET)) { @@ -123,12 +147,18 @@ void setGenericCommand(client *c, setkey_flags |= ((flags & OBJ_KEEPTTL) || expire) ? SETKEY_KEEPTTL : 0; setkey_flags |= found ? SETKEY_ALREADY_EXIST : SETKEY_DOESNT_EXIST; - setKey(c, c->db, key, val, setkey_flags); + setKey(c, c->db, key, &val, setkey_flags); + if (expire) val = setExpire(c, c->db, key, milliseconds); + + /* By setting the reallocated value back into argv, we can avoid duplicating + * a large string value when adding it to the db. */ + c->argv[(flags & OBJ_ARGV3) ? 3 : 2] = val; + incrRefCount(val); + server.dirty++; notifyKeyspaceEvent(NOTIFY_STRING, "set", key, c->db->id); if (expire) { - setExpire(c, c->db, key, milliseconds); /* Propagate as SET Key Value PXAT millisecond-timestamp if there is * EX/PX/EXAT flag. */ if (!(flags & OBJ_PXAT)) { @@ -208,7 +238,7 @@ static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int * string arguments used in SET and GET command. * * Get specific commands - PERSIST/DEL - * Set specific commands - XX/NX/GET + * Set specific commands - XX/NX/GET/IFEQ * Common commands - EX/EXAT/PX/PXAT/KEEPTTL * * Function takes pointers to client, flags, unit, pointer to pointer of expire obj if needed @@ -219,7 +249,7 @@ static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int * Input flags are updated upon parsing the arguments. Unit and expire are updated if there are any * EX/EXAT/PX/PXAT arguments. Unit is updated to millisecond if PX/PXAT is set. */ -int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj **expire, int command_type) { +int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj **expire, robj **compare_val, int command_type) { int j = command_type == COMMAND_GET ? 2 : 3; for (; j < c->argc; j++) { char *opt = c->argv[j]->ptr; @@ -228,14 +258,23 @@ int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj * /* clang-format off */ if ((opt[0] == 'n' || opt[0] == 'N') && (opt[1] == 'x' || opt[1] == 'X') && opt[2] == '\0' && - !(*flags & OBJ_SET_XX) && (command_type == COMMAND_SET)) + !(*flags & OBJ_SET_XX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET)) { *flags |= OBJ_SET_NX; } else if ((opt[0] == 'x' || opt[0] == 'X') && (opt[1] == 'x' || opt[1] == 'X') && opt[2] == '\0' && - !(*flags & OBJ_SET_NX) && (command_type == COMMAND_SET)) + !(*flags & OBJ_SET_NX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET)) { *flags |= OBJ_SET_XX; + } else if ((opt[0] == 'i' || opt[0] == 'I') && + (opt[1] == 'f' || opt[1] == 'F') && + (opt[2] == 'e' || opt[2] == 'E') && + (opt[3] == 'q' || opt[3] == 'Q') && opt[4] == '\0' && + next && !(*flags & OBJ_SET_NX || *flags & OBJ_SET_XX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET)) + { + *flags |= OBJ_SET_IFEQ; + *compare_val = next; + j++; } else if ((opt[0] == 'g' || opt[0] == 'G') && (opt[1] == 'e' || opt[1] == 'E') && (opt[2] == 't' || opt[2] == 'T') && opt[3] == '\0' && @@ -304,34 +343,36 @@ int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj * return C_OK; } -/* SET key value [NX] [XX] [KEEPTTL] [GET] [EX ] [PX ] - * [EXAT ][PXAT ] */ +/* SET key value [NX | XX | IFEQ comparison-value] [GET] + * [EX seconds | PX milliseconds | + * EXAT seconds-timestamp | PXAT milliseconds-timestamp | KEEPTTL] */ void setCommand(client *c) { robj *expire = NULL; + robj *comparison = NULL; int unit = UNIT_SECONDS; int flags = OBJ_NO_FLAGS; - if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, COMMAND_SET) != C_OK) { + if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, &comparison, COMMAND_SET) != C_OK) { return; } c->argv[2] = tryObjectEncoding(c->argv[2]); - setGenericCommand(c, flags, c->argv[1], c->argv[2], expire, unit, NULL, NULL); + setGenericCommand(c, flags, c->argv[1], c->argv[2], expire, unit, NULL, NULL, comparison); } void setnxCommand(client *c) { c->argv[2] = tryObjectEncoding(c->argv[2]); - setGenericCommand(c, OBJ_SET_NX, c->argv[1], c->argv[2], NULL, 0, shared.cone, shared.czero); + setGenericCommand(c, OBJ_SET_NX, c->argv[1], c->argv[2], NULL, 0, shared.cone, shared.czero, NULL); } void setexCommand(client *c) { c->argv[3] = tryObjectEncoding(c->argv[3]); - setGenericCommand(c, OBJ_EX, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL); + setGenericCommand(c, OBJ_EX | OBJ_ARGV3, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL, NULL); } void psetexCommand(client *c) { c->argv[3] = tryObjectEncoding(c->argv[3]); - setGenericCommand(c, OBJ_PX, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL); + setGenericCommand(c, OBJ_PX | OBJ_ARGV3, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL, NULL); } int getGenericCommand(client *c) { @@ -377,7 +418,7 @@ void getexCommand(client *c) { int unit = UNIT_SECONDS; int flags = OBJ_NO_FLAGS; - if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, COMMAND_GET) != C_OK) { + if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, NULL, COMMAND_GET) != C_OK) { return; } @@ -406,7 +447,7 @@ void getexCommand(client *c) { * has already elapsed so delete the key in that case. */ deleteExpiredKeyFromOverwriteAndPropagate(c, c->argv[1]); } else if (expire) { - setExpire(c, c->db, c->argv[1], milliseconds); + o = setExpire(c, c->db, c->argv[1], milliseconds); /* Propagate as PXEXPIREAT millisecond-timestamp if there is * EX/PX/EXAT/PXAT flag and the key has not expired. */ robj *milliseconds_obj = createStringObjectFromLongLong(milliseconds); @@ -439,7 +480,8 @@ void getdelCommand(client *c) { void getsetCommand(client *c) { if (getGenericCommand(c) == C_ERR) return; c->argv[2] = tryObjectEncoding(c->argv[2]); - setKey(c, c->db, c->argv[1], c->argv[2], 0); + setKey(c, c->db, c->argv[1], &c->argv[2], 0); + incrRefCount(c->argv[2]); notifyKeyspaceEvent(NOTIFY_STRING, "set", c->argv[1], c->db->id); server.dirty++; @@ -473,7 +515,7 @@ void setrangeCommand(client *c) { return; o = createObject(OBJ_STRING, sdsnewlen(NULL, offset + sdslen(value))); - dbAdd(c->db, c->argv[1], o); + dbAdd(c->db, c->argv[1], &o); } else { size_t olen; @@ -587,8 +629,10 @@ void msetGenericCommand(client *c, int nx) { int setkey_flags = nx ? SETKEY_DOESNT_EXIST : 0; for (j = 1; j < c->argc; j += 2) { - c->argv[j + 1] = tryObjectEncoding(c->argv[j + 1]); - setKey(c, c->db, c->argv[j], c->argv[j + 1], setkey_flags); + robj *val = tryObjectEncoding(c->argv[j + 1]); + setKey(c, c->db, c->argv[j], &val, setkey_flags); + incrRefCount(val); + c->argv[j + 1] = val; notifyKeyspaceEvent(NOTIFY_STRING, "set", c->argv[j], c->db->id); /* In MSETNX, It could be that we're overriding the same key, we can't be sure it doesn't exist. */ if (nx) @@ -623,16 +667,15 @@ void incrDecrCommand(client *c, long long incr) { value += incr; if (o && o->refcount == 1 && o->encoding == OBJ_ENCODING_INT && - (value < 0 || value >= OBJ_SHARED_INTEGERS) && value >= LONG_MIN && value <= LONG_MAX) { new = o; o->ptr = (void *)((long)value); } else { new = createStringObjectFromLongLongForValue(value); if (o) { - dbReplaceValue(c->db, c->argv[1], new); + dbReplaceValue(c->db, c->argv[1], &new); } else { - dbAdd(c->db, c->argv[1], new); + dbAdd(c->db, c->argv[1], &new); } } signalModifiedKey(c, c->db, c->argv[1]); @@ -685,9 +728,9 @@ void incrbyfloatCommand(client *c) { } new = createStringObjectFromLongDouble(value, 1); if (o) - dbReplaceValue(c->db, c->argv[1], new); + dbReplaceValue(c->db, c->argv[1], &new); else - dbAdd(c->db, c->argv[1], new); + dbAdd(c->db, c->argv[1], &new); signalModifiedKey(c, c->db, c->argv[1]); notifyKeyspaceEvent(NOTIFY_STRING, "incrbyfloat", c->argv[1], c->db->id); server.dirty++; @@ -709,7 +752,7 @@ void appendCommand(client *c) { if (o == NULL) { /* Create the key */ c->argv[2] = tryObjectEncoding(c->argv[2]); - dbAdd(c->db, c->argv[1], c->argv[2]); + dbAdd(c->db, c->argv[1], &c->argv[2]); incrRefCount(c->argv[2]); totlen = stringObjectLen(c->argv[2]); } else { diff --git a/src/t_zset.c b/src/t_zset.c index 069ab0924a..e8c5a369b7 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -60,6 +60,8 @@ #include "intset.h" /* Compact integer set structure */ #include +#include "valkey_strtod.h" + /*----------------------------------------------------------------------------- * Skiplist implementation of the low level API *----------------------------------------------------------------------------*/ @@ -70,12 +72,51 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap); zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank); zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); +static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) { + /* We use the level 0 span in order to hold the node height, so in case the span is requested on + * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return + * the recorded span in that level. */ + if (level > 0) return x->level[level].span; + return x->level[level].forward ? 1 : 0; +} + +static inline void zslSetNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long span) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span = span; +} + +static inline void zslIncrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long incr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span += incr; +} + +static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long decr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span -= decr; +} + +static inline unsigned long zslGetNodeHeight(zskiplistNode *x) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + return x->level[0].span; +} + +static inline void zslSetNodeHeight(zskiplistNode *x, int height) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + x->level[0].span = height; +} + /* Create a skiplist node with the specified number of levels. * The SDS string 'ele' is referenced by the node after the call. */ -zskiplistNode *zslCreateNode(int level, double score, sds ele) { - zskiplistNode *zn = zmalloc(sizeof(*zn) + level * sizeof(struct zskiplistLevel)); +zskiplistNode *zslCreateNode(int height, double score, sds ele) { + zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel)); zn->score = score; zn->ele = ele; + zslSetNodeHeight(zn, height); return zn; } @@ -145,7 +186,7 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) { - rank[i] += x->level[i].span; + rank[i] += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -159,9 +200,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { for (i = zsl->level; i < level; i++) { rank[i] = 0; update[i] = zsl->header; - update[i]->level[i].span = zsl->length; + zslSetNodeSpanAtLevel(update[i], i, zsl->length); } zsl->level = level; + zslSetNodeHeight(zsl->header, level); } x = zslCreateNode(level, score, ele); for (i = 0; i < level; i++) { @@ -169,13 +211,13 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { update[i]->level[i].forward = x; /* update span covered by update[i] as x is inserted here */ - x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]); - update[i]->level[i].span = (rank[0] - rank[i]) + 1; + zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i])); + zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1); } /* increment span for untouched levels */ for (i = level; i < zsl->level; i++) { - update[i]->level[i].span++; + zslIncrNodeSpanAtLevel(update[i], i, 1); } x->backward = (update[0] == zsl->header) ? NULL : update[0]; @@ -193,10 +235,10 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == x) { - update[i]->level[i].span += x->level[i].span - 1; + zslIncrNodeSpanAtLevel(update[i], i, zslGetNodeSpanAtLevel(x, i) - 1); update[i]->level[i].forward = x->level[i].forward; } else { - update[i]->level[i].span -= 1; + zslDecrNodeSpanAtLevel(update[i], i, 1); } } if (x->level[0].forward) { @@ -334,7 +376,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -346,7 +388,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -370,7 +412,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslValueLteMax(x->level[i].forward->score, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -462,8 +504,8 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned x = zsl->header; for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) < start) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) < start) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -497,7 +539,7 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) { - rank += x->level[i].span; + rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } @@ -509,6 +551,18 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { return 0; } +/* Find the rank for a specific skiplist node. */ +unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) { + int i = zslGetNodeHeight(x) - 1; + unsigned long rank = zslGetNodeSpanAtLevel(x, i); + while (x->level[zslGetNodeHeight(x) - 1].forward) { + x = x->level[zslGetNodeHeight(x) - 1].forward; + rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1); + } + rank = zsl->length - rank; + return rank; +} + /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) { zskiplistNode *x; @@ -517,8 +571,8 @@ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_ x = start_node; for (i = start_level; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) <= rank) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) <= rank) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } if (traversed == rank) { @@ -546,11 +600,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->min = (long)min->ptr; } else { if (((char *)min->ptr)[0] == '(') { - spec->min = strtod((char *)min->ptr + 1, &eptr); + spec->min = valkey_strtod((char *)min->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; spec->minex = 1; } else { - spec->min = strtod((char *)min->ptr, &eptr); + spec->min = valkey_strtod((char *)min->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; } } @@ -558,11 +612,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->max = (long)max->ptr; } else { if (((char *)max->ptr)[0] == '(') { - spec->max = strtod((char *)max->ptr + 1, &eptr); + spec->max = valkey_strtod((char *)max->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; spec->maxex = 1; } else { - spec->max = strtod((char *)max->ptr, &eptr); + spec->max = valkey_strtod((char *)max->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; } } @@ -688,7 +742,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -700,7 +754,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -724,7 +778,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslLexValueLteMax(x->level[i].forward->ele, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -757,7 +811,7 @@ double zzlStrtod(unsigned char *vstr, unsigned int vlen) { if (vlen > sizeof(buf) - 1) vlen = sizeof(buf) - 1; memcpy(buf, vstr, vlen); buf[vlen] = '\0'; - return strtod(buf, NULL); + return valkey_strtod(buf, NULL); } double zzlGetScore(unsigned char *sptr) { @@ -1171,6 +1225,13 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig * Common sorted set API *----------------------------------------------------------------------------*/ +/* Utility function used for mapping the hashtable entry to the matching skiplist node. + * For example, this is used in case of ZRANK query. */ +static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) { + char *score_ref = ((char *)dictGetVal(de)); + return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score)); +} + unsigned long zsetLength(const robj *zobj) { unsigned long length = 0; if (zobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -1601,8 +1662,9 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { de = dictFind(zs->dict, ele); if (de != NULL) { - score = *(double *)dictGetVal(de); - rank = zslGetRank(zsl, score, ele); + zskiplistNode *n = zsetGetSLNodeByEntry(de); + score = n->score; + rank = zslGetRankByNode(zsl, n); /* Existing elements always have a rank. */ serverAssert(rank != 0); if (output_score) *output_score = score; @@ -1803,7 +1865,7 @@ void zaddGenericCommand(client *c, int flags) { if (zobj == NULL) { if (xx) goto reply_to_client; /* No key + XX option: nothing to do. */ zobj = zsetTypeCreate(elements, maxelelen); - dbAdd(c->db, key, zobj); + dbAdd(c->db, key, &zobj); } else { zsetTypeMaybeConvert(zobj, elements, maxelelen); } @@ -2007,9 +2069,7 @@ typedef struct { int ii; } is; struct { - dict *dict; - dictIterator *di; - dictEntry *de; + hashtableIterator *iter; } ht; struct { unsigned char *lp; @@ -2064,10 +2124,8 @@ void zuiInitIterator(zsetopsrc *op) { if (op->encoding == OBJ_ENCODING_INTSET) { it->is.is = op->subject->ptr; it->is.ii = 0; - } else if (op->encoding == OBJ_ENCODING_HT) { - it->ht.dict = op->subject->ptr; - it->ht.di = dictGetIterator(op->subject->ptr); - it->ht.de = dictNext(it->ht.di); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + it->ht.iter = hashtableCreateIterator(op->subject->ptr); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { it->lp.lp = op->subject->ptr; it->lp.p = lpFirst(it->lp.lp); @@ -2104,8 +2162,8 @@ void zuiClearIterator(zsetopsrc *op) { iterset *it = &op->iter.set; if (op->encoding == OBJ_ENCODING_INTSET) { UNUSED(it); /* skip */ - } else if (op->encoding == OBJ_ENCODING_HT) { - dictReleaseIterator(it->ht.di); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + hashtableReleaseIterator(it->ht.iter); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { UNUSED(it); } else { @@ -2173,13 +2231,11 @@ int zuiNext(zsetopsrc *op, zsetopval *val) { /* Move to next element. */ it->is.ii++; - } else if (op->encoding == OBJ_ENCODING_HT) { - if (it->ht.de == NULL) return 0; - val->ele = dictGetKey(it->ht.de); + } else if (op->encoding == OBJ_ENCODING_HASHTABLE) { + void *next; + if (!hashtableNext(it->ht.iter, &next)) return 0; + val->ele = next; val->score = 1.0; - - /* Move to next element. */ - it->ht.de = dictNext(it->ht.di); } else if (op->encoding == OBJ_ENCODING_LISTPACK) { if (it->lp.p == NULL) return 0; val->estr = lpGetValue(it->lp.p, &val->elen, &val->ell); @@ -2782,7 +2838,7 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in if (dstkey) { if (dstzset->zsl->length) { zsetConvertToListpackIfNeeded(dstobj, maxelelen, totelelen); - setKey(c, c->db, dstkey, dstobj, 0); + setKey(c, c->db, dstkey, &dstobj, 0); addReplyLongLong(c, zsetLength(dstobj)); notifyKeyspaceEvent( NOTIFY_ZSET, (op == SET_OP_UNION) ? "zunionstore" : (op == SET_OP_INTER ? "zinterstore" : "zdiffstore"), @@ -2795,8 +2851,8 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id); server.dirty++; } + decrRefCount(dstobj); } - decrRefCount(dstobj); } else if (cardinality_only) { addReplyLongLong(c, cardinality); } else { @@ -2985,7 +3041,7 @@ static void zrangeResultEmitLongLongForStore(zrange_result_handler *handler, lon static void zrangeResultFinalizeStore(zrange_result_handler *handler, size_t result_count) { if (result_count) { - setKey(handler->client, handler->client->db, handler->dstkey, handler->dstobj, 0); + setKey(handler->client, handler->client->db, handler->dstkey, &handler->dstobj, 0); addReplyLongLong(handler->client, result_count); notifyKeyspaceEvent(NOTIFY_ZSET, "zrangestore", handler->dstkey, handler->client->db->id); server.dirty++; @@ -2996,8 +3052,8 @@ static void zrangeResultFinalizeStore(zrange_result_handler *handler, size_t res notifyKeyspaceEvent(NOTIFY_GENERIC, "del", handler->dstkey, handler->client->db->id); server.dirty++; } + decrRefCount(handler->dstobj); } - decrRefCount(handler->dstobj); } /* Initialize the consumer interface type with the requested type. */ diff --git a/src/tls.c b/src/tls.c index f1c82d35e4..11e6143561 100644 --- a/src/tls.c +++ b/src/tls.c @@ -32,6 +32,7 @@ #include "server.h" #include "connhelpers.h" #include "adlist.h" +#include "io_threads.h" #if (USE_OPENSSL == 1 /* BUILD_YES */) || ((USE_OPENSSL == 2 /* BUILD_MODULE */) && (BUILD_TLS_MODULE == 2)) @@ -437,15 +438,13 @@ static ConnectionType CT_TLS; * */ -typedef enum { - WANT_READ = 1, - WANT_WRITE -} WantIOType; - #define TLS_CONN_FLAG_READ_WANT_WRITE (1 << 0) #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1) #define TLS_CONN_FLAG_FD_SET (1 << 2) #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3) +#define TLS_CONN_FLAG_HAS_PENDING (1 << 4) +#define TLS_CONN_FLAG_ACCEPT_ERROR (1 << 5) +#define TLS_CONN_FLAG_ACCEPT_SUCCESS (1 << 6) typedef struct tls_connection { connection c; @@ -513,20 +512,26 @@ static connection *connCreateAcceptedTLS(int fd, void *priv) { return (connection *)conn; } +static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler); static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask); static void updateSSLEvent(tls_connection *conn); +static void clearTLSWantFlags(tls_connection *conn) { + conn->flags &= ~(TLS_CONN_FLAG_WRITE_WANT_READ | TLS_CONN_FLAG_READ_WANT_WRITE); +} + /* Process the return code received from OpenSSL> - * Update the want parameter with expected I/O. + * Update the conn flags with the WANT_READ/WANT_WRITE flags. * Update the connection's error state if a real error has occurred. * Returns an SSL error code, or 0 if no further handling is required. */ -static int handleSSLReturnCode(tls_connection *conn, int ret_value, WantIOType *want) { +static int handleSSLReturnCode(tls_connection *conn, int ret_value) { + clearTLSWantFlags(conn); if (ret_value <= 0) { int ssl_err = SSL_get_error(conn->ssl, ret_value); switch (ssl_err) { - case SSL_ERROR_WANT_WRITE: *want = WANT_WRITE; return 0; - case SSL_ERROR_WANT_READ: *want = WANT_READ; return 0; + case SSL_ERROR_WANT_WRITE: conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; return 0; + case SSL_ERROR_WANT_READ: conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; return 0; case SSL_ERROR_SYSCALL: conn->c.last_errno = errno; if (conn->ssl_error) zfree(conn->ssl_error); @@ -562,11 +567,8 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update } if (ret_value <= 0) { - WantIOType want = 0; int ssl_err; - if (!(ssl_err = handleSSLReturnCode(conn, ret_value, &want))) { - if (want == WANT_READ) conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; - if (want == WANT_WRITE) conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; + if (!(ssl_err = handleSSLReturnCode(conn, ret_value))) { if (update_event) updateSSLEvent(conn); errno = EAGAIN; return -1; @@ -584,19 +586,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update return ret_value; } -static void registerSSLEvent(tls_connection *conn, WantIOType want) { +static void registerSSLEvent(tls_connection *conn) { int mask = aeGetFileEvents(server.el, conn->c.fd); - switch (want) { - case WANT_READ: + if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) { if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); - break; - case WANT_WRITE: + } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) { if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); - break; - default: serverAssert(0); break; + } else { + serverAssert(0); } } @@ -614,7 +614,7 @@ static void updatePendingData(tls_connection *conn) { /* If SSL has pending data, already read from the socket, we're at risk of not calling the read handler again, make * sure to add it to a list of pending connection that should be handled anyway. */ - if (SSL_pending(conn->ssl) > 0) { + if (conn->flags & TLS_CONN_FLAG_HAS_PENDING) { if (!conn->pending_list_node) { listAddNodeTail(pending_list, conn); conn->pending_list_node = listLast(pending_list); @@ -625,6 +625,14 @@ static void updatePendingData(tls_connection *conn) { } } +void updateSSLPendingFlag(tls_connection *conn) { + if (SSL_pending(conn->ssl) > 0) { + conn->flags |= TLS_CONN_FLAG_HAS_PENDING; + } else { + conn->flags &= ~TLS_CONN_FLAG_HAS_PENDING; + } +} + static void updateSSLEvent(tls_connection *conn) { if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) return; @@ -641,20 +649,53 @@ static void updateSSLEvent(tls_connection *conn) { if (!need_write && (mask & AE_WRITABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); } +static int TLSHandleAcceptResult(tls_connection *conn, int call_handler_on_error) { + serverAssert(conn->c.state == CONN_STATE_ACCEPTING); + if (conn->flags & TLS_CONN_FLAG_ACCEPT_SUCCESS) { + conn->c.state = CONN_STATE_CONNECTED; + } else if (conn->flags & TLS_CONN_FLAG_ACCEPT_ERROR) { + conn->c.state = CONN_STATE_ERROR; + if (!call_handler_on_error) return C_ERR; + } else { + /* Still pending accept */ + registerSSLEvent(conn); + return C_OK; + } + + /* call accept handler */ + if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_ERR; + conn->c.conn_handler = NULL; + return C_OK; +} + static void updateSSLState(connection *conn_) { tls_connection *conn = (tls_connection *)conn_; + + if (conn->c.state == CONN_STATE_ACCEPTING) { + if (TLSHandleAcceptResult(conn, 1) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return; + } + updateSSLEvent(conn); updatePendingData(conn); } +static void TLSAccept(void *_conn) { + tls_connection *conn = (tls_connection *)_conn; + ERR_clear_error(); + int ret = SSL_accept(conn->ssl); + if (ret > 0) { + conn->flags |= TLS_CONN_FLAG_ACCEPT_SUCCESS; + } else if (handleSSLReturnCode(conn, ret)) { + conn->flags |= TLS_CONN_FLAG_ACCEPT_ERROR; + } +} + static void tlsHandleEvent(tls_connection *conn, int mask) { int ret, conn_error; TLSCONN_DEBUG("tlsEventHandler(): fd=%d, state=%d, mask=%d, r=%d, w=%d, flags=%d", fd, conn->c.state, mask, conn->c.read_handler != NULL, conn->c.write_handler != NULL, conn->flags); - ERR_clear_error(); - switch (conn->c.state) { case CONN_STATE_CONNECTING: conn_error = anetGetError(conn->c.fd); @@ -662,16 +703,15 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.last_errno = conn_error; conn->c.state = CONN_STATE_ERROR; } else { + ERR_clear_error(); if (!(conn->flags & TLS_CONN_FLAG_FD_SET)) { SSL_set_fd(conn->ssl, conn->c.fd); conn->flags |= TLS_CONN_FLAG_FD_SET; } ret = SSL_connect(conn->ssl); if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - registerSSLEvent(conn, want); - + if (!handleSSLReturnCode(conn, ret)) { + registerSSLEvent(conn); /* Avoid hitting UpdateSSLEvent, which knows nothing * of what SSL_connect() wants and instead looks at our * R/W handlers. @@ -690,26 +730,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.conn_handler = NULL; break; case CONN_STATE_ACCEPTING: - ret = SSL_accept(conn->ssl); - if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - /* Avoid hitting UpdateSSLEvent, which knows nothing - * of what SSL_connect() wants and instead looks at our - * R/W handlers. - */ - registerSSLEvent(conn, want); - return; - } - - /* If not handled, it's an error */ - conn->c.state = CONN_STATE_ERROR; - } else { - conn->c.state = CONN_STATE_CONNECTED; - } - - if (!callHandler((connection *)conn, conn->c.conn_handler)) return; - conn->c.conn_handler = NULL; + if (connTLSAccept((connection *)conn, NULL) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return; break; case CONN_STATE_CONNECTED: { int call_read = ((mask & AE_READABLE) && conn->c.read_handler) || @@ -731,26 +752,20 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { int invert = conn->c.flags & CONN_FLAG_WRITE_BARRIER; if (!invert && call_read) { - conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } /* Fire the writable event. */ if (call_write) { - conn->flags &= ~TLS_CONN_FLAG_WRITE_WANT_READ; if (!callHandler((connection *)conn, conn->c.write_handler)) return; } /* If we have to invert the call, fire the readable event now * after the writable one. */ if (invert && call_read) { - conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } - - if (mask & AE_READABLE) { - updatePendingData(conn); - } + updatePendingData(conn); break; } @@ -799,6 +814,10 @@ static int connTLSListen(connListener *listener) { return listenToPort(listener); } +static void connTLSCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static void connTLSShutdown(connection *conn_) { tls_connection *conn = (tls_connection *)conn_; @@ -835,31 +854,25 @@ static void connTLSClose(connection *conn_) { static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler) { tls_connection *conn = (tls_connection *)_conn; - int ret; - if (conn->c.state != CONN_STATE_ACCEPTING) return C_ERR; - ERR_clear_error(); - + int call_handler_on_error = 1; /* Try to accept */ - conn->c.conn_handler = accept_handler; - ret = SSL_accept(conn->ssl); - - if (ret <= 0) { - WantIOType want = 0; - if (!handleSSLReturnCode(conn, ret, &want)) { - registerSSLEvent(conn, want); /* We'll fire back */ - return C_OK; - } else { - conn->c.state = CONN_STATE_ERROR; - return C_ERR; - } + if (accept_handler) { + conn->c.conn_handler = accept_handler; + call_handler_on_error = 0; } - conn->c.state = CONN_STATE_CONNECTED; - if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_OK; - conn->c.conn_handler = NULL; + /* We're in IO thread - just call accept and return, the main thread will handle the rest */ + if (!inMainThread()) { + TLSAccept(conn); + return C_OK; + } - return C_OK; + /* Try to offload accept to IO threads */ + if (trySendAcceptToIOThreads(_conn) == C_OK) return C_OK; + + TLSAccept(conn); + return TLSHandleAcceptResult(conn, call_handler_on_error); } static int connTLSConnect(connection *conn_, @@ -941,6 +954,7 @@ static int connTLSRead(connection *conn_, void *buf, size_t buf_len) { if (conn->c.state != CONN_STATE_CONNECTED) return -1; ERR_clear_error(); ret = SSL_read(conn->ssl, buf, buf_len); + updateSSLPendingFlag(conn); return updateStateAfterSSLIO(conn, ret, 1); } @@ -967,6 +981,10 @@ static int connTLSSetReadHandler(connection *conn, ConnectionCallbackFunc func) return C_OK; } +static int isBlocking(tls_connection *conn) { + return anetIsBlock(NULL, conn->c.fd); +} + static void setBlockingTimeout(tls_connection *conn, long long timeout) { anetBlock(NULL, conn->c.fd); anetSendTimeout(NULL, conn->c.fd, timeout); @@ -992,7 +1010,7 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, * which means the specified timeout will not be enforced accurately. */ SSL_set_fd(conn->ssl, conn->c.fd); setBlockingTimeout(conn, timeout); - + ERR_clear_error(); if ((ret = SSL_connect(conn->ssl)) <= 0) { conn->c.state = CONN_STATE_ERROR; return C_ERR; @@ -1005,26 +1023,31 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, static ssize_t connTLSSyncWrite(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); SSL_clear_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); ERR_clear_error(); int ret = SSL_write(conn->ssl, ptr, size); ret = updateStateAfterSSLIO(conn, ret, 0); SSL_set_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } static ssize_t connTLSSyncRead(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); ERR_clear_error(); int ret = SSL_read(conn->ssl, ptr, size); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } @@ -1033,6 +1056,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l tls_connection *conn = (tls_connection *)conn_; ssize_t nread = 0; + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); size--; @@ -1041,6 +1065,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l ERR_clear_error(); int ret = SSL_read(conn->ssl, &c, 1); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); if (ret <= 0) { nread = -1; @@ -1058,7 +1083,9 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l size--; } exit: - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return nread; } @@ -1127,6 +1154,7 @@ static ConnectionType CT_TLS = { .addr = connTLSAddr, .is_local = connTLSIsLocal, .listen = connTLSListen, + .closeListener = connTLSCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateTLS, diff --git a/src/unit/README.md b/src/unit/README.md index 1ef439eaeb..93ac82f6dc 100644 --- a/src/unit/README.md +++ b/src/unit/README.md @@ -12,6 +12,7 @@ Tests flags: * UNIT_TEST_ACCURATE: Corresponds to the --accurate flag. This flag indicates the test should use extra computation to more accurately validate the tests. * UNIT_TEST_LARGE_MEMORY: Corresponds to the --large-memory flag. This flag indicates whether or not tests should use more than 100mb of memory. * UNIT_TEST_SINGLE: Corresponds to the --single flag. This flag indicates that a single test is being executed. +* UNIT_TEST_VALGRIND: Corresponds to the --valgrind flag. This flag is just a hint passed to the test to indicate that we are running it under valgrind. Tests are allowed to be passed in additional arbitrary argv/argc, which they can access from the argc and argv arguments of the test. diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 87bc031fb4..f25e320452 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -19,6 +19,22 @@ int test_dictDisableResizeReduceTo3(int argc, char **argv, int flags); int test_dictDeleteOneKeyTriggerResizeAgain(int argc, char **argv, int flags); int test_dictBenchmark(int argc, char **argv, int flags); int test_endianconv(int argc, char *argv[], int flags); +int test_cursor(int argc, char **argv, int flags); +int test_set_hash_function_seed(int argc, char **argv, int flags); +int test_add_find_delete(int argc, char **argv, int flags); +int test_add_find_delete_avoid_resize(int argc, char **argv, int flags); +int test_instant_rehashing(int argc, char **argv, int flags); +int test_bucket_chain_length(int argc, char **argv, int flags); +int test_two_phase_insert_and_pop(int argc, char **argv, int flags); +int test_replace_reallocated_entry(int argc, char **argv, int flags); +int test_incremental_find(int argc, char **argv, int flags); +int test_scan(int argc, char **argv, int flags); +int test_iterator(int argc, char **argv, int flags); +int test_safe_iterator(int argc, char **argv, int flags); +int test_compact_bucket_chain(int argc, char **argv, int flags); +int test_random_entry(int argc, char **argv, int flags); +int test_random_entry_with_long_chain(int argc, char **argv, int flags); +int test_all_memory_freed(int argc, char **argv, int flags); int test_intsetValueEncodings(int argc, char **argv, int flags); int test_intsetBasicAdding(int argc, char **argv, int flags); int test_intsetLargeNumberRandomAdd(int argc, char **argv, int flags); @@ -28,10 +44,10 @@ int test_intsetUpgradeFromint32Toint64(int argc, char **argv, int flags); int test_intsetStressLookups(int argc, char **argv, int flags); int test_intsetStressAddDelete(int argc, char **argv, int flags); int test_kvstoreAdd16Keys(int argc, char **argv, int flags); -int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags); -int test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags); -int test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags); -int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); int test_listpackCreateIntList(int argc, char **argv, int flags); int test_listpackCreateList(int argc, char **argv, int flags); int test_listpackLpPrepend(int argc, char **argv, int flags); @@ -84,6 +100,9 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags); +int test_rewriteClientCommandArgument(int argc, char **argv, int flags); +int test_object_with_key(int argc, char **argv, int flags); int test_quicklistCreateList(int argc, char **argv, int flags); int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); @@ -166,6 +185,7 @@ int test_ld2string(int argc, char **argv, int flags); int test_fixedpoint_d2string(int argc, char **argv, int flags); int test_version2num(int argc, char **argv, int flags); int test_reclaimFilePageCache(int argc, char **argv, int flags); +int test_valkey_strtod(int argc, char **argv, int flags); int test_ziplistCreateIntList(int argc, char **argv, int flags); int test_ziplistPop(int argc, char **argv, int flags); int test_ziplistGetElementAtIndex3(int argc, char **argv, int flags); @@ -212,14 +232,18 @@ unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}}; unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}}; unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}}; unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}}; +unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_function_seed", test_set_hash_function_seed}, {"test_add_find_delete", test_add_find_delete}, {"test_add_find_delete_avoid_resize", test_add_find_delete_avoid_resize}, {"test_instant_rehashing", test_instant_rehashing}, {"test_bucket_chain_length", test_bucket_chain_length}, {"test_two_phase_insert_and_pop", test_two_phase_insert_and_pop}, {"test_replace_reallocated_entry", test_replace_reallocated_entry}, {"test_incremental_find", test_incremental_find}, {"test_scan", test_scan}, {"test_iterator", test_iterator}, {"test_safe_iterator", test_safe_iterator}, {"test_compact_bucket_chain", test_compact_bucket_chain}, {"test_random_entry", test_random_entry}, {"test_random_entry_with_long_chain", test_random_entry_with_long_chain}, {"test_all_memory_freed", test_all_memory_freed}, {NULL, NULL}}; unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; -unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; +unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}}; +unitTest __test_object_c[] = {{"test_object_with_key", test_object_with_key}, {NULL, NULL}}; unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}}; +unitTest __test_valkey_strtod_c[] = {{"test_valkey_strtod", test_valkey_strtod}, {NULL, NULL}}; unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; unitTest __test_zipmap_c[] = {{"test_zipmapIterateWithLargeKey", test_zipmapIterateWithLargeKey}, {"test_zipmapIterateThroughElements", test_zipmapIterateThroughElements}, {NULL, NULL}}; unitTest __test_zmalloc_c[] = {{"test_zmallocInitialUsedMemory", test_zmallocInitialUsedMemory}, {"test_zmallocAllocReallocCallocAndFree", test_zmallocAllocReallocCallocAndFree}, {"test_zmallocAllocZeroByteAndFree", test_zmallocAllocZeroByteAndFree}, {NULL, NULL}}; @@ -232,14 +256,18 @@ struct unitTestSuite { {"test_crc64combine.c", __test_crc64combine_c}, {"test_dict.c", __test_dict_c}, {"test_endianconv.c", __test_endianconv_c}, + {"test_hashtable.c", __test_hashtable_c}, {"test_intset.c", __test_intset_c}, {"test_kvstore.c", __test_kvstore_c}, {"test_listpack.c", __test_listpack_c}, + {"test_networking.c", __test_networking_c}, + {"test_object.c", __test_object_c}, {"test_quicklist.c", __test_quicklist_c}, {"test_rax.c", __test_rax_c}, {"test_sds.c", __test_sds_c}, {"test_sha1.c", __test_sha1_c}, {"test_util.c", __test_util_c}, + {"test_valkey_strtod.c", __test_valkey_strtod_c}, {"test_ziplist.c", __test_ziplist_c}, {"test_zipmap.c", __test_zipmap_c}, {"test_zmalloc.c", __test_zmalloc_c}, diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c new file mode 100644 index 0000000000..689440e43d --- /dev/null +++ b/src/unit/test_hashtable.c @@ -0,0 +1,870 @@ +#include "../hashtable.h" +#include "test_help.h" +#include "../mt19937-64.h" +#include "../zmalloc.h" +#include "../monotonic.h" + +#include +#include +#include +#include + +/* Global variable to test the memory tracking callback. */ +static size_t mem_usage; + +/* From util.c: getRandomBytes to seed hash function. */ +void getRandomBytes(unsigned char *p, size_t len); + +/* Init hash function salt and seed random generator. */ +static void randomSeed(void) { + unsigned long long seed; + getRandomBytes((void *)&seed, sizeof(seed)); + init_genrand64(seed); + srandom((unsigned)seed); +} + +/* An entry holding a string key and a string value in one allocation. */ +typedef struct { + unsigned int keysize; /* Sizes, including null-terminator */ + unsigned int valsize; + char data[]; /* key and value */ +} keyval; + +static keyval *create_keyval(const char *key, const char *val) { + size_t keysize = strlen(key) + 1; + size_t valsize = strlen(val) + 1; + keyval *e = malloc(sizeof(keyval) + keysize + valsize); + e->keysize = keysize; + e->valsize = valsize; + memcpy(e->data, key, keysize); + memcpy(e->data + keysize, val, valsize); + return e; +} + +static const void *getkey(const void *entry) { + const keyval *e = entry; + return e->data; +} + +static const void *getval(const void *entry) { + const keyval *e = entry; + return e->data + e->keysize; +} + +static uint64_t hashfunc(const void *key) { + return hashtableGenHashFunction(key, strlen(key)); +} + +static int keycmp(const void *key1, const void *key2) { + return strcmp(key1, key2); +} + +static void freekeyval(void *keyval) { + free(keyval); +} + +static void trackmemusage(hashtable *ht, ssize_t delta) { + UNUSED(ht); + mem_usage += delta; +} + +/* Hashtable type used for some of the tests. */ +static hashtableType keyval_type = { + .entryGetKey = getkey, + .hashFunction = hashfunc, + .keyCompare = keycmp, + .entryDestructor = freekeyval, + .trackMemUsage = trackmemusage, +}; + +/* Callback for testing hashtableEmpty(). */ +static long empty_callback_call_counter; +void emptyCallback(hashtable *ht) { + UNUSED(ht); + empty_callback_call_counter++; +} + +/* Prototypes for debugging */ +void hashtableDump(hashtable *ht); +void hashtableHistogram(hashtable *ht); +int hashtableLongestBucketChain(hashtable *ht); +size_t nextCursor(size_t v, size_t mask); + +int test_cursor(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST_ASSERT(nextCursor(0x0000, 0xffff) == 0x8000); + TEST_ASSERT(nextCursor(0x8000, 0xffff) == 0x4000); + TEST_ASSERT(nextCursor(0x4001, 0xffff) == 0xc001); + TEST_ASSERT(nextCursor(0xffff, 0xffff) == 0x0000); + return 0; +} + +int test_set_hash_function_seed(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + randomSeed(); + return 0; +} + +static int add_find_delete_test_helper(int flags) { + int count = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 200; + TEST_ASSERT(mem_usage == 0); + hashtable *ht = hashtableCreate(&keyval_type); + int j; + + /* Add */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + keyval *e = create_keyval(key, val); + TEST_ASSERT(hashtableAdd(ht, e)); + } + TEST_ASSERT(hashtableMemUsage(ht) == mem_usage); + + if (count < 1000) { + hashtableHistogram(ht); + printf("Mem usage: %zu\n", hashtableMemUsage(ht)); + } + + /* Find */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + void *found; + TEST_ASSERT(hashtableFind(ht, key, &found)); + keyval *e = found; + TEST_ASSERT(!strcmp(val, getval(e))); + } + + /* Delete half of them */ + for (j = 0; j < count / 2; j++) { + char key[32]; + snprintf(key, sizeof(key), "%d", j); + if (j % 3 == 0) { + /* Test hashtablePop */ + char val[32]; + snprintf(val, sizeof(val), "%d", count - j + 42); + void *popped; + TEST_ASSERT(hashtablePop(ht, key, &popped)); + keyval *e = popped; + TEST_ASSERT(!strcmp(val, getval(e))); + free(e); + } else { + TEST_ASSERT(hashtableDelete(ht, key)); + } + } + TEST_ASSERT(hashtableMemUsage(ht) == mem_usage); + + /* Empty, i.e. delete remaining entries, with progress callback. */ + empty_callback_call_counter = 0; + hashtableEmpty(ht, emptyCallback); + TEST_ASSERT(empty_callback_call_counter > 0); + + /* Release memory */ + hashtableRelease(ht); + TEST_ASSERT(mem_usage == 0); + return 0; +} + +int test_add_find_delete(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + TEST_ASSERT(add_find_delete_test_helper(flags) == 0); + TEST_ASSERT(zmalloc_used_memory() == 0); + return 0; +} + +int test_add_find_delete_avoid_resize(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID); + TEST_ASSERT(add_find_delete_test_helper(flags) == 0); + hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW); + TEST_ASSERT(zmalloc_used_memory() == 0); + return 0; +} + +int test_instant_rehashing(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + long count = 200; + + /* A set of longs, i.e. pointer-sized values. */ + hashtableType type = {.instant_rehashing = 1}; + hashtable *ht = hashtableCreate(&type); + long j; + + /* Populate and check that rehashing is never ongoing. */ + for (j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + TEST_ASSERT(!hashtableIsRehashing(ht)); + } + + /* Delete and check that rehashing is never ongoing. */ + for (j = 0; j < count; j++) { + TEST_ASSERT(hashtableDelete(ht, (void *)j)); + TEST_ASSERT(!hashtableIsRehashing(ht)); + } + + hashtableRelease(ht); + return 0; +} + +int test_bucket_chain_length(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + unsigned long count = 1000000; + + /* A set of longs, i.e. pointer-sized integer values. */ + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + unsigned long j; + for (j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + } + /* If it's rehashing, add a few more until rehashing is complete. */ + while (hashtableIsRehashing(ht)) { + j++; + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + } + TEST_ASSERT(j < count * 2); + int max_chainlen_not_rehashing = hashtableLongestBucketChain(ht); + TEST_ASSERT(max_chainlen_not_rehashing < 10); + + /* Add more until rehashing starts again. */ + while (!hashtableIsRehashing(ht)) { + j++; + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + } + TEST_ASSERT(j < count * 2); + int max_chainlen_rehashing = hashtableLongestBucketChain(ht); + TEST_ASSERT(max_chainlen_rehashing < 10); + + hashtableRelease(ht); + return 0; +} + +int test_two_phase_insert_and_pop(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int count = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 200; + hashtable *ht = hashtableCreate(&keyval_type); + int j; + + /* hashtableFindPositionForInsert + hashtableInsertAtPosition */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + hashtablePosition position; + int ret = hashtableFindPositionForInsert(ht, key, &position, NULL); + TEST_ASSERT(ret == 1); + keyval *e = create_keyval(key, val); + hashtableInsertAtPosition(ht, e, &position); + } + + if (count < 1000) { + hashtableHistogram(ht); + } + + /* Check that all entries were inserted. */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + void *found; + TEST_ASSERT(hashtableFind(ht, key, &found)); + keyval *e = found; + TEST_ASSERT(!strcmp(val, getval(e))); + } + + /* Test two-phase pop. */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + hashtablePosition position; + size_t size_before_find = hashtableSize(ht); + void **ref = hashtableTwoPhasePopFindRef(ht, key, &position); + TEST_ASSERT(ref != NULL); + keyval *e = *ref; + TEST_ASSERT(!strcmp(val, getval(e))); + TEST_ASSERT(hashtableSize(ht) == size_before_find); + hashtableTwoPhasePopDelete(ht, &position); + TEST_ASSERT(hashtableSize(ht) == size_before_find - 1); + free(e); + } + TEST_ASSERT(hashtableSize(ht) == 0); + + hashtableRelease(ht); + return 0; +} + +int test_replace_reallocated_entry(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int count = 100, j; + hashtable *ht = hashtableCreate(&keyval_type); + + /* Add */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + keyval *e = create_keyval(key, val); + TEST_ASSERT(hashtableAdd(ht, e)); + } + + /* Find and replace */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", count - j + 42); + void *found; + TEST_ASSERT(hashtableFind(ht, key, &found)); + keyval *old = found; + TEST_ASSERT(strcmp(getkey(old), key) == 0); + TEST_ASSERT(strcmp(getval(old), val) == 0); + snprintf(val, sizeof(val), "%d", j + 1234); + keyval *new = create_keyval(key, val); + /* If we free 'old' before the call to hashtableReplaceReallocatedEntry, + * we get a use-after-free warning, so instead we just overwrite it with + * junk. The purpose is to verify that the function doesn't use the + * memory it points to. */ + memset(old->data, 'x', old->keysize + old->valsize); + TEST_ASSERT(hashtableReplaceReallocatedEntry(ht, old, new)); + free(old); + } + + /* Check */ + for (j = 0; j < count; j++) { + char key[32], val[32]; + snprintf(key, sizeof(key), "%d", j); + snprintf(val, sizeof(val), "%d", j + 1234); + void *found; + TEST_ASSERT(hashtableFind(ht, key, &found)); + keyval *e = found; + TEST_ASSERT(!strcmp(val, getval(e))); + } + + hashtableRelease(ht); + TEST_ASSERT(zmalloc_used_memory() == 0); + return 0; +} + +int test_incremental_find(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + size_t count = 2000000; + uint8_t element_array[count]; + memset(element_array, 0, sizeof element_array); + + /* A set of uint8_t pointers */ + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + + /* Populate */ + for (size_t j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, element_array + j)); + } + + monotime timer; + monotonicInit(); + + /* Compare to looking up one by one. */ + elapsedStart(&timer); + for (size_t i = 0; i < count; i++) { + uint8_t *key = &element_array[i]; + void *found; + TEST_ASSERT(hashtableFind(ht, key, &found) == 1); + TEST_ASSERT(found == key); + } + uint64_t us2 = elapsedUs(timer); + TEST_PRINT_INFO("Lookup %zu elements one by one took %lu microseconds.", + count, (unsigned long)us2); + + /* Lookup elements in batches. */ + for (size_t batch_size = 1; batch_size <= 64; batch_size *= 2) { + elapsedStart(&timer); + for (size_t batch = 0; batch < count / batch_size; batch++) { + /* Init batches. */ + hashtableIncrementalFindState states[batch_size]; + for (size_t i = 0; i < batch_size; i++) { + void *key = &element_array[batch * batch_size + i]; + hashtableIncrementalFindInit(&states[i], ht, key); + } + /* Work on batches in round-robin order until all are done. */ + size_t num_left; + do { + num_left = batch_size; + for (size_t i = 0; i < batch_size; i++) { + if (hashtableIncrementalFindStep(&states[i]) == 0) { + num_left--; + } + } + } while (num_left > 0); + + /* Fetch results. */ + for (size_t i = 0; i < batch_size; i++) { + void *found; + TEST_ASSERT(hashtableIncrementalFindGetResult(&states[i], &found) == 1); + TEST_ASSERT(found == &element_array[batch * batch_size + i]); + } + } + uint64_t us1 = elapsedUs(timer); + TEST_PRINT_INFO("Lookup %zu elements in batches of %zu took %lu microseconds.", + count, batch_size, (unsigned long)us1); + } + + hashtableRelease(ht); + return 0; +} + +typedef struct { + long count; + uint8_t entry_seen[]; +} scandata; + +void scanfn(void *privdata, void *entry) { + scandata *data = (scandata *)privdata; + unsigned long j = (unsigned long)entry; + data->entry_seen[j]++; + data->count++; +} + +int test_scan(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + long num_entries = (flags & UNIT_TEST_LARGE_MEMORY) ? 1000000 : 200000; + int num_rounds = (flags & UNIT_TEST_ACCURATE) ? 20 : 5; + + /* A set of longs, i.e. pointer-sized values. */ + hashtableType type = {0}; + long j; + + for (int round = 0; round < num_rounds; round++) { + /* First round count = num_entries, then some more. */ + long count = num_entries * (1 + 2 * (double)round / num_rounds); + + /* Seed, to make sure each round is different. */ + randomSeed(); + + /* Populate */ + hashtable *ht = hashtableCreate(&type); + for (j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + } + + /* Scan */ + scandata *data = calloc(1, sizeof(scandata) + count); + long max_entries_per_cycle = 0; + unsigned num_cycles = 0; + long scanned_count = 0; + size_t cursor = 0; + do { + data->count = 0; + cursor = hashtableScan(ht, cursor, scanfn, data); + if (data->count > max_entries_per_cycle) { + max_entries_per_cycle = data->count; + } + scanned_count += data->count; + data->count = 0; + num_cycles++; + } while (cursor != 0); + + /* Verify that every entry was returned exactly once. */ + TEST_ASSERT(scanned_count == count); + for (j = 0; j < count; j++) { + TEST_ASSERT(data->entry_seen[j] >= 1); + TEST_ASSERT(data->entry_seen[j] <= 2); + } + + /* Print some information for curious readers. */ + TEST_PRINT_INFO("Scanned %ld; max emitted per call: %ld; avg emitted per call: %.2lf", + count, max_entries_per_cycle, (double)count / num_cycles); + + /* Cleanup */ + hashtableRelease(ht); + free(data); + } + return 0; +} + +typedef struct { + uint64_t value; + uint64_t hash; +} mock_hash_entry; + +static mock_hash_entry *mock_hash_entry_create(uint64_t value, uint64_t hash) { + mock_hash_entry *entry = malloc(sizeof(mock_hash_entry)); + entry->value = value; + entry->hash = hash; + return entry; +} + +static uint64_t mock_hash_entry_get_hash(const void *entry) { + if (entry == NULL) return 0UL; + mock_hash_entry *mock = (mock_hash_entry *)entry; + return (mock->hash != 0) ? mock->hash : mock->value; +} + +int test_iterator(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + size_t count = 2000000; + uint8_t entry_array[count]; + memset(entry_array, 0, sizeof entry_array); + + /* A set of uint8_t pointers */ + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + + /* Populate */ + for (size_t j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, entry_array + j)); + } + + /* Iterate */ + size_t num_returned = 0; + hashtableIterator iter; + void *next; + hashtableInitIterator(&iter, ht); + while (hashtableNext(&iter, &next)) { + uint8_t *entry = next; + num_returned++; + TEST_ASSERT(entry >= entry_array && entry < entry_array + count); + /* increment entry at this position as a counter */ + (*entry)++; + } + hashtableResetIterator(&iter); + + /* Check that all entries were returned exactly once. */ + TEST_ASSERT(num_returned == count); + for (size_t j = 0; j < count; j++) { + if (entry_array[j] != 1) { + printf("Entry %zu returned %d times\n", j, entry_array[j]); + return 0; + } + } + + hashtableRelease(ht); + return 0; +} + +int test_safe_iterator(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + size_t count = 1000; + uint8_t entry_counts[count * 2]; + memset(entry_counts, 0, sizeof entry_counts); + + /* A set of pointers into the uint8_t array. */ + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + + /* Populate */ + for (size_t j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, entry_counts + j)); + } + + /* Iterate */ + size_t num_returned = 0; + hashtableIterator iter; + void *next; + hashtableInitSafeIterator(&iter, ht); + while (hashtableNext(&iter, &next)) { + uint8_t *entry = next; + size_t index = entry - entry_counts; + num_returned++; + TEST_ASSERT(entry >= entry_counts && entry < entry_counts + count * 2); + /* increment entry at this position as a counter */ + (*entry)++; + if (index % 4 == 0) { + TEST_ASSERT(hashtableDelete(ht, entry)); + } + /* Add new item each time we see one of the original items */ + if (index < count) { + TEST_ASSERT(hashtableAdd(ht, entry + count)); + } + } + hashtableResetIterator(&iter); + + /* Check that all entries present during the whole iteration were returned + * exactly once. (Some are deleted after being returned.) */ + TEST_ASSERT(num_returned >= count); + for (size_t j = 0; j < count; j++) { + if (entry_counts[j] != 1) { + printf("Entry %zu returned %d times\n", j, entry_counts[j]); + return 0; + } + } + /* Check that entries inserted during the iteration were returned at most + * once. */ + unsigned long num_optional_returned = 0; + for (size_t j = count; j < count * 2; j++) { + TEST_ASSERT(entry_counts[j] <= 1); + num_optional_returned += entry_counts[j]; + } + printf("Safe iterator returned %lu of the %zu entries inserted while iterating.\n", num_optional_returned, count); + + hashtableRelease(ht); + return 0; +} + +int test_compact_bucket_chain(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* Create a table with only one bucket chain. */ + hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID); + unsigned long count = 30; + + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + + /* Populate */ + unsigned long j; + for (j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, (void *)j)); + } + TEST_ASSERT(hashtableBuckets(ht) == 1); + printf("Populated a single bucket chain, avoiding resize.\n"); + hashtableHistogram(ht); + + /* Delete half of the entries while iterating. */ + size_t num_chained_buckets = hashtableChainedBuckets(ht, 0); + size_t num_returned = 0; + hashtableIterator iter; + hashtableInitSafeIterator(&iter, ht); + void *entry; + while (hashtableNext(&iter, &entry)) { + /* As long as the iterator is still returning entries from the same + * bucket chain, the bucket chain is not compacted, so it still has the + * same number of buckets. */ + TEST_ASSERT(hashtableChainedBuckets(ht, 0) == num_chained_buckets); + num_returned++; + if (num_returned % 2 == 0) { + TEST_ASSERT(hashtableDelete(ht, entry)); + } + if (num_returned == count) { + printf("Last iteration. Half of them have been deleted.\n"); + hashtableHistogram(ht); + } + } + hashtableResetIterator(&iter); + + /* Verify that the bucket chain has been compacted by filling the holes and + * freeing empty child buckets. */ + printf("When the iterator leaves the bucket chain, compaction should happen.\n"); + hashtableHistogram(ht); + TEST_ASSERT(hashtableChainedBuckets(ht, 0) < num_chained_buckets); + + hashtableRelease(ht); + hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW); + TEST_ASSERT(zmalloc_used_memory() == 0); + return 0; +} + +int test_random_entry(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + randomSeed(); + + size_t count = (flags & UNIT_TEST_LARGE_MEMORY) ? 7000 : 400; + long num_rounds = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 10000; + + /* A set of ints */ + hashtableType type = {0}; + hashtable *ht = hashtableCreate(&type); + + /* Populate */ + unsigned times_picked[count]; + memset(times_picked, 0, sizeof(times_picked)); + for (size_t j = 0; j < count; j++) { + TEST_ASSERT(hashtableAdd(ht, times_picked + j)); + } + + /* Pick entries, and count how many times each entry is picked. */ + for (long i = 0; i < num_rounds; i++) { + /* Using void* variable to avoid a cast that violates strict aliasing */ + void *entry; + TEST_ASSERT(hashtableFairRandomEntry(ht, &entry)); + unsigned *picked = entry; + TEST_ASSERT(picked >= times_picked && picked < times_picked + count); + /* increment entry at this position as a counter */ + (*picked)++; + } + hashtableRelease(ht); + + /* Fairness measurement + * -------------------- + * + * Selecting a single random entry: For any entry in the hash table, let + * X=1 if the we selected the entry (success) and X=0 otherwise. With m + * entries, our entry is sepected with probability p = 1/m, the expected + * value is E(X) = 1/m, E(X^2) = 1/m and the variance: + * + * Var(X) = E(X^2) - (E(X))^2 = 1/m - 1/(m^2) = (1/m) * (1 - 1/m). + * + * Repeating the selection of a random entry: Let's repeat the experiment + * n times and let Y be the number of times our entry was selected. This + * is a binomial distribution. + * + * Y = X_1 + X_2 + ... + X_n + * E(Y) = n/m + * + * The variance of a sum of independent random variables is the sum of the + * variances, so Y has variance np(1−p). + * + * Var(Y) = npq = np(1 - p) = (n/m) * (1 - 1/m) = n * (m - 1) / (m * m) + */ + double m = (double)count, n = (double)num_rounds; + double expected = n / m; /* E(Y) */ + double variance = n * (m - 1) / (m * m); /* Var(Y) */ + double std_dev = sqrt(variance); + + /* With large n, the distribution approaches a normal distribution and we + * can use p68 = within 1 std dev, p95 = within 2 std dev, p99.7 = within 3 + * std dev. */ + long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0; + for (size_t j = 0; j < count; j++) { + double dev = expected - times_picked[j]; + p68 += (dev >= -std_dev && dev <= std_dev); + p95 += (dev >= -std_dev * 2 && dev <= std_dev * 2); + p99 += (dev >= -std_dev * 3 && dev <= std_dev * 3); + p4dev += (dev >= -std_dev * 4 && dev <= std_dev * 4); + p5dev += (dev >= -std_dev * 5 && dev <= std_dev * 5); + } + printf("Random entry fairness test\n"); + printf(" Pick one of %zu entries, %ld times.\n", count, num_rounds); + printf(" Expecting each entry to be picked %.2lf times, std dev %.3lf.\n", expected, std_dev); + printf(" Within 1 std dev (p68) = %.2lf%%\n", 100 * p68 / m); + printf(" Within 2 std dev (p95) = %.2lf%%\n", 100 * p95 / m); + printf(" Within 3 std dev (p99) = %.2lf%%\n", 100 * p99 / m); + printf(" Within 4 std dev = %.2lf%%\n", 100 * p4dev / m); + printf(" Within 5 std dev = %.2lf%%\n", 100 * p5dev / m); + + /* Conclusion? The number of trials (n) relative to the probabilities (p and + * 1 − p) must be sufficiently large (n * p ≥ 5 and n * (1 − p) ≥ 5) to + * approximate a binomial distribution with a normal distribution. */ + if (n / m >= 5 && n * (1 - 1 / m) >= 5) { + TEST_ASSERT_MESSAGE("Too unfair randomness", 100 * p99 / m >= 60.0); + } else { + printf("To uncertain numbers to draw any conclusions about fairness.\n"); + } + return 0; +} + +int test_random_entry_with_long_chain(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* We use an estimator of true probability. + * We determine how many samples to take based on how precise of a + * measurement we want to take, and how certain we want to be that the + * measurement is correct. + * https://en.wikipedia.org/wiki/Checking_whether_a_coin_is_fair#Estimator_of_true_probability + */ + + /* In a thousand runs the worst deviation seen was 0.018 +/- 0.01. + * This means the true deviation was at least 0.008 or 0.8%. + * Accept a deviation of 5% to be on the safe side so we don't get + * a flaky test case. */ + const double acceptable_probability_deviation = 0.05; + + const size_t num_chained_entries = 64; + const size_t num_random_entries = 448; + const double p_fair = (double)num_chained_entries / (num_chained_entries + num_random_entries); + + /* Precision of our measurement */ + const double precision = (flags & UNIT_TEST_ACCURATE) ? 0.001 : 0.01; + + /* This is confidence level for our measurement as the Z value of a normal + * distribution. 5 sigma corresponds to 0.00002% probability that our + * measurement is farther than 'precision' from the truth. This value is + * used in particle physics. */ + const double z = 5; + + const double n = p_fair * (1 - p_fair) * z * z / (precision * precision); + const size_t num_samples = (size_t)n + 1; + + hashtableType type = { + .hashFunction = mock_hash_entry_get_hash, + .entryDestructor = freekeyval, + }; + + hashtable *ht = hashtableCreate(&type); + hashtableExpand(ht, num_random_entries + num_chained_entries); + uint64_t chain_hash = (uint64_t)genrand64_int64(); + if (chain_hash == 0) chain_hash++; + + /* add random entries */ + for (size_t i = 0; i < num_random_entries; i++) { + uint64_t random_hash = (uint64_t)genrand64_int64(); + if (random_hash == chain_hash) random_hash++; + hashtableAdd(ht, mock_hash_entry_create(random_hash, 0)); + } + + /* create long chain */ + for (size_t i = 0; i < num_chained_entries; i++) { + hashtableAdd(ht, mock_hash_entry_create(i, chain_hash)); + } + + TEST_ASSERT(!hashtableIsRehashing(ht)); + + printf("Created a table with a long bucket chain.\n"); + hashtableHistogram(ht); + + printf("Taking %zu random samples\n", num_samples); + size_t count_chain_entry_picked = 0; + for (size_t i = 0; i < num_samples; i++) { + void *entry; + TEST_ASSERT(hashtableFairRandomEntry(ht, &entry)); + mock_hash_entry *mock_entry = entry; + if (mock_entry->hash == chain_hash) { + count_chain_entry_picked++; + } + } + const double measured_probability = (double)count_chain_entry_picked / num_samples; + const double deviation = fabs(measured_probability - p_fair); + printf("Measured probability: %.1f%%\n", measured_probability * 100); + printf("Expected probability: %.1f%%\n", p_fair * 100); + printf("Measured probability deviated %1.1f%% +/- %1.1f%% from expected probability\n", + deviation * 100, precision * 100); + TEST_ASSERT(deviation <= precision + acceptable_probability_deviation); + + hashtableRelease(ht); + return 0; +} + +int test_all_memory_freed(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST_ASSERT(zmalloc_used_memory() == 0); + return 0; +} diff --git a/src/unit/test_help.h b/src/unit/test_help.h index 804a7e3449..51e77d19d3 100644 --- a/src/unit/test_help.h +++ b/src/unit/test_help.h @@ -18,10 +18,12 @@ /* The flags are the following: * --accurate: Runs tests with more iterations. * --large-memory: Enables tests that consume more than 100mb. - * --single: A flag to indicate a specific test file was executed. */ + * --single: A flag to indicate a specific test file was executed. + * --valgrind: Runs tests with valgrind. */ #define UNIT_TEST_ACCURATE (1 << 0) #define UNIT_TEST_LARGE_MEMORY (1 << 1) #define UNIT_TEST_SINGLE (1 << 2) +#define UNIT_TEST_VALGRIND (1 << 3) #define KRED "\33[31m" #define KGRN "\33[32m" diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c index 062b9f32fc..d4cc91d6d8 100644 --- a/src/unit/test_kvstore.c +++ b/src/unit/test_kvstore.c @@ -2,22 +2,26 @@ #include "test_help.h" uint64_t hashTestCallback(const void *key) { - return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); + return hashtableGenHashFunction((char *)key, strlen((char *)key)); +} + +int cmpTestCallback(const void *k1, const void *k2) { + return strcmp(k1, k2); } void freeTestCallback(void *val) { zfree(val); } -dictType KvstoreDictTestType = {hashTestCallback, - NULL, - NULL, - freeTestCallback, - NULL, - NULL, - kvstoreDictRehashingStarted, - kvstoreDictRehashingCompleted, - kvstoreDictMetadataSize}; +hashtableType KvstoreHashtableTestType = { + .hashFunction = hashTestCallback, + .keyCompare = cmpTestCallback, + .entryDestructor = freeTestCallback, + .rehashingStarted = kvstoreHashtableRehashingStarted, + .rehashingCompleted = kvstoreHashtableRehashingCompleted, + .trackMemUsage = kvstoreHashtableTrackMemUsage, + .getMetadataSize = kvstoreHashtableMetadataSize, +}; char *stringFromInt(int value) { char buf[32]; @@ -37,21 +41,18 @@ int test_kvstoreAdd16Keys(int argc, char **argv, int flags) { UNUSED(flags); int i; - dictEntry *de; int didx = 0; - kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); - kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND); + kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES); for (i = 0; i < 16; i++) { - de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); - de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); + TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i))); + TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i))); } - TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 16); + TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 16); TEST_ASSERT(kvstoreSize(kvs1) == 16); - TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 16); + TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 16); TEST_ASSERT(kvstoreSize(kvs2) == 16); kvstoreRelease(kvs1); @@ -59,144 +60,132 @@ int test_kvstoreAdd16Keys(int argc, char **argv, int flags) { return 0; } -int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags) { +int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); UNUSED(flags); int i; void *key; - dictEntry *de; kvstoreIterator *kvs_it; int didx = 0; int curr_slot = 0; - kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND); for (i = 0; i < 16; i++) { - de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); + TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i))); } kvs_it = kvstoreIteratorInit(kvs1); - while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { - curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); - key = dictGetKey(de); - TEST_ASSERT(kvstoreDictDelete(kvs1, curr_slot, key) == DICT_OK); + while (kvstoreIteratorNext(kvs_it, &key)) { + curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it); + TEST_ASSERT(kvstoreHashtableDelete(kvs1, curr_slot, key)); } kvstoreIteratorRelease(kvs_it); - dict *d = kvstoreGetDict(kvs1, didx); - TEST_ASSERT(d != NULL); - TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 0); + hashtable *ht = kvstoreGetHashtable(kvs1, didx); + TEST_ASSERT(ht != NULL); + TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 0); TEST_ASSERT(kvstoreSize(kvs1) == 0); kvstoreRelease(kvs1); return 0; } -int test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags) { +int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); UNUSED(flags); int i; void *key; - dictEntry *de; kvstoreIterator *kvs_it; int didx = 0; int curr_slot = 0; - kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES); for (i = 0; i < 16; i++) { - de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); + TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i))); } kvs_it = kvstoreIteratorInit(kvs2); - while ((de = kvstoreIteratorNext(kvs_it)) != NULL) { - curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it); - key = dictGetKey(de); - TEST_ASSERT(kvstoreDictDelete(kvs2, curr_slot, key) == DICT_OK); + while (kvstoreIteratorNext(kvs_it, &key)) { + curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it); + TEST_ASSERT(kvstoreHashtableDelete(kvs2, curr_slot, key)); } kvstoreIteratorRelease(kvs_it); - /* Make sure the dict was removed from the rehashing list. */ + /* Make sure the hashtable was removed from the rehashing list. */ while (kvstoreIncrementallyRehash(kvs2, 1000)) { } - dict *d = kvstoreGetDict(kvs2, didx); - TEST_ASSERT(d == NULL); - TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 0); + hashtable *ht = kvstoreGetHashtable(kvs2, didx); + TEST_ASSERT(ht == NULL); + TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 0); TEST_ASSERT(kvstoreSize(kvs2) == 0); kvstoreRelease(kvs2); return 0; } -int test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags) { +int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); UNUSED(flags); int i; void *key; - dictEntry *de; - kvstoreDictIterator *kvs_di; + kvstoreHashtableIterator *kvs_di; int didx = 0; - kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND); + kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND); for (i = 0; i < 16; i++) { - de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); + TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i))); } - kvs_di = kvstoreGetDictSafeIterator(kvs1, didx); - while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { - key = dictGetKey(de); - TEST_ASSERT(kvstoreDictDelete(kvs1, didx, key) == DICT_OK); + kvs_di = kvstoreGetHashtableSafeIterator(kvs1, didx); + while (kvstoreHashtableIteratorNext(kvs_di, &key)) { + TEST_ASSERT(kvstoreHashtableDelete(kvs1, didx, key)); } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); - dict *d = kvstoreGetDict(kvs1, didx); - TEST_ASSERT(d != NULL); - TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 0); + hashtable *ht = kvstoreGetHashtable(kvs1, didx); + TEST_ASSERT(ht != NULL); + TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 0); TEST_ASSERT(kvstoreSize(kvs1) == 0); kvstoreRelease(kvs1); return 0; } -int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags) { +int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); UNUSED(flags); int i; void *key; - dictEntry *de; - kvstoreDictIterator *kvs_di; + kvstoreHashtableIterator *kvs_di; int didx = 0; - kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES); for (i = 0; i < 16; i++) { - de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL); - TEST_ASSERT(de != NULL); + TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i))); } - kvs_di = kvstoreGetDictSafeIterator(kvs2, didx); - while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) { - key = dictGetKey(de); - TEST_ASSERT(kvstoreDictDelete(kvs2, didx, key) == DICT_OK); + kvs_di = kvstoreGetHashtableSafeIterator(kvs2, didx); + while (kvstoreHashtableIteratorNext(kvs_di, &key)) { + TEST_ASSERT(kvstoreHashtableDelete(kvs2, didx, key)); } - kvstoreReleaseDictIterator(kvs_di); + kvstoreReleaseHashtableIterator(kvs_di); - dict *d = kvstoreGetDict(kvs2, didx); - TEST_ASSERT(d == NULL); - TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 0); + hashtable *ht = kvstoreGetHashtable(kvs2, didx); + TEST_ASSERT(ht == NULL); + TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 0); TEST_ASSERT(kvstoreSize(kvs2) == 0); kvstoreRelease(kvs2); diff --git a/src/unit/test_listpack.c b/src/unit/test_listpack.c index 4838fc8952..0c71da18db 100644 --- a/src/unit/test_listpack.c +++ b/src/unit/test_listpack.c @@ -1184,7 +1184,7 @@ int test_listpackStressWithRandom(int argc, char **argv, int flags) { for (i = 0; i < iteration; i++) { lp = lpNew(0); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ diff --git a/src/unit/test_main.c b/src/unit/test_main.c index 277d1b42c1..1b7cd8c96d 100644 --- a/src/unit/test_main.c +++ b/src/unit/test_main.c @@ -49,6 +49,8 @@ int main(int argc, char **argv) { else if (!strcasecmp(arg, "--single") && (j + 1 < argc)) { flags |= UNIT_TEST_SINGLE; file = argv[j + 1]; + } else if (!strcasecmp(arg, "--valgrind")) { + flags |= UNIT_TEST_VALGRIND; } } diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c new file mode 100644 index 0000000000..566583bcc5 --- /dev/null +++ b/src/unit/test_networking.c @@ -0,0 +1,131 @@ +#include "../networking.c" +#include "../server.c" +#include "test_help.h" + +#include + +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + + /* Test 1: Initial backup of arguments */ + c->argc = 2; + robj **initial_argv = zmalloc(sizeof(robj *) * 2); + c->argv = initial_argv; + c->argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + c->argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + c->original_argv = NULL; + + backupAndUpdateClientArgv(c, 3, NULL); + + TEST_ASSERT(c->argv != initial_argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->argc == 3); + TEST_ASSERT(c->argv_len == 3); + TEST_ASSERT(c->argv[0]->refcount == 2); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv[2] == NULL); + + /* Test 2: Direct argv replacement */ + robj **new_argv = zmalloc(sizeof(robj *) * 2); + new_argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + new_argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + + backupAndUpdateClientArgv(c, 2, new_argv); + + TEST_ASSERT(c->argv == new_argv); + TEST_ASSERT(c->argc == 2); + TEST_ASSERT(c->argv_len == 2); + TEST_ASSERT(c->original_argv != c->argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->original_argv[0]->refcount == 1); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Expanding argc */ + backupAndUpdateClientArgv(c, 4, NULL); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv_len == 4); + TEST_ASSERT(c->argv[0] != NULL); + TEST_ASSERT(c->argv[1] != NULL); + TEST_ASSERT(c->argv[2] == NULL); + TEST_ASSERT(c->argv[3] == NULL); + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->original_argc; i++) { + decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + zfree(c); + + return 0; +} + +int test_rewriteClientCommandArgument(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + c->argc = 3; + robj **initial_argv = zmalloc(sizeof(robj *) * 3); + c->argv = initial_argv; + c->original_argv = NULL; + c->argv_len_sum = 0; + + /* Initialize client with command "SET key value" */ + c->argv[0] = createStringObject("SET", 3); + robj *original_key = createStringObject("key", 3); + c->argv[1] = original_key; + c->argv[2] = createStringObject("value", 5); + c->argv_len_sum = 11; // 3 + 3 + 5 + + /* Test 1: Rewrite existing argument */ + robj *newval = createStringObject("newkey", 6); + rewriteClientCommandArgument(c, 1, newval); + + TEST_ASSERT(c->argv[1] == newval); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv_len_sum == 14); // 3 + 6 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argv[1] == original_key); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Extend argument vector */ + robj *extraval = createStringObject("extra", 5); + rewriteClientCommandArgument(c, 3, extraval); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv[3] == extraval); + TEST_ASSERT(c->argv_len_sum == 19); // 3 + 6 + 5 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + + for (int i = 0; i < c->original_argc; i++) { + if (c->original_argv[i]) decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + decrRefCount(newval); + decrRefCount(extraval); + + zfree(c); + + return 0; +} diff --git a/src/unit/test_object.c b/src/unit/test_object.c new file mode 100644 index 0000000000..995400c3d9 --- /dev/null +++ b/src/unit/test_object.c @@ -0,0 +1,50 @@ +#include "../object.c" +#include "test_help.h" + +#include +#include +#include +#include +#include + + +int test_object_with_key(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + sds key = sdsnew("foo"); + robj *val = createStringObject("bar", strlen("bar")); + TEST_ASSERT(val->encoding == OBJ_ENCODING_EMBSTR); + + /* Prevent objectSetKeyAndExpire from freeing the old val when reallocating it. */ + incrRefCount(val); + + /* Create valkey: val with key. */ + robj *valkey = objectSetKeyAndExpire(val, key, -1); + TEST_ASSERT(valkey->encoding == OBJ_ENCODING_EMBSTR); + TEST_ASSERT(objectGetKey(valkey) != NULL); + + /* Check embedded key "foo" */ + TEST_ASSERT(sdslen(objectGetKey(valkey)) == 3); + TEST_ASSERT(sdslen(key) == 3); + TEST_ASSERT(sdscmp(objectGetKey(valkey), key) == 0); + TEST_ASSERT(strcmp(objectGetKey(valkey), "foo") == 0); + + /* Check embedded value "bar" (EMBSTR content) */ + TEST_ASSERT(sdscmp(valkey->ptr, val->ptr) == 0); + TEST_ASSERT(strcmp(valkey->ptr, "bar") == 0); + + /* Either they're two separate objects, or one object with refcount == 2. */ + if (valkey == val) { + TEST_ASSERT(valkey->refcount == 2); + } else { + TEST_ASSERT(valkey->refcount == 1); + TEST_ASSERT(val->refcount == 1); + } + + /* Free them. */ + sdsfree(key); + decrRefCount(val); + decrRefCount(valkey); + return 0; +} diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c index b97d0d9d32..30f25e4f6f 100644 --- a/src/unit/test_sds.c +++ b/src/unit/test_sds.c @@ -259,43 +259,44 @@ int test_typesAndAllocSize(int argc, char **argv, int flags) { sds x = sdsnewlen(NULL, 31); TEST_ASSERT_MESSAGE("len 31 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_5); + TEST_ASSERT_MESSAGE("len 31 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 32); TEST_ASSERT_MESSAGE("len 32 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 252); TEST_ASSERT_MESSAGE("len 252 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 253); TEST_ASSERT_MESSAGE("len 253 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65530); TEST_ASSERT_MESSAGE("len 65530 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65531); TEST_ASSERT_MESSAGE("len 65531 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); #if (LONG_MAX == LLONG_MAX) if (flags & UNIT_TEST_LARGE_MEMORY) { x = sdsnewlen(NULL, 4294967286); TEST_ASSERT_MESSAGE("len 4294967286 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 4294967287); TEST_ASSERT_MESSAGE("len 4294967287 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_64); - TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); } #endif diff --git a/src/unit/test_util.c b/src/unit/test_util.c index 70be0255d8..9858318e06 100644 --- a/src/unit/test_util.c +++ b/src/unit/test_util.c @@ -6,6 +6,11 @@ #include "../util.h" #include "test_help.h" +#if defined(__linux__) +#include +#include +#endif + int test_string2ll(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); @@ -286,9 +291,20 @@ static int cache_exist(int fd) { int test_reclaimFilePageCache(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); - UNUSED(flags); + + /* The test is incompatible with valgrind, skip it. */ + if (flags & UNIT_TEST_VALGRIND) return 0; #if defined(__linux__) + struct statfs stats; + + /* Check if /tmp is memory-backed (e.g., tmpfs) */ + if (statfs("/tmp", &stats) == 0) { + if (stats.f_type != TMPFS_MAGIC) { // Not tmpfs, use /tmp + return 0; + } + } + char *tmpfile = "/tmp/redis-reclaim-cache-test"; int fd = open(tmpfile, O_RDWR | O_CREAT, 0644); TEST_ASSERT(fd >= 0); diff --git a/src/unit/test_valkey_strtod.c b/src/unit/test_valkey_strtod.c new file mode 100644 index 0000000000..4796d7a5b6 --- /dev/null +++ b/src/unit/test_valkey_strtod.c @@ -0,0 +1,36 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + + +#include "../valkey_strtod.h" +#include "errno.h" +#include "math.h" +#include "test_help.h" + +int test_valkey_strtod(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + errno = 0; + double value = valkey_strtod("231.2341234", NULL); + TEST_ASSERT(value == 231.2341234); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("+inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("-inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + return 0; +} diff --git a/src/unit/test_ziplist.c b/src/unit/test_ziplist.c index d2f7ebe69c..58687d81fc 100644 --- a/src/unit/test_ziplist.c +++ b/src/unit/test_ziplist.c @@ -645,7 +645,7 @@ int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **arg for (i = 0; i < iteration; i++) { zl = ziplistNew(); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ diff --git a/src/unit/test_zmalloc.c b/src/unit/test_zmalloc.c index 6c1d03e8e1..08444a157e 100644 --- a/src/unit/test_zmalloc.c +++ b/src/unit/test_zmalloc.c @@ -6,6 +6,8 @@ int test_zmallocInitialUsedMemory(int argc, char **argv, int flags) { UNUSED(argv); UNUSED(flags); + /* If this fails, it may be that other tests have failed and the memory has not been released. */ + TEST_PRINT_INFO("test_zmallocInitialUsedMemory; used: %zu\n", zmalloc_used_memory()); TEST_ASSERT(zmalloc_used_memory() == 0); return 0; diff --git a/src/unix.c b/src/unix.c index 35778779f9..86df05bd52 100644 --- a/src/unix.c +++ b/src/unix.c @@ -74,6 +74,10 @@ static int connUnixListen(connListener *listener) { return C_OK; } +static void connUnixCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static connection *connCreateUnix(void) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Unix; @@ -174,6 +178,7 @@ static ConnectionType CT_Unix = { .addr = connUnixAddr, .is_local = connUnixIsLocal, .listen = connUnixListen, + .closeListener = connUnixCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateUnix, diff --git a/src/util.c b/src/util.c index b1235c2822..6e44392ce1 100644 --- a/src/util.c +++ b/src/util.c @@ -50,6 +50,9 @@ #include "util.h" #include "sha256.h" #include "config.h" +#include "zmalloc.h" + +#include "valkey_strtod.h" #define UNUSED(x) ((void)(x)) @@ -102,23 +105,23 @@ static int stringmatchlen_impl(const char *pattern, pattern++; patternLen--; - not_op = pattern[0] == '^'; + not_op = patternLen && pattern[0] == '^'; if (not_op) { pattern++; patternLen--; } match = 0; while (1) { - if (pattern[0] == '\\' && patternLen >= 2) { + if (patternLen >= 2 && pattern[0] == '\\') { pattern++; patternLen--; if (pattern[0] == string[0]) match = 1; - } else if (pattern[0] == ']') { - break; } else if (patternLen == 0) { pattern--; patternLen++; break; + } else if (pattern[0] == ']') { + break; } else if (patternLen >= 3 && pattern[1] == '-') { int start = pattern[0]; int end = pattern[2]; @@ -171,7 +174,7 @@ static int stringmatchlen_impl(const char *pattern, pattern++; patternLen--; if (stringLen == 0) { - while (*pattern == '*') { + while (patternLen && *pattern == '*') { pattern++; patternLen--; } @@ -595,10 +598,12 @@ int string2ld(const char *s, size_t slen, long double *dp) { int string2d(const char *s, size_t slen, double *dp) { errno = 0; char *eptr; - *dp = strtod(s, &eptr); + *dp = valkey_strtod(s, &eptr); if (slen == 0 || isspace(((const char *)s)[0]) || (size_t)(eptr - (char *)s) != slen || - (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp)) + (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp) || errno == EINVAL) { + errno = 0; return 0; + } return 1; } @@ -1376,3 +1381,23 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...) { va_end(args); return result; } + +/* A printf-like function that returns a freshly allocated string. + * + * This function is similar to asprintf function, but it uses zmalloc for + * allocating the string buffer. */ +char *valkey_asprintf(char const *fmt, ...) { + va_list args; + + va_start(args, fmt); + size_t str_len = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + char *str = zmalloc(str_len); + + va_start(args, fmt); + vsnprintf(str, str_len, fmt, args); + va_end(args); + + return str; +} diff --git a/src/util.h b/src/util.h index 51eb38f0b4..61095ddb65 100644 --- a/src/util.h +++ b/src/util.h @@ -99,5 +99,6 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...); #endif size_t valkey_strlcpy(char *dst, const char *src, size_t dsize); size_t valkey_strlcat(char *dst, const char *src, size_t dsize); +char *valkey_asprintf(char const *fmt, ...); #endif diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index 57cdd6fc16..1924203ae7 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -77,6 +77,13 @@ struct benchmarkThread; struct clusterNode; struct serverConfig; +/* Read from replica options */ +typedef enum readFromReplica { + FROM_PRIMARY_ONLY = 0, /* default option */ + FROM_REPLICA_ONLY, + FROM_ALL +} readFromReplica; + static struct config { aeEventLoop *el; cliConnInfo conn_info; @@ -112,6 +119,7 @@ static struct config { int num_threads; struct benchmarkThread **threads; int cluster_mode; + readFromReplica read_from_replica; int cluster_node_count; struct clusterNode **cluster_nodes; struct serverConfig *redis_config; @@ -168,12 +176,6 @@ typedef struct clusterNode { int *updated_slots; /* Used by updateClusterSlotsConfiguration */ int updated_slots_count; /* Used by updateClusterSlotsConfiguration */ int replicas_count; - sds *migrating; /* An array of sds where even strings are slots and odd - * strings are the destination node IDs. */ - sds *importing; /* An array of sds where even strings are slots and odd - * strings are the source node IDs. */ - int migrating_count; /* Length of the migrating array (migrating slots*2) */ - int importing_count; /* Length of the importing array (importing slots*2) */ struct serverConfig *redis_config; } clusterNode; @@ -228,6 +230,15 @@ static int dictSdsKeyCompare(const void *key1, const void *key2) { return memcmp(key1, key2, l1) == 0; } +static dictType dtype = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + dictSdsKeyCompare, /* key compare */ + NULL, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + static redisContext *getRedisContext(const char *ip, int port, const char *hostsocket) { redisContext *ctx = NULL; redisReply *reply = NULL; @@ -710,6 +721,15 @@ static client createClient(char *cmd, size_t len, client from, int thread_id) { c->prefix_pending++; } + if (config.cluster_mode && (config.read_from_replica == FROM_REPLICA_ONLY || config.read_from_replica == FROM_ALL)) { + char *buf = NULL; + int len; + len = redisFormatCommand(&buf, "READONLY"); + c->obuf = sdscatlen(c->obuf, buf, len); + free(buf); + c->prefix_pending++; + } + c->prefixlen = sdslen(c->obuf); /* Append the request itself. */ if (from) { @@ -835,7 +855,15 @@ static void showLatencyReport(void) { printf(" %d bytes payload\n", config.datasize); printf(" keep alive: %d\n", config.keepalive); if (config.cluster_mode) { - printf(" cluster mode: yes (%d primaries)\n", config.cluster_node_count); + const char *node_roles = NULL; + if (config.read_from_replica == FROM_ALL) { + node_roles = "cluster"; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + node_roles = "replica"; + } else { + node_roles = "primary"; + } + printf(" cluster mode: yes (%d %s)\n", config.cluster_node_count, node_roles); int m; for (m = 0; m < config.cluster_node_count; m++) { clusterNode *node = config.cluster_nodes[m]; @@ -1009,26 +1037,13 @@ static clusterNode *createClusterNode(char *ip, int port) { node->slots_count = 0; node->updated_slots = NULL; node->updated_slots_count = 0; - node->migrating = NULL; - node->importing = NULL; - node->migrating_count = 0; - node->importing_count = 0; node->redis_config = NULL; return node; } static void freeClusterNode(clusterNode *node) { - int i; if (node->name) sdsfree(node->name); if (node->replicate) sdsfree(node->replicate); - if (node->migrating != NULL) { - for (i = 0; i < node->migrating_count; i++) sdsfree(node->migrating[i]); - zfree(node->migrating); - } - if (node->importing != NULL) { - for (i = 0; i < node->importing_count; i++) sdsfree(node->importing[i]); - zfree(node->importing); - } /* If the node is not the reference node, that uses the address from * config.conn_info.hostip and config.conn_info.hostport, then the node ip has been * allocated by fetchClusterConfiguration, so it must be freed. */ @@ -1056,157 +1071,85 @@ static clusterNode **addClusterNode(clusterNode *node) { return config.cluster_nodes; } -/* TODO: This should be refactored to use CLUSTER SLOTS, the migrating/importing - * information is anyway not used. - */ static int fetchClusterConfiguration(void) { int success = 1; redisContext *ctx = NULL; redisReply *reply = NULL; + dict *nodes = NULL; + const char *errmsg = "Failed to fetch cluster configuration"; + size_t i, j; ctx = getRedisContext(config.conn_info.hostip, config.conn_info.hostport, config.hostsocket); if (ctx == NULL) { exit(1); } - clusterNode *firstNode = createClusterNode((char *)config.conn_info.hostip, config.conn_info.hostport); - if (!firstNode) { + + reply = redisCommand(ctx, "CLUSTER SLOTS"); + if (reply == NULL || reply->type == REDIS_REPLY_ERROR) { success = 0; + if (reply) fprintf(stderr, "%s\nCLUSTER SLOTS ERROR: %s\n", errmsg, reply->str); goto cleanup; } - reply = redisCommand(ctx, "CLUSTER NODES"); - success = (reply != NULL); - if (!success) goto cleanup; - success = (reply->type != REDIS_REPLY_ERROR); - if (!success) { - if (config.hostsocket == NULL) { - fprintf(stderr, "Cluster node %s:%d replied with error:\n%s\n", config.conn_info.hostip, - config.conn_info.hostport, reply->str); - } else { - fprintf(stderr, "Cluster node %s replied with error:\n%s\n", config.hostsocket, reply->str); - } - goto cleanup; - } - char *lines = reply->str, *p, *line; - while ((p = strstr(lines, "\n")) != NULL) { - *p = '\0'; - line = lines; - lines = p + 1; - char *name = NULL, *addr = NULL, *flags = NULL, *primary_id = NULL; - int i = 0; - while ((p = strchr(line, ' ')) != NULL) { - *p = '\0'; - char *token = line; - line = p + 1; - switch (i++) { - case 0: name = token; break; - case 1: addr = token; break; - case 2: flags = token; break; - case 3: primary_id = token; break; - } - if (i == 8) break; // Slots - } - if (!flags) { - fprintf(stderr, "Invalid CLUSTER NODES reply: missing flags.\n"); - success = 0; - goto cleanup; - } - int myself = (strstr(flags, "myself") != NULL); - int is_replica = (strstr(flags, "slave") != NULL || (primary_id != NULL && primary_id[0] != '-')); - if (is_replica) continue; - if (addr == NULL) { - fprintf(stderr, "Invalid CLUSTER NODES reply: missing addr.\n"); - success = 0; - goto cleanup; - } - clusterNode *node = NULL; - char *ip = NULL; - int port = 0; - char *paddr = strrchr(addr, ':'); - if (paddr != NULL) { - *paddr = '\0'; - ip = addr; - addr = paddr + 1; - /* If internal bus is specified, then just drop it. */ - if ((paddr = strchr(addr, '@')) != NULL) *paddr = '\0'; - port = atoi(addr); - } - if (myself) { - node = firstNode; - if (ip != NULL && strcmp(node->ip, ip) != 0) { - node->ip = sdsnew(ip); - node->port = port; + assert(reply->type == REDIS_REPLY_ARRAY); + nodes = dictCreate(&dtype); + for (i = 0; i < reply->elements; i++) { + redisReply *r = reply->element[i]; + assert(r->type == REDIS_REPLY_ARRAY); + assert(r->elements >= 3); + int from = r->element[0]->integer; + int to = r->element[1]->integer; + sds primary = NULL; + for (j = 2; j < r->elements; j++) { + redisReply *nr = r->element[j]; + assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); + assert(nr->element[0]->str != NULL); + assert(nr->element[2]->str != NULL); + + int is_primary = (j == 2); + if (is_primary) primary = sdsnew(nr->element[2]->str); + int is_cluster_option_only = (config.read_from_replica == FROM_PRIMARY_ONLY); + if ((config.read_from_replica == FROM_REPLICA_ONLY && is_primary) || (is_cluster_option_only && !is_primary)) continue; + + sds ip = sdsnew(nr->element[0]->str); + sds name = sdsnew(nr->element[2]->str); + int port = nr->element[1]->integer; + int slot_start = from; + int slot_end = to; + + clusterNode *node = NULL; + dictEntry *entry = dictFind(nodes, name); + if (entry == NULL) { + node = createClusterNode(sdsnew(ip), port); + if (node == NULL) { + success = 0; + goto cleanup; + } else { + node->name = name; + if (!is_primary) node->replicate = sdsdup(primary); + } + } else { + node = dictGetVal(entry); } - } else { - node = createClusterNode(sdsnew(ip), port); - } - if (node == NULL) { - success = 0; - goto cleanup; - } - if (name != NULL) node->name = sdsnew(name); - if (i == 8) { - int remaining = strlen(line); - while (remaining > 0) { - p = strchr(line, ' '); - if (p == NULL) p = line + remaining; - remaining -= (p - line); - - char *slotsdef = line; - *p = '\0'; - if (remaining) { - line = p + 1; - remaining--; - } else - line = p; - char *dash = NULL; - if (slotsdef[0] == '[') { - slotsdef++; - if ((p = strstr(slotsdef, "->-"))) { // Migrating - *p = '\0'; - p += 3; - char *closing_bracket = strchr(p, ']'); - if (closing_bracket) *closing_bracket = '\0'; - sds slot = sdsnew(slotsdef); - sds dst = sdsnew(p); - node->migrating_count += 2; - node->migrating = zrealloc(node->migrating, (node->migrating_count * sizeof(sds))); - node->migrating[node->migrating_count - 2] = slot; - node->migrating[node->migrating_count - 1] = dst; - } else if ((p = strstr(slotsdef, "-<-"))) { // Importing - *p = '\0'; - p += 3; - char *closing_bracket = strchr(p, ']'); - if (closing_bracket) *closing_bracket = '\0'; - sds slot = sdsnew(slotsdef); - sds src = sdsnew(p); - node->importing_count += 2; - node->importing = zrealloc(node->importing, (node->importing_count * sizeof(sds))); - node->importing[node->importing_count - 2] = slot; - node->importing[node->importing_count - 1] = src; - } - } else if ((dash = strchr(slotsdef, '-')) != NULL) { - p = dash; - int start, stop; - *p = '\0'; - start = atoi(slotsdef); - stop = atoi(p + 1); - while (start <= stop) { - int slot = start++; - node->slots[node->slots_count++] = slot; - } - } else if (p > slotsdef) { - int slot = atoi(slotsdef); + if (slot_start == slot_end) { + node->slots[node->slots_count++] = slot_start; + } else { + while (slot_start <= slot_end) { + int slot = slot_start++; node->slots[node->slots_count++] = slot; } } + if (node->slots_count == 0) { + fprintf(stderr, "WARNING: Node %s:%d has no slots, skipping...\n", node->ip, node->port); + continue; + } + if (entry == NULL) { + dictReplace(nodes, node->name, node); + if (!addClusterNode(node)) { + success = 0; + goto cleanup; + } + } } - if (node->slots_count == 0) { - fprintf(stderr, "WARNING: Primary node %s:%d has no slots, skipping...\n", node->ip, node->port); - continue; - } - if (!addClusterNode(node)) { - success = 0; - goto cleanup; - } + sdsfree(primary); } cleanup: if (ctx) redisFree(ctx); @@ -1214,6 +1157,7 @@ static int fetchClusterConfiguration(void) { if (config.cluster_nodes) freeClusterNodes(); } if (reply) freeReplyObject(reply); + if (nodes) dictRelease(nodes); return success; } @@ -1222,7 +1166,7 @@ static int fetchClusterConfiguration(void) { static int fetchClusterSlotsConfiguration(client c) { UNUSED(c); int success = 1, is_fetching_slots = 0, last_update = 0; - size_t i; + size_t i, j; last_update = atomic_load_explicit(&config.slots_last_update, memory_order_relaxed); if (c->slots_last_update < last_update) { @@ -1236,16 +1180,9 @@ static int fetchClusterSlotsConfiguration(client c) { atomic_store_explicit(&config.is_fetching_slots, 1, memory_order_relaxed); fprintf(stderr, "WARNING: Cluster slots configuration changed, fetching new one...\n"); const char *errmsg = "Failed to update cluster slots configuration"; - static dictType dtype = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ - }; + /* printf("[%d] fetchClusterSlotsConfiguration\n", c->thread_id); */ - dict *primaries = dictCreate(&dtype); + dict *nodes = dictCreate(&dtype); redisContext *ctx = NULL; for (i = 0; i < (size_t)config.cluster_node_count; i++) { clusterNode *node = config.cluster_nodes[i]; @@ -1263,7 +1200,7 @@ static int fetchClusterSlotsConfiguration(client c) { if (node->updated_slots != NULL) zfree(node->updated_slots); node->updated_slots = NULL; node->updated_slots_count = 0; - dictReplace(primaries, node->name, node); + dictReplace(nodes, node->name, node); } reply = redisCommand(ctx, "CLUSTER SLOTS"); if (reply == NULL || reply->type == REDIS_REPLY_ERROR) { @@ -1279,30 +1216,44 @@ static int fetchClusterSlotsConfiguration(client c) { int from, to, slot; from = r->element[0]->integer; to = r->element[1]->integer; - redisReply *nr = r->element[2]; - assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); - assert(nr->element[2]->str != NULL); - sds name = sdsnew(nr->element[2]->str); - dictEntry *entry = dictFind(primaries, name); - if (entry == NULL) { - success = 0; - fprintf(stderr, - "%s: could not find node with ID %s in current " - "configuration.\n", - errmsg, name); - if (name) sdsfree(name); - goto cleanup; + size_t start, end; + if (config.read_from_replica == FROM_ALL) { + start = 2; + end = r->elements; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + start = 3; + end = r->elements; + } else { + start = 2; + end = 3; + } + + for (j = start; j < end; j++) { + redisReply *nr = r->element[j]; + assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3); + assert(nr->element[2]->str != NULL); + sds name = sdsnew(nr->element[2]->str); + dictEntry *entry = dictFind(nodes, name); + if (entry == NULL) { + success = 0; + fprintf(stderr, + "%s: could not find node with ID %s in current " + "configuration.\n", + errmsg, name); + if (name) sdsfree(name); + goto cleanup; + } + sdsfree(name); + clusterNode *node = dictGetVal(entry); + if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int)); + for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot; } - sdsfree(name); - clusterNode *node = dictGetVal(entry); - if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int)); - for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot; } updateClusterSlotsConfiguration(); cleanup: freeReplyObject(reply); redisFree(ctx); - dictRelease(primaries); + dictRelease(nodes); atomic_store_explicit(&config.is_fetching_slots, 0, memory_order_relaxed); return success; } @@ -1460,6 +1411,19 @@ int parseOptions(int argc, char **argv) { config.num_threads = 0; } else if (!strcmp(argv[i], "--cluster")) { config.cluster_mode = 1; + } else if (!strcmp(argv[i], "--rfr")) { + if (argv[++i]) { + if (!strcmp(argv[i], "all")) { + config.read_from_replica = FROM_ALL; + } else if (!strcmp(argv[i], "yes")) { + config.read_from_replica = FROM_REPLICA_ONLY; + } else if (!strcmp(argv[i], "no")) { + config.read_from_replica = FROM_PRIMARY_ONLY; + } else { + goto invalid; + } + } else + goto invalid; } else if (!strcmp(argv[i], "--enable-tracking")) { config.enable_tracking = 1; } else if (!strcmp(argv[i], "--help")) { @@ -1557,6 +1521,14 @@ int parseOptions(int argc, char **argv) { " If the command is supplied on the command line in cluster\n" " mode, the key must contain \"{tag}\". Otherwise, the\n" " command will not be sent to the right cluster node.\n" + " --rfr Enable read from replicas in cluster mode.\n" + " This command must be used with the --cluster option.\n" + " There are three modes for reading from replicas:\n" + " 'no' - sends read requests to primaries only (default) \n" + " 'yes' - sends read requests to replicas only.\n" + " 'all' - sends read requests to all nodes.\n" + " Since write commands will not be accepted by replicas,\n" + " it is recommended to enable read from replicas only for read command tests.\n" " --enable-tracking Send CLIENT TRACKING on before starting benchmark.\n" " -k 1=keep alive 0=reconnect (default 1)\n" " -r Use random keys for SET/GET/INCR, random values for SADD,\n" @@ -1698,6 +1670,7 @@ int main(int argc, char **argv) { config.num_threads = 0; config.threads = NULL; config.cluster_mode = 0; + config.read_from_replica = FROM_PRIMARY_ONLY; config.cluster_node_count = 0; config.cluster_nodes = NULL; config.redis_config = NULL; @@ -1742,7 +1715,15 @@ int main(int argc, char **argv) { fprintf(stderr, "Invalid cluster: %d node(s).\n", config.cluster_node_count); exit(1); } - printf("Cluster has %d primary nodes:\n\n", config.cluster_node_count); + const char *node_roles = NULL; + if (config.read_from_replica == FROM_ALL) { + node_roles = "cluster"; + } else if (config.read_from_replica == FROM_REPLICA_ONLY) { + node_roles = "replica"; + } else { + node_roles = "primary"; + } + printf("Cluster has %d %s nodes:\n\n", config.cluster_node_count, node_roles); int i = 0; for (; i < config.cluster_node_count; i++) { clusterNode *node = config.cluster_nodes[i]; @@ -1750,7 +1731,8 @@ int main(int argc, char **argv) { fprintf(stderr, "Invalid cluster node #%d\n", i); exit(1); } - printf("Primary %d: ", i); + const char *node_type = (node->replicate == NULL ? "Primary" : "Replica"); + printf("Node %d(%s): ", i, node_type); if (node->name) printf("%s ", node->name); printf("%s:%d\n", node->ip, node->port); node->redis_config = getServerConfig(node->ip, node->port, NULL); diff --git a/src/valkey-cli.c b/src/valkey-cli.c index 0ba03dc6ba..4416e09431 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -65,6 +65,8 @@ #include "mt19937-64.h" #include "cli_commands.h" +#include "valkey_strtod.h" + #define UNUSED(V) ((void)V) #define OUTPUT_STANDARD 0 @@ -2537,9 +2539,10 @@ static int parseOptions(int argc, char **argv) { exit(1); } } else if (!strcmp(argv[i], "-t") && !lastarg) { + errno = 0; char *eptr; - double seconds = strtod(argv[++i], &eptr); - if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0) { + double seconds = valkey_strtod(argv[++i], &eptr); + if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0 || errno == EINVAL || errno == ERANGE) { fprintf(stderr, "Invalid connection timeout for -t.\n"); exit(1); } @@ -4391,7 +4394,7 @@ static sds clusterManagerNodeInfo(clusterManagerNode *node, int indent) { if (node->replicate != NULL) info = sdscatfmt(info, "\n%s replicates %S", spaces, node->replicate); else if (node->replicas_count) - info = sdscatfmt(info, "\n%s %U additional replica(s)", spaces, node->replicas_count); + info = sdscatfmt(info, "\n%s %i additional replica(s)", spaces, node->replicas_count); sdsfree(spaces); return info; } diff --git a/src/valkey_strtod.h b/src/valkey_strtod.h new file mode 100644 index 0000000000..037a3f3cec --- /dev/null +++ b/src/valkey_strtod.h @@ -0,0 +1,42 @@ +#ifndef FAST_FLOAT_STRTOD_H +#define FAST_FLOAT_STRTOD_H + +#ifdef USE_FAST_FLOAT + +#include "errno.h" + +/** + * Converts a null-terminated byte string to a double using the fast_float library. + * + * This function provides a C-compatible wrapper around the fast_float library's string-to-double + * conversion functionality. It aims to offer a faster alternative to the standard strtod function. + * + * str: A pointer to the null-terminated byte string to be converted. + * eptr: On success, stores char pointer pointing to '\0' at the end of the string. + * On failure, stores char pointer pointing to first invalid character in the string. + * returns: On success, the function returns the converted double value. + * On failure, it returns 0.0 and stores error code in errno to ERANGE or EINVAL. + * + * note: This function uses the fast_float library (https://github.com/fastfloat/fast_float) for + * the actual conversion, which can be significantly faster than standard library functions. + * Refer to "../deps/fast_float_c_interface" for more details. + * Refer to https://github.com/fastfloat/fast_float for more information on the underlying library. + */ +double fast_float_strtod(const char *str, char **endptr); + +static inline double valkey_strtod(const char *str, char **endptr) { + errno = 0; + return fast_float_strtod(str, endptr); +} + +#else + +#include + +static inline double valkey_strtod(const char *str, char **endptr) { + return strtod(str, endptr); +} + +#endif + +#endif // FAST_FLOAT_STRTOD_H diff --git a/src/valkeymodule.h b/src/valkeymodule.h index c2cdb2f0e7..1d99d2ff7a 100644 --- a/src/valkeymodule.h +++ b/src/valkeymodule.h @@ -783,6 +783,7 @@ typedef enum { } ValkeyModuleACLLogEntryReason; /* Incomplete structures needed by both the core and modules. */ +typedef struct ValkeyModuleCtx ValkeyModuleCtx; typedef struct ValkeyModuleIO ValkeyModuleIO; typedef struct ValkeyModuleDigest ValkeyModuleDigest; typedef struct ValkeyModuleInfoCtx ValkeyModuleInfoCtx; @@ -794,6 +795,93 @@ typedef void (*ValkeyModuleInfoFunc)(ValkeyModuleInfoCtx *ctx, int for_crash_rep typedef void (*ValkeyModuleDefragFunc)(ValkeyModuleDefragCtx *ctx); typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata); +/* Current ABI version for scripting engine modules. */ +#define VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION 1UL + +/* Type definitions for implementing scripting engines modules. */ +typedef void ValkeyModuleScriptingEngineCtx; +typedef void ValkeyModuleScriptingEngineFunctionCtx; + +/* This struct represents a scripting engine function that results from the + * compilation of a script by the engine implementation. + * + * IMPORTANT: If we ever need to add/remove fields from this struct, we need + * to bump the version number defined in the + * `VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION` constant. + */ +typedef struct ValkeyModuleScriptingEngineCompiledFunction { + ValkeyModuleString *name; /* Function name */ + void *function; /* Opaque object representing a function, usually it' + the function compiled code. */ + ValkeyModuleString *desc; /* Function description */ + uint64_t f_flags; /* Function flags */ +} ValkeyModuleScriptingEngineCompiledFunction; + +/* This struct is used to return the memory information of the scripting + * engine. */ +typedef struct ValkeyModuleScriptingEngineMemoryInfo { + /* The memory used by the scripting engine runtime. */ + size_t used_memory; + /* The memory used by the scripting engine data structures. */ + size_t engine_memory_overhead; +} ValkeyModuleScriptingEngineMemoryInfo; + +typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err); + +typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineFunctionCtx *func_ctx, + void *compiled_function, + ValkeyModuleString **keys, + size_t nkeys, + ValkeyModuleString **args, + size_t nargs); + +typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)( + ValkeyModuleCtx *module_ctx, + void *compiled_function); + +typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + void *compiled_function); + +typedef ValkeyModuleScriptingEngineMemoryInfo (*ValkeyModuleScriptingEngineGetMemoryInfoFunc)( + ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx); + +typedef struct ValkeyModuleScriptingEngineMethodsV1 { + uint64_t version; /* Version of this structure for ABI compat. */ + + /* Library create function callback. When a new script is loaded, this + * callback will be called with the script code, and returns a list of + * ValkeyModuleScriptingEngineCompiledFunc objects. */ + ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library; + + /* Function callback to free the memory of a registered engine function. */ + ValkeyModuleScriptingEngineFreeFunctionFunc free_function; + + /* The callback function called when `FCALL` command is called on a function + * registered in this engine. */ + ValkeyModuleScriptingEngineCallFunctionFunc call_function; + + /* Function callback to return memory overhead for a given function. */ + ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead; + + /* Function callback to get the used memory by the engine. */ + ValkeyModuleScriptingEngineGetMemoryInfoFunc get_memory_info; + +} ValkeyModuleScriptingEngineMethodsV1; + +#define ValkeyModuleScriptingEngineMethods ValkeyModuleScriptingEngineMethodsV1 + /* ------------------------- End of common defines ------------------------ */ /* ----------- The rest of the defines are only for modules ----------------- */ @@ -826,7 +914,6 @@ typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata); #endif /* Incomplete structures for compiler checks but opaque access. */ -typedef struct ValkeyModuleCtx ValkeyModuleCtx; typedef struct ValkeyModuleCommand ValkeyModuleCommand; typedef struct ValkeyModuleCallReply ValkeyModuleCallReply; typedef struct ValkeyModuleType ValkeyModuleType; @@ -967,6 +1054,7 @@ VALKEYMODULE_API void (*ValkeyModule_SetModuleAttribs)(ValkeyModuleCtx *ctx, con VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_IsModuleNameBusy)(const char *name) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_WrongArity)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; +VALKEYMODULE_API int (*ValkeyModule_UpdateRuntimeArgs)(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_ReplyWithLongLong)(ValkeyModuleCtx *ctx, long long ll) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_GetSelectedDb)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_SelectDb)(ValkeyModuleCtx *ctx, int newid) VALKEYMODULE_ATTR; @@ -1649,6 +1737,14 @@ VALKEYMODULE_API int (*ValkeyModule_RdbSave)(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) VALKEYMODULE_ATTR; +VALKEYMODULE_API int (*ValkeyModule_RegisterScriptingEngine)(ValkeyModuleCtx *module_ctx, + const char *engine_name, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineMethods *engine_methods) VALKEYMODULE_ATTR; + +VALKEYMODULE_API int (*ValkeyModule_UnregisterScriptingEngine)(ValkeyModuleCtx *module_ctx, + const char *engine_name) VALKEYMODULE_ATTR; + #define ValkeyModule_IsAOFClient(id) ((id) == UINT64_MAX) /* This is included inline inside each Valkey module. */ @@ -1673,6 +1769,7 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in VALKEYMODULE_GET_API(SetModuleAttribs); VALKEYMODULE_GET_API(IsModuleNameBusy); VALKEYMODULE_GET_API(WrongArity); + VALKEYMODULE_GET_API(UpdateRuntimeArgs); VALKEYMODULE_GET_API(ReplyWithLongLong); VALKEYMODULE_GET_API(ReplyWithError); VALKEYMODULE_GET_API(ReplyWithErrorFormat); @@ -2015,6 +2112,8 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in VALKEYMODULE_GET_API(RdbStreamFree); VALKEYMODULE_GET_API(RdbLoad); VALKEYMODULE_GET_API(RdbSave); + VALKEYMODULE_GET_API(RegisterScriptingEngine); + VALKEYMODULE_GET_API(UnregisterScriptingEngine); if (ValkeyModule_IsModuleNameBusy && ValkeyModule_IsModuleNameBusy(name)) return VALKEYMODULE_ERR; ValkeyModule_SetModuleAttribs(ctx, name, ver, apiver); diff --git a/src/zmalloc.c b/src/zmalloc.c index e18fa8bac2..3abf9a31a0 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -84,8 +84,6 @@ void zlibc_free(void *ptr) { #define calloc(count, size) je_calloc(count, size) #define realloc(ptr, size) je_realloc(ptr, size) #define free(ptr) je_free(ptr) -#define mallocx(size, flags) je_mallocx(size, flags) -#define dallocx(ptr, flags) je_dallocx(ptr, flags) #endif #define thread_local _Thread_local @@ -207,25 +205,6 @@ void *zmalloc_usable(size_t size, size_t *usable) { return ptr; } -/* Allocation and free functions that bypass the thread cache - * and go straight to the allocator arena bins. - * Currently implemented only for jemalloc. Used for online defragmentation. */ -#ifdef HAVE_DEFRAG -void *zmalloc_no_tcache(size_t size) { - if (size >= SIZE_MAX / 2) zmalloc_oom_handler(size); - void *ptr = mallocx(size + PREFIX_SIZE, MALLOCX_TCACHE_NONE); - if (!ptr) zmalloc_oom_handler(size); - update_zmalloc_stat_alloc(zmalloc_size(ptr)); - return ptr; -} - -void zfree_no_tcache(void *ptr) { - if (ptr == NULL) return; - update_zmalloc_stat_free(zmalloc_size(ptr)); - dallocx(ptr, MALLOCX_TCACHE_NONE); -} -#endif - /* Try allocating memory and zero it, and return NULL if failed. * '*usable' is set to the usable size if non NULL. */ static inline void *ztrycalloc_usable_internal(size_t size, size_t *usable) { @@ -472,15 +451,25 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) { zmalloc_oom_handler = oom_handler; } -/* Use 'MADV_DONTNEED' to release memory to operating system quickly. - * We do that in a fork child process to avoid CoW when the parent modifies - * these shared pages. */ -void zmadvise_dontneed(void *ptr) { +/* Try to release pages back to the OS directly using 'MADV_DONTNEED' (bypassing + * the allocator) in a fork child process to avoid CoW when the parent modifies + * those shared pages. For small allocations, we can't release any full page, + * so in an effort to avoid getting the size of the allocation from the + * allocator (malloc_size) when we already know it's small, we check the + * size_hint. If the size is not already known, passing a size_hint of 0 will + * lead the checking the real size of the allocation. + * Also please note that the size may be not accurate, so in order to make this + * solution effective, the judgement for releasing memory pages should not be + * too strict. */ +void zmadvise_dontneed(void *ptr, size_t size_hint) { #if defined(USE_JEMALLOC) && defined(__linux__) + if (ptr == NULL) return; + static size_t page_size = 0; if (page_size == 0) page_size = sysconf(_SC_PAGESIZE); size_t page_size_mask = page_size - 1; + if (size_hint && size_hint / 2 < page_size) return; size_t real_size = zmalloc_size(ptr); if (real_size < page_size) return; @@ -494,6 +483,7 @@ void zmadvise_dontneed(void *ptr) { } #else (void)(ptr); + (void)(size_hint); #endif } @@ -683,52 +673,7 @@ size_t zmalloc_get_rss(void) { #define STRINGIFY_(x) #x #define STRINGIFY(x) STRINGIFY_(x) -/* Compute the total memory wasted in fragmentation of inside small arena bins. - * Done by summing the memory in unused regs in all slabs of all small bins. */ -size_t zmalloc_get_frag_smallbins(void) { - unsigned nbins; - size_t sz, frag = 0; - char buf[100]; - - sz = sizeof(unsigned); - assert(!je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0)); - for (unsigned j = 0; j < nbins; j++) { - size_t curregs, curslabs, reg_size; - uint32_t nregs; - - /* The size of the current bin */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.size", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, ®_size, &sz, NULL, 0)); - - /* Number of used regions in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curregs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curregs, &sz, NULL, 0)); - - /* Number of regions per slab */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.nregs", j); - sz = sizeof(uint32_t); - assert(!je_mallctl(buf, &nregs, &sz, NULL, 0)); - - /* Number of current slabs in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curslabs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curslabs, &sz, NULL, 0)); - - /* Calculate the fragmentation bytes for the current bin and add it to the total. */ - frag += ((nregs * curslabs) - curregs) * reg_size; - } - - return frag; -} - -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { uint64_t epoch = 1; size_t sz; *allocated = *resident = *active = 0; @@ -763,8 +708,6 @@ int zmalloc_get_allocator_info(size_t *allocated, *muzzy = pmuzzy * page; } - /* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). */ - *frag_smallbins_bytes = zmalloc_get_frag_smallbins(); return 1; } @@ -789,13 +732,8 @@ int jemalloc_purge(void) { #else -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { - *allocated = *resident = *active = *frag_smallbins_bytes = 0; +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { + *allocated = *resident = *active = 0; if (retained) *retained = 0; if (muzzy) *muzzy = 0; return 1; @@ -824,7 +762,7 @@ void zlibc_trim(void) { /* For proc_pidinfo() used later in zmalloc_get_smap_bytes_by_field(). * Note that this file cannot be included in zmalloc.h because it includes * a Darwin queue.h file where there is a "LIST_HEAD" macro (!) defined - * conficting with user code. */ + * conflicting with user code. */ #include #endif diff --git a/src/zmalloc.h b/src/zmalloc.h index 9b51f4c866..68b4df63aa 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -100,13 +100,6 @@ #include #endif -/* We can enable the server defrag capabilities only if we are using Jemalloc - * and the version used is our special version modified for the server having - * the ability to return per-allocation fragmentation hints. */ -#if defined(USE_JEMALLOC) && defined(JEMALLOC_FRAG_HINT) -#define HAVE_DEFRAG -#endif - /* The zcalloc symbol is a symbol name already used by zlib, which is defining * other names using the "z" prefix specific to zlib. In practice, linking * valkey with a static openssl, which itself might depend on a static libz @@ -138,12 +131,7 @@ __attribute__((malloc)) char *zstrdup(const char *s); size_t zmalloc_used_memory(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes); +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy); void set_jemalloc_bg_thread(int enable); int jemalloc_purge(void); size_t zmalloc_get_private_dirty(long pid); @@ -151,12 +139,7 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid); size_t zmalloc_get_memory_size(void); void zlibc_free(void *ptr); void zlibc_trim(void); -void zmadvise_dontneed(void *ptr); - -#ifdef HAVE_DEFRAG -void zfree_no_tcache(void *ptr); -__attribute__((malloc)) void *zmalloc_no_tcache(size_t size); -#endif +void zmadvise_dontneed(void *ptr, size_t size_hint); #ifndef HAVE_MALLOC_SIZE size_t zmalloc_size(void *ptr); diff --git a/tests/assets/test_cli_hint_suite.txt b/tests/assets/test_cli_hint_suite.txt index 3cebf5229c..b8cfb0fdf1 100644 --- a/tests/assets/test_cli_hint_suite.txt +++ b/tests/assets/test_cli_hint_suite.txt @@ -68,17 +68,17 @@ "ZRANGE k 1 2 WITHSCORES " "[BYSCORE|BYLEX] [REV] [LIMIT offset count]" # Optional one-of args with parameters: SET key value [NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL] -"SET key value " "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" -"SET key value EX" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" -"SET key value EX " "seconds [NX|XX] [GET]" -"SET key value EX 23 " "[NX|XX] [GET]" -"SET key value EXAT" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" -"SET key value EXAT " "unix-time-seconds [NX|XX] [GET]" -"SET key value PX" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" -"SET key value PX " "milliseconds [NX|XX] [GET]" -"SET key value PXAT" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" -"SET key value PXAT " "unix-time-milliseconds [NX|XX] [GET]" -"SET key value KEEPTTL " "[NX|XX] [GET]" +"SET key value " "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" +"SET key value EX" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" +"SET key value EX " "seconds [NX|XX|IFEQ comparison-value] [GET]" +"SET key value EX 23 " "[NX|XX|IFEQ comparison-value] [GET]" +"SET key value EXAT" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" +"SET key value EXAT " "unix-time-seconds [NX|XX|IFEQ comparison-value] [GET]" +"SET key value PX" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" +"SET key value PX " "milliseconds [NX|XX|IFEQ comparison-value] [GET]" +"SET key value PXAT" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" +"SET key value PXAT " "unix-time-milliseconds [NX|XX|IFEQ comparison-value] [GET]" +"SET key value KEEPTTL " "[NX|XX|IFEQ comparison-value] [GET]" "SET key value XX " "[GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]" # If an input word can't be matched, stop hinting. diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl index 5c4f24b7d4..9a23031c08 100644 --- a/tests/integration/aof-multi-part.tcl +++ b/tests/integration/aof-multi-part.tcl @@ -4,11 +4,11 @@ set server_path [tmpdir server.multi.aof] set aof_dirname "appendonlydir" set aof_basename "appendonly.aof" set aof_dirpath "$server_path/$aof_dirname" -set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix" -set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_sufix$::aof_format_suffix" -set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix" -set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_sufix$::aof_format_suffix" -set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_sufix$::aof_format_suffix" +set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix" +set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_suffix$::aof_format_suffix" +set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix" +set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_suffix$::aof_format_suffix" +set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_suffix$::aof_format_suffix" set aof_manifest_file "$server_path/$aof_dirname/${aof_basename}$::manifest_suffix" set aof_old_name_old_path "$server_path/$aof_basename" set aof_old_name_new_path "$aof_dirpath/$aof_basename" @@ -705,7 +705,7 @@ tags {"external:skip"} { set client [valkey [srv host] [srv port] 0 $::tls] wait_done_loading $client - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.rdb seq 1 type b} @@ -728,7 +728,7 @@ tags {"external:skip"} { set client [valkey [srv host] [srv port] 0 $::tls] wait_done_loading $client - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.aof seq 1 type b} @@ -750,7 +750,7 @@ tags {"external:skip"} { start_server_aof [list dir $server_path aof-use-rdb-preamble no] { wait_done_loading r - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.1.base.aof seq 1 type b} @@ -827,8 +827,8 @@ tags {"external:skip"} { # Check we really have these files assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] r bgrewriteaof waitForBgrewriteaof r @@ -842,13 +842,13 @@ tags {"external:skip"} { assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name] # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] stop_write_load $load_handle0 wait_load_handlers_disconnected @@ -901,11 +901,11 @@ tags {"external:skip"} { {file appendonly.aof.5.incr.aof seq 5 type i} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] stop_write_load $load_handle0 wait_load_handlers_disconnected @@ -936,17 +936,17 @@ tags {"external:skip"} { # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] set d1 [r debug digest] r debug loadaof @@ -965,10 +965,10 @@ tags {"external:skip"} { {file appendonly.aof.4.base.rdb seq 4 type b} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } @@ -990,13 +990,13 @@ tags {"external:skip"} { # Wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] } test "AOF enable/disable auto gc" { @@ -1018,10 +1018,10 @@ tags {"external:skip"} { {file appendonly.aof.3.incr.aof seq 3 type i} } - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] r config set aof-disable-auto-gc no @@ -1033,10 +1033,10 @@ tags {"external:skip"} { # wait bio delete history wait_for_condition 1000 10 { - [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 && - [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 + [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 && + [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 } else { fail "Failed to delete history AOF" } @@ -1192,7 +1192,7 @@ tags {"external:skip"} { waitForBgrewriteaof r # Can create New INCR AOF - assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_sufix}${::aof_format_suffix}"] + assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_suffix}${::aof_format_suffix}"] assert_aof_manifest_content $aof_manifest_file { {file appendonly.aof.11.base.rdb seq 11 type b} @@ -1248,7 +1248,7 @@ tags {"external:skip"} { # Make sure manifest file is not created assert_equal 0 [check_file_exist $aof_dirpath $aof_manifest_name] # Make sure BASE AOF is not created - assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] + assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] # Make sure the next AOFRW has started wait_for_condition 1000 50 { diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 33c7c12d4b..3a666bbd15 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -4,8 +4,8 @@ set server_path [tmpdir server.aof] set aof_dirname "appendonlydir" set aof_basename "appendonly.aof" set aof_dirpath "$server_path/$aof_dirname" -set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix" -set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix" +set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix" +set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix" set aof_manifest_file "$server_path/$aof_dirname/$aof_basename$::manifest_suffix" tags {"aof external:skip"} { diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 05bdc130c1..8191b9f699 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -355,8 +355,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { verify_replica_online $primary 0 500 verify_replica_online $primary 1 500 - wait_for_value_to_propegate_to_replica $primary $replica1 "key1" - wait_for_value_to_propegate_to_replica $primary $replica2 "key1" + wait_for_value_to_propagate_to_replica $primary $replica1 "key1" + wait_for_value_to_propagate_to_replica $primary $replica2 "key1" assert {[s 0 total_forks] eq "1" } } @@ -374,8 +374,8 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica2 replicaof $primary_host $primary_port verify_replica_online $primary 0 500 verify_replica_online $primary 1 500 - wait_for_value_to_propegate_to_replica $primary $replica1 "key2" - wait_for_value_to_propegate_to_replica $primary $replica2 "key2" + wait_for_value_to_propagate_to_replica $primary $replica1 "key2" + wait_for_value_to_propagate_to_replica $primary $replica2 "key2" wait_for_condition 50 1000 { [status $replica1 master_link_status] == "up" } else { @@ -444,7 +444,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Replica is not synced" } - wait_for_value_to_propegate_to_replica $primary $replica1 "key3" + wait_for_value_to_propagate_to_replica $primary $replica1 "key3" # Verify that we did not use dual-channel-replication sync assert {[status $primary sync_partial_ok] == $cur_psync} @@ -483,9 +483,9 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Replica is not synced" } - wait_for_value_to_propegate_to_replica $primary $replica "key1" + wait_for_value_to_propagate_to_replica $primary $replica "key1" # Confirm the occurrence of a race condition. - wait_for_log_messages -1 {"*Dual channel sync - psync established after rdb load*"} 0 2000 1 + wait_for_log_messages -1 {"* Psync established after rdb load*"} 0 2000 1 } } } @@ -775,7 +775,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - $replica config set repl-timeout 10 + $replica config set repl-timeout 60 $primary config set repl-backlog-size 1mb test "Test dual-channel-replication primary gets cob overrun before established psync" { @@ -815,6 +815,37 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary should abort sync" } + stop_write_load $load_handle0 + stop_write_load $load_handle1 + stop_write_load $load_handle2 + } +} + +start_server {tags {"dual-channel-replication external:skip"}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set loglines [count_log_lines 0] + + $primary config set repl-diskless-sync yes + $primary config set dual-channel-replication-enabled yes + $primary config set client-output-buffer-limit "replica 1100k 0 0" + $primary config set loglevel debug + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_log [srv 0 stdout] + set replica_pid [srv 0 pid] + + set load_handle0 [start_write_load $primary_host $primary_port 60] + set load_handle1 [start_write_load $primary_host $primary_port 60] + set load_handle2 [start_write_load $primary_host $primary_port 60] + + $replica config set dual-channel-replication-enabled yes + $replica config set loglevel debug + $replica config set repl-timeout 60 + $primary config set repl-backlog-size 1mb $replica debug pause-after-fork 1 $primary debug populate 1000 primary 100000 diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index 6344215a25..0c15af74f9 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -499,10 +499,10 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS assert_equal 1000 [llength [split [run_cli --scan]]] # pattern - assert_equal {key:2} [run_cli --scan --pattern "*:2"] + assert_equal {key:2} [split [run_cli --scan --pattern "*:2"]] # pattern matching with a quoted string - assert_equal {key:2} [run_cli --scan --quoted-pattern {"*:\x32"}] + assert_equal {key:2} [split [run_cli --scan --quoted-pattern {"*:\x32"}]] } proc test_valkey_cli_repl {} { diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt index 0cac0c4cb6..e98a878c9d 100644 --- a/tests/modules/CMakeLists.txt +++ b/tests/modules/CMakeLists.txt @@ -40,6 +40,7 @@ list(APPEND MODULES_LIST "moduleauthtwo") list(APPEND MODULES_LIST "rdbloadsave") list(APPEND MODULES_LIST "crash") list(APPEND MODULES_LIST "cluster") +list(APPEND MODULES_LIST "helloscripting") foreach (MODULE_NAME ${MODULES_LIST}) message(STATUS "Building test module: ${MODULE_NAME}") diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 1690b9b627..963546a9ff 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -58,13 +58,15 @@ TEST_MODULES = \ eventloop.so \ moduleconfigs.so \ moduleconfigstwo.so \ + moduleparameter.so \ publish.so \ usercall.so \ postnotifications.so \ moduleauthtwo.so \ rdbloadsave.so \ crash.so \ - cluster.so + cluster.so \ + helloscripting.so .PHONY: all diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c new file mode 100644 index 0000000000..c912164bda --- /dev/null +++ b/tests/modules/helloscripting.c @@ -0,0 +1,385 @@ +#include "valkeymodule.h" + +#include +#include +#include + +/* + * This module implements a very simple stack based scripting language. + * It's purpose is only to test the valkey module API to implement scripting + * engines. + * + * The language is called HELLO, and a program in this language is formed by + * a list of function definitions. + * The language only supports 32-bit integer, and it only allows to return an + * integer constant, or return the value passed as the first argument to the + * function. + * + * Example of a program: + * + * ``` + * FUNCTION foo # declaration of function 'foo' + * ARGS 0 # pushes the value in the first argument to the top of the + * # stack + * RETURN # returns the current value on the top of the stack and marks + * # the end of the function declaration + * + * FUNCTION bar # declaration of function 'bar' + * CONSTI 432 # pushes the value 432 to the top of the stack + * RETURN # returns the current value on the top of the stack and marks + * # the end of the function declaration. + * ``` + */ + +/* + * List of instructions of the HELLO language. + */ +typedef enum HelloInstKind { + FUNCTION = 0, + CONSTI, + ARGS, + RETURN, + _NUM_INSTRUCTIONS, // Not a real instruction. +} HelloInstKind; + +/* + * String representations of the instructions above. + */ +const char *HelloInstKindStr[] = { + "FUNCTION", + "CONSTI", + "ARGS", + "RETURN", +}; + +/* + * Struct that represents an instance of an instruction. + * Instructions may have at most one parameter. + */ +typedef struct HelloInst { + HelloInstKind kind; + union { + uint32_t integer; + const char *string; + } param; +} HelloInst; + +/* + * Struct that represents an instance of a function. + * A function is just a list of instruction instances. + */ +typedef struct HelloFunc { + char *name; + HelloInst instructions[256]; + uint32_t num_instructions; +} HelloFunc; + +/* + * Struct that represents an instance of an HELLO program. + * A program is just a list of function instances. + */ +typedef struct HelloProgram { + HelloFunc *functions[16]; + uint32_t num_functions; +} HelloProgram; + +/* + * Struct that represents the runtime context of an HELLO program. + */ +typedef struct HelloLangCtx { + HelloProgram *program; +} HelloLangCtx; + + +static HelloLangCtx *hello_ctx = NULL; + + +static uint32_t str2int(const char *str) { + char *end; + errno = 0; + uint32_t val = (uint32_t)strtoul(str, &end, 10); + ValkeyModule_Assert(errno == 0); + return val; +} + +/* + * Parses the kind of instruction that the current token points to. + */ +static HelloInstKind helloLangParseInstruction(const char *token) { + for (HelloInstKind i = 0; i < _NUM_INSTRUCTIONS; i++) { + if (strcmp(HelloInstKindStr[i], token) == 0) { + return i; + } + } + return _NUM_INSTRUCTIONS; +} + +/* + * Parses the function param. + */ +static void helloLangParseFunction(HelloFunc *func) { + char *token = strtok(NULL, " \n"); + ValkeyModule_Assert(token != NULL); + func->name = ValkeyModule_Alloc(sizeof(char) * strlen(token) + 1); + strcpy(func->name, token); +} + +/* + * Parses an integer parameter. + */ +static void helloLangParseIntegerParam(HelloFunc *func) { + char *token = strtok(NULL, " \n"); + func->instructions[func->num_instructions].param.integer = str2int(token); +} + +/* + * Parses the CONSTI instruction parameter. + */ +static void helloLangParseConstI(HelloFunc *func) { + helloLangParseIntegerParam(func); + func->num_instructions++; +} + +/* + * Parses the ARGS instruction parameter. + */ +static void helloLangParseArgs(HelloFunc *func) { + helloLangParseIntegerParam(func); + func->num_instructions++; +} + +/* + * Parses an HELLO program source code. + */ +static HelloProgram *helloLangParseCode(const char *code, + HelloProgram *program) { + char *_code = ValkeyModule_Alloc(sizeof(char) * strlen(code) + 1); + strcpy(_code, code); + + HelloFunc *currentFunc = NULL; + + char *token = strtok(_code, " \n"); + while (token != NULL) { + HelloInstKind kind = helloLangParseInstruction(token); + + if (currentFunc != NULL) { + currentFunc->instructions[currentFunc->num_instructions].kind = kind; + } + + switch (kind) { + case FUNCTION: + ValkeyModule_Assert(currentFunc == NULL); + currentFunc = ValkeyModule_Alloc(sizeof(HelloFunc)); + memset(currentFunc, 0, sizeof(HelloFunc)); + program->functions[program->num_functions++] = currentFunc; + helloLangParseFunction(currentFunc); + break; + case CONSTI: + ValkeyModule_Assert(currentFunc != NULL); + helloLangParseConstI(currentFunc); + break; + case ARGS: + ValkeyModule_Assert(currentFunc != NULL); + helloLangParseArgs(currentFunc); + break; + case RETURN: + ValkeyModule_Assert(currentFunc != NULL); + currentFunc->num_instructions++; + currentFunc = NULL; + break; + default: + ValkeyModule_Assert(0); + } + + token = strtok(NULL, " \n"); + } + + ValkeyModule_Free(_code); + + return program; +} + +/* + * Executes an HELLO function. + */ +static uint32_t executeHelloLangFunction(HelloFunc *func, + ValkeyModuleString **args, int nargs) { + uint32_t stack[64]; + int sp = 0; + + for (uint32_t pc = 0; pc < func->num_instructions; pc++) { + HelloInst instr = func->instructions[pc]; + switch (instr.kind) { + case CONSTI: + stack[sp++] = instr.param.integer; + break; + case ARGS: { + uint32_t idx = instr.param.integer; + ValkeyModule_Assert(idx < (uint32_t)nargs); + size_t len; + const char *argStr = ValkeyModule_StringPtrLen(args[idx], &len); + uint32_t arg = str2int(argStr); + stack[sp++] = arg; + break; + } + case RETURN: { + uint32_t val = stack[--sp]; + ValkeyModule_Assert(sp == 0); + return val; + } + case FUNCTION: + default: + ValkeyModule_Assert(0); + } + } + + ValkeyModule_Assert(0); + return 0; +} + +static ValkeyModuleScriptingEngineMemoryInfo engineGetMemoryInfo(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx) { + VALKEYMODULE_NOT_USED(module_ctx); + HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx; + ValkeyModuleScriptingEngineMemoryInfo mem_info = {0}; + + if (ctx->program != NULL) { + mem_info.used_memory += ValkeyModule_MallocSize(ctx->program); + + for (uint32_t i = 0; i < ctx->program->num_functions; i++) { + HelloFunc *func = ctx->program->functions[i]; + mem_info.used_memory += ValkeyModule_MallocSize(func); + mem_info.used_memory += ValkeyModule_MallocSize(func->name); + } + } + + mem_info.engine_memory_overhead = ValkeyModule_MallocSize(ctx); + if (ctx->program != NULL) { + mem_info.engine_memory_overhead += ValkeyModule_MallocSize(ctx->program); + } + + return mem_info; +} + +static size_t engineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx, + void *compiled_function) { + VALKEYMODULE_NOT_USED(module_ctx); + HelloFunc *func = (HelloFunc *)compiled_function; + return ValkeyModule_MallocSize(func->name); +} + +static void engineFreeFunction(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + void *compiled_function) { + VALKEYMODULE_NOT_USED(module_ctx); + VALKEYMODULE_NOT_USED(engine_ctx); + HelloFunc *func = (HelloFunc *)compiled_function; + ValkeyModule_Free(func->name); + func->name = NULL; + ValkeyModule_Free(func); +} + +static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + const char *code, + size_t timeout, + size_t *out_num_compiled_functions, + char **err) { + VALKEYMODULE_NOT_USED(module_ctx); + VALKEYMODULE_NOT_USED(timeout); + VALKEYMODULE_NOT_USED(err); + + HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx; + + if (ctx->program == NULL) { + ctx->program = ValkeyModule_Alloc(sizeof(HelloProgram)); + memset(ctx->program, 0, sizeof(HelloProgram)); + } else { + ctx->program->num_functions = 0; + } + + ctx->program = helloLangParseCode(code, ctx->program); + + ValkeyModuleScriptingEngineCompiledFunction **compiled_functions = + ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction *) * ctx->program->num_functions); + + for (uint32_t i = 0; i < ctx->program->num_functions; i++) { + HelloFunc *func = ctx->program->functions[i]; + + ValkeyModuleScriptingEngineCompiledFunction *cfunc = + ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction)); + *cfunc = (ValkeyModuleScriptingEngineCompiledFunction) { + .name = ValkeyModule_CreateString(NULL, func->name, strlen(func->name)), + .function = func, + .desc = NULL, + .f_flags = 0, + }; + + compiled_functions[i] = cfunc; + } + + *out_num_compiled_functions = ctx->program->num_functions; + + return compiled_functions; +} + +static void +callHelloLangFunction(ValkeyModuleCtx *module_ctx, + ValkeyModuleScriptingEngineCtx *engine_ctx, + ValkeyModuleScriptingEngineFunctionCtx *func_ctx, + void *compiled_function, + ValkeyModuleString **keys, size_t nkeys, + ValkeyModuleString **args, size_t nargs) { + VALKEYMODULE_NOT_USED(engine_ctx); + VALKEYMODULE_NOT_USED(func_ctx); + VALKEYMODULE_NOT_USED(keys); + VALKEYMODULE_NOT_USED(nkeys); + + HelloFunc *func = (HelloFunc *)compiled_function; + uint32_t result = executeHelloLangFunction(func, args, nargs); + + ValkeyModule_ReplyWithLongLong(module_ctx, result); +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, + int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "helloengine", 1, VALKEYMODULE_APIVER_1) == + VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + hello_ctx = ValkeyModule_Alloc(sizeof(HelloLangCtx)); + hello_ctx->program = NULL; + + ValkeyModuleScriptingEngineMethods methods = { + .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION, + .create_functions_library = createHelloLangEngine, + .call_function = callHelloLangFunction, + .get_function_memory_overhead = engineFunctionMemoryOverhead, + .free_function = engineFreeFunction, + .get_memory_info = engineGetMemoryInfo, + }; + + ValkeyModule_RegisterScriptingEngine(ctx, + "HELLO", + hello_ctx, + &methods); + + return VALKEYMODULE_OK; +} + +int ValkeyModule_OnUnload(ValkeyModuleCtx *ctx) { + if (ValkeyModule_UnregisterScriptingEngine(ctx, "HELLO") != VALKEYMODULE_OK) { + ValkeyModule_Log(ctx, "error", "Failed to unregister engine"); + return VALKEYMODULE_ERR; + } + + ValkeyModule_Free(hello_ctx->program); + hello_ctx->program = NULL; + ValkeyModule_Free(hello_ctx); + hello_ctx = NULL; + + return VALKEYMODULE_OK; +} diff --git a/tests/modules/moduleparameter.c b/tests/modules/moduleparameter.c new file mode 100644 index 0000000000..6c110f2cfb --- /dev/null +++ b/tests/modules/moduleparameter.c @@ -0,0 +1,28 @@ +#include "valkeymodule.h" +#include +#include +#include +#include + +int test_module_update_parameter(ValkeyModuleCtx *ctx, + ValkeyModuleString **argv, int argc) { + + ValkeyModule_UpdateRuntimeArgs(ctx, argv, argc); + return ValkeyModule_ReplyWithSimpleString(ctx, "OK"); +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "moduleparameter", 1, VALKEYMODULE_APIVER_1) == + VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + if (ValkeyModule_CreateCommand(ctx, "testmoduleparameter.update.parameter", + test_module_update_parameter, "fast", 0, 0, + 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + return VALKEYMODULE_OK; +} diff --git a/tests/rdma/run.py b/tests/rdma/run.py index 0724c27adc..77e0f285fe 100755 --- a/tests/rdma/run.py +++ b/tests/rdma/run.py @@ -60,10 +60,9 @@ def test_rdma(ipaddr): # step 2, start server svrpath = valkeydir + "/src/valkey-server" - rdmapath = valkeydir + "/src/valkey-rdma.so" svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes", "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp", - "--loadmodule", rdmapath, "port=6379", "bind=" + ipaddr] + "--rdma-port", "6379", "--rdma-bind", ipaddr] svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE) try: diff --git a/tests/support/aofmanifest.tcl b/tests/support/aofmanifest.tcl index 308d1172aa..fc20bacc99 100644 --- a/tests/support/aofmanifest.tcl +++ b/tests/support/aofmanifest.tcl @@ -1,5 +1,5 @@ -set ::base_aof_sufix ".base" -set ::incr_aof_sufix ".incr" +set ::base_aof_suffix ".base" +set ::incr_aof_suffix ".incr" set ::manifest_suffix ".manifest" set ::aof_format_suffix ".aof" set ::rdb_format_suffix ".rdb" diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 4b399214b9..4f641c5e96 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { @@ -322,6 +323,15 @@ proc get_cluster_nodes {id {status "*"}} { return $nodes } +# Returns the parsed myself node entry as a dictionary. +proc get_myself id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[cluster_has_flag $n myself]} {return $n} + } + return {} +} + # Returns 1 if no node knows node_id, 0 if any node knows it. proc node_is_forgotten {node_id} { for {set j 0} {$j < [llength $::servers]} {incr j} { diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 7257339042..bd3135e9d9 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -221,6 +221,11 @@ proc tags_acceptable {tags err_return} { return 0 } + if {$::debug_defrag && [lsearch $tags "debug_defrag:skip"] >= 0} { + set err "Not supported on server compiled with DEBUG_FORCE_DEFRAG option" + return 0 + } + if {$::singledb && [lsearch $tags "singledb:skip"] >= 0} { set err "Not supported on singledb" return 0 @@ -309,7 +314,7 @@ proc spawn_server {config_file stdout stderr args} { } # Tell the test server about this new instance. - send_data_packet $::test_server_fd server-spawned $pid + send_data_packet $::test_server_fd server-spawned "$pid - $::curfile" return $pid } diff --git a/tests/support/test.tcl b/tests/support/test.tcl index 262dc66041..3fd74d0387 100644 --- a/tests/support/test.tcl +++ b/tests/support/test.tcl @@ -160,12 +160,12 @@ proc verify_replica_online {master replica_idx max_retry} { } } -proc wait_for_value_to_propegate_to_replica {master replica key} { +proc wait_for_value_to_propagate_to_replica {master replica key} { set val [$master get $key] wait_for_condition 50 500 { ([$replica get $key] eq $val) } else { - error "Key $key did not propegate. Expected $val but got [$replica get $key]" + error "Key $key did not propagate. Expected $val but got [$replica get $key]" } } diff --git a/tests/support/util.tcl b/tests/support/util.tcl index e53cda3071..8f3cda3a4c 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -931,6 +931,30 @@ proc debug_digest {{level 0}} { r $level debug digest } +proc main_hash_table_size {{level 0}} { + set dbnum [expr {$::singledb ? 0 : 9}] + append re \ + {^\[Dictionary HT\]\n} \ + {Hash table 0 stats \(main hash table\):\n} \ + { table size: (\d+)} + regexp $re [r $level DEBUG HTSTATS $dbnum] -> table_size + return $table_size +} + +# Returns the number of keys that can be added before rehashing starts. Insert +# this number of keys and no rehashing happens. Insert one more key and +# rehashing can be triggered by the cron function. Insert two more keys and +# rehashing is triggered immediately. +proc main_hash_table_keys_before_rehashing_starts {{level 0}} { + # This fill factor is defined internally in hashtable.c and duplicated here. + # If we change the fill factor, this needs to be updated accordingly. + set MAX_FILL_PERCENT_SOFT 100 + set table_size [main_hash_table_size $level] + set dbsize [r $level dbsize] + set free_space [expr {$table_size * $MAX_FILL_PERCENT_SOFT / 100 - $dbsize - 1}] + return $free_space +} + proc wait_for_blocked_client {{idx 0}} { wait_for_condition 50 100 { [s $idx blocked_clients] ne 0 diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 7c15413806..54bb923674 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -35,12 +35,12 @@ foreach test_dir $test_dirs { set cluster_test_dir unit/cluster foreach file [glob -nocomplain $dir/tests/$cluster_test_dir/*.tcl] { - lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] + lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] } set moduleapi_test_dir unit/moduleapi foreach file [glob -nocomplain $dir/tests/$moduleapi_test_dir/*.tcl] { - lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] + lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] } # Index to the next test to run in the ::all_tests list. @@ -92,6 +92,7 @@ set ::large_memory 0 set ::log_req_res 0 set ::force_resp3 0 set ::solo_tests_count 0 +set ::debug_defrag 0 # Set to 1 when we are running in client mode. The server test uses a # server-client model to run tests simultaneously. The server instance @@ -420,7 +421,8 @@ proc read_from_test_client fd { } elseif {$status eq {server-spawning}} { set ::active_clients_task($fd) "(SPAWNING SERVER) $data" } elseif {$status eq {server-spawned}} { - lappend ::active_servers $data + set pid [string trim [lindex [split $data "-"] 0]] + lappend ::active_servers $pid set ::active_clients_task($fd) "(SPAWNED SERVER) pid:$data" } elseif {$status eq {server-killing}} { set ::active_clients_task($fd) "(KILLING SERVER) pid:$data" @@ -607,6 +609,7 @@ proc print_help_screen {} { "--ignore-encoding Don't validate object encoding." "--ignore-digest Don't use debug digest validations." "--large-memory Run tests using over 100mb." + "--debug-defrag Indicate the test is running against server compiled with DEBUG_FORCE_DEFRAG option" "--help Print this help screen." } "\n"] } @@ -654,7 +657,7 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } } elseif {$opt eq {--quiet}} { set ::quiet 1 - } elseif {$opt eq {--io-threads}} { + } elseif {$opt eq {--io-threads}} { set ::io_threads 1 } elseif {$opt eq {--tls} || $opt eq {--tls-module}} { package require tls 1.6 @@ -748,6 +751,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} { set ::ignoreencoding 1 } elseif {$opt eq {--ignore-digest}} { set ::ignoredigest 1 + } elseif {$opt eq {--debug-defrag}} { + set ::debug_defrag 1 } elseif {$opt eq {--help}} { print_help_screen exit 0 diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl index 059f03fbe4..0b5f769930 100644 --- a/tests/unit/cluster/cluster-multiple-meets.tcl +++ b/tests/unit/cluster/cluster-multiple-meets.tcl @@ -58,7 +58,7 @@ tags {tls:skip external:skip cluster} { } else { fail "Node 1 recognizes node 0 even though it drops PONGs from node 0" } - assert {[llength [get_cluster_nodes 0 connected]] == 2} + assert {[llength [get_cluster_nodes 0]] == 2} # Drop incoming and outgoing links from/to 1 R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID] @@ -77,6 +77,8 @@ tags {tls:skip external:skip cluster} { # Both a and b will turn to cluster state ok wait_for_condition 1000 50 { [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} && + [llength [get_cluster_nodes 0 connected]] == 2 && + [llength [get_cluster_nodes 1 connected]] == 2 && [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received] } else { fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]" diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl index 45f5a6dc89..f189e96d5b 100644 --- a/tests/unit/cluster/cluster-reliable-meet.tcl +++ b/tests/unit/cluster/cluster-reliable-meet.tcl @@ -3,6 +3,12 @@ set old_singledb $::singledb set ::singledb 1 tags {tls:skip external:skip cluster} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + set base_conf [list cluster-enabled yes] start_multiple_servers 2 [list overrides $base_conf] { test "Cluster nodes are reachable" { @@ -22,9 +28,6 @@ tags {tls:skip external:skip cluster} { wait_for_cluster_state fail } - set CLUSTER_PACKET_TYPE_MEET 2 - set CLUSTER_PACKET_TYPE_NONE -1 - test "Cluster nodes haven't met each other" { assert {[llength [get_cluster_nodes 1]] == 1} assert {[llength [get_cluster_nodes 0]] == 1} @@ -75,3 +78,202 @@ tags {tls:skip external:skip cluster} { set ::singledb $old_singledb +proc cluster_get_first_node_in_handshake id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[cluster_has_flag $n handshake]} { + return [dict get $n id] + } + } + return {} +} + +proc cluster_nodes_all_know_each_other {num_nodes} { + # Collect node IDs dynamically + set node_ids {} + for {set i 0} {$i < $num_nodes} {incr i} { + lappend node_ids [dict get [get_myself $i] id] + } + + # Check if all nodes know each other + foreach node_id $node_ids { + foreach check_node_id $node_ids { + for {set node_index 0} {$node_index < $num_nodes} {incr node_index} { + if {[cluster_get_node_by_id $node_index $check_node_id] == {}} { + return 0 + } + } + } + } + + # Verify cluster link counts for each node + set expected_links [expr {2 * ($num_nodes - 1)}] + for {set i 0} {$i < $num_nodes} {incr i} { + if {[llength [R $i CLUSTER LINKS]] != $expected_links} { + return 0 + } + } + + return 1 +} + +start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + + test "Handshake eventually succeeds after node handshake timeout on both sides with inconsistent view of the cluster" { + set cluster_port [find_available_port $::baseport $::portcount] + start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] { + # In this test we will trigger a handshake timeout on both sides of the handshake. + # Node 1 and 2 already know each other, then we make node 1 meet node 0: + # + # Node 1 -- MEET -> Node 0 [Node 0 might learn about Node 2 from the gossip section of the msg] + # Node 1 <- PONG -- Node 0 [we drop this message, so Node 1 will eventually mark the handshake as timed out] + # Node 1 <- PING -- Node 0 [we drop this message, so Node 1 will never send a PONG and Node 0 will eventually mark the handshake as timed out] + # + # After the handshake is timed out, we allow all cluster bus messages to go through. + # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake. + + set node0_id [dict get [get_myself 0] id] + set node1_id [dict get [get_myself 1] id] + set node2_id [dict get [get_myself 2] id] + + # Drop all cluster bus messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL + # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2. + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET + + R 1 CLUSTER MEET [srv 0 host] [srv 0 port] $cluster_port + + # Wait for Node 0 to be in handshake + wait_for_condition 10 400 { + [cluster_get_first_node_in_handshake 0] != {} + } else { + fail "Node 0 never entered handshake state" + } + + # We want Node 0 to learn about Node 2 through the gossip section of the MEET message + set meet_retry 0 + while {[cluster_get_node_by_id 0 $node2_id] eq {}} { + if {$meet_retry == 10} { + error "assertion: Retried to meet Node 0 too many times" + } + # If Node 0 doesn't know about Node 1 & 2, it means Node 1 did not gossip about node 2 in its MEET message. + # So we kill the outbound link from Node 1 to Node 0, to force a reconnect and a re-send of the MEET message. + after 100 + # Since we are in handshake, we use a randomly generated ID we have to find + R 1 DEBUG CLUSTERLINK KILL ALL [cluster_get_first_node_in_handshake 1] + incr meet_retry 1 + } + + # Wait for Node 1's handshake to timeout + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 1 never exited handshake state" + } + + # Wait for Node 0's handshake to timeout + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 0 never exited handshake state" + } + + # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0. + wait_for_condition 50 100 { + [cluster_get_node_by_id 0 $node1_id] != {} && + [cluster_get_node_by_id 0 $node2_id] != {} && + [cluster_get_node_by_id 1 $node0_id] eq {} && + [cluster_get_node_by_id 2 $node0_id] eq {} + } else { + fail "Unexpected CLUSTER NODES output, nodes 1 & 2 should not know node 0." + } + + # Allow all messages to go through again + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + + # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inbound link. + # Handshake should now complete successfully. + wait_for_condition 50 200 { + [cluster_nodes_all_know_each_other 3] + } else { + fail "Unexpected CLUSTER NODES output, all nodes should know each other." + } + } ;# stop Node 0 + } ;# test +} ;# stop cluster + +start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} { + set CLUSTER_PACKET_TYPE_PING 0 + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + + test "Handshake eventually succeeds after node handshake timeout on one side with inconsistent view of the cluster" { + set cluster_port [find_available_port $::baseport $::portcount] + start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] { + # In this test we will trigger a handshake timeout on one side of the handshake. + # Node 1 and 2 already know each other, then we make node 0 meet node 1: + # + # Node 0 -- MEET -> Node 1 + # Node 0 <- PONG -- Node 1 + # Node 0 <- PING -- Node 1 [Node 0 will mark the handshake as successful] + # Node 0 -- PONG -> Node 1 [we drop this message, so node 1 will eventually mark the handshake as timed out] + # + # After the handshake is timed out, we allow all cluster bus messages to go through. + # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake. + + set node0_id [dict get [get_myself 0] id] + set node1_id [dict get [get_myself 1] id] + set node2_id [dict get [get_myself 2] id] + + # Drop PONG messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG + # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2. + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET + + # Node 0 meets node 1 + R 0 CLUSTER MEET [srv -1 host] [srv -1 port] + + # Wait for node 0 to know about the other nodes in the cluster + wait_for_condition 50 100 { + [cluster_get_node_by_id 0 $node1_id] != {} + } else { + fail "Node 0 never learned about node 1" + } + # At this point, node 0 knows about node 1 and might know node 2 if node 1 gossiped about it. + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 0] eq {} + } else { + fail "Node 1 never exited handshake state" + } + # At this point, from node 0 point of view, the handshake with node 1 succeeded. + + wait_for_condition 50 100 { + [cluster_get_first_node_in_handshake 1] eq {} + } else { + fail "Node 1 never exited handshake state" + } + assert {[cluster_get_node_by_id 1 $node0_id] eq {}} + # At this point, from node 1 point of view, the handshake with node 0 timed out. + + # Allow all messages + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + + # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inblound link. + # Handshake should now complete successfully. + wait_for_condition 50 200 { + [cluster_nodes_all_know_each_other 3] + } else { + fail "Unexpected CLUSTER NODES output, all nodes should know each other." + } + } ;# stop Node 0 + } ;# test +} ;# stop cluster diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 21c4f4a678..9262049e4e 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -86,6 +86,11 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval fail "No failover detected" } + # Make sure there is no false epoch 0. + verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0 + # Make sure there is no failover timeout. verify_no_log_message -7 "*Failover attempt expired*" 0 verify_no_log_message -8 "*Failover attempt expired*" 0 diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl index 0d7b249899..f882378172 100644 --- a/tests/unit/cluster/info.tcl +++ b/tests/unit/cluster/info.tcl @@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" { } } ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} { + test "fail reason changed" { + # Kill one primary, so the cluster fail with not-full-coverage. + pause_process [srv 0 pid] + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + verify_log_message -1 "*At least one hash slot is not served by any available node*" 0 + verify_log_message -2 "*At least one hash slot is not served by any available node*" 0 + + # Kill one more primary, so the cluster fail with minority-partition. + pause_process [srv -1 pid] + wait_for_log_messages -2 {"*minority partition*"} 0 1000 50 + + resume_process [srv 0 pid] + resume_process [srv -1 pid] + wait_for_cluster_state ok + } +} diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 2a9dff934b..dbcbb26380 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -183,3 +183,215 @@ test "Wait for instance #0 to return back alive" { } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Let replica drop FAILOVER_AUTH_ACK so that the election won't + # get the enough votes and the election will time out. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK + + # The first manual failover will time out. + R 3 cluster failover + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + + # Undo packet drop, so that replica can win the next election. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Make sure the second manual failover will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - mixed failover" { + # Make sure the failover is triggered by us. + R 1 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover yes + R 3 config set cluster-replica-validity-factor 0 + + # Pause the primary. + pause_process [srv 0 pid] + wait_for_cluster_state fail + + # R 3 performs an automatic failover and it will work. + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + + # Resume the primary and wait for it to become a replica. + resume_process [srv 0 pid] + wait_for_condition 1000 50 { + [s 0 role] eq {slave} + } else { + fail "Old primary not converted into replica" + } + wait_for_cluster_propagation + + # The old primary doing a manual failover and wait for it. + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # R 3 performs a manual failover and it will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The third falover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Automatic failover vote is not limited by two times the node timeout - mixed failover" { + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + wait_for_cluster_propagation + + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # Let R 3 trigger the automatic failover + pause_process [srv 0 pid] + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The third failover does not happen" + } + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Manual failover will reset the on-going election" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Let other primaries drop FAILOVER_AUTH_REQUEST so that the election won't + # get the enough votes and the election will time out. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + + # Replica doing the manual failover. + R 3 cluster failover + + # Waiting for primary and replica to confirm manual failover timeout. + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + set loglines1 [count_log_lines 0] + set loglines2 [count_log_lines -3] + + # Undo packet drop, so that replica can win the next election. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Replica doing the manual failover again. + R 3 cluster failover + + # Make sure the election is reset. + wait_for_log_messages -3 {"*Failover election in progress*Resetting the election*"} $loglines2 1000 50 + + # Wait for failover. + wait_for_condition 1000 50 { + [s -3 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure that the second manual failover does not time out. + verify_no_log_message 0 "*Manual failover timed out*" $loglines1 + verify_no_log_message -3 "*Manual failover timed out*" $loglines2 + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 1000}} { + test "Broadcast PONG to the cluster when the node role changes" { + # R0 is a primary and R3 is a replica, we will do multiple cluster failover + # and then check their role and flags. + set R0_nodeid [R 0 cluster myid] + set R3_nodeid [R 3 cluster myid] + + # Make sure we don't send PINGs for a short period of time. + for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j debug disable-cluster-random-ping 0 + R $j config set cluster-ping-interval 300000 + } + + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "Failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a replica and R3 should be a primary in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark slave $j $R0_nodeid] && + [check_cluster_node_mark master $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the first failover" + } + } + + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a primary and R3 should be a replica in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark master $j $R0_nodeid] && + [check_cluster_node_mark slave $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the second failover" + } + } + } +} ;# start_cluster diff --git a/tests/unit/cluster/slot-stats.tcl b/tests/unit/cluster/slot-stats.tcl index 3e3487a612..99f9c1c03a 100644 --- a/tests/unit/cluster/slot-stats.tcl +++ b/tests/unit/cluster/slot-stats.tcl @@ -228,11 +228,11 @@ start_cluster 1 0 {tags {external:skip cluster} overrides {cluster-slot-stats-en R 0 FLUSHALL test "CLUSTER SLOT-STATS cpu-usec for blocking commands, unblocked on timeout." { - # Blocking command with 1 second timeout. + # Blocking command with 0.5 seconds timeout. set rd [valkey_deferring_client] - $rd BLPOP $key 1 + $rd BLPOP $key 0.5 - # Confirm that the client is blocked, then unblocked after 1 second timeout. + # Confirm that the client is blocked, then unblocked within 1 second. wait_for_blocked_clients_count 1 wait_for_blocked_clients_count 0 @@ -971,4 +971,4 @@ start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-slot-stats-en } R 0 CONFIG RESETSTAT R 1 CONFIG RESETSTAT -} \ No newline at end of file +} diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl index d85ce7ee68..e1dcc9203b 100644 --- a/tests/unit/expire.tcl +++ b/tests/unit/expire.tcl @@ -832,6 +832,98 @@ start_server {tags {"expire"}} { close_replication_stream $repl assert_equal [r debug set-active-expire 1] {OK} } {} {needs:debug} + + test {import-source can be closed when import-mode is off} { + r config set import-mode no + assert_error "ERR Server is not in import mode" {r client import-source on} + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + assert_match {*flags=I*} [r client list id [r client id]] + + r config set import-mode no + assert_equal [r client import-source off] {OK} + assert_match {*flags=N*} [r client list id [r client id]] + } + + test {Import mode should forbid active expiration} { + r flushall + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 bar PX 1 + r set foo2 bar PX 1 + after 10 + + assert_equal [r dbsize] {2} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } + + test {Import mode should forbid lazy expiration} { + r flushall + r debug set-active-expire 0 + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 1 PX 1 + after 10 + + r get foo1 + assert_equal [r dbsize] {1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + r get foo1 + + assert_equal [r dbsize] {0} + + assert_equal [r debug set-active-expire 1] {OK} + } {} {needs:debug} + + test {Client can visit expired key in import-source state} { + r flushall + + r config set import-mode yes + + r set foo1 1 PX 1 + after 10 + + # Normal clients cannot visit expired key. + assert_equal [r get foo1] {} + assert_equal [r ttl foo1] {-2} + assert_equal [r dbsize] 1 + + # Client can visit expired key when in import-source state. + assert_equal [r client import-source on] {OK} + assert_equal [r ttl foo1] {0} + assert_equal [r get foo1] {1} + assert_equal [r incr foo1] {2} + assert_equal [r randomkey] {foo1} + assert_equal [r scan 0 match * count 10000] {0 foo1} + assert_equal [r keys *] {foo1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } } start_cluster 1 0 {tags {"expire external:skip cluster"}} { @@ -847,7 +939,7 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} { # hashslot(foo) is 12182 # fill data across different slots with expiration - for {set j 1} {$j <= 100} {incr j} { + for {set j 1} {$j <= 1000} {incr j} { r psetex "{foo}$j" 500 a } # hashslot(key) is 12539 @@ -858,7 +950,7 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} { r debug dict-resizing 0 # delete data to have lot's (99%) of empty buckets (slot 12182 should be skipped) - for {set j 1} {$j <= 99} {incr j} { + for {set j 1} {$j <= 999} {incr j} { r del "{foo}$j" } @@ -884,7 +976,9 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} { r debug dict-resizing 1 # put some data into slot 12182 and trigger the resize + # by deleting it to trigger shrink r psetex "{foo}0" 500 a + r del "{foo}0" # Verify all keys have expired wait_for_condition 400 100 { diff --git a/tests/unit/functions.tcl b/tests/unit/functions.tcl index 7ddd36dd7d..1636baaf6d 100644 --- a/tests/unit/functions.tcl +++ b/tests/unit/functions.tcl @@ -604,7 +604,7 @@ start_server {tags {"scripting"}} { } } e set _ $e - } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*} + } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*} test {LIBRARIES - test registration with empty name} { catch { @@ -613,7 +613,7 @@ start_server {tags {"scripting"}} { } } e set _ $e - } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*} + } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*} test {LIBRARIES - math.random from function load} { catch { diff --git a/tests/unit/hyperloglog.tcl b/tests/unit/hyperloglog.tcl index c1b3b3a79f..765d5e0bdd 100644 --- a/tests/unit/hyperloglog.tcl +++ b/tests/unit/hyperloglog.tcl @@ -222,6 +222,46 @@ start_server {tags {"hll"}} { assert_equal 3 [r pfcount destkey] } + test {PFMERGE results with simd} { + r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t} + for {set x 1} {$x < 2000} {incr x} { + r pfadd hll1{t} [expr rand()] + } + for {set x 1} {$x < 4000} {incr x} { + r pfadd hll2{t} [expr rand()] + } + for {set x 1} {$x < 8000} {incr x} { + r pfadd hll3{t} [expr rand()] + } + assert {[r pfcount hll1{t}] > 0} + assert {[r pfcount hll2{t}] > 0} + assert {[r pfcount hll3{t}] > 0} + + r pfdebug simd off + set scalar [r pfcount hll1{t} hll2{t} hll3{t}] + r pfdebug simd on + set simd [r pfcount hll1{t} hll2{t} hll3{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + r pfdebug simd off + r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t} + r pfdebug simd on + r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t} + + set scalar [r pfcount hllscalar{t}] + set simd [r pfcount hllsimd{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + set scalar [r get hllscalar{t}] + set simd [r get hllsimd{t}] + assert_equal $scalar $simd + + } {} {needs:pfdebug} + test {PFCOUNT multiple-keys merge returns cardinality of union #1} { r del hll1{t} hll2{t} hll3{t} for {set x 1} {$x < 10000} {incr x} { diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 278a1d8e33..3295c5e31a 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -10,7 +10,7 @@ proc latency_percentiles_usec {cmd} { return [latencyrstat_percentiles $cmd r] } -start_server {tags {"info" "external:skip"}} { +start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { start_server {} { test {latencystats: disable/enable} { @@ -391,7 +391,13 @@ start_server {tags {"info" "external:skip"}} { # set qbuf limit to minimum to test stat set org_qbuf_limit [lindex [r config get client-query-buffer-limit] 1] r config set client-query-buffer-limit 1048576 - catch {r set key [string repeat a 1048576]} + catch {r set key [string repeat a 2048576]} e + # We might get an error on the write path of the previous command, which won't be + # an I/O error based on how the client is designed. We will need to manually consume + # the secondary I/O error. + if {![string match "I/O error*" $e]} { + catch {r read} + } set info [r info stats] assert_equal [getInfoProperty $info client_query_buffer_limit_disconnections] {1} r config set client-query-buffer-limit $org_qbuf_limit @@ -400,10 +406,10 @@ start_server {tags {"info" "external:skip"}} { r config set client-output-buffer-limit "normal 10 0 0" r set key [string repeat a 100000] ;# to trigger output buffer limit check this needs to be big catch {r get key} + r config set client-output-buffer-limit $org_outbuf_limit set info [r info stats] assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1} - r config set client-output-buffer-limit $org_outbuf_limit - } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres + } {} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres test {clients: pubsub clients} { set info [r info clients] @@ -515,23 +521,43 @@ start_server {tags {"info" "external:skip"}} { set info_mem [r info memory] set mem_stats [r memory stats] assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0} - assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 64 + # overhead.db.hashtable.lut = memory overhead of hashtable including hashtable struct and tables + set hashtable_overhead [dict get $mem_stats overhead.db.hashtable.lut] + if {$hashtable_overhead < 140} { + # 32-bit version (hashtable struct + 1 bucket of 64 bytes) + set bits 32 + } else { + set bits 64 + } + assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 256 assert_equal [dict get $mem_stats overhead.db.hashtable.rehashing] {0} assert_equal [dict get $mem_stats db.dict.rehashing.count] {0} - # set 4 more keys to trigger rehashing + # set 7 more keys to trigger rehashing # get the info within a transaction to make sure the rehashing is not completed - r multi + r multi r set b c r set c d r set d e r set e f + r set f g + r set g h + r set h i + if {$bits == 32} { + # In 32-bit mode, we have 12 elements per bucket. Insert five more + # to trigger rehashing. + r set aa aa + r set bb bb + r set cc cc + r set dd dd + r set ee ee + } r info memory r memory stats set res [r exec] - set info_mem [lindex $res 4] - set mem_stats [lindex $res 5] + set info_mem [lindex $res end-1] + set mem_stats [lindex $res end] assert_range [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] 1 64 - assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 192 + assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 300 assert_range [dict get $mem_stats overhead.db.hashtable.rehashing] 1 64 assert_equal [dict get $mem_stats db.dict.rehashing.count] {1} } diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 352f5f183e..bafc46d4b7 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -376,6 +376,32 @@ start_server {tags {"introspection"}} { $rd close } + # This test verifies that MONITOR correctly records overwritten commands + # when executed within a MULTI-EXEC block. Specifically, it checks that even if + # the original SET-EX command arguments are overwritten for replica propagation, the MONITOR output + # still shows the original command. + test {MONITOR correctly records SET EX in MULTI-EXEC} { + # Start monitoring client + set rd [valkey_deferring_client] + $rd monitor + $rd read ; # Discard the OK + + # Execute multi-exec block with SET EX commands + r multi + r set "{slot}key1" value1 ex 3600 + r set "{slot}key2" value2 ex 1800 + r exec + + # Verify monitor output shows the original commands: + assert_match {*"multi"*} [$rd read] + assert_match {*"set"*"{slot}key1"*"value1"*"ex"*"3600"*} [$rd read] + assert_match {*"set"*"{slot}key2"*"value2"*"ex"*"1800"*} [$rd read] + assert_match {*"exec"*} [$rd read] + + # Clean up monitoring client + $rd close + } + test {MONITOR log blocked command only once} { # need to reconnect in order to reset the clients state reconnect @@ -558,6 +584,10 @@ start_server {tags {"introspection"}} { req-res-logfile client-default-resp dual-channel-replication-enabled + rdma-completion-vector + rdma-rx-size + rdma-bind + rdma-port } if {!$::tls} { @@ -950,6 +980,13 @@ start_server {tags {"introspection"}} { } } {} {external:skip} + test {valkey-server command line arguments - dir multiple times} { + start_server {config "default.conf" args {--dir "./" --dir "./"}} { + r config get dir + assert_equal {PONG} [r ping] + } + } {} {external:skip} + # Config file at this point is at a weird state, and includes all # known keywords. Might be a good idea to avoid adding tests here. } @@ -1005,6 +1042,49 @@ test {config during loading} { } } {} {external:skip} +test {MEMORY commands during loading} { + start_server [list overrides [list key-load-delay 50 loading-process-events-interval-bytes 1024]] { + # Set up some initial data + r debug populate 100000 key 1000 + + # Save and restart + r save + restart_server 0 false false + + # At this point, keys are loaded one at time, busy looping 50usec + # between each. Further, other events are processed every 1024 bytes + # of RDB. We're sending all our commands deferred, so they have a + # chance to be processed all at once between loading two keys. + + set rd [valkey_deferring_client] + + # Allowed during loading + $rd memory help + $rd memory malloc-stats + $rd memory purge + + # Disallowed during loading (because directly dependent on the dataset) + $rd memory doctor + $rd memory stats + $rd memory usage key:1 + + # memory help + assert_match {{MEMORY *}} [$rd read] + # memory malloc-stats + assert_match {*alloc*} [$rd read] + # memory purge + assert_match OK [$rd read] + # memory doctor + assert_error {*LOADING*} {$rd read} + # memory stats + assert_error {*LOADING*} {$rd read} + # memory usage key:1 + assert_error {*LOADING*} {$rd read} + + $rd close + } +} {} {external:skip} + test {CONFIG REWRITE handles rename-command properly} { start_server {tags {"introspection"} overrides {rename-command {flushdb badger}}} { assert_error {ERR unknown command*} {r flushdb} diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index d4e62246f1..5b76f44645 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -4,7 +4,7 @@ start_server {tags {"maxmemory" "external:skip"}} { set server_pid [s process_id] proc init_test {client_eviction} { - r flushdb + r flushdb sync set prev_maxmemory_clients [r config get maxmemory-clients] if $client_eviction { @@ -145,45 +145,6 @@ start_server {tags {"maxmemory" "external:skip"}} { } start_server {tags {"maxmemory external:skip"}} { - test "Without maxmemory small integers are shared" { - r config set maxmemory 0 - r set a 1 - assert_refcount_morethan a 1 - } - - test "With maxmemory and non-LRU policy integers are still shared" { - r config set maxmemory 1073741824 - r config set maxmemory-policy allkeys-random - r set a 1 - assert_refcount_morethan a 1 - } - - test "With maxmemory and LRU policy integers are not shared" { - r config set maxmemory 1073741824 - r config set maxmemory-policy allkeys-lru - r set a 1 - r config set maxmemory-policy volatile-lru - r set b 1 - assert_refcount 1 a - assert_refcount 1 b - r config set maxmemory 0 - } - - test "Shared integers are unshared with maxmemory and LRU policy" { - r set a 1 - r set b 1 - assert_refcount_morethan a 1 - assert_refcount_morethan b 1 - r config set maxmemory 1073741824 - r config set maxmemory-policy allkeys-lru - r get a - assert_refcount 1 a - r config set maxmemory-policy volatile-lru - r get b - assert_refcount 1 b - r config set maxmemory 0 - } - foreach policy { allkeys-random allkeys-lru allkeys-lfu volatile-lru volatile-lfu volatile-random volatile-ttl } { @@ -265,10 +226,10 @@ start_server {tags {"maxmemory external:skip"}} { # make sure to start with a blank instance r flushall # Get the current memory limit and calculate a new limit. - # We just add 100k to the current memory size so that it is + # We just add 400KiB to the current memory size so that it is # fast for us to reach that limit. set used [s used_memory] - set limit [expr {$used+100*1024}] + set limit [expr {$used+400*1024}] r config set maxmemory $limit r config set maxmemory-policy $policy # Now add keys until the limit is almost reached. @@ -435,25 +396,37 @@ start_server {tags {"maxmemory external:skip"}} { r config set latency-tracking no r config set maxmemory 0 r config set maxmemory-policy allkeys-random + set dbnum [expr {$::singledb ? 0 : 9}] + + # Populate some, then check table size and populate more up to one less + # than the soft maximum fill factor. Adding some more elements after + # this does not trigger rehashing, because rehashing would eat some + # kilobytes of memory. + populate 2000 a 1 + set table_size [main_hash_table_size] + populate [main_hash_table_keys_before_rehashing_starts] b 1 - # Next rehash size is 8192, that will eat 64k memory - populate 4095 "" 1 + # Now we are close to resizing. Check that rehashing didn't start. + assert_equal $table_size [main_hash_table_size] + assert_no_match "*Hash table 1 stats*" [r debug htstats $dbnum] + set dbsize_before [r dbsize] set used [s used_memory] set limit [expr {$used + 10*1024}] r config set maxmemory $limit # Adding a key to meet the 1:1 radio. r set k0 v0 - # The dict has reached 4096, it can be resized in tryResizeHashTables in cron, + # The table has reached the soft max fill factor. + # It can be resized in tryResizeHashTables in cron, # or we add a key to let it check whether it can be resized. r set k1 v1 # Next writing command will trigger evicting some keys if last # command trigger DB dict rehash r set k2 v2 - # There must be 4098 keys because the server doesn't evict keys. - r dbsize - } {4098} + # There must be three more keys because the server doesn't evict keys. + assert_equal [r dbsize] [expr {$dbsize_before + 3}] + } } # Skip the following test when running with IO threads @@ -611,3 +584,21 @@ start_server {tags {"maxmemory" "external:skip"}} { assert {[r object freq foo] == 5} } } + +start_server {tags {"maxmemory" "external:skip"}} { + test {Import mode should forbid eviction} { + r set key val + r config set import-mode yes + assert_equal [r client import-source on] {OK} + r config set maxmemory-policy allkeys-lru + r config set maxmemory 1 + + assert_equal [r dbsize] {1} + assert_error {OOM command not allowed*} {r set key1 val1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + assert_equal [r dbsize] {0} + } +} diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index d5a6a6efe2..8f6e5e8dd3 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -40,7 +40,6 @@ run_solo {defrag} { proc test_active_defrag {type} { if {[string match {*jemalloc*} [s mem_allocator]] && [r debug mallctl arenas.page] <= 8192} { test "Active defrag main dictionary: $type" { - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -48,6 +47,8 @@ run_solo {defrag} { r config set active-defrag-ignore-bytes 2mb r config set maxmemory 100mb r config set maxmemory-policy allkeys-lru + r config set lazyfree-lazy-user-del no + r config set lazyfree-lazy-user-flush no populate 700000 asdf1 150 populate 100 asdf1 150 0 false 1000 @@ -89,6 +90,8 @@ run_solo {defrag} { r config set active-defrag-cycle-min 65 r config set active-defrag-cycle-max 75 + after 1000 ;# Give defrag time to work (might be multiple cycles) + # Wait for the active defrag to stop working. wait_for_condition 2000 100 { [s active_defrag_running] eq 0 @@ -137,13 +140,18 @@ run_solo {defrag} { # reset stats and load the AOF file r config resetstat r config set key-load-delay -25 ;# sleep on average 1/25 usec + # Note: This test is checking if defrag is working DURING AOF loading (while + # timers are not active). So we don't give any extra time, and we deactivate + # defrag immediately after the AOF loading is complete. During loading, + # defrag will get invoked less often, causing starvation prevention. We + # should expect longer latency measurements. r debug loadaof r config set activedefrag no + # measure hits and misses right after aof loading set misses [s active_defrag_misses] set hits [s active_defrag_hits] - after 120 ;# serverCron only updates the info once in 100ms set frag [s allocator_frag_ratio] set max_latency 0 foreach event [r latency latest] { @@ -166,10 +174,12 @@ run_solo {defrag} { # make sure the defragger did enough work to keep the fragmentation low during loading. # we cannot check that it went all the way down, since we don't wait for full defrag cycle to complete. assert {$frag < 1.4} - # since the AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands), - # it'll still not block the loading for long periods of time. + # The AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands). + # Even so, defrag can get starved for periods exceeding 100ms. Using 200ms for test stability, and + # a 75% CPU requirement (as set above), we should allow up to 600ms latency + # (as total time = 200 non duty + 600 duty = 800ms, and 75% of 800ms is 600ms). if {!$::no_latency} { - assert {$max_latency <= 40} + assert {$max_latency <= 600} } } } ;# Active defrag - AOF loading @@ -181,7 +191,6 @@ run_solo {defrag} { r flushdb sync r script flush sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -203,7 +212,7 @@ run_solo {defrag} { $rd read ; # Discard script load replies $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms + after 1000 ;# give defrag some time to work if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -239,6 +248,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag time to work (might be multiple cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -266,7 +277,6 @@ run_solo {defrag} { test "Active defrag big keys: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -361,6 +371,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -407,7 +419,6 @@ run_solo {defrag} { test "Active defrag pubsub: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -430,7 +441,6 @@ run_solo {defrag} { $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -466,6 +476,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -475,6 +487,7 @@ run_solo {defrag} { puts [r memory malloc-stats] fail "defrag didn't stop." } + r config set activedefrag no ;# disable before we accidentally create more frag # test the fragmentation is lower after 120 ;# serverCron only updates the info once in 100ms @@ -507,7 +520,6 @@ run_solo {defrag} { test "Active defrag big list: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -561,6 +573,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -619,7 +633,6 @@ run_solo {defrag} { start_server {tags {"defrag"} overrides {save ""}} { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -685,6 +698,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -720,11 +735,11 @@ run_solo {defrag} { } } - start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "cluster" } - start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "standalone" } } ;# run_solo diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl index e169f8de9b..6d8f55bd06 100644 --- a/tests/unit/moduleapi/defrag.tcl +++ b/tests/unit/moduleapi/defrag.tcl @@ -2,7 +2,6 @@ set testmodule [file normalize tests/modules/defragtest.so] start_server {tags {"modules"} overrides {{save ""}}} { r module load $testmodule 10000 - r config set hz 100 r config set active-defrag-ignore-bytes 1 r config set active-defrag-threshold-lower 0 r config set active-defrag-cycle-min 99 diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl index 44f994d2d0..2474ad3567 100644 --- a/tests/unit/moduleapi/moduleconfigs.tcl +++ b/tests/unit/moduleapi/moduleconfigs.tcl @@ -1,5 +1,15 @@ set testmodule [file normalize tests/modules/moduleconfigs.so] set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so] +set testmoduleparameter [file normalize tests/modules/moduleparameter.so] + +proc module_get_args {mod} { + foreach line [r module list] { + if {[dict get $line name] eq $mod} { + return [dict get $line args] + } + } + throw error {module not found} +} start_server {tags {"modules"}} { r module load $testmodule @@ -243,5 +253,14 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.memory_numeric] "moduleconfigs.memory_numeric 1024" } } -} + test {Module Update Args} { + r module load $testmoduleparameter 10 20 30 + set t [r module list] + set modulename [lmap x [r module list] {dict get $x name}] + assert_not_equal [lsearch $modulename moduleparameter] -1 + assert_equal {10 20 30} [module_get_args moduleparameter] + assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70] + assert_equal {40 50 60 70} [module_get_args moduleparameter] + } +} diff --git a/tests/unit/moduleapi/scriptingengine.tcl b/tests/unit/moduleapi/scriptingengine.tcl new file mode 100644 index 0000000000..c350633dd8 --- /dev/null +++ b/tests/unit/moduleapi/scriptingengine.tcl @@ -0,0 +1,126 @@ +set testmodule [file normalize tests/modules/helloscripting.so] + +set HELLO_PROGRAM "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 432\nRETURN" + +start_server {tags {"modules"}} { + r module load $testmodule + + r function load $HELLO_PROGRAM + + test {Load script with invalid library name} { + assert_error {ERR Library names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=my-lib\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with existing library} { + assert_error {ERR Library 'mylib' already exists} {r function load $HELLO_PROGRAM} + } + + test {Load script with invalid engine} { + assert_error {ERR Engine 'wasm' not found} {r function load "#!wasm name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with no functions} { + assert_error {ERR No functions registered} {r function load "#!hello name=mylib2\n"} + } + + test {Load script with duplicate function} { + assert_error {ERR Function foo already exists} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with no metadata header} { + assert_error {ERR Missing library metadata} {r function load "FUNCTION foo\nARGS 0\nRETURN"} + } + + test {Load script with header without lib name} { + assert_error {ERR Library name was not given} {r function load "#!hello \n"} + } + + test {Load script with header with unknown param} { + assert_error {ERR Invalid metadata value given: nme=mylib} {r function load "#!hello nme=mylib\n"} + } + + test {Load script with header with lib name passed twice} { + assert_error {ERR Invalid metadata value, name argument was given multiple times} {r function load "#!hello name=mylib2 name=mylib3\n"} + } + + test {Load script with invalid function name} { + assert_error {ERR Function names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=mylib2\nFUNCTION foo-bar\nARGS 0\nRETURN"} + } + + test {Load script with duplicate function} { + assert_error {ERR Function already exists in the library} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION foo\nARGS 0\nRETURN"} + } + + test {Call scripting engine function: calling foo works} { + r fcall foo 0 134 + } {134} + + test {Call scripting engine function: calling bar works} { + r fcall bar 0 + } {432} + + test {Replace function library and call functions} { + set result [r function load replace "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 500\nRETURN"] + assert_equal $result "mylib" + + set result [r fcall foo 0 132] + assert_equal $result 132 + + set result [r fcall bar 0] + assert_equal $result 500 + } + + test {List scripting engine functions} { + r function load replace "#!hello name=mylib\nFUNCTION foobar\nARGS 0\nRETURN" + r function list + } {{library_name mylib engine HELLO functions {{name foobar description {} flags {}}}}} + + test {Load a second library and call a function} { + r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN" + set result [r fcall getarg 0 456] + assert_equal $result 456 + } + + test {Delete all libraries and functions} { + set result [r function flush] + assert_equal $result {OK} + r function list + } {} + + test {Test the deletion of a single library} { + r function load $HELLO_PROGRAM + r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN" + + set result [r function delete mylib] + assert_equal $result {OK} + + set result [r fcall getarg 0 446] + assert_equal $result 446 + } + + test {Test dump and restore function library} { + r function load $HELLO_PROGRAM + + set result [r fcall bar 0] + assert_equal $result 432 + + set dump [r function dump] + + set result [r function flush] + assert_equal $result {OK} + + set result [r function restore $dump] + assert_equal $result {OK} + + set result [r fcall getarg 0 436] + assert_equal $result 436 + + set result [r fcall bar 0] + assert_equal $result 432 + } + + test {Unload scripting engine module} { + set result [r module unload helloengine] + assert_equal $result "OK" + } +} diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 6e6230fc19..bb08c67471 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -396,7 +396,16 @@ start_server {tags {"other external:skip"}} { r config set save "" r config set rdb-key-save-delay 1000000 - populate 4095 "" 1 + # Populate some, then check table size and populate more up to one less + # than the soft maximum fill factor. + populate 2000 a 1 + set table_size [main_hash_table_size] + populate [main_hash_table_keys_before_rehashing_starts] b 1 + + # Now we are close to resizing. Check that rehashing didn't start. + assert_equal $table_size [main_hash_table_size] + assert_no_match "*Hash table 1 stats*" [r debug htstats 9] + r bgsave wait_for_condition 10 100 { [s rdb_bgsave_in_progress] eq 1 @@ -406,14 +415,15 @@ start_server {tags {"other external:skip"}} { r mset k1 v1 k2 v2 # Hash table should not rehash - assert_no_match "*table size: 8192*" [r debug HTSTATS 9] + assert_equal $table_size [main_hash_table_size] + assert_no_match "*Hash table 1 stats*" [r debug htstats 9] exec kill -9 [get_child_pid 0] waitForBgsave r # Hash table should rehash since there is no child process, - # size is power of two and over 4096, so it is 8192 + # so the resize limit is restored. wait_for_condition 50 100 { - [string match "*table size: 8192*" [r debug HTSTATS 9]] + [main_hash_table_size] > $table_size } else { fail "hash table did not rehash after child process killed" } @@ -472,7 +482,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { for {set j 1} {$j <= 128} {incr j} { r set "{foo}$j" a } - assert_match "*table size: 128*" [r debug HTSTATS 0] + set table_size [main_hash_table_size] # disable resizing, the reason for not using slow bgsave is because # it will hit the dict_force_resize_ratio. @@ -482,14 +492,14 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { for {set j 1} {$j <= 123} {incr j} { r del "{foo}$j" } - assert_match "*table size: 128*" [r debug HTSTATS 0] + assert_equal $table_size [main_hash_table_size] # enable resizing r debug dict-resizing 1 # waiting for serverCron to resize the tables wait_for_condition 1000 10 { - [string match {*table size: 8*} [r debug HTSTATS 0]] + [main_hash_table_size] < $table_size } else { puts [r debug HTSTATS 0] fail "hash tables weren't resize." @@ -503,6 +513,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { for {set j 1} {$j <= 128} {incr j} { r set "{alice}$j" a } + set table_size [main_hash_table_size] # disable resizing, the reason for not using slow bgsave is because # it will hit the dict_force_resize_ratio. @@ -517,7 +528,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} { # waiting for serverCron to resize the tables wait_for_condition 1000 10 { - [string match {*table size: 16*} [r debug HTSTATS 0]] + [main_hash_table_size] < $table_size } else { puts [r debug HTSTATS 0] fail "hash tables weren't resize." @@ -537,8 +548,9 @@ start_server {tags {"other external:skip"}} { } # The dict containing 128 keys must have expanded, # its hash table itself takes a lot more than 400 bytes + set dbnum [expr {$::singledb ? 0 : 9}] wait_for_condition 100 50 { - [dict get [r memory stats] db.9 overhead.hashtable.main] < 400 + [dict get [r memory stats] db.$dbnum overhead.hashtable.main] < 400 } else { fail "dict did not resize in time" } diff --git a/tests/unit/type/incr.tcl b/tests/unit/type/incr.tcl index 4bc130bcb1..fd0a8d02d8 100644 --- a/tests/unit/type/incr.tcl +++ b/tests/unit/type/incr.tcl @@ -75,17 +75,6 @@ start_server {tags {"incr"}} { assert_equal {-1} [r decrby key_not_exist 1] } - test {INCR uses shared objects in the 0-9999 range} { - r set foo -1 - r incr foo - assert_refcount_morethan foo 1 - r set foo 9998 - r incr foo - assert_refcount_morethan foo 1 - r incr foo - assert_refcount 1 foo - } - test {INCR can modify objects in-place} { r set foo 20000 r incr foo diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index 944c3d3d98..1871ec9b4d 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -33,6 +33,7 @@ start_server { assert_equal {0 1} [r smismember myset bla foo] assert_equal {0} [r smismember myset bla] assert_equal "bar $initelems($type)" [lsort [r smembers myset]] + r memory usage myset } } @@ -51,6 +52,7 @@ start_server { assert_equal {0 1} [r smismember myset 18 16] assert_equal {0} [r smismember myset 18] assert_equal {16 17} [lsort [r smembers myset]] + r memory usage myset } test {SMISMEMBER SMEMBERS SCARD against non set} { @@ -1029,111 +1031,6 @@ foreach type {single multiple single_multiple} { r srem $myset {*}$members } - proc verify_rehashing_completed_key {myset table_size keys} { - set htstats [r debug HTSTATS-KEY $myset] - assert {![string match {*rehashing target*} $htstats]} - return {[string match {*table size: $table_size*number of elements: $keys*} $htstats]} - } - - test "SRANDMEMBER with a dict containing long chain" { - set origin_save [config_get_set save ""] - set origin_max_lp [config_get_set set-max-listpack-entries 0] - set origin_save_delay [config_get_set rdb-key-save-delay 2147483647] - - # 1) Create a hash set with 100000 members. - set members {} - for {set i 0} {$i < 100000} {incr i} { - lappend members [format "m:%d" $i] - } - create_set myset $members - - # 2) Wait for the hash set rehashing to finish. - while {[is_rehashing myset]} { - r srandmember myset 100 - } - - # 3) Turn off the rehashing of this set, and remove the members to 500. - r bgsave - rem_hash_set_top_N myset [expr {[r scard myset] - 500}] - assert_equal [r scard myset] 500 - - # 4) Kill RDB child process to restart rehashing. - set pid1 [get_child_pid 0] - catch {exec kill -9 $pid1} - waitForBgsave r - - # 5) Let the set hash to start rehashing - r spop myset 1 - assert [is_rehashing myset] - - # 6) Verify that when rdb saving is in progress, rehashing will still be performed (because - # the ratio is extreme) by waiting for it to finish during an active bgsave. - r bgsave - - while {[is_rehashing myset]} { - r srandmember myset 1 - } - if {$::verbose} { - puts [r debug HTSTATS-KEY myset full] - } - - set pid1 [get_child_pid 0] - catch {exec kill -9 $pid1} - waitForBgsave r - - # 7) Check that eventually, SRANDMEMBER returns all elements. - array set allmyset {} - foreach ele [r smembers myset] { - set allmyset($ele) 1 - } - unset -nocomplain auxset - set iterations 1000 - while {$iterations != 0} { - incr iterations -1 - set res [r srandmember myset -10] - foreach ele $res { - set auxset($ele) 1 - } - if {[lsort [array names allmyset]] eq - [lsort [array names auxset]]} { - break; - } - } - assert {$iterations != 0} - - # 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution, - # otherwise we would need more iterations. - rem_hash_set_top_N myset [expr {[r scard myset] - 30}] - assert_equal [r scard myset] 30 - - # Hash set rehashing would be completed while removing members from the `myset` - # We also check the size and members in the hash table. - verify_rehashing_completed_key myset 64 30 - - # Now that we have a hash set with only one long chain bucket. - set htstats [r debug HTSTATS-KEY myset full] - assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]} - assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]} - assert {$different_slots == 1 && $max_chain_length == 30} - - # 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time. - unset -nocomplain allkey - set iterations 1000 - while {$iterations != 0} { - incr iterations -1 - set res [r srandmember myset 10] - foreach ele $res { - lappend allkey $ele - } - } - # validate even distribution of random sampling (df = 29, 73 means 0.00001 probability) - assert_lessthan [chi_square_value $allkey] 73 - - r config set save $origin_save - r config set set-max-listpack-entries $origin_max_lp - r config set rdb-key-save-delay $origin_save_delay - } {OK} {needs:debug slow} - proc setup_move {} { r del myset3{t} myset4{t} create_set myset1{t} {1 a b} diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index d934e48140..d736b9cdb7 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -944,7 +944,7 @@ start_server { # Simulate loading from RDB - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] set prev_seen [dict get $consumer seen-time] @@ -954,7 +954,7 @@ start_server { r DEL mystream r RESTORE mystream 0 $dump - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] assert_equal $prev_seen [dict get $consumer seen-time] diff --git a/tests/unit/type/string.tcl b/tests/unit/type/string.tcl index d7969b5b3e..bbfb30b60d 100644 --- a/tests/unit/type/string.tcl +++ b/tests/unit/type/string.tcl @@ -582,6 +582,56 @@ if {[string match {*jemalloc*} [s mem_allocator]]} { set err1 } {*WRONGTYPE*} + test "SET with IFEQ conditional" { + r del foo + + r set foo "initial_value" + + assert_equal {OK} [r set foo "new_value" ifeq "initial_value"] + assert_equal "new_value" [r get foo] + + assert_equal {} [r set foo "should_not_set" ifeq "wrong_value"] + assert_equal "new_value" [r get foo] + } + + test "SET with IFEQ conditional - non-string current value" { + r del foo + + r sadd foo "some_set_value" + assert_error {WRONGTYPE Operation against a key holding the wrong kind of value} {r set foo "new_value" ifeq "some_set_value"} + } + + + test "SET with IFEQ conditional - with get" { + r del foo + + assert_equal {} [r set foo "new_value" ifeq "initial_value" get] + assert_equal {} [r get foo] + + r set foo "initial_value" + + assert_equal "initial_value" [r set foo "new_value" ifeq "initial_value" get] + assert_equal "new_value" [r get foo] + } + + test "SET with IFEQ conditional - non string current value with get" { + r del foo + + r sadd foo "some_set_value" + + assert_error {WRONGTYPE Operation against a key holding the wrong kind of value} {r set foo "new_value" ifeq "initial_value" get} + } + + test "SET with IFEQ conditional - with xx" { + r del foo + assert_error {ERR syntax error} {r set foo "new_value" ifeq "initial_value" xx} + } + + test "SET with IFEQ conditional - with nx" { + r del foo + assert_error {ERR syntax error} {r set foo "new_value" ifeq "initial_value" nx} + } + test {Extended SET EX option} { r del foo r set foo bar ex 10 diff --git a/valkey.conf b/valkey.conf index 7c7b9da43e..e23aea39de 100644 --- a/valkey.conf +++ b/valkey.conf @@ -300,6 +300,54 @@ tcp-keepalive 300 # # tls-session-cache-timeout 60 +################################### RDMA ###################################### + +# Valkey Over RDMA is experimental, it may be changed or be removed in any minor or major version. +# By default, RDMA is disabled. To enable it, the "rdma-port" configuration +# directive can be used to define RDMA-listening ports. +# +# rdma-port 6379 +# rdma-bind 192.168.1.100 + +# The RDMA receive transfer buffer is 1M by default. It can be set between 64K and 16M. +# Note that page size aligned size is preferred. +# +# rdma-rx-size 1048576 + +# The RDMA completion queue will use the completion vector to signal completion events +# via hardware interrupts. A large number of hardware interrupts can affect CPU performance. +# It is possible to tune the performance using rdma-completion-vector. +# +# Example 1. a) Pin hardware interrupt vectors [0, 3] to CPU [0, 3]. +# b) Set CPU affinity for valkey to CPU [4, X]. +# c) Any valkey server uses a random RDMA completion vector [-1]. +# All valkey servers will not affect each other and will be isolated from kernel interrupts. +# +# SYS SYS SYS SYS VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | +# INTR0 INTR1 INTR2 INTR3 +# +# Example 2. a) 1:1 pin hardware interrupt vectors [0, X] to CPU [0, X]. +# b) Set CPU affinity for valkey [M] to CPU [M]. +# c) Valkey server [M] uses RDMA completion vector [M]. +# A single CPU [M] handles hardware interrupts, the RDMA completion vector [M], +# and the valkey server [M] within its context only. +# This avoids overhead and function calls across multiple CPUs, fully isolating +# each valkey server from one another. +# +# VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | | | | +# INTR0 INTR1 INTR2 INTR3 INTR4 INTR5 INTRX +# +# Use 0 and positive numbers to specify the RDMA completion vector, or specify -1 to allow +# the server to use a random vector for a new connection. The default vector is -1. +# +# rdma-completion-vector 0 + ################################# GENERAL ##################################### # By default the server does not run as a daemon. Use 'yes' if you need it. @@ -534,6 +582,9 @@ rdb-del-sync-files no # The working directory. # +# The server log is written relative this directory, if the 'logfile' +# configuration directive is a relative path. +# # The DB will be written inside this directory, with the filename specified # above using the 'dbfilename' configuration directive. # @@ -543,6 +594,9 @@ rdb-del-sync-files no # 'cluster-config-file' configuration directive is a relative path. # # Note that you must specify a directory here, not a file name. +# Note that modifying 'dir' during runtime may have unexpected behavior, +# for example when a child process is running, related file operations may +# have unexpected effects. dir ./ ################################# REPLICATION ################################# @@ -818,6 +872,13 @@ replica-priority 100 # # replica-ignore-disk-write-errors no +# Make the primary forbid expiration and eviction. +# This is useful for sync tools, because expiration and eviction may cause the data corruption. +# Sync tools can mark their connections as importing source by CLIENT IMPORT-SOURCE. +# NOTICE: Clients should avoid writing the same key on the source server and the destination server. +# +# import-mode no + # ----------------------------------------------------------------------------- # By default, Sentinel includes all replicas in its reports. A replica # can be excluded from Sentinel's announcements. An unannounced replica @@ -2326,9 +2387,8 @@ rdb-save-incremental-fsync yes # Fragmentation is a natural process that happens with every allocator (but # less so with Jemalloc, fortunately) and certain workloads. Normally a server # restart is needed in order to lower the fragmentation, or at least to flush -# away all the data and create it again. However thanks to this feature -# implemented by Oran Agra, this process can happen at runtime -# in a "hot" way, while the server is running. +# away all the data and create it again. However thanks to this feature, this +# process can happen at runtime in a "hot" way, while the server is running. # # Basically when the fragmentation is over a certain level (see the # configuration options below) the server will start to create new copies of the @@ -2366,18 +2426,23 @@ rdb-save-incremental-fsync yes # Maximum percentage of fragmentation at which we use maximum effort # active-defrag-threshold-upper 100 -# Minimal effort for defrag in CPU percentage, to be used when the lower -# threshold is reached +# Minimal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the lower threshold is reached. # active-defrag-cycle-min 1 -# Maximal effort for defrag in CPU percentage, to be used when the upper -# threshold is reached +# Maximal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the upper threshold is reached. # active-defrag-cycle-max 25 # Maximum number of set/hash/zset/list fields that will be processed from # the main dictionary scan # active-defrag-max-scan-fields 1000 +# The time spent (in microseconds) of the periodic active defrag process. This +# affects the latency impact of active defrag on client commands. Smaller numbers +# will result in less latency impact at the cost of increased defrag overhead. +# active-defrag-cycle-us 500 + # Jemalloc background thread for purging will be enabled by default jemalloc-bg-thread yes