From 721b9ee0a885ce3fc9669e7147e9a113a3773d24 Mon Sep 17 00:00:00 2001 From: Dave Friedel Date: Sun, 21 Dec 2025 05:51:30 -0500 Subject: [PATCH] Initial commit - Tutus Bolt database --- .gitattributes | 4 + .github/dependabot.yml | 11 + .github/workflows/benchmark-pr.yaml | 9 + .github/workflows/benchmark-releases.yaml | 13 + .github/workflows/benchmark-template.yaml | 57 + .github/workflows/failpoint_test.yaml | 20 + .github/workflows/gh-workflow-approve.yaml | 42 + .github/workflows/robustness_nightly.yaml | 17 + .github/workflows/robustness_template.yaml | 50 + .github/workflows/robustness_test.yaml | 16 + .github/workflows/stale.yaml | 19 + .github/workflows/tests-template.yml | 55 + .github/workflows/tests_amd64.yaml | 26 + .github/workflows/tests_arm64.yaml | 26 + .github/workflows/tests_windows.yml | 57 + .gitignore | 12 + .go-version | 1 + .golangci.yaml | 34 + CHANGELOG/CHANGELOG-1.3.md | 90 + CHANGELOG/CHANGELOG-1.4.md | 76 + LICENSE | 20 + Makefile | 108 + OWNERS | 10 + README.md | 21 + allocate_test.go | 39 + bolt_386.go | 7 + bolt_aix.go | 90 + bolt_amd64.go | 7 + bolt_android.go | 90 + bolt_arm.go | 7 + bolt_arm64.go | 9 + bolt_linux.go | 10 + bolt_loong64.go | 9 + bolt_mips64x.go | 9 + bolt_mipsx.go | 9 + bolt_openbsd.go | 16 + bolt_ppc.go | 9 + bolt_ppc64.go | 9 + bolt_ppc64le.go | 9 + bolt_riscv64.go | 9 + bolt_s390x.go | 9 + bolt_solaris.go | 88 + bolt_unix.go | 88 + bolt_windows.go | 132 ++ boltsync_unix.go | 8 + bucket.go | 1005 +++++++++ bucket_test.go | 2170 ++++++++++++++++++++ cmd/bbolt/OWNERS | 12 + cmd/bbolt/README.md | 453 ++++ cmd/bbolt/command_check.go | 73 + cmd/bbolt/command_check_test.go | 66 + cmd/bbolt/command_inspect.go | 46 + cmd/bbolt/command_inspect_test.go | 27 + cmd/bbolt/command_root.go | 27 + cmd/bbolt/command_surgery.go | 300 +++ cmd/bbolt/command_surgery_freelist.go | 111 + cmd/bbolt/command_surgery_freelist_test.go | 103 + cmd/bbolt/command_surgery_meta.go | 275 +++ cmd/bbolt/command_surgery_meta_test.go | 126 ++ cmd/bbolt/command_surgery_test.go | 636 ++++++ cmd/bbolt/command_version.go | 25 + cmd/bbolt/main.go | 1795 ++++++++++++++++ cmd/bbolt/main_test.go | 754 +++++++ cmd/bbolt/page_command.go | 290 +++ cmd/bbolt/utils.go | 16 + cmd/bbolt/utils_test.go | 46 + code-of-conduct.md | 3 + compact.go | 119 ++ concurrent_test.go | 956 +++++++++ cursor.go | 432 ++++ cursor_test.go | 986 +++++++++ db.go | 1417 +++++++++++++ db_test.go | 1904 +++++++++++++++++ db_whitebox_test.go | 126 ++ doc.go | 40 + errors.go | 108 + errors/errors.go | 87 + go.mod | 21 + go.sum | 24 + internal/btesting/btesting.go | 230 +++ internal/common/bucket.go | 54 + internal/common/inode.go | 115 ++ internal/common/meta.go | 161 ++ internal/common/page.go | 391 ++++ internal/common/page_test.go | 72 + internal/common/types.go | 40 + internal/common/unsafe.go | 27 + internal/common/utils.go | 64 + internal/common/verify.go | 67 + internal/freelist/array.go | 108 + internal/freelist/array_test.go | 91 + internal/freelist/freelist.go | 82 + internal/freelist/freelist_test.go | 622 ++++++ internal/freelist/hashmap.go | 292 +++ internal/freelist/hashmap_test.go | 187 ++ internal/freelist/shared.go | 310 +++ internal/guts_cli/guts_cli.go | 141 ++ internal/surgeon/surgeon.go | 156 ++ internal/surgeon/surgeon_test.go | 57 + internal/surgeon/xray.go | 102 + internal/surgeon/xray_test.go | 66 + internal/tests/tx_check_test.go | 91 + logger.go | 113 + manydbs_test.go | 73 + mlock_unix.go | 36 + mlock_windows.go | 11 + movebucket_test.go | 398 ++++ node.go | 538 +++++ node_test.go | 169 ++ quick_test.go | 90 + scripts/compare_benchmarks.sh | 70 + scripts/fix.sh | 13 + simulation_no_freelist_sync_test.go | 47 + simulation_test.go | 362 ++++ tests/dmflakey/dmflakey.go | 350 ++++ tests/dmflakey/dmflakey_test.go | 188 ++ tests/dmflakey/dmsetup.go | 105 + tests/dmflakey/loopback.go | 91 + tests/failpoint/db_failpoint_test.go | 368 ++++ tests/robustness/main_test.go | 17 + tests/robustness/powerfailure_test.go | 326 +++ tests/utils/helpers.go | 26 + tx.go | 858 ++++++++ tx_check.go | 290 +++ tx_check_test.go | 166 ++ tx_stats_test.go | 54 + tx_test.go | 1056 ++++++++++ unix_test.go | 115 ++ utils_test.go | 47 + version/version.go | 6 + 130 files changed, 25545 insertions(+) create mode 100644 .gitattributes create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/benchmark-pr.yaml create mode 100644 .github/workflows/benchmark-releases.yaml create mode 100644 .github/workflows/benchmark-template.yaml create mode 100644 .github/workflows/failpoint_test.yaml create mode 100644 .github/workflows/gh-workflow-approve.yaml create mode 100644 .github/workflows/robustness_nightly.yaml create mode 100644 .github/workflows/robustness_template.yaml create mode 100644 .github/workflows/robustness_test.yaml create mode 100644 .github/workflows/stale.yaml create mode 100644 .github/workflows/tests-template.yml create mode 100644 .github/workflows/tests_amd64.yaml create mode 100644 .github/workflows/tests_arm64.yaml create mode 100644 .github/workflows/tests_windows.yml create mode 100644 .gitignore create mode 100644 .go-version create mode 100644 .golangci.yaml create mode 100644 CHANGELOG/CHANGELOG-1.3.md create mode 100644 CHANGELOG/CHANGELOG-1.4.md create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 OWNERS create mode 100644 README.md create mode 100644 allocate_test.go create mode 100644 bolt_386.go create mode 100644 bolt_aix.go create mode 100644 bolt_amd64.go create mode 100644 bolt_android.go create mode 100644 bolt_arm.go create mode 100644 bolt_arm64.go create mode 100644 bolt_linux.go create mode 100644 bolt_loong64.go create mode 100644 bolt_mips64x.go create mode 100644 bolt_mipsx.go create mode 100644 bolt_openbsd.go create mode 100644 bolt_ppc.go create mode 100644 bolt_ppc64.go create mode 100644 bolt_ppc64le.go create mode 100644 bolt_riscv64.go create mode 100644 bolt_s390x.go create mode 100644 bolt_solaris.go create mode 100644 bolt_unix.go create mode 100644 bolt_windows.go create mode 100644 boltsync_unix.go create mode 100644 bucket.go create mode 100644 bucket_test.go create mode 100644 cmd/bbolt/OWNERS create mode 100644 cmd/bbolt/README.md create mode 100644 cmd/bbolt/command_check.go create mode 100644 cmd/bbolt/command_check_test.go create mode 100644 cmd/bbolt/command_inspect.go create mode 100644 cmd/bbolt/command_inspect_test.go create mode 100644 cmd/bbolt/command_root.go create mode 100644 cmd/bbolt/command_surgery.go create mode 100644 cmd/bbolt/command_surgery_freelist.go create mode 100644 cmd/bbolt/command_surgery_freelist_test.go create mode 100644 cmd/bbolt/command_surgery_meta.go create mode 100644 cmd/bbolt/command_surgery_meta_test.go create mode 100644 cmd/bbolt/command_surgery_test.go create mode 100644 cmd/bbolt/command_version.go create mode 100644 cmd/bbolt/main.go create mode 100644 cmd/bbolt/main_test.go create mode 100644 cmd/bbolt/page_command.go create mode 100644 cmd/bbolt/utils.go create mode 100644 cmd/bbolt/utils_test.go create mode 100644 code-of-conduct.md create mode 100644 compact.go create mode 100644 concurrent_test.go create mode 100644 cursor.go create mode 100644 cursor_test.go create mode 100644 db.go create mode 100644 db_test.go create mode 100644 db_whitebox_test.go create mode 100644 doc.go create mode 100644 errors.go create mode 100644 errors/errors.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/btesting/btesting.go create mode 100644 internal/common/bucket.go create mode 100644 internal/common/inode.go create mode 100644 internal/common/meta.go create mode 100644 internal/common/page.go create mode 100644 internal/common/page_test.go create mode 100644 internal/common/types.go create mode 100644 internal/common/unsafe.go create mode 100644 internal/common/utils.go create mode 100644 internal/common/verify.go create mode 100644 internal/freelist/array.go create mode 100644 internal/freelist/array_test.go create mode 100644 internal/freelist/freelist.go create mode 100644 internal/freelist/freelist_test.go create mode 100644 internal/freelist/hashmap.go create mode 100644 internal/freelist/hashmap_test.go create mode 100644 internal/freelist/shared.go create mode 100644 internal/guts_cli/guts_cli.go create mode 100644 internal/surgeon/surgeon.go create mode 100644 internal/surgeon/surgeon_test.go create mode 100644 internal/surgeon/xray.go create mode 100644 internal/surgeon/xray_test.go create mode 100644 internal/tests/tx_check_test.go create mode 100644 logger.go create mode 100644 manydbs_test.go create mode 100644 mlock_unix.go create mode 100644 mlock_windows.go create mode 100644 movebucket_test.go create mode 100644 node.go create mode 100644 node_test.go create mode 100644 quick_test.go create mode 100644 scripts/compare_benchmarks.sh create mode 100644 scripts/fix.sh create mode 100644 simulation_no_freelist_sync_test.go create mode 100644 simulation_test.go create mode 100644 tests/dmflakey/dmflakey.go create mode 100644 tests/dmflakey/dmflakey_test.go create mode 100644 tests/dmflakey/dmsetup.go create mode 100644 tests/dmflakey/loopback.go create mode 100644 tests/failpoint/db_failpoint_test.go create mode 100644 tests/robustness/main_test.go create mode 100644 tests/robustness/powerfailure_test.go create mode 100644 tests/utils/helpers.go create mode 100644 tx.go create mode 100644 tx_check.go create mode 100644 tx_check_test.go create mode 100644 tx_stats_test.go create mode 100644 tx_test.go create mode 100644 unix_test.go create mode 100644 utils_test.go create mode 100644 version/version.go diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a681ce3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +# ensure that line endings for Windows builds are properly formatted +# see https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#how-to-use +# at "Multiple OS Example" section +*.go text eol=lf diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..aafb8a2 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + + - package-ecosystem: gomod + directory: / + schedule: + interval: weekly diff --git a/.github/workflows/benchmark-pr.yaml b/.github/workflows/benchmark-pr.yaml new file mode 100644 index 0000000..95de955 --- /dev/null +++ b/.github/workflows/benchmark-pr.yaml @@ -0,0 +1,9 @@ +--- +name: Benchmarks on PRs (AMD64) +permissions: read-all +on: [pull_request] +jobs: + amd64: + uses: ./.github/workflows/benchmark-template.yaml + with: + benchGitRef: ${{ github.event.pull_request.base.sha }} diff --git a/.github/workflows/benchmark-releases.yaml b/.github/workflows/benchmark-releases.yaml new file mode 100644 index 0000000..6cc1c1f --- /dev/null +++ b/.github/workflows/benchmark-releases.yaml @@ -0,0 +1,13 @@ +--- +name: Nightly Benchmarks against last release (AMD64) +permissions: read-all +on: + schedule: + - cron: '10 5 * * *' # runs every day at 05:10 UTC + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: +jobs: + amd64: + uses: ./.github/workflows/benchmark-template.yaml + with: + benchGitRef: release-1.3 diff --git a/.github/workflows/benchmark-template.yaml b/.github/workflows/benchmark-template.yaml new file mode 100644 index 0000000..057286b --- /dev/null +++ b/.github/workflows/benchmark-template.yaml @@ -0,0 +1,57 @@ +--- +name: Reusable Benchmark Template +on: + workflow_call: + inputs: + # which git reference to benchmark against + benchGitRef: + required: true + type: string + maxAcceptableDifferencePercent: + required: false + type: number + default: 5 + runs-on: + required: false + type: string + default: "['ubuntu-latest']" +permissions: read-all + +jobs: + benchmark: + runs-on: ${{ fromJson(inputs.runs-on) }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - name: Run Benchmarks + run: | + BENCHSTAT_OUTPUT_FILE=result.txt make test-benchmark-compare REF=${{ inputs.benchGitRef }} + - run: | + echo "\`\`\`" >> "$GITHUB_STEP_SUMMARY" + cat result.txt >> "$GITHUB_STEP_SUMMARY" + echo "\`\`\`" >> "$GITHUB_STEP_SUMMARY" + cat <> "$GITHUB_STEP_SUMMARY" +
+ The table shows the median and 90% confidence interval (CI) summaries for each benchmark comparing the HEAD and the BASE, and an A/B comparison under "vs base". The last column shows the statistical p-value with ten runs (n=10). + The last row has the Geometric Mean (geomean) for the given rows in the table. + Refer to [benchstat's documentation](https://pkg.go.dev/golang.org/x/perf/cmd/benchstat) for more help. + EOL + - name: Validate results under acceptable limit + run: | + export MAX_ACCEPTABLE_DIFFERENCE=${{ inputs.maxAcceptableDifferencePercent }} + while IFS= read -r line; do + # Get fourth value, which is the comparison with the base. + value="$(echo "$line" | awk '{print $4}')" + if [[ "$value" = +* ]] || [[ "$value" = -* ]]; then + if (( $(echo "${value//[^0-9.]/}"'>'"$MAX_ACCEPTABLE_DIFFERENCE" | bc -l) )); then + echo "::error::$value is above the maximum acceptable difference ($MAX_ACCEPTABLE_DIFFERENCE)" + exit 1 + fi + fi + done < <(grep geomean result.txt) diff --git a/.github/workflows/failpoint_test.yaml b/.github/workflows/failpoint_test.yaml new file mode 100644 index 0000000..ce626ca --- /dev/null +++ b/.github/workflows/failpoint_test.yaml @@ -0,0 +1,20 @@ +--- +name: Failpoint test +on: [push, pull_request] +permissions: read-all +jobs: + test: + strategy: + matrix: + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: | + make gofail-enable + make test-failpoint diff --git a/.github/workflows/gh-workflow-approve.yaml b/.github/workflows/gh-workflow-approve.yaml new file mode 100644 index 0000000..4a51970 --- /dev/null +++ b/.github/workflows/gh-workflow-approve.yaml @@ -0,0 +1,42 @@ +--- +name: Approve GitHub Workflows +permissions: read-all +on: + pull_request_target: + types: + - labeled + - synchronize + branches: + - main + - release-1.3 + +jobs: + approve: + name: Approve ok-to-test + if: contains(github.event.pull_request.labels.*.name, 'ok-to-test') + runs-on: ubuntu-latest + permissions: + actions: write + steps: + - name: Update PR + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + continue-on-error: true + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + debug: ${{ secrets.ACTIONS_RUNNER_DEBUG == 'true' }} + script: | + const result = await github.rest.actions.listWorkflowRunsForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + event: "pull_request", + status: "action_required", + head_sha: context.payload.pull_request.head.sha, + per_page: 100 + }); + for (var run of result.data.workflow_runs) { + await github.rest.actions.approveWorkflowRun({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run.id + }); + } diff --git a/.github/workflows/robustness_nightly.yaml b/.github/workflows/robustness_nightly.yaml new file mode 100644 index 0000000..df04e78 --- /dev/null +++ b/.github/workflows/robustness_nightly.yaml @@ -0,0 +1,17 @@ +--- +name: Robustness Nightly +permissions: read-all +on: + schedule: + - cron: '25 9 * * *' # runs every day at 09:25 UTC + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: + +jobs: + amd64: + # GHA has a maximum amount of 6h execution time, we try to get done within 3h + uses: ./.github/workflows/robustness_template.yaml + with: + count: 100 + testTimeout: 200m + runs-on: "['ubuntu-latest']" diff --git a/.github/workflows/robustness_template.yaml b/.github/workflows/robustness_template.yaml new file mode 100644 index 0000000..befe7df --- /dev/null +++ b/.github/workflows/robustness_template.yaml @@ -0,0 +1,50 @@ +--- +name: Reusable Robustness Workflow +on: + workflow_call: + inputs: + count: + required: true + type: number + testTimeout: + required: false + type: string + default: '30m' + runs-on: + required: false + type: string + default: "['ubuntu-latest']" +permissions: read-all + +jobs: + test: + # this is to prevent the job to run at forked projects + if: github.repository == 'etcd-io/bbolt' + timeout-minutes: 210 + runs-on: ${{ fromJson(inputs.runs-on) }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - name: test-robustness + run: | + set -euo pipefail + sudo apt-get install -y dmsetup xfsprogs + + ROBUSTNESS_TESTFLAGS="--count ${{ inputs.count }} --timeout ${{ inputs.testTimeout }} -failfast" make test-robustness + + - name: Host Status + if: always() + run: | + set -x + mount + df + losetup -l + - name: Kernel Message + if: failure() + run: | + sudo lsmod + sudo dmesg -T -f kern diff --git a/.github/workflows/robustness_test.yaml b/.github/workflows/robustness_test.yaml new file mode 100644 index 0000000..635d4e8 --- /dev/null +++ b/.github/workflows/robustness_test.yaml @@ -0,0 +1,16 @@ +name: Robustness Test +on: [push, pull_request] +permissions: read-all +jobs: + amd64: + uses: ./.github/workflows/robustness_template.yaml + with: + count: 10 + testTimeout: 30m + runs-on: "['ubuntu-latest']" + arm64: + uses: ./.github/workflows/robustness_template.yaml + with: + count: 10 + testTimeout: 30m + runs-on: "['ubuntu-24.04-arm']" diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml new file mode 100644 index 0000000..1abb63a --- /dev/null +++ b/.github/workflows/stale.yaml @@ -0,0 +1,19 @@ +name: 'Close stale issues and PRs' +on: + schedule: + - cron: '0 0 * * *' # every day at 00:00 UTC + +permissions: + issues: write + pull-requests: write + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 + with: + days-before-stale: 90 + days-before-close: 21 + stale-issue-label: stale + stale-pr-label: stale diff --git a/.github/workflows/tests-template.yml b/.github/workflows/tests-template.yml new file mode 100644 index 0000000..ad92c8c --- /dev/null +++ b/.github/workflows/tests-template.yml @@ -0,0 +1,55 @@ +--- +name: Reusable unit test Workflow +on: + workflow_call: + inputs: + runs-on: + required: false + type: string + default: ubuntu-latest + targets: + required: false + type: string + default: "['linux-unit-test-1-cpu','linux-unit-test-2-cpu','linux-unit-test-4-cpu']" +permissions: read-all + +jobs: + test-linux: + strategy: + fail-fast: false + matrix: + target: ${{ fromJSON(inputs.targets) }} + runs-on: ${{ inputs.runs-on }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: make fmt + - env: + TARGET: ${{ matrix.target }} + run: | + case "${TARGET}" in + linux-unit-test-1-cpu) + CPU=1 make test + ;; + linux-unit-test-2-cpu) + CPU=2 make test + ;; + linux-unit-test-4-cpu) + CPU=4 make test + ;; + linux-unit-test-4-cpu-race) + CPU=4 ENABLE_RACE=true make test + ;; + *) + echo "Failed to find target" + exit 1 + ;; + esac + - name: golangci-lint + uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v8.0.0 + with: + version: v2.1.6 diff --git a/.github/workflows/tests_amd64.yaml b/.github/workflows/tests_amd64.yaml new file mode 100644 index 0000000..7372dd7 --- /dev/null +++ b/.github/workflows/tests_amd64.yaml @@ -0,0 +1,26 @@ +--- +name: Tests AMD64 +permissions: read-all +on: [push, pull_request] +jobs: + test-linux-amd64: + uses: ./.github/workflows/tests-template.yml + test-linux-amd64-race: + uses: ./.github/workflows/tests-template.yml + with: + runs-on: ubuntu-latest + targets: "['linux-unit-test-4-cpu-race']" + + coverage: + needs: + - test-linux-amd64 + - test-linux-amd64-race + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: make coverage diff --git a/.github/workflows/tests_arm64.yaml b/.github/workflows/tests_arm64.yaml new file mode 100644 index 0000000..c89b322 --- /dev/null +++ b/.github/workflows/tests_arm64.yaml @@ -0,0 +1,26 @@ +--- +name: Tests ARM64 +permissions: read-all +on: [push, pull_request] +jobs: + test-linux-arm64: + uses: ./.github/workflows/tests-template.yml + test-linux-arm64-race: + uses: ./.github/workflows/tests-template.yml + with: + runs-on: ubuntu-24.04-arm + targets: "['linux-unit-test-4-cpu-race']" + + coverage: + needs: + - test-linux-arm64 + - test-linux-arm64-race + runs-on: ubuntu-24.04-arm + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: make coverage diff --git a/.github/workflows/tests_windows.yml b/.github/workflows/tests_windows.yml new file mode 100644 index 0000000..54546e1 --- /dev/null +++ b/.github/workflows/tests_windows.yml @@ -0,0 +1,57 @@ +--- +name: Tests +on: [push, pull_request] +permissions: read-all +jobs: + test-windows: + strategy: + fail-fast: false + matrix: + target: + - windows-amd64-unit-test-4-cpu + # FIXME(fuweid): + # + # The windows will throws the following error when enable race. + # We skip it until we have solution. + # + # ThreadSanitizer failed to allocate 0x000200000000 (8589934592) bytes at 0x0400c0000000 (error code: 1455) + # + # - windows-amd64-unit-test-4-cpu-race + runs-on: windows-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: make fmt + - env: + TARGET: ${{ matrix.target }} + run: | + case "${TARGET}" in + windows-amd64-unit-test-4-cpu) + CPU=4 make test + ;; + *) + echo "Failed to find target" + exit 1 + ;; + esac + shell: bash + - name: golangci-lint + uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v8.0.0 + with: + version: v2.1.6 + + coverage: + needs: ["test-windows"] + runs-on: windows-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - id: goversion + run: echo "goversion=$(cat .go-version)" >> "$GITHUB_OUTPUT" + - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version: ${{ steps.goversion.outputs.goversion }} + - run: make coverage diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed4d259 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +*.prof +*.test +*.swp +/bin/ +cover.out +cover-*.out +/.idea +*.iml +/bbolt +/cmd/bbolt/bbolt +.DS_Store + diff --git a/.go-version b/.go-version new file mode 100644 index 0000000..ae96cc7 --- /dev/null +++ b/.go-version @@ -0,0 +1 @@ +1.24.3 diff --git a/.golangci.yaml b/.golangci.yaml new file mode 100644 index 0000000..68fc131 --- /dev/null +++ b/.golangci.yaml @@ -0,0 +1,34 @@ +formatters: + enable: + - gofmt + - goimports + settings: # please keep this alphabetized + goimports: + local-prefixes: + - go.etcd.io # Put imports beginning with prefix after 3rd-party packages. +issues: + max-same-issues: 0 +linters: + default: none + enable: # please keep this alphabetized + - errcheck + - govet + - ineffassign + - staticcheck + - unused + exclusions: + presets: + - comments + - common-false-positives + - legacy + - std-error-handling + settings: # please keep this alphabetized + staticcheck: + checks: + - all + - -QF1003 # Convert if/else-if chain to tagged switch + - -QF1010 # Convert slice of bytes to string when printing it + - -ST1003 # Poorly chosen identifier + - -ST1005 # Incorrectly formatted error string + - -ST1012 # Poorly chosen name for error variable +version: "2" diff --git a/CHANGELOG/CHANGELOG-1.3.md b/CHANGELOG/CHANGELOG-1.3.md new file mode 100644 index 0000000..23009eb --- /dev/null +++ b/CHANGELOG/CHANGELOG-1.3.md @@ -0,0 +1,90 @@ +Note that we start to track changes starting from v1.3.7. + +
+ +## v1.3.11(2024-08-21) + +### BoltDB +- Fix [the `freelist.allocs` isn't rollbacked when a tx is rollbacked](https://github.com/etcd-io/bbolt/pull/823). + +### CMD +- Add [`-gobench-output` option for bench command to adapt to benchstat](https://github.com/etcd-io/bbolt/pull/802). + +### Other +- [Bump go version to 1.22.x](https://github.com/etcd-io/bbolt/pull/822). +- This patch also added `dmflakey` package, which can be reused by other projects. See https://github.com/etcd-io/bbolt/pull/812. + +
+ +## v1.3.10(2024-05-06) + +### BoltDB +- [Remove deprecated `UnsafeSlice` and use `unsafe.Slice`](https://github.com/etcd-io/bbolt/pull/717) +- [Stabilize the behaviour of Prev when the cursor already points to the first element](https://github.com/etcd-io/bbolt/pull/744) + +### Other +- [Bump go version to 1.21.9](https://github.com/etcd-io/bbolt/pull/713) + +
+ +## v1.3.9(2024-02-24) + +### BoltDB +- [Clone the key before operating data in bucket against the key](https://github.com/etcd-io/bbolt/pull/639) + +### CMD +- [Fix `bbolt keys` and `bbolt get` to prevent them from panicking when no parameter provided](https://github.com/etcd-io/bbolt/pull/683) + +
+ +## v1.3.8(2023-10-26) + +### BoltDB +- Fix [db.close() doesn't unlock the db file if db.munnmap() fails](https://github.com/etcd-io/bbolt/pull/439). +- [Avoid syscall.Syscall use on OpenBSD](https://github.com/etcd-io/bbolt/pull/406). +- Fix [rollback panicking after mlock failed or both meta pages corrupted](https://github.com/etcd-io/bbolt/pull/444). +- Fix [bbolt panicking due to 64bit unaligned on arm32](https://github.com/etcd-io/bbolt/pull/584). + +### CMD +- [Update the usage of surgery command](https://github.com/etcd-io/bbolt/pull/411). + +
+ +## v1.3.7(2023-01-31) + +### BoltDB +- Add [recursive checker to confirm database consistency](https://github.com/etcd-io/bbolt/pull/225). +- Add [support to get the page size from the second meta page if the first one is invalid](https://github.com/etcd-io/bbolt/pull/294). +- Add [support for loong64 arch](https://github.com/etcd-io/bbolt/pull/303). +- Add [internal iterator to Bucket that goes over buckets](https://github.com/etcd-io/bbolt/pull/356). +- Add [validation on page read and write](https://github.com/etcd-io/bbolt/pull/358). +- Add [PreLoadFreelist option to support loading free pages in readonly mode](https://github.com/etcd-io/bbolt/pull/381). +- Add [(*Tx) CheckWithOption to support generating human-readable diagnostic messages](https://github.com/etcd-io/bbolt/pull/395). +- Fix [Use `golang.org/x/sys/windows` for `FileLockEx`/`UnlockFileEx`](https://github.com/etcd-io/bbolt/pull/283). +- Fix [readonly file mapping on windows](https://github.com/etcd-io/bbolt/pull/307). +- Fix [the "Last" method might return no data due to not skipping the empty pages](https://github.com/etcd-io/bbolt/pull/341). +- Fix [panic on db.meta when rollback](https://github.com/etcd-io/bbolt/pull/362). + +### CMD +- Add [support for get keys in sub buckets in `bbolt get` command](https://github.com/etcd-io/bbolt/pull/295). +- Add [support for `--format` flag for `bbolt keys` command](https://github.com/etcd-io/bbolt/pull/306). +- Add [safeguards to bbolt CLI commands](https://github.com/etcd-io/bbolt/pull/354). +- Add [`bbolt page` supports --all and --value-format=redacted formats](https://github.com/etcd-io/bbolt/pull/359). +- Add [`bbolt surgery` commands](https://github.com/etcd-io/bbolt/issues/370). +- Fix [open db file readonly mode for commands which shouldn't update the db file](https://github.com/etcd-io/bbolt/pull/365), see also [pull/292](https://github.com/etcd-io/bbolt/pull/292). + +### Other +- [Build bbolt CLI tool, test and format the source code using golang 1.17.13](https://github.com/etcd-io/bbolt/pull/297). +- [Bump golang.org/x/sys to v0.4.0](https://github.com/etcd-io/bbolt/pull/397). + +### Summary +Release v1.3.7 contains following critical fixes: +- fix to problem that `Last` method might return incorrect value ([#341](https://github.com/etcd-io/bbolt/pull/341)) +- fix of potential panic when performing transaction's rollback ([#362](https://github.com/etcd-io/bbolt/pull/362)) + +Other changes focused on defense-in-depth ([#358](https://github.com/etcd-io/bbolt/pull/358), [#294](https://github.com/etcd-io/bbolt/pull/294), [#225](https://github.com/etcd-io/bbolt/pull/225), [#395](https://github.com/etcd-io/bbolt/pull/395)) + +`bbolt` command line tool was expanded to: +- allow fixing simple corruptions by `bbolt surgery` ([#370](https://github.com/etcd-io/bbolt/pull/370)) +- be flexible about output formatting ([#306](https://github.com/etcd-io/bbolt/pull/306), [#359](https://github.com/etcd-io/bbolt/pull/359)) +- allow accessing data in subbuckets ([#295](https://github.com/etcd-io/bbolt/pull/295)) diff --git a/CHANGELOG/CHANGELOG-1.4.md b/CHANGELOG/CHANGELOG-1.4.md new file mode 100644 index 0000000..f5cb940 --- /dev/null +++ b/CHANGELOG/CHANGELOG-1.4.md @@ -0,0 +1,76 @@ + +
+ +## v1.4.0(2025-02-05) +There isn't any production code change since v1.4.0-beta.0. Only some dependencies +are bumped, also updated some typos in comment and readme, and removed the legacy +build tag `// +build` in https://github.com/etcd-io/bbolt/pull/879. + +
+ +## v1.4.0-beta.0(2024-11-04) + +### BoltDB +- Reorganized the directory structure of freelist source code + - [Move array related freelist source code into a separate file](https://github.com/etcd-io/bbolt/pull/777) + - [Move method `freePages` into freelist.go](https://github.com/etcd-io/bbolt/pull/783) + - [Add an interface for freelist](https://github.com/etcd-io/bbolt/pull/775) +- [Rollback alloc map when a transaction is rollbacked](https://github.com/etcd-io/bbolt/pull/819) +- [No handling freelist as a special case when freeing a page](https://github.com/etcd-io/bbolt/pull/788) +- [Ensure hashmap init method clears the data structures](https://github.com/etcd-io/bbolt/pull/794) +- [Panicking when a write transaction tries to free a page allocated by itself](https://github.com/etcd-io/bbolt/pull/792) + +### CMD +- [Add `-gobench-output` flag for `bbolt bench` command](https://github.com/etcd-io/bbolt/pull/765) + +### Other +- [Bump go version to 1.23.x](https://github.com/etcd-io/bbolt/pull/821) + +
+ +## v1.4.0-alpha.1(2024-05-06) + +### BoltDB +- [Enhance check functionality to support checking starting from a pageId](https://github.com/etcd-io/bbolt/pull/659) +- [Optimize the logger performance for frequent called methods](https://github.com/etcd-io/bbolt/pull/741) +- [Stabilize the behaviour of Prev when the cursor already points to the first element](https://github.com/etcd-io/bbolt/pull/734) + +### CMD +- [Fix `bbolt keys` and `bbolt get` to prevent them from panicking when no parameter provided](https://github.com/etcd-io/bbolt/pull/682) +- [Fix surgery freelist command in info logs](https://github.com/etcd-io/bbolt/pull/700) +- [Remove txid references in surgery meta command's comment and description](https://github.com/etcd-io/bbolt/pull/703) +- [Add rnd read capabilities to bbolt bench](https://github.com/etcd-io/bbolt/pull/711) +- [Use `cobra.ExactArgs` to simplify the argument number check](https://github.com/etcd-io/bbolt/pull/728) +- [Migrate `bbolt check` command to cobra style](https://github.com/etcd-io/bbolt/pull/723) +- [Simplify the naming of cobra commands](https://github.com/etcd-io/bbolt/pull/732) +- [Aggregate adding completed ops for read test of the `bbolt bench` command](https://github.com/etcd-io/bbolt/pull/721) +- [Add `--from-page` flag to `bbolt check` command](https://github.com/etcd-io/bbolt/pull/737) + +### Document +- [Add document for a known issue on the writing a value with a length of 0](https://github.com/etcd-io/bbolt/pull/730) + +### Test +- [Enhance robustness test to cover XFS](https://github.com/etcd-io/bbolt/pull/707) + +### Other +- [Bump go toolchain version to 1.22.2](https://github.com/etcd-io/bbolt/pull/712) + +
+ +## v1.4.0-alpha.0(2024-01-12) + +### BoltDB +- [Improve the performance of hashmapGetFreePageIDs](https://github.com/etcd-io/bbolt/pull/419) +- [Improve CreateBucketIfNotExists to avoid double searching the same key](https://github.com/etcd-io/bbolt/pull/532) +- [Support Android platform](https://github.com/etcd-io/bbolt/pull/571) +- [Record the count of free page to improve the performance of hashmapFreeCount](https://github.com/etcd-io/bbolt/pull/585) +- [Add logger to bbolt](https://github.com/etcd-io/bbolt/issues/509) +- [Support moving bucket inside the same db](https://github.com/etcd-io/bbolt/pull/635) +- [Support inspecting database structure](https://github.com/etcd-io/bbolt/pull/674) + +### CMD +- [Add `surgery clear-page-elements` command](https://github.com/etcd-io/bbolt/pull/417) +- [Add `surgery abandon-freelist` command](https://github.com/etcd-io/bbolt/pull/443) +- [Add `bbolt version` command](https://github.com/etcd-io/bbolt/pull/552) +- [Add `bbolt inspect` command](https://github.com/etcd-io/bbolt/pull/674) +- [Add `--no-sync` option to `bbolt compact` command](https://github.com/etcd-io/bbolt/pull/290) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..004e77f --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Ben Johnson + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f5a6703 --- /dev/null +++ b/Makefile @@ -0,0 +1,108 @@ +BRANCH=`git rev-parse --abbrev-ref HEAD` +COMMIT=`git rev-parse --short HEAD` +GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)" +GOFILES = $(shell find . -name \*.go) + +TESTFLAGS_RACE=-race=false +ifdef ENABLE_RACE + TESTFLAGS_RACE=-race=true +endif + +TESTFLAGS_CPU= +ifdef CPU + TESTFLAGS_CPU=-cpu=$(CPU) +endif +TESTFLAGS = $(TESTFLAGS_RACE) $(TESTFLAGS_CPU) $(EXTRA_TESTFLAGS) + +TESTFLAGS_TIMEOUT=30m +ifdef TIMEOUT + TESTFLAGS_TIMEOUT=$(TIMEOUT) +endif + +TESTFLAGS_ENABLE_STRICT_MODE=false +ifdef ENABLE_STRICT_MODE + TESTFLAGS_ENABLE_STRICT_MODE=$(ENABLE_STRICT_MODE) +endif + +.EXPORT_ALL_VARIABLES: +TEST_ENABLE_STRICT_MODE=${TESTFLAGS_ENABLE_STRICT_MODE} + +.PHONY: fmt +fmt: + @echo "Verifying gofmt, failures can be fixed with ./scripts/fix.sh" + @!(gofmt -l -s -d ${GOFILES} | grep '[a-z]') + + @echo "Verifying goimports, failures can be fixed with ./scripts/fix.sh" + @!(go run golang.org/x/tools/cmd/goimports@latest -l -d ${GOFILES} | grep '[a-z]') + +.PHONY: lint +lint: + golangci-lint run ./... + +.PHONY: test +test: + @echo "hashmap freelist test" + BBOLT_VERIFY=all TEST_FREELIST_TYPE=hashmap go test -v ${TESTFLAGS} -timeout ${TESTFLAGS_TIMEOUT} + BBOLT_VERIFY=all TEST_FREELIST_TYPE=hashmap go test -v ${TESTFLAGS} ./internal/... + BBOLT_VERIFY=all TEST_FREELIST_TYPE=hashmap go test -v ${TESTFLAGS} ./cmd/bbolt + + @echo "array freelist test" + BBOLT_VERIFY=all TEST_FREELIST_TYPE=array go test -v ${TESTFLAGS} -timeout ${TESTFLAGS_TIMEOUT} + BBOLT_VERIFY=all TEST_FREELIST_TYPE=array go test -v ${TESTFLAGS} ./internal/... + BBOLT_VERIFY=all TEST_FREELIST_TYPE=array go test -v ${TESTFLAGS} ./cmd/bbolt + +.PHONY: coverage +coverage: + @echo "hashmap freelist test" + TEST_FREELIST_TYPE=hashmap go test -v -timeout ${TESTFLAGS_TIMEOUT} \ + -coverprofile cover-freelist-hashmap.out -covermode atomic + + @echo "array freelist test" + TEST_FREELIST_TYPE=array go test -v -timeout ${TESTFLAGS_TIMEOUT} \ + -coverprofile cover-freelist-array.out -covermode atomic + +BOLT_CMD=bbolt + +build: + go build -o bin/${BOLT_CMD} ./cmd/${BOLT_CMD} + +.PHONY: clean +clean: # Clean binaries + rm -f ./bin/${BOLT_CMD} + +.PHONY: gofail-enable +gofail-enable: install-gofail + gofail enable . + +.PHONY: gofail-disable +gofail-disable: install-gofail + gofail disable . + +.PHONY: install-gofail +install-gofail: + go install go.etcd.io/gofail + +.PHONY: test-failpoint +test-failpoint: + @echo "[failpoint] hashmap freelist test" + BBOLT_VERIFY=all TEST_FREELIST_TYPE=hashmap go test -v ${TESTFLAGS} -timeout 30m ./tests/failpoint + + @echo "[failpoint] array freelist test" + BBOLT_VERIFY=all TEST_FREELIST_TYPE=array go test -v ${TESTFLAGS} -timeout 30m ./tests/failpoint + +.PHONY: test-robustness # Running robustness tests requires root permission for now +# TODO: Remove sudo once we fully migrate to the prow infrastructure +test-robustness: gofail-enable build + sudo env PATH=$$PATH go test -v ${TESTFLAGS} ./tests/dmflakey -test.root + sudo env PATH=$(PWD)/bin:$$PATH go test -v ${TESTFLAGS} ${ROBUSTNESS_TESTFLAGS} ./tests/robustness -test.root + +.PHONY: test-benchmark-compare +# Runs benchmark tests on the current git ref and the given REF, and compares +# the two. +test-benchmark-compare: install-benchstat + @git fetch + ./scripts/compare_benchmarks.sh $(REF) + +.PHONY: install-benchstat +install-benchstat: + go install golang.org/x/perf/cmd/benchstat@latest diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..91f168a --- /dev/null +++ b/OWNERS @@ -0,0 +1,10 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +approvers: + - ahrtr # Benjamin Wang + - serathius # Marek Siarkowicz + - ptabor # Piotr Tabor + - spzala # Sahdev Zala +reviewers: + - fuweid # Wei Fu + - tjungblu # Thomas Jungblut diff --git a/README.md b/README.md new file mode 100644 index 0000000..3cecf47 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# Tutus Bolt + +Embedded key/value database for the Tutus blockchain. + +## Overview + +Tutus Bolt is a pure Go embedded database providing ACID transactions with serializable isolation. + +## Installation + +```go +import "github.com/tutus-one/tutus-bolt" +``` + +## License + +MIT License + +--- + +Part of the [Tutus](https://github.com/tutus-one/tutus-chain) blockchain infrastructure. diff --git a/allocate_test.go b/allocate_test.go new file mode 100644 index 0000000..4b41f9e --- /dev/null +++ b/allocate_test.go @@ -0,0 +1,39 @@ +package bbolt + +import ( + "testing" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/freelist" +) + +func TestTx_allocatePageStats(t *testing.T) { + for n, f := range map[string]freelist.Interface{"hashmap": freelist.NewHashMapFreelist(), "array": freelist.NewArrayFreelist()} { + t.Run(n, func(t *testing.T) { + ids := []common.Pgid{2, 3} + f.Init(ids) + + tx := &Tx{ + db: &DB{ + freelist: f, + pageSize: common.DefaultPageSize, + }, + meta: &common.Meta{}, + pages: make(map[common.Pgid]*common.Page), + } + + txStats := tx.Stats() + prePageCnt := txStats.GetPageCount() + allocateCnt := f.FreeCount() + + if _, err := tx.allocate(allocateCnt); err != nil { + t.Fatal(err) + } + + txStats = tx.Stats() + if txStats.GetPageCount() != prePageCnt+int64(allocateCnt) { + t.Errorf("Allocated %d but got %d page in stats", allocateCnt, txStats.GetPageCount()) + } + }) + } +} diff --git a/bolt_386.go b/bolt_386.go new file mode 100644 index 0000000..aee2596 --- /dev/null +++ b/bolt_386.go @@ -0,0 +1,7 @@ +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x7FFFFFFF // 2GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/bolt_aix.go b/bolt_aix.go new file mode 100644 index 0000000..4b424ed --- /dev/null +++ b/bolt_aix.go @@ -0,0 +1,90 @@ +//go:build aix + +package bbolt + +import ( + "fmt" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/unix" +) + +// flock acquires an advisory lock on a file descriptor. +func flock(db *DB, exclusive bool, timeout time.Duration) error { + var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + var lockType int16 + if exclusive { + lockType = syscall.F_WRLCK + } else { + lockType = syscall.F_RDLCK + } + for { + // Attempt to obtain an exclusive lock. + lock := syscall.Flock_t{Type: lockType} + err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock) + if err == nil { + return nil + } else if err != syscall.EAGAIN { + return err + } + + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + + // Wait for a bit and try again. + time.Sleep(flockRetryTimeout) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(db *DB) error { + var lock syscall.Flock_t + lock.Start = 0 + lock.Len = 0 + lock.Type = syscall.F_UNLCK + lock.Whence = 0 + return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock) +} + +// mmap memory maps a DB's data file. +func mmap(db *DB, sz int) error { + // Map the data file to memory. + b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) + if err != nil { + return err + } + + // Advise the kernel that the mmap is accessed randomly. + if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil { + return fmt.Errorf("madvise: %s", err) + } + + // Save the original byte slice and convert to a byte array pointer. + db.dataref = b + db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) + db.datasz = sz + return nil +} + +// munmap unmaps a DB's data file from memory. +func munmap(db *DB) error { + // Ignore the unmap if we have no mapped data. + if db.dataref == nil { + return nil + } + + // Unmap using the original byte slice. + err := unix.Munmap(db.dataref) + db.dataref = nil + db.data = nil + db.datasz = 0 + return err +} diff --git a/bolt_amd64.go b/bolt_amd64.go new file mode 100644 index 0000000..5dd8f3f --- /dev/null +++ b/bolt_amd64.go @@ -0,0 +1,7 @@ +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_android.go b/bolt_android.go new file mode 100644 index 0000000..11890f0 --- /dev/null +++ b/bolt_android.go @@ -0,0 +1,90 @@ +package bbolt + +import ( + "fmt" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/unix" +) + +// flock acquires an advisory lock on a file descriptor. +func flock(db *DB, exclusive bool, timeout time.Duration) error { + var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + var lockType int16 + if exclusive { + lockType = syscall.F_WRLCK + } else { + lockType = syscall.F_RDLCK + } + for { + // Attempt to obtain an exclusive lock. + lock := syscall.Flock_t{Type: lockType} + err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock) + if err == nil { + return nil + } else if err != syscall.EAGAIN { + return err + } + + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + + // Wait for a bit and try again. + time.Sleep(flockRetryTimeout) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(db *DB) error { + var lock syscall.Flock_t + lock.Start = 0 + lock.Len = 0 + lock.Type = syscall.F_UNLCK + lock.Whence = 0 + return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock) +} + +// mmap memory maps a DB's data file. +func mmap(db *DB, sz int) error { + // Map the data file to memory. + b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) + if err != nil { + return err + } + + // Advise the kernel that the mmap is accessed randomly. + err = unix.Madvise(b, syscall.MADV_RANDOM) + if err != nil && err != syscall.ENOSYS { + // Ignore not implemented error in kernel because it still works. + return fmt.Errorf("madvise: %s", err) + } + + // Save the original byte slice and convert to a byte array pointer. + db.dataref = b + db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) + db.datasz = sz + return nil +} + +// munmap unmaps a DB's data file from memory. +func munmap(db *DB) error { + // Ignore the unmap if we have no mapped data. + if db.dataref == nil { + return nil + } + + // Unmap using the original byte slice. + err := unix.Munmap(db.dataref) + db.dataref = nil + db.data = nil + db.datasz = 0 + return err +} diff --git a/bolt_arm.go b/bolt_arm.go new file mode 100644 index 0000000..aee2596 --- /dev/null +++ b/bolt_arm.go @@ -0,0 +1,7 @@ +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x7FFFFFFF // 2GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/bolt_arm64.go b/bolt_arm64.go new file mode 100644 index 0000000..2c67ab1 --- /dev/null +++ b/bolt_arm64.go @@ -0,0 +1,9 @@ +//go:build arm64 + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_linux.go b/bolt_linux.go new file mode 100644 index 0000000..7707bca --- /dev/null +++ b/bolt_linux.go @@ -0,0 +1,10 @@ +package bbolt + +import ( + "syscall" +) + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return syscall.Fdatasync(int(db.file.Fd())) +} diff --git a/bolt_loong64.go b/bolt_loong64.go new file mode 100644 index 0000000..1ef2145 --- /dev/null +++ b/bolt_loong64.go @@ -0,0 +1,9 @@ +//go:build loong64 + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_mips64x.go b/bolt_mips64x.go new file mode 100644 index 0000000..f28a051 --- /dev/null +++ b/bolt_mips64x.go @@ -0,0 +1,9 @@ +//go:build mips64 || mips64le + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x8000000000 // 512GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_mipsx.go b/bolt_mipsx.go new file mode 100644 index 0000000..708fccd --- /dev/null +++ b/bolt_mipsx.go @@ -0,0 +1,9 @@ +//go:build mips || mipsle + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x40000000 // 1GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/bolt_openbsd.go b/bolt_openbsd.go new file mode 100644 index 0000000..bf47aa1 --- /dev/null +++ b/bolt_openbsd.go @@ -0,0 +1,16 @@ +package bbolt + +import ( + "golang.org/x/sys/unix" +) + +func msync(db *DB) error { + return unix.Msync(db.data[:db.datasz], unix.MS_INVALIDATE) +} + +func fdatasync(db *DB) error { + if db.data != nil { + return msync(db) + } + return db.file.Sync() +} diff --git a/bolt_ppc.go b/bolt_ppc.go new file mode 100644 index 0000000..6a21cf3 --- /dev/null +++ b/bolt_ppc.go @@ -0,0 +1,9 @@ +//go:build ppc + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x7FFFFFFF // 2GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/bolt_ppc64.go b/bolt_ppc64.go new file mode 100644 index 0000000..a32f246 --- /dev/null +++ b/bolt_ppc64.go @@ -0,0 +1,9 @@ +//go:build ppc64 + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_ppc64le.go b/bolt_ppc64le.go new file mode 100644 index 0000000..8fb60dd --- /dev/null +++ b/bolt_ppc64le.go @@ -0,0 +1,9 @@ +//go:build ppc64le + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_riscv64.go b/bolt_riscv64.go new file mode 100644 index 0000000..a63d26a --- /dev/null +++ b/bolt_riscv64.go @@ -0,0 +1,9 @@ +//go:build riscv64 + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_s390x.go b/bolt_s390x.go new file mode 100644 index 0000000..749ea97 --- /dev/null +++ b/bolt_s390x.go @@ -0,0 +1,9 @@ +//go:build s390x + +package bbolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/bolt_solaris.go b/bolt_solaris.go new file mode 100644 index 0000000..babad65 --- /dev/null +++ b/bolt_solaris.go @@ -0,0 +1,88 @@ +package bbolt + +import ( + "fmt" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/unix" +) + +// flock acquires an advisory lock on a file descriptor. +func flock(db *DB, exclusive bool, timeout time.Duration) error { + var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + var lockType int16 + if exclusive { + lockType = syscall.F_WRLCK + } else { + lockType = syscall.F_RDLCK + } + for { + // Attempt to obtain an exclusive lock. + lock := syscall.Flock_t{Type: lockType} + err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock) + if err == nil { + return nil + } else if err != syscall.EAGAIN { + return err + } + + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return ErrTimeout + } + + // Wait for a bit and try again. + time.Sleep(flockRetryTimeout) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(db *DB) error { + var lock syscall.Flock_t + lock.Start = 0 + lock.Len = 0 + lock.Type = syscall.F_UNLCK + lock.Whence = 0 + return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock) +} + +// mmap memory maps a DB's data file. +func mmap(db *DB, sz int) error { + // Map the data file to memory. + b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) + if err != nil { + return err + } + + // Advise the kernel that the mmap is accessed randomly. + if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil { + return fmt.Errorf("madvise: %s", err) + } + + // Save the original byte slice and convert to a byte array pointer. + db.dataref = b + db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) + db.datasz = sz + return nil +} + +// munmap unmaps a DB's data file from memory. +func munmap(db *DB) error { + // Ignore the unmap if we have no mapped data. + if db.dataref == nil { + return nil + } + + // Unmap using the original byte slice. + err := unix.Munmap(db.dataref) + db.dataref = nil + db.data = nil + db.datasz = 0 + return err +} diff --git a/bolt_unix.go b/bolt_unix.go new file mode 100644 index 0000000..d4552b4 --- /dev/null +++ b/bolt_unix.go @@ -0,0 +1,88 @@ +//go:build !windows && !plan9 && !solaris && !aix && !android + +package bbolt + +import ( + "fmt" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/unix" + + "github.com/tutus-one/tutus-bolt/errors" +) + +// flock acquires an advisory lock on a file descriptor. +func flock(db *DB, exclusive bool, timeout time.Duration) error { + var t time.Time + if timeout != 0 { + t = time.Now() + } + fd := db.file.Fd() + flag := syscall.LOCK_NB + if exclusive { + flag |= syscall.LOCK_EX + } else { + flag |= syscall.LOCK_SH + } + for { + // Attempt to obtain an exclusive lock. + err := syscall.Flock(int(fd), flag) + if err == nil { + return nil + } else if err != syscall.EWOULDBLOCK { + return err + } + + // If we timed out then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return errors.ErrTimeout + } + + // Wait for a bit and try again. + time.Sleep(flockRetryTimeout) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(db *DB) error { + return syscall.Flock(int(db.file.Fd()), syscall.LOCK_UN) +} + +// mmap memory maps a DB's data file. +func mmap(db *DB, sz int) error { + // Map the data file to memory. + b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) + if err != nil { + return err + } + + // Advise the kernel that the mmap is accessed randomly. + err = unix.Madvise(b, syscall.MADV_RANDOM) + if err != nil && err != syscall.ENOSYS { + // Ignore not implemented error in kernel because it still works. + return fmt.Errorf("madvise: %s", err) + } + + // Save the original byte slice and convert to a byte array pointer. + db.dataref = b + db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) + db.datasz = sz + return nil +} + +// munmap unmaps a DB's data file from memory. +func munmap(db *DB) error { + // Ignore the unmap if we have no mapped data. + if db.dataref == nil { + return nil + } + + // Unmap using the original byte slice. + err := unix.Munmap(db.dataref) + db.dataref = nil + db.data = nil + db.datasz = 0 + return err +} diff --git a/bolt_windows.go b/bolt_windows.go new file mode 100644 index 0000000..641e6e6 --- /dev/null +++ b/bolt_windows.go @@ -0,0 +1,132 @@ +package bbolt + +import ( + "fmt" + "os" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/windows" + + "github.com/tutus-one/tutus-bolt/errors" +) + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return db.file.Sync() +} + +// flock acquires an advisory lock on a file descriptor. +func flock(db *DB, exclusive bool, timeout time.Duration) error { + var t time.Time + if timeout != 0 { + t = time.Now() + } + var flags uint32 = windows.LOCKFILE_FAIL_IMMEDIATELY + if exclusive { + flags |= windows.LOCKFILE_EXCLUSIVE_LOCK + } + for { + // Fix for https://github.com/etcd-io/bbolt/issues/121. Use byte-range + // -1..0 as the lock on the database file. + var m1 uint32 = (1 << 32) - 1 // -1 in a uint32 + err := windows.LockFileEx(windows.Handle(db.file.Fd()), flags, 0, 1, 0, &windows.Overlapped{ + Offset: m1, + OffsetHigh: m1, + }) + + if err == nil { + return nil + } else if err != windows.ERROR_LOCK_VIOLATION { + return err + } + + // If we timed oumercit then return an error. + if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { + return errors.ErrTimeout + } + + // Wait for a bit and try again. + time.Sleep(flockRetryTimeout) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(db *DB) error { + var m1 uint32 = (1 << 32) - 1 // -1 in a uint32 + return windows.UnlockFileEx(windows.Handle(db.file.Fd()), 0, 1, 0, &windows.Overlapped{ + Offset: m1, + OffsetHigh: m1, + }) +} + +// mmap memory maps a DB's data file. +// Based on: https://github.com/edsrzf/mmap-go +func mmap(db *DB, sz int) error { + var sizelo, sizehi uint32 + + if !db.readOnly { + if db.MaxSize > 0 && sz > db.MaxSize { + // The max size only limits future writes; however, we don’t block opening + // and mapping the database if it already exceeds the limit. + fileSize, err := db.fileSize() + if err != nil { + return fmt.Errorf("could not check existing db file size: %s", err) + } + + if sz > fileSize { + return errors.ErrMaxSizeReached + } + } + + // Truncate the database to the size of the mmap. + if err := db.file.Truncate(int64(sz)); err != nil { + return fmt.Errorf("truncate: %s", err) + } + sizehi = uint32(sz >> 32) + sizelo = uint32(sz) + } + + // Open a file mapping handle. + h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizehi, sizelo, nil) + if h == 0 { + return os.NewSyscallError("CreateFileMapping", errno) + } + + // Create the memory map. + addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, 0) + if addr == 0 { + // Do our best and report error returned from MapViewOfFile. + _ = syscall.CloseHandle(h) + return os.NewSyscallError("MapViewOfFile", errno) + } + + // Close mapping handle. + if err := syscall.CloseHandle(syscall.Handle(h)); err != nil { + return os.NewSyscallError("CloseHandle", err) + } + + // Convert to a byte array. + db.data = (*[maxMapSize]byte)(unsafe.Pointer(addr)) + db.datasz = sz + + return nil +} + +// munmap unmaps a pointer from a file. +// Based on: https://github.com/edsrzf/mmap-go +func munmap(db *DB) error { + if db.data == nil { + return nil + } + + addr := (uintptr)(unsafe.Pointer(&db.data[0])) + var err1 error + if err := syscall.UnmapViewOfFile(addr); err != nil { + err1 = os.NewSyscallError("UnmapViewOfFile", err) + } + db.data = nil + db.datasz = 0 + return err1 +} diff --git a/boltsync_unix.go b/boltsync_unix.go new file mode 100644 index 0000000..27face7 --- /dev/null +++ b/boltsync_unix.go @@ -0,0 +1,8 @@ +//go:build !windows && !plan9 && !linux && !openbsd + +package bbolt + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return db.file.Sync() +} diff --git a/bucket.go b/bucket.go new file mode 100644 index 0000000..ab38902 --- /dev/null +++ b/bucket.go @@ -0,0 +1,1005 @@ +package bbolt + +import ( + "bytes" + "fmt" + "unsafe" + + "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +const ( + // MaxKeySize is the maximum length of a key, in bytes. + MaxKeySize = 32768 + + // MaxValueSize is the maximum length of a value, in bytes. + MaxValueSize = (1 << 31) - 2 +) + +const ( + minFillPercent = 0.1 + maxFillPercent = 1.0 +) + +// DefaultFillPercent is the percentage that split pages are filled. +// This value can be changed by setting Bucket.FillPercent. +const DefaultFillPercent = 0.5 + +// Bucket represents a collection of key/value pairs inside the database. +type Bucket struct { + *common.InBucket + tx *Tx // the associated transaction + buckets map[string]*Bucket // subbucket cache + page *common.Page // inline page reference + rootNode *node // materialized node for the root page. + nodes map[common.Pgid]*node // node cache + + // Sets the threshold for filling nodes when they split. By default, + // the bucket will fill to 50% but it can be useful to increase this + // amount if you know that your write workloads are mostly append-only. + // + // This is non-persisted across transactions so it must be set in every Tx. + FillPercent float64 +} + +// newBucket returns a new bucket associated with a transaction. +func newBucket(tx *Tx) Bucket { + var b = Bucket{tx: tx, FillPercent: DefaultFillPercent} + if tx.writable { + b.buckets = make(map[string]*Bucket) + b.nodes = make(map[common.Pgid]*node) + } + return b +} + +// Tx returns the tx of the bucket. +func (b *Bucket) Tx() *Tx { + return b.tx +} + +// Root returns the root of the bucket. +func (b *Bucket) Root() common.Pgid { + return b.RootPage() +} + +// Writable returns whether the bucket is writable. +func (b *Bucket) Writable() bool { + return b.tx.writable +} + +// Cursor creates a cursor associated with the bucket. +// The cursor is only valid as long as the transaction is open. +// Do not use a cursor after the transaction is closed. +func (b *Bucket) Cursor() *Cursor { + // Update transaction statistics. + b.tx.stats.IncCursorCount(1) + + // Allocate and return a cursor. + return &Cursor{ + bucket: b, + stack: make([]elemRef, 0), + } +} + +// Bucket retrieves a nested bucket by name. +// Returns nil if the bucket does not exist. +// The bucket instance is only valid for the lifetime of the transaction. +func (b *Bucket) Bucket(name []byte) *Bucket { + if b.buckets != nil { + if child := b.buckets[string(name)]; child != nil { + return child + } + } + + // Move cursor to key. + c := b.Cursor() + k, v, flags := c.seek(name) + + // Return nil if the key doesn't exist or it is not a bucket. + if !bytes.Equal(name, k) || (flags&common.BucketLeafFlag) == 0 { + return nil + } + + // Otherwise create a bucket and cache it. + var child = b.openBucket(v) + if b.buckets != nil { + b.buckets[string(name)] = child + } + + return child +} + +// Helper method that re-interprets a sub-bucket value +// from a parent into a Bucket +func (b *Bucket) openBucket(value []byte) *Bucket { + var child = newBucket(b.tx) + + // Unaligned access requires a copy to be made. + const unalignedMask = unsafe.Alignof(struct { + common.InBucket + common.Page + }{}) - 1 + unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0 + if unaligned { + value = cloneBytes(value) + } + + // If this is a writable transaction then we need to copy the bucket entry. + // Read-only transactions can point directly at the mmap entry. + if b.tx.writable && !unaligned { + child.InBucket = &common.InBucket{} + *child.InBucket = *(*common.InBucket)(unsafe.Pointer(&value[0])) + } else { + child.InBucket = (*common.InBucket)(unsafe.Pointer(&value[0])) + } + + // Save a reference to the inline page if the bucket is inline. + if child.RootPage() == 0 { + child.page = (*common.Page)(unsafe.Pointer(&value[common.BucketHeaderSize])) + } + + return &child +} + +// CreateBucket creates a new bucket at the given key and returns the new bucket. +// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. +func (b *Bucket) CreateBucket(key []byte) (rb *Bucket, err error) { + if lg := b.tx.db.Logger(); lg != discardLogger { + lg.Debugf("Creating bucket %q", key) + defer func() { + if err != nil { + lg.Errorf("Creating bucket %q failed: %v", key, err) + } else { + lg.Debugf("Creating bucket %q successfully", key) + } + }() + } + if b.tx.db == nil { + return nil, errors.ErrTxClosed + } else if !b.tx.writable { + return nil, errors.ErrTxNotWritable + } else if len(key) == 0 { + return nil, errors.ErrBucketNameRequired + } + + // Insert into node. + // Tip: Use a new variable `newKey` instead of reusing the existing `key` to prevent + // it from being marked as leaking, and accordingly cannot be allocated on stack. + newKey := cloneBytes(key) + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(newKey) + + // Return an error if there is an existing key. + if bytes.Equal(newKey, k) { + if (flags & common.BucketLeafFlag) != 0 { + return nil, errors.ErrBucketExists + } + return nil, errors.ErrIncompatibleValue + } + + // Create empty, inline bucket. + var bucket = Bucket{ + InBucket: &common.InBucket{}, + rootNode: &node{isLeaf: true}, + FillPercent: DefaultFillPercent, + } + var value = bucket.write() + + c.node().put(newKey, newKey, value, 0, common.BucketLeafFlag) + + // Since subbuckets are not allowed on inline buckets, we need to + // dereference the inline page, if it exists. This will cause the bucket + // to be treated as a regular, non-inline bucket for the rest of the tx. + b.page = nil + + return b.Bucket(newKey), nil +} + +// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it. +// Returns an error if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. +func (b *Bucket) CreateBucketIfNotExists(key []byte) (rb *Bucket, err error) { + if lg := b.tx.db.Logger(); lg != discardLogger { + lg.Debugf("Creating bucket if not exist %q", key) + defer func() { + if err != nil { + lg.Errorf("Creating bucket if not exist %q failed: %v", key, err) + } else { + lg.Debugf("Creating bucket if not exist %q successfully", key) + } + }() + } + + if b.tx.db == nil { + return nil, errors.ErrTxClosed + } else if !b.tx.writable { + return nil, errors.ErrTxNotWritable + } else if len(key) == 0 { + return nil, errors.ErrBucketNameRequired + } + + // Insert into node. + // Tip: Use a new variable `newKey` instead of reusing the existing `key` to prevent + // it from being marked as leaking, and accordingly cannot be allocated on stack. + newKey := cloneBytes(key) + + if b.buckets != nil { + if child := b.buckets[string(newKey)]; child != nil { + return child, nil + } + } + + // Move cursor to correct position. + c := b.Cursor() + k, v, flags := c.seek(newKey) + + // Return an error if there is an existing non-bucket key. + if bytes.Equal(newKey, k) { + if (flags & common.BucketLeafFlag) != 0 { + var child = b.openBucket(v) + if b.buckets != nil { + b.buckets[string(newKey)] = child + } + + return child, nil + } + return nil, errors.ErrIncompatibleValue + } + + // Create empty, inline bucket. + var bucket = Bucket{ + InBucket: &common.InBucket{}, + rootNode: &node{isLeaf: true}, + FillPercent: DefaultFillPercent, + } + var value = bucket.write() + + c.node().put(newKey, newKey, value, 0, common.BucketLeafFlag) + + // Since subbuckets are not allowed on inline buckets, we need to + // dereference the inline page, if it exists. This will cause the bucket + // to be treated as a regular, non-inline bucket for the rest of the tx. + b.page = nil + + return b.Bucket(newKey), nil +} + +// DeleteBucket deletes a bucket at the given key. +// Returns an error if the bucket does not exist, or if the key represents a non-bucket value. +func (b *Bucket) DeleteBucket(key []byte) (err error) { + if lg := b.tx.db.Logger(); lg != discardLogger { + lg.Debugf("Deleting bucket %q", key) + defer func() { + if err != nil { + lg.Errorf("Deleting bucket %q failed: %v", key, err) + } else { + lg.Debugf("Deleting bucket %q successfully", key) + } + }() + } + + if b.tx.db == nil { + return errors.ErrTxClosed + } else if !b.Writable() { + return errors.ErrTxNotWritable + } + + newKey := cloneBytes(key) + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(newKey) + + // Return an error if bucket doesn't exist or is not a bucket. + if !bytes.Equal(newKey, k) { + return errors.ErrBucketNotFound + } else if (flags & common.BucketLeafFlag) == 0 { + return errors.ErrIncompatibleValue + } + + // Recursively delete all child buckets. + child := b.Bucket(newKey) + err = child.ForEachBucket(func(k []byte) error { + if err := child.DeleteBucket(k); err != nil { + return fmt.Errorf("delete bucket: %s", err) + } + return nil + }) + if err != nil { + return err + } + + // Remove cached copy. + delete(b.buckets, string(newKey)) + + // Release all bucket pages to freelist. + child.nodes = nil + child.rootNode = nil + child.free() + + // Delete the node if we have a matching key. + c.node().del(newKey) + + return nil +} + +// MoveBucket moves a sub-bucket from the source bucket to the destination bucket. +// Returns an error if +// 1. the sub-bucket cannot be found in the source bucket; +// 2. or the key already exists in the destination bucket; +// 3. or the key represents a non-bucket value; +// 4. the source and destination buckets are the same. +func (b *Bucket) MoveBucket(key []byte, dstBucket *Bucket) (err error) { + lg := b.tx.db.Logger() + if lg != discardLogger { + lg.Debugf("Moving bucket %q", key) + defer func() { + if err != nil { + lg.Errorf("Moving bucket %q failed: %v", key, err) + } else { + lg.Debugf("Moving bucket %q successfully", key) + } + }() + } + + if b.tx.db == nil || dstBucket.tx.db == nil { + return errors.ErrTxClosed + } else if !b.Writable() || !dstBucket.Writable() { + return errors.ErrTxNotWritable + } + + if b.tx.db.Path() != dstBucket.tx.db.Path() || b.tx != dstBucket.tx { + lg.Errorf("The source and target buckets are not in the same db file, source bucket in %s and target bucket in %s", b.tx.db.Path(), dstBucket.tx.db.Path()) + return errors.ErrDifferentDB + } + + newKey := cloneBytes(key) + + // Move cursor to correct position. + c := b.Cursor() + k, v, flags := c.seek(newKey) + + // Return an error if bucket doesn't exist or is not a bucket. + if !bytes.Equal(newKey, k) { + return errors.ErrBucketNotFound + } else if (flags & common.BucketLeafFlag) == 0 { + lg.Errorf("An incompatible key %s exists in the source bucket", newKey) + return errors.ErrIncompatibleValue + } + + // Do nothing (return true directly) if the source bucket and the + // destination bucket are actually the same bucket. + if b == dstBucket || (b.RootPage() == dstBucket.RootPage() && b.RootPage() != 0) { + lg.Errorf("The source bucket (%s) and the target bucket (%s) are the same bucket", b, dstBucket) + return errors.ErrSameBuckets + } + + // check whether the key already exists in the destination bucket + curDst := dstBucket.Cursor() + k, _, flags = curDst.seek(newKey) + + // Return an error if there is an existing key in the destination bucket. + if bytes.Equal(newKey, k) { + if (flags & common.BucketLeafFlag) != 0 { + return errors.ErrBucketExists + } + lg.Errorf("An incompatible key %s exists in the target bucket", newKey) + return errors.ErrIncompatibleValue + } + + // remove the sub-bucket from the source bucket + delete(b.buckets, string(newKey)) + c.node().del(newKey) + + // add te sub-bucket to the destination bucket + newValue := cloneBytes(v) + curDst.node().put(newKey, newKey, newValue, 0, common.BucketLeafFlag) + + return nil +} + +// Inspect returns the structure of the bucket. +func (b *Bucket) Inspect() BucketStructure { + return b.recursivelyInspect([]byte("root")) +} + +func (b *Bucket) recursivelyInspect(name []byte) BucketStructure { + bs := BucketStructure{Name: string(name)} + + keyN := 0 + c := b.Cursor() + for k, _, flags := c.first(); k != nil; k, _, flags = c.next() { + if flags&common.BucketLeafFlag != 0 { + childBucket := b.Bucket(k) + childBS := childBucket.recursivelyInspect(k) + bs.Children = append(bs.Children, childBS) + } else { + keyN++ + } + } + bs.KeyN = keyN + + return bs +} + +// Get retrieves the value for a key in the bucket. +// Returns a nil value if the key does not exist or if the key is a nested bucket. +// The returned value is only valid for the life of the transaction. +// The returned memory is owned by bbolt and must never be modified; writing to this memory might corrupt the database. +func (b *Bucket) Get(key []byte) []byte { + k, v, flags := b.Cursor().seek(key) + + // Return nil if this is a bucket. + if (flags & common.BucketLeafFlag) != 0 { + return nil + } + + // If our target node isn't the same key as what's passed in then return nil. + if !bytes.Equal(key, k) { + return nil + } + return v +} + +// Put sets the value for a key in the bucket. +// If the key exist then its previous value will be overwritten. +// Supplied value must remain valid for the life of the transaction. +// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large. +func (b *Bucket) Put(key []byte, value []byte) (err error) { + if lg := b.tx.db.Logger(); lg != discardLogger { + lg.Debugf("Putting key %q", key) + defer func() { + if err != nil { + lg.Errorf("Putting key %q failed: %v", key, err) + } else { + lg.Debugf("Putting key %q successfully", key) + } + }() + } + if b.tx.db == nil { + return errors.ErrTxClosed + } else if !b.Writable() { + return errors.ErrTxNotWritable + } else if len(key) == 0 { + return errors.ErrKeyRequired + } else if len(key) > MaxKeySize { + return errors.ErrKeyTooLarge + } else if int64(len(value)) > MaxValueSize { + return errors.ErrValueTooLarge + } + + // Insert into node. + // Tip: Use a new variable `newKey` instead of reusing the existing `key` to prevent + // it from being marked as leaking, and accordingly cannot be allocated on stack. + newKey := cloneBytes(key) + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(newKey) + + // Return an error if there is an existing key with a bucket value. + if bytes.Equal(newKey, k) && (flags&common.BucketLeafFlag) != 0 { + return errors.ErrIncompatibleValue + } + + // gofail: var beforeBucketPut struct{} + + c.node().put(newKey, newKey, value, 0, 0) + + return nil +} + +// Delete removes a key from the bucket. +// If the key does not exist then nothing is done and a nil error is returned. +// Returns an error if the bucket was created from a read-only transaction. +func (b *Bucket) Delete(key []byte) (err error) { + if lg := b.tx.db.Logger(); lg != discardLogger { + lg.Debugf("Deleting key %q", key) + defer func() { + if err != nil { + lg.Errorf("Deleting key %q failed: %v", key, err) + } else { + lg.Debugf("Deleting key %q successfully", key) + } + }() + } + + if b.tx.db == nil { + return errors.ErrTxClosed + } else if !b.Writable() { + return errors.ErrTxNotWritable + } + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(key) + + // Return nil if the key doesn't exist. + if !bytes.Equal(key, k) { + return nil + } + + // Return an error if there is already existing bucket value. + if (flags & common.BucketLeafFlag) != 0 { + return errors.ErrIncompatibleValue + } + + // Delete the node if we have a matching key. + c.node().del(key) + + return nil +} + +// Sequence returns the current integer for the bucket without incrementing it. +func (b *Bucket) Sequence() uint64 { + return b.InSequence() +} + +// SetSequence updates the sequence number for the bucket. +func (b *Bucket) SetSequence(v uint64) error { + if b.tx.db == nil { + return errors.ErrTxClosed + } else if !b.Writable() { + return errors.ErrTxNotWritable + } + + // Materialize the root node if it hasn't been already so that the + // bucket will be saved during commit. + if b.rootNode == nil { + _ = b.node(b.RootPage(), nil) + } + + // Set the sequence. + b.SetInSequence(v) + return nil +} + +// NextSequence returns an autoincrementing integer for the bucket. +func (b *Bucket) NextSequence() (uint64, error) { + if b.tx.db == nil { + return 0, errors.ErrTxClosed + } else if !b.Writable() { + return 0, errors.ErrTxNotWritable + } + + // Materialize the root node if it hasn't been already so that the + // bucket will be saved during commit. + if b.rootNode == nil { + _ = b.node(b.RootPage(), nil) + } + + // Increment and return the sequence. + b.IncSequence() + return b.Sequence(), nil +} + +// ForEach executes a function for each key/value pair in a bucket. +// Because ForEach uses a Cursor, the iteration over keys is in lexicographical order. +// If the provided function returns an error then the iteration is stopped and +// the error is returned to the caller. The provided function must not modify +// the bucket; this will result in undefined behavior. +func (b *Bucket) ForEach(fn func(k, v []byte) error) error { + if b.tx.db == nil { + return errors.ErrTxClosed + } + c := b.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + if err := fn(k, v); err != nil { + return err + } + } + return nil +} + +func (b *Bucket) ForEachBucket(fn func(k []byte) error) error { + if b.tx.db == nil { + return errors.ErrTxClosed + } + c := b.Cursor() + for k, _, flags := c.first(); k != nil; k, _, flags = c.next() { + if flags&common.BucketLeafFlag != 0 { + if err := fn(k); err != nil { + return err + } + } + } + return nil +} + +// Stats returns stats on a bucket. +func (b *Bucket) Stats() BucketStats { + var s, subStats BucketStats + pageSize := b.tx.db.pageSize + s.BucketN += 1 + if b.RootPage() == 0 { + s.InlineBucketN += 1 + } + b.forEachPage(func(p *common.Page, depth int, pgstack []common.Pgid) { + if p.IsLeafPage() { + s.KeyN += int(p.Count()) + + // used totals the used bytes for the page + used := common.PageHeaderSize + + if p.Count() != 0 { + // If page has any elements, add all element headers. + used += common.LeafPageElementSize * uintptr(p.Count()-1) + + // Add all element key, value sizes. + // The computation takes advantage of the fact that the position + // of the last element's key/value equals to the total of the sizes + // of all previous elements' keys and values. + // It also includes the last element's header. + lastElement := p.LeafPageElement(p.Count() - 1) + used += uintptr(lastElement.Pos() + lastElement.Ksize() + lastElement.Vsize()) + } + + if b.RootPage() == 0 { + // For inlined bucket just update the inline stats + s.InlineBucketInuse += int(used) + } else { + // For non-inlined bucket update all the leaf stats + s.LeafPageN++ + s.LeafInuse += int(used) + s.LeafOverflowN += int(p.Overflow()) + + // Collect stats from sub-buckets. + // Do that by iterating over all element headers + // looking for the ones with the bucketLeafFlag. + for i := uint16(0); i < p.Count(); i++ { + e := p.LeafPageElement(i) + if (e.Flags() & common.BucketLeafFlag) != 0 { + // For any bucket element, open the element value + // and recursively call Stats on the contained bucket. + subStats.Add(b.openBucket(e.Value()).Stats()) + } + } + } + } else if p.IsBranchPage() { + s.BranchPageN++ + lastElement := p.BranchPageElement(p.Count() - 1) + + // used totals the used bytes for the page + // Add header and all element headers. + used := common.PageHeaderSize + (common.BranchPageElementSize * uintptr(p.Count()-1)) + + // Add size of all keys and values. + // Again, use the fact that last element's position equals to + // the total of key, value sizes of all previous elements. + used += uintptr(lastElement.Pos() + lastElement.Ksize()) + s.BranchInuse += int(used) + s.BranchOverflowN += int(p.Overflow()) + } + + // Keep track of maximum page depth. + if depth+1 > s.Depth { + s.Depth = depth + 1 + } + }) + + // Alloc stats can be computed from page counts and pageSize. + s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize + s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize + + // Add the max depth of sub-buckets to get total nested depth. + s.Depth += subStats.Depth + // Add the stats for all sub-buckets + s.Add(subStats) + return s +} + +// forEachPage iterates over every page in a bucket, including inline pages. +func (b *Bucket) forEachPage(fn func(*common.Page, int, []common.Pgid)) { + // If we have an inline page then just use that. + if b.page != nil { + fn(b.page, 0, []common.Pgid{b.RootPage()}) + return + } + + // Otherwise traverse the page hierarchy. + b.tx.forEachPage(b.RootPage(), fn) +} + +// forEachPageNode iterates over every page (or node) in a bucket. +// This also includes inline pages. +func (b *Bucket) forEachPageNode(fn func(*common.Page, *node, int)) { + // If we have an inline page or root node then just use that. + if b.page != nil { + fn(b.page, nil, 0) + return + } + b._forEachPageNode(b.RootPage(), 0, fn) +} + +func (b *Bucket) _forEachPageNode(pgId common.Pgid, depth int, fn func(*common.Page, *node, int)) { + var p, n = b.pageNode(pgId) + + // Execute function. + fn(p, n, depth) + + // Recursively loop over children. + if p != nil { + if p.IsBranchPage() { + for i := 0; i < int(p.Count()); i++ { + elem := p.BranchPageElement(uint16(i)) + b._forEachPageNode(elem.Pgid(), depth+1, fn) + } + } + } else { + if !n.isLeaf { + for _, inode := range n.inodes { + b._forEachPageNode(inode.Pgid(), depth+1, fn) + } + } + } +} + +// spill writes all the nodes for this bucket to dirty pages. +func (b *Bucket) spill() error { + // Spill all child buckets first. + for name, child := range b.buckets { + // If the child bucket is small enough and it has no child buckets then + // write it inline into the parent bucket's page. Otherwise spill it + // like a normal bucket and make the parent value a pointer to the page. + var value []byte + if child.inlineable() { + child.free() + value = child.write() + } else { + if err := child.spill(); err != nil { + return err + } + + // Update the child bucket header in this bucket. + value = make([]byte, unsafe.Sizeof(common.InBucket{})) + var bucket = (*common.InBucket)(unsafe.Pointer(&value[0])) + *bucket = *child.InBucket + } + + // Skip writing the bucket if there are no materialized nodes. + if child.rootNode == nil { + continue + } + + // Update parent node. + var c = b.Cursor() + k, _, flags := c.seek([]byte(name)) + if !bytes.Equal([]byte(name), k) { + panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k)) + } + if flags&common.BucketLeafFlag == 0 { + panic(fmt.Sprintf("unexpected bucket header flag: %x", flags)) + } + c.node().put([]byte(name), []byte(name), value, 0, common.BucketLeafFlag) + } + + // Ignore if there's not a materialized root node. + if b.rootNode == nil { + return nil + } + + // Spill nodes. + if err := b.rootNode.spill(); err != nil { + return err + } + b.rootNode = b.rootNode.root() + + // Update the root node for this bucket. + if b.rootNode.pgid >= b.tx.meta.Pgid() { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.Pgid())) + } + b.SetRootPage(b.rootNode.pgid) + + return nil +} + +// inlineable returns true if a bucket is small enough to be written inline +// and if it contains no subbuckets. Otherwise, returns false. +func (b *Bucket) inlineable() bool { + var n = b.rootNode + + // Bucket must only contain a single leaf node. + if n == nil || !n.isLeaf { + return false + } + + // Bucket is not inlineable if it contains subbuckets or if it goes beyond + // our threshold for inline bucket size. + var size = common.PageHeaderSize + for _, inode := range n.inodes { + size += common.LeafPageElementSize + uintptr(len(inode.Key())) + uintptr(len(inode.Value())) + + if inode.Flags()&common.BucketLeafFlag != 0 { + return false + } else if size > b.maxInlineBucketSize() { + return false + } + } + + return true +} + +// Returns the maximum total size of a bucket to make it a candidate for inlining. +func (b *Bucket) maxInlineBucketSize() uintptr { + return uintptr(b.tx.db.pageSize / 4) +} + +// write allocates and writes a bucket to a byte slice. +func (b *Bucket) write() []byte { + // Allocate the appropriate size. + var n = b.rootNode + var value = make([]byte, common.BucketHeaderSize+n.size()) + + // Write a bucket header. + var bucket = (*common.InBucket)(unsafe.Pointer(&value[0])) + *bucket = *b.InBucket + + // Convert byte slice to a fake page and write the root node. + var p = (*common.Page)(unsafe.Pointer(&value[common.BucketHeaderSize])) + n.write(p) + + return value +} + +// rebalance attempts to balance all nodes. +func (b *Bucket) rebalance() { + for _, n := range b.nodes { + n.rebalance() + } + for _, child := range b.buckets { + child.rebalance() + } +} + +// node creates a node from a page and associates it with a given parent. +func (b *Bucket) node(pgId common.Pgid, parent *node) *node { + common.Assert(b.nodes != nil, "nodes map expected") + + // Retrieve node if it's already been created. + if n := b.nodes[pgId]; n != nil { + return n + } + + // Otherwise create a node and cache it. + n := &node{bucket: b, parent: parent} + if parent == nil { + b.rootNode = n + } else { + parent.children = append(parent.children, n) + } + + // Use the inline page if this is an inline bucket. + var p = b.page + if p == nil { + p = b.tx.page(pgId) + } else { + // if p isn't nil, then it's an inline bucket. + // The pgId must be 0 in this case. + common.Verify(func() { + common.Assert(pgId == 0, "The page ID (%d) isn't 0 for an inline bucket", pgId) + }) + } + + // Read the page into the node and cache it. + n.read(p) + b.nodes[pgId] = n + + // Update statistics. + b.tx.stats.IncNodeCount(1) + + return n +} + +// free recursively frees all pages in the bucket. +func (b *Bucket) free() { + if b.RootPage() == 0 { + return + } + + var tx = b.tx + b.forEachPageNode(func(p *common.Page, n *node, _ int) { + if p != nil { + tx.db.freelist.Free(tx.meta.Txid(), p) + } else { + n.free() + } + }) + b.SetRootPage(0) +} + +// dereference removes all references to the old mmap. +func (b *Bucket) dereference() { + if b.rootNode != nil { + b.rootNode.root().dereference() + } + + for _, child := range b.buckets { + child.dereference() + } +} + +// pageNode returns the in-memory node, if it exists. +// Otherwise, returns the underlying page. +func (b *Bucket) pageNode(id common.Pgid) (*common.Page, *node) { + // Inline buckets have a fake page embedded in their value so treat them + // differently. We'll return the rootNode (if available) or the fake page. + if b.RootPage() == 0 { + if id != 0 { + panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id)) + } + if b.rootNode != nil { + return nil, b.rootNode + } + return b.page, nil + } + + // Check the node cache for non-inline buckets. + if b.nodes != nil { + if n := b.nodes[id]; n != nil { + return nil, n + } + } + + // Finally lookup the page from the transaction if no node is materialized. + return b.tx.page(id), nil +} + +// BucketStats records statistics about resources used by a bucket. +type BucketStats struct { + // Page count statistics. + BranchPageN int // number of logical branch pages + BranchOverflowN int // number of physical branch overflow pages + LeafPageN int // number of logical leaf pages + LeafOverflowN int // number of physical leaf overflow pages + + // Tree statistics. + KeyN int // number of keys/value pairs + Depth int // number of levels in B+tree + + // Page size utilization. + BranchAlloc int // bytes allocated for physical branch pages + BranchInuse int // bytes actually used for branch data + LeafAlloc int // bytes allocated for physical leaf pages + LeafInuse int // bytes actually used for leaf data + + // Bucket statistics + BucketN int // total number of buckets including the top bucket + InlineBucketN int // total number on inlined buckets + InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse) +} + +func (s *BucketStats) Add(other BucketStats) { + s.BranchPageN += other.BranchPageN + s.BranchOverflowN += other.BranchOverflowN + s.LeafPageN += other.LeafPageN + s.LeafOverflowN += other.LeafOverflowN + s.KeyN += other.KeyN + if s.Depth < other.Depth { + s.Depth = other.Depth + } + s.BranchAlloc += other.BranchAlloc + s.BranchInuse += other.BranchInuse + s.LeafAlloc += other.LeafAlloc + s.LeafInuse += other.LeafInuse + + s.BucketN += other.BucketN + s.InlineBucketN += other.InlineBucketN + s.InlineBucketInuse += other.InlineBucketInuse +} + +// cloneBytes returns a copy of a given slice. +func cloneBytes(v []byte) []byte { + var clone = make([]byte, len(v)) + copy(clone, v) + return clone +} + +type BucketStructure struct { + Name string `json:"name"` // name of the bucket + KeyN int `json:"keyN"` // number of key/value pairs + Children []BucketStructure `json:"buckets,omitempty"` // child buckets +} diff --git a/bucket_test.go b/bucket_test.go new file mode 100644 index 0000000..cfcdc76 --- /dev/null +++ b/bucket_test.go @@ -0,0 +1,2170 @@ +package bbolt_test + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "log" + "math/rand" + "os" + "strconv" + "strings" + "testing" + "testing/quick" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +// Ensure that a bucket that gets a non-existent key returns nil. +func TestBucket_Get_NonExistent(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if v := b.Get([]byte("foo")); v != nil { + t.Fatal("expected nil value") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can read a value that is not flushed yet. +func TestBucket_Get_FromNode(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if v := b.Get([]byte("foo")); !bytes.Equal(v, []byte("bar")) { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket retrieved via Get() returns a nil. +func TestBucket_Get_IncompatibleValue(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + + if tx.Bucket([]byte("widgets")).Get([]byte("foo")) != nil { + t.Fatal("expected nil value") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a slice returned from a bucket has a capacity equal to its length. +// This also allows slices to be appended to since it will require a realloc by Go. +// +// https://github.com/boltdb/bolt/issues/544 +func TestBucket_Get_Capacity(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Write key to a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("bucket")) + if err != nil { + return err + } + return b.Put([]byte("key"), []byte("val")) + }); err != nil { + t.Fatal(err) + } + + // Retrieve value and attempt to append to it. + if err := db.Update(func(tx *bolt.Tx) error { + k, v := tx.Bucket([]byte("bucket")).Cursor().First() + + // Verify capacity. + if len(k) != cap(k) { + t.Fatalf("unexpected key slice capacity: %d", cap(k)) + } else if len(v) != cap(v) { + t.Fatalf("unexpected value slice capacity: %d", cap(v)) + } + + // Ensure slice can be appended to without a segfault. + k = append(k, []byte("123")...) + v = append(v, []byte("123")...) + _, _ = k, v // to pass ineffassign + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can write a key/value. +func TestBucket_Put(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + + v := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + if !bytes.Equal([]byte("bar"), v) { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can rewrite a key in the same transaction. +func TestBucket_Put_Repeat(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("baz")); err != nil { + t.Fatal(err) + } + + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + if !bytes.Equal([]byte("baz"), value) { + t.Fatalf("unexpected value: %v", value) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can write a bunch of large values. +func TestBucket_Put_Large(t *testing.T) { + db := btesting.MustCreateDB(t) + + count, factor := 100, 200 + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for i := 1; i < count; i++ { + if err := b.Put([]byte(strings.Repeat("0", i*factor)), []byte(strings.Repeat("X", (count-i)*factor))); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i < count; i++ { + value := b.Get([]byte(strings.Repeat("0", i*factor))) + if !bytes.Equal(value, []byte(strings.Repeat("X", (count-i)*factor))) { + t.Fatalf("unexpected value: %v", value) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a database can perform multiple large appends safely. +func TestDB_Put_VeryLarge(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + n, batchN := 400000, 200000 + ksize, vsize := 8, 500 + + db := btesting.MustCreateDB(t) + + for i := 0; i < n; i += batchN { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for j := 0; j < batchN; j++ { + k, v := make([]byte, ksize), make([]byte, vsize) + binary.BigEndian.PutUint32(k, uint32(i+j)) + if err := b.Put(k, v); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + } +} + +// Ensure that a setting a value on a key with a bucket value returns an error. +func TestBucket_Put_IncompatibleValue(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b0, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + if err := b0.Put([]byte("foo"), []byte("bar")); err != berrors.ErrIncompatibleValue { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a setting a value while the transaction is closed returns an error. +func TestBucket_Put_Closed(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + + if err := b.Put([]byte("foo"), []byte("bar")); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that setting a value on a read-only bucket returns an error. +func TestBucket_Put_ReadOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + if err := b.Put([]byte("foo"), []byte("bar")); err != berrors.ErrTxNotWritable { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can delete an existing key. +func TestBucket_Delete(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Delete([]byte("foo")); err != nil { + t.Fatal(err) + } + if v := b.Get([]byte("foo")); v != nil { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a large set of keys will work correctly. +func TestBucket_Delete_Large(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 100; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strings.Repeat("*", 1024))); err != nil { + t.Fatal(err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + if err := b.Delete([]byte(strconv.Itoa(i))); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + if v := b.Get([]byte(strconv.Itoa(i))); v != nil { + t.Fatalf("unexpected value: %v, i=%d", v, i) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Deleting a very large list of keys will cause the freelist to use overflow. +func TestBucket_Delete_FreelistOverflow(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + db := btesting.MustCreateDB(t) + + k := make([]byte, 16) + // The bigger the pages - the more values we need to write. + for i := uint64(0); i < 2*uint64(db.Info().PageSize); i++ { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("0")) + if err != nil { + t.Fatalf("bucket error: %s", err) + } + + for j := uint64(0); j < 1000; j++ { + binary.BigEndian.PutUint64(k[:8], i) + binary.BigEndian.PutUint64(k[8:], j) + if err := b.Put(k, nil); err != nil { + t.Fatalf("put error: %s", err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } + } + + // Delete all of them in one large transaction + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("0")) + c := b.Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + if err := c.Delete(); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Check more than an overflow's worth of pages are freed. + stats := db.Stats() + freePages := stats.FreePageN + stats.PendingPageN + if freePages <= 0xFFFF { + t.Fatalf("expected more than 0xFFFF free pages, got %v", freePages) + } + + // Free page count should be preserved on reopen. + db.MustClose() + db.MustReopen() + if reopenFreePages := db.Stats().FreePageN; freePages != reopenFreePages { + t.Fatalf("expected %d free pages, got %+v", freePages, db.Stats()) + } + if reopenPendingPages := db.Stats().PendingPageN; reopenPendingPages != 0 { + t.Fatalf("expected no pending pages, got %+v", db.Stats()) + } +} + +// Ensure that deleting of non-existing key is a no-op. +func TestBucket_Delete_NonExisting(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if _, err = b.CreateBucket([]byte("nested")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + if err := b.Delete([]byte("foo")); err != nil { + t.Fatal(err) + } + if b.Bucket([]byte("nested")) == nil { + t.Fatal("nested bucket has been deleted") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that accessing and updating nested buckets is ok across transactions. +func TestBucket_Nested(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + // Create a widgets bucket. + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + // Create a widgets/foo bucket. + _, err = b.CreateBucket([]byte("foo")) + if err != nil { + t.Fatal(err) + } + + // Create a widgets/bar key. + if err := b.Put([]byte("bar"), []byte("0000")); err != nil { + t.Fatal(err) + } + + return nil + }); err != nil { + t.Fatal(err) + } + db.MustCheck() + + // Update widgets/bar. + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + if err := b.Put([]byte("bar"), []byte("xxxx")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + db.MustCheck() + + // Cause a split. + if err := db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 10000; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + db.MustCheck() + + // Insert into widgets/foo/baz. + if err := db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + if err := b.Bucket([]byte("foo")).Put([]byte("baz"), []byte("yyyy")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + db.MustCheck() + + // Verify. + if err := db.View(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + if v := b.Bucket([]byte("foo")).Get([]byte("baz")); !bytes.Equal(v, []byte("yyyy")) { + t.Fatalf("unexpected value: %v", v) + } + if v := b.Get([]byte("bar")); !bytes.Equal(v, []byte("xxxx")) { + t.Fatalf("unexpected value: %v", v) + } + for i := 0; i < 10000; i++ { + if v := b.Get([]byte(strconv.Itoa(i))); !bytes.Equal(v, []byte(strconv.Itoa(i))) { + t.Fatalf("unexpected value: %v", v) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a bucket using Delete() returns an error. +func TestBucket_Delete_Bucket(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + if err := b.Delete([]byte("foo")); err != berrors.ErrIncompatibleValue { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a key on a read-only bucket returns an error. +func TestBucket_Delete_ReadOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("widgets")).Delete([]byte("foo")); err != berrors.ErrTxNotWritable { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a deleting value while the transaction is closed returns an error. +func TestBucket_Delete_Closed(t *testing.T) { + db := btesting.MustCreateDB(t) + + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + if err := b.Delete([]byte("foo")); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that deleting a bucket causes nested buckets to be deleted. +func TestBucket_DeleteBucket_Nested(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + foo, err := widgets.CreateBucket([]byte("foo")) + if err != nil { + t.Fatal(err) + } + + bar, err := foo.CreateBucket([]byte("bar")) + if err != nil { + t.Fatal(err) + } + if err := bar.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + if err := tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a bucket causes nested buckets to be deleted after they have been committed. +func TestBucket_DeleteBucket_Nested2(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + foo, err := widgets.CreateBucket([]byte("foo")) + if err != nil { + t.Fatal(err) + } + + bar, err := foo.CreateBucket([]byte("bar")) + if err != nil { + t.Fatal(err) + } + + if err := bar.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + widgets := tx.Bucket([]byte("widgets")) + if widgets == nil { + t.Fatal("expected widgets bucket") + } + + foo := widgets.Bucket([]byte("foo")) + if foo == nil { + t.Fatal("expected foo bucket") + } + + bar := foo.Bucket([]byte("bar")) + if bar == nil { + t.Fatal("expected bar bucket") + } + + if v := bar.Get([]byte("baz")); !bytes.Equal(v, []byte("bat")) { + t.Fatalf("unexpected value: %v", v) + } + if err := tx.DeleteBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) != nil { + t.Fatal("expected bucket to be deleted") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a child bucket with multiple pages causes all pages to get collected. +// NOTE: Consistency check in bolt_test.DB.Close() will panic if pages not freed properly. +func TestBucket_DeleteBucket_Large(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + foo, err := widgets.CreateBucket([]byte("foo")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 1000; i++ { + if err := foo.Put([]byte(fmt.Sprintf("%d", i)), []byte(fmt.Sprintf("%0100d", i))); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.DeleteBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a simple value retrieved via Bucket() returns a nil. +func TestBucket_Bucket_IncompatibleValue(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if b := tx.Bucket([]byte("widgets")).Bucket([]byte("foo")); b != nil { + t.Fatal("expected nil bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that creating a bucket on an existing non-bucket key returns an error. +func TestBucket_CreateBucket_IncompatibleValue(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if _, err := widgets.CreateBucket([]byte("foo")); err != berrors.ErrIncompatibleValue { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a bucket on an existing non-bucket key returns an error. +func TestBucket_DeleteBucket_IncompatibleValue(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")); err != berrors.ErrIncompatibleValue { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure bucket can set and update its sequence number. +func TestBucket_Sequence(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + bkt, err := tx.CreateBucket([]byte("0")) + if err != nil { + t.Fatal(err) + } + + // Retrieve sequence. + if v := bkt.Sequence(); v != 0 { + t.Fatalf("unexpected sequence: %d", v) + } + + // Update sequence. + if err := bkt.SetSequence(1000); err != nil { + t.Fatal(err) + } + + // Read sequence again. + if v := bkt.Sequence(); v != 1000 { + t.Fatalf("unexpected sequence: %d", v) + } + + return nil + }); err != nil { + t.Fatal(err) + } + + // Verify sequence in separate transaction. + if err := db.View(func(tx *bolt.Tx) error { + if v := tx.Bucket([]byte("0")).Sequence(); v != 1000 { + t.Fatalf("unexpected sequence: %d", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can return an autoincrementing sequence. +func TestBucket_NextSequence(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + widgets, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + woojits, err := tx.CreateBucket([]byte("woojits")) + if err != nil { + t.Fatal(err) + } + + // Make sure sequence increments. + if seq, err := widgets.NextSequence(); err != nil { + t.Fatal(err) + } else if seq != 1 { + t.Fatalf("unexpecte sequence: %d", seq) + } + + if seq, err := widgets.NextSequence(); err != nil { + t.Fatal(err) + } else if seq != 2 { + t.Fatalf("unexpected sequence: %d", seq) + } + + // Buckets should be separate. + if seq, err := woojits.NextSequence(); err != nil { + t.Fatal(err) + } else if seq != 1 { + t.Fatalf("unexpected sequence: %d", 1) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket will persist an autoincrementing sequence even if its +// the only thing updated on the bucket. +// https://github.com/boltdb/bolt/issues/296 +func TestBucket_NextSequence_Persist(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.Bucket([]byte("widgets")).NextSequence(); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + seq, err := tx.Bucket([]byte("widgets")).NextSequence() + if err != nil { + t.Fatalf("unexpected error: %s", err) + } else if seq != 2 { + t.Fatalf("unexpected sequence: %d", seq) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that retrieving the next sequence on a read-only bucket returns an error. +func TestBucket_NextSequence_ReadOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + _, err := tx.Bucket([]byte("widgets")).NextSequence() + if err != berrors.ErrTxNotWritable { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that retrieving the next sequence for a bucket on a closed database return an error. +func TestBucket_NextSequence_Closed(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + if _, err := b.NextSequence(); err != berrors.ErrTxClosed { + t.Fatal(err) + } +} + +// Ensure a user can loop over all key/value pairs in a bucket. +func TestBucket_ForEach(t *testing.T) { + db := btesting.MustCreateDB(t) + + type kv struct { + k []byte + v []byte + } + + expectedItems := []kv{ + {k: []byte("bar"), v: []byte("0002")}, + {k: []byte("baz"), v: []byte("0001")}, + {k: []byte("csubbucket"), v: nil}, + {k: []byte("foo"), v: []byte("0000")}, + } + + verifyReads := func(b *bolt.Bucket) { + var items []kv + err := b.ForEach(func(k, v []byte) error { + items = append(items, kv{k: k, v: v}) + return nil + }) + assert.NoErrorf(t, err, "b.ForEach failed") + assert.Equal(t, expectedItems, items, "what we iterated (ForEach) is not what we put") + } + + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + require.NoError(t, err, "bucket creation failed") + + require.NoErrorf(t, b.Put([]byte("foo"), []byte("0000")), "put 'foo' failed") + require.NoErrorf(t, b.Put([]byte("baz"), []byte("0001")), "put 'baz' failed") + require.NoErrorf(t, b.Put([]byte("bar"), []byte("0002")), "put 'bar' failed") + _, err = b.CreateBucket([]byte("csubbucket")) + require.NoErrorf(t, err, "creation of subbucket failed") + + verifyReads(b) + + return nil + }) + require.NoErrorf(t, err, "db.Update failed") + err = db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + require.NotNil(t, b, "bucket opening failed") + verifyReads(b) + return nil + }) + assert.NoErrorf(t, err, "db.View failed") +} + +func TestBucket_ForEachBucket(t *testing.T) { + db := btesting.MustCreateDB(t) + + expectedItems := [][]byte{ + []byte("csubbucket"), + []byte("zsubbucket"), + } + + verifyReads := func(b *bolt.Bucket) { + var items [][]byte + err := b.ForEachBucket(func(k []byte) error { + items = append(items, k) + return nil + }) + assert.NoErrorf(t, err, "b.ForEach failed") + assert.Equal(t, expectedItems, items, "what we iterated (ForEach) is not what we put") + } + + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + require.NoError(t, err, "bucket creation failed") + + require.NoErrorf(t, b.Put([]byte("foo"), []byte("0000")), "put 'foo' failed") + _, err = b.CreateBucket([]byte("zsubbucket")) + require.NoErrorf(t, err, "creation of subbucket failed") + require.NoErrorf(t, b.Put([]byte("baz"), []byte("0001")), "put 'baz' failed") + require.NoErrorf(t, b.Put([]byte("bar"), []byte("0002")), "put 'bar' failed") + _, err = b.CreateBucket([]byte("csubbucket")) + require.NoErrorf(t, err, "creation of subbucket failed") + + verifyReads(b) + + return nil + }) + assert.NoErrorf(t, err, "db.Update failed") + err = db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + require.NotNil(t, b, "bucket opening failed") + verifyReads(b) + return nil + }) + assert.NoErrorf(t, err, "db.View failed") +} + +func TestBucket_ForEachBucket_NoBuckets(t *testing.T) { + db := btesting.MustCreateDB(t) + + verifyReads := func(b *bolt.Bucket) { + var items [][]byte + err := b.ForEachBucket(func(k []byte) error { + items = append(items, k) + return nil + }) + assert.NoErrorf(t, err, "b.ForEach failed") + assert.Emptyf(t, items, "what we iterated (ForEach) is not what we put") + } + + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + require.NoError(t, err, "bucket creation failed") + + require.NoErrorf(t, b.Put([]byte("foo"), []byte("0000")), "put 'foo' failed") + require.NoErrorf(t, err, "creation of subbucket failed") + require.NoErrorf(t, b.Put([]byte("baz"), []byte("0001")), "put 'baz' failed") + require.NoErrorf(t, err, "creation of subbucket failed") + + verifyReads(b) + + return nil + }) + require.NoErrorf(t, err, "db.Update failed") + + err = db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + require.NotNil(t, b, "bucket opening failed") + verifyReads(b) + return nil + }) + assert.NoErrorf(t, err, "db.View failed") +} + +// Ensure a database can stop iteration early. +func TestBucket_ForEach_ShortCircuit(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("bar"), []byte("0000")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("0000")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("0000")); err != nil { + t.Fatal(err) + } + + var index int + if err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error { + index++ + if bytes.Equal(k, []byte("baz")) { + return errors.New("marker") + } + return nil + }); err == nil || err.Error() != "marker" { + t.Fatalf("unexpected error: %s", err) + } + if index != 2 { + t.Fatalf("unexpected index: %d", index) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that looping over a bucket on a closed database returns an error. +func TestBucket_ForEach_Closed(t *testing.T) { + db := btesting.MustCreateDB(t) + + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + + if err := b.ForEach(func(k, v []byte) error { return nil }); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that an error is returned when inserting with an empty key. +func TestBucket_Put_EmptyKey(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte(""), []byte("bar")); err != berrors.ErrKeyRequired { + t.Fatalf("unexpected error: %s", err) + } + if err := b.Put(nil, []byte("bar")); err != berrors.ErrKeyRequired { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that an error is returned when inserting with a key that's too large. +func TestBucket_Put_KeyTooLarge(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put(make([]byte, 32769), []byte("bar")); err != berrors.ErrKeyTooLarge { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that an error is returned when inserting a value that's too large. +func TestBucket_Put_ValueTooLarge(t *testing.T) { + // Skip this test on DroneCI because the machine is resource constrained. + if os.Getenv("DRONE") == "true" { + t.Skip("not enough RAM for test") + } + + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), make([]byte, bolt.MaxValueSize+1)); err != berrors.ErrValueTooLarge { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode") + } + + db := btesting.MustCreateDB(t) + + // Add bucket with fewer keys but one big value. + bigKey := []byte("really-big-value") + for i := 0; i < 500; i++ { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("woojits")) + if err != nil { + t.Fatal(err) + } + + if err := b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i))); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + } + longKeyLength := 10*db.Info().PageSize + 17 + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("woojits")).Put(bigKey, []byte(strings.Repeat("*", longKeyLength))); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + db.MustCheck() + + pageSize2stats := map[int]bolt.BucketStats{ + 4096: { + BranchPageN: 1, + BranchOverflowN: 0, + LeafPageN: 7, + LeafOverflowN: 10, + KeyN: 501, + Depth: 2, + BranchAlloc: 4096, + BranchInuse: 149, + LeafAlloc: 69632, + LeafInuse: 0 + + 7*16 + // leaf page header (x LeafPageN) + 501*16 + // leaf elements + 500*3 + len(bigKey) + // leaf keys + 1*10 + 2*90 + 3*400 + longKeyLength, // leaf values: 10 * 1digit, 90*2digits, ... + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + 16384: { + BranchPageN: 1, + BranchOverflowN: 0, + LeafPageN: 3, + LeafOverflowN: 10, + KeyN: 501, + Depth: 2, + BranchAlloc: 16384, + BranchInuse: 73, + LeafAlloc: 212992, + LeafInuse: 0 + + 3*16 + // leaf page header (x LeafPageN) + 501*16 + // leaf elements + 500*3 + len(bigKey) + // leaf keys + 1*10 + 2*90 + 3*400 + longKeyLength, // leaf values: 10 * 1digit, 90*2digits, ... + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + 65536: { + BranchPageN: 1, + BranchOverflowN: 0, + LeafPageN: 2, + LeafOverflowN: 10, + KeyN: 501, + Depth: 2, + BranchAlloc: 65536, + BranchInuse: 54, + LeafAlloc: 786432, + LeafInuse: 0 + + 2*16 + // leaf page header (x LeafPageN) + 501*16 + // leaf elements + 500*3 + len(bigKey) + // leaf keys + 1*10 + 2*90 + 3*400 + longKeyLength, // leaf values: 10 * 1digit, 90*2digits, ... + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + } + + if err := db.View(func(tx *bolt.Tx) error { + stats := tx.Bucket([]byte("woojits")).Stats() + t.Logf("Stats: %#v", stats) + if expected, ok := pageSize2stats[db.Info().PageSize]; ok { + assert.EqualValues(t, expected, stats, "stats differs from expectations") + } else { + t.Skipf("No expectations for page size: %d", db.Info().PageSize) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a bucket with random insertion utilizes fill percentage correctly. +func TestBucket_Stats_RandomFill(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } else if os.Getpagesize() != 4096 { + t.Skip("invalid page size for test") + } + + db := btesting.MustCreateDB(t) + + // Add a set of values in random order. It will be the same random + // order so we can maintain consistency between test runs. + var count int + rand := rand.New(rand.NewSource(42)) + for _, i := range rand.Perm(1000) { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("woojits")) + if err != nil { + t.Fatal(err) + } + b.FillPercent = 0.9 + for _, j := range rand.Perm(100) { + index := (j * 10000) + i + if err := b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000")); err != nil { + t.Fatal(err) + } + count++ + } + return nil + }); err != nil { + t.Fatal(err) + } + } + + db.MustCheck() + + if err := db.View(func(tx *bolt.Tx) error { + stats := tx.Bucket([]byte("woojits")).Stats() + if stats.KeyN != 100000 { + t.Fatalf("unexpected KeyN: %d", stats.KeyN) + } + + if stats.BranchPageN != 98 { + t.Fatalf("unexpected BranchPageN: %d", stats.BranchPageN) + } else if stats.BranchOverflowN != 0 { + t.Fatalf("unexpected BranchOverflowN: %d", stats.BranchOverflowN) + } else if stats.BranchInuse != 130984 { + t.Fatalf("unexpected BranchInuse: %d", stats.BranchInuse) + } else if stats.BranchAlloc != 401408 { + t.Fatalf("unexpected BranchAlloc: %d", stats.BranchAlloc) + } + + if stats.LeafPageN != 3412 { + t.Fatalf("unexpected LeafPageN: %d", stats.LeafPageN) + } else if stats.LeafOverflowN != 0 { + t.Fatalf("unexpected LeafOverflowN: %d", stats.LeafOverflowN) + } else if stats.LeafInuse != 4742482 { + t.Fatalf("unexpected LeafInuse: %d", stats.LeafInuse) + } else if stats.LeafAlloc != 13975552 { + t.Fatalf("unexpected LeafAlloc: %d", stats.LeafAlloc) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats_Small(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + // Add a bucket that fits on a single root leaf. + b, err := tx.CreateBucket([]byte("whozawhats")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + + return nil + }); err != nil { + t.Fatal(err) + } + + db.MustCheck() + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("whozawhats")) + stats := b.Stats() + if stats.BranchPageN != 0 { + t.Fatalf("unexpected BranchPageN: %d", stats.BranchPageN) + } else if stats.BranchOverflowN != 0 { + t.Fatalf("unexpected BranchOverflowN: %d", stats.BranchOverflowN) + } else if stats.LeafPageN != 0 { + t.Fatalf("unexpected LeafPageN: %d", stats.LeafPageN) + } else if stats.LeafOverflowN != 0 { + t.Fatalf("unexpected LeafOverflowN: %d", stats.LeafOverflowN) + } else if stats.KeyN != 1 { + t.Fatalf("unexpected KeyN: %d", stats.KeyN) + } else if stats.Depth != 1 { + t.Fatalf("unexpected Depth: %d", stats.Depth) + } else if stats.BranchInuse != 0 { + t.Fatalf("unexpected BranchInuse: %d", stats.BranchInuse) + } else if stats.LeafInuse != 0 { + t.Fatalf("unexpected LeafInuse: %d", stats.LeafInuse) + } + + if db.Info().PageSize == 4096 { + if stats.BranchAlloc != 0 { + t.Fatalf("unexpected BranchAlloc: %d", stats.BranchAlloc) + } else if stats.LeafAlloc != 0 { + t.Fatalf("unexpected LeafAlloc: %d", stats.LeafAlloc) + } + } + + if stats.BucketN != 1 { + t.Fatalf("unexpected BucketN: %d", stats.BucketN) + } else if stats.InlineBucketN != 1 { + t.Fatalf("unexpected InlineBucketN: %d", stats.InlineBucketN) + } else if stats.InlineBucketInuse != 16+16+6 { + t.Fatalf("unexpected InlineBucketInuse: %d", stats.InlineBucketInuse) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +func TestBucket_Stats_EmptyBucket(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + // Add a bucket that fits on a single root leaf. + if _, err := tx.CreateBucket([]byte("whozawhats")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + db.MustCheck() + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("whozawhats")) + stats := b.Stats() + if stats.BranchPageN != 0 { + t.Fatalf("unexpected BranchPageN: %d", stats.BranchPageN) + } else if stats.BranchOverflowN != 0 { + t.Fatalf("unexpected BranchOverflowN: %d", stats.BranchOverflowN) + } else if stats.LeafPageN != 0 { + t.Fatalf("unexpected LeafPageN: %d", stats.LeafPageN) + } else if stats.LeafOverflowN != 0 { + t.Fatalf("unexpected LeafOverflowN: %d", stats.LeafOverflowN) + } else if stats.KeyN != 0 { + t.Fatalf("unexpected KeyN: %d", stats.KeyN) + } else if stats.Depth != 1 { + t.Fatalf("unexpected Depth: %d", stats.Depth) + } else if stats.BranchInuse != 0 { + t.Fatalf("unexpected BranchInuse: %d", stats.BranchInuse) + } else if stats.LeafInuse != 0 { + t.Fatalf("unexpected LeafInuse: %d", stats.LeafInuse) + } + + if db.Info().PageSize == 4096 { + if stats.BranchAlloc != 0 { + t.Fatalf("unexpected BranchAlloc: %d", stats.BranchAlloc) + } else if stats.LeafAlloc != 0 { + t.Fatalf("unexpected LeafAlloc: %d", stats.LeafAlloc) + } + } + + if stats.BucketN != 1 { + t.Fatalf("unexpected BucketN: %d", stats.BucketN) + } else if stats.InlineBucketN != 1 { + t.Fatalf("unexpected InlineBucketN: %d", stats.InlineBucketN) + } else if stats.InlineBucketInuse != 16 { + t.Fatalf("unexpected InlineBucketInuse: %d", stats.InlineBucketInuse) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats_Nested(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("foo")) + if err != nil { + t.Fatal(err) + } + for i := 0; i < 100; i++ { + if err := b.Put([]byte(fmt.Sprintf("%02d", i)), []byte(fmt.Sprintf("%02d", i))); err != nil { + t.Fatal(err) + } + } + + bar, err := b.CreateBucket([]byte("bar")) + if err != nil { + t.Fatal(err) + } + for i := 0; i < 10; i++ { + if err := bar.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + t.Fatal(err) + } + } + + baz, err := bar.CreateBucket([]byte("baz")) + if err != nil { + t.Fatal(err) + } + for i := 0; i < 10; i++ { + if err := baz.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + t.Fatal(err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } + + db.MustCheck() + + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("foo")) + stats := b.Stats() + if stats.BranchPageN != 0 { + t.Fatalf("unexpected BranchPageN: %d", stats.BranchPageN) + } else if stats.BranchOverflowN != 0 { + t.Fatalf("unexpected BranchOverflowN: %d", stats.BranchOverflowN) + } else if stats.LeafPageN != 2 { + t.Fatalf("unexpected LeafPageN: %d", stats.LeafPageN) + } else if stats.LeafOverflowN != 0 { + t.Fatalf("unexpected LeafOverflowN: %d", stats.LeafOverflowN) + } else if stats.KeyN != 122 { + t.Fatalf("unexpected KeyN: %d", stats.KeyN) + } else if stats.Depth != 3 { + t.Fatalf("unexpected Depth: %d", stats.Depth) + } else if stats.BranchInuse != 0 { + t.Fatalf("unexpected BranchInuse: %d", stats.BranchInuse) + } + + foo := 16 // foo (pghdr) + foo += 101 * 16 // foo leaf elements + foo += 100*2 + 100*2 // foo leaf key/values + foo += 3 + 16 // foo -> bar key/value + + bar := 16 // bar (pghdr) + bar += 11 * 16 // bar leaf elements + bar += 10 + 10 // bar leaf key/values + bar += 3 + 16 // bar -> baz key/value + + baz := 16 // baz (inline) (pghdr) + baz += 10 * 16 // baz leaf elements + baz += 10 + 10 // baz leaf key/values + + if stats.LeafInuse != foo+bar+baz { + t.Fatalf("unexpected LeafInuse: %d", stats.LeafInuse) + } + + if db.Info().PageSize == 4096 { + if stats.BranchAlloc != 0 { + t.Fatalf("unexpected BranchAlloc: %d", stats.BranchAlloc) + } else if stats.LeafAlloc != 8192 { + t.Fatalf("unexpected LeafAlloc: %d", stats.LeafAlloc) + } + } + + if stats.BucketN != 3 { + t.Fatalf("unexpected BucketN: %d", stats.BucketN) + } else if stats.InlineBucketN != 1 { + t.Fatalf("unexpected InlineBucketN: %d", stats.InlineBucketN) + } else if stats.InlineBucketInuse != baz { + t.Fatalf("unexpected InlineBucketInuse: %d", stats.InlineBucketInuse) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +func TestBucket_Inspect(t *testing.T) { + db := btesting.MustCreateDB(t) + + expectedStructure := bolt.BucketStructure{ + Name: "root", + KeyN: 0, + Children: []bolt.BucketStructure{ + { + Name: "b1", + KeyN: 3, + Children: []bolt.BucketStructure{ + { + Name: "b1_1", + KeyN: 6, + }, + { + Name: "b1_2", + KeyN: 7, + }, + { + Name: "b1_3", + KeyN: 8, + }, + }, + }, + { + Name: "b2", + KeyN: 4, + Children: []bolt.BucketStructure{ + { + Name: "b2_1", + KeyN: 10, + }, + { + Name: "b2_2", + KeyN: 12, + Children: []bolt.BucketStructure{ + { + Name: "b2_2_1", + KeyN: 2, + }, + { + Name: "b2_2_2", + KeyN: 3, + }, + }, + }, + { + Name: "b2_3", + KeyN: 11, + }, + }, + }, + }, + } + + type bucketItem struct { + b *bolt.Bucket + bs bolt.BucketStructure + } + + t.Log("Populating the database") + err := db.Update(func(tx *bolt.Tx) error { + queue := []bucketItem{ + { + b: nil, + bs: expectedStructure, + }, + } + + for len(queue) > 0 { + item := queue[0] + queue = queue[1:] + + if item.b != nil { + for i := 0; i < item.bs.KeyN; i++ { + err := item.b.Put([]byte(fmt.Sprintf("%02d", i)), []byte(fmt.Sprintf("%02d", i))) + require.NoError(t, err) + } + + for _, child := range item.bs.Children { + childBucket, err := item.b.CreateBucket([]byte(child.Name)) + require.NoError(t, err) + queue = append(queue, bucketItem{b: childBucket, bs: child}) + } + } else { + for _, child := range item.bs.Children { + childBucket, err := tx.CreateBucket([]byte(child.Name)) + require.NoError(t, err) + queue = append(queue, bucketItem{b: childBucket, bs: child}) + } + } + } + return nil + }) + require.NoError(t, err) + + t.Log("Inspecting the database") + _ = db.View(func(tx *bolt.Tx) error { + actualStructure := tx.Inspect() + assert.Equal(t, expectedStructure, actualStructure) + return nil + }) +} + +// Ensure a large bucket can calculate stats. +func TestBucket_Stats_Large(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + db := btesting.MustCreateDB(t) + + var index int + for i := 0; i < 100; i++ { + // Add bucket with lots of keys. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for i := 0; i < 1000; i++ { + if err := b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index))); err != nil { + t.Fatal(err) + } + index++ + } + return nil + }); err != nil { + t.Fatal(err) + } + } + + db.MustCheck() + + pageSize2stats := map[int]bolt.BucketStats{ + 4096: { + BranchPageN: 13, + BranchOverflowN: 0, + LeafPageN: 1196, + LeafOverflowN: 0, + KeyN: 100000, + Depth: 3, + BranchAlloc: 53248, + BranchInuse: 25257, + LeafAlloc: 4898816, + LeafInuse: 2596916, + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + 16384: { + BranchPageN: 1, + BranchOverflowN: 0, + LeafPageN: 292, + LeafOverflowN: 0, + KeyN: 100000, + Depth: 2, + BranchAlloc: 16384, + BranchInuse: 6094, + LeafAlloc: 4784128, + LeafInuse: 2582452, + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + 65536: { + BranchPageN: 1, + BranchOverflowN: 0, + LeafPageN: 73, + LeafOverflowN: 0, + KeyN: 100000, + Depth: 2, + BranchAlloc: 65536, + BranchInuse: 1534, + LeafAlloc: 4784128, + LeafInuse: 2578948, + BucketN: 1, + InlineBucketN: 0, + InlineBucketInuse: 0}, + } + + if err := db.View(func(tx *bolt.Tx) error { + stats := tx.Bucket([]byte("widgets")).Stats() + t.Logf("Stats: %#v", stats) + if expected, ok := pageSize2stats[db.Info().PageSize]; ok { + assert.EqualValues(t, expected, stats, "stats differs from expectations") + } else { + t.Skipf("No expectations for page size: %d", db.Info().PageSize) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can write random keys and values across multiple transactions. +func TestBucket_Put_Single(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + index := 0 + if err := quick.Check(func(items testdata) bool { + db := btesting.MustCreateDB(t) + defer db.MustClose() + + m := make(map[string][]byte) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + for _, item := range items { + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("widgets")).Put(item.Key, item.Value); err != nil { + panic("put error: " + err.Error()) + } + m[string(item.Key)] = item.Value + return nil + }); err != nil { + t.Fatal(err) + } + + // Verify all key/values so far. + if err := db.View(func(tx *bolt.Tx) error { + i := 0 + for k, v := range m { + value := tx.Bucket([]byte("widgets")).Get([]byte(k)) + if !bytes.Equal(value, v) { + t.Logf("value mismatch [run %d] (%d of %d):\nkey: %x\ngot: %x\nexp: %x", index, i, len(m), []byte(k), value, v) + db.CopyTempFile() + t.FailNow() + } + i++ + } + return nil + }); err != nil { + t.Fatal(err) + } + } + + index++ + return true + }, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can insert multiple key/value pairs at once. +func TestBucket_Put_Multiple(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + if err := quick.Check(func(items testdata) bool { + db := btesting.MustCreateDB(t) + defer db.MustClose() + + // Bulk insert all values. + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + if err := b.Put(item.Key, item.Value); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Verify all items exist. + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + value := b.Get(item.Key) + if !bytes.Equal(item.Value, value) { + db.CopyTempFile() + t.Fatalf("exp=%x; got=%x", item.Value, value) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + return true + }, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can delete all key/value pairs and return to a single leaf page. +func TestBucket_Delete_Quick(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + if err := quick.Check(func(items testdata) bool { + db := btesting.MustCreateDB(t) + defer db.MustClose() + + // Bulk insert all values. + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + if err := b.Put(item.Key, item.Value); err != nil { + t.Fatal(err) + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Remove items one at a time and check consistency. + for _, item := range items { + if err := db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Delete(item.Key) + }); err != nil { + t.Fatal(err) + } + } + + // Anything before our deletion index should be nil. + if err := db.View(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error { + t.Fatalf("bucket should be empty; found: %06x", trunc(k, 3)) + return nil + }); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + return true + }, qconfig()); err != nil { + t.Error(err) + } +} + +func BenchmarkBucket_CreateBucketIfNotExists(b *testing.B) { + db := btesting.MustCreateDB(b) + defer db.MustClose() + + const bucketCount = 1_000_000 + + err := db.Update(func(tx *bolt.Tx) error { + for i := 0; i < bucketCount; i++ { + bucketName := fmt.Sprintf("bucket_%d", i) + _, berr := tx.CreateBucket([]byte(bucketName)) + require.NoError(b, berr) + } + return nil + }) + require.NoError(b, err) + + b.ResetTimer() + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + err := db.Update(func(tx *bolt.Tx) error { + _, berr := tx.CreateBucketIfNotExists([]byte("bucket_100")) + return berr + }) + require.NoError(b, err) + } +} + +func ExampleBucket_Put() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Start a write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + // Create a bucket. + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + return err + } + + // Set the value "bar" for the key "foo". + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + return err + } + return nil + }); err != nil { + log.Fatal(err) + } + + // Read value back in a different read-only transaction. + if err := db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value of 'foo' is: %s\n", value) + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // The value of 'foo' is: bar +} + +func ExampleBucket_Delete() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Start a write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + // Create a bucket. + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + return err + } + + // Set the value "bar" for the key "foo". + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + return err + } + + // Retrieve the key back from the database and verify it. + value := b.Get([]byte("foo")) + fmt.Printf("The value of 'foo' was: %s\n", value) + + return nil + }); err != nil { + log.Fatal(err) + } + + // Delete the key in a different write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Delete([]byte("foo")) + }); err != nil { + log.Fatal(err) + } + + // Retrieve the key again. + if err := db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + if value == nil { + fmt.Printf("The value of 'foo' is now: nil\n") + } + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // The value of 'foo' was: bar + // The value of 'foo' is now: nil +} + +func ExampleBucket_ForEach() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Insert data into a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("animals")) + if err != nil { + return err + } + + if err := b.Put([]byte("dog"), []byte("fun")); err != nil { + return err + } + if err := b.Put([]byte("cat"), []byte("lame")); err != nil { + return err + } + if err := b.Put([]byte("liger"), []byte("awesome")); err != nil { + return err + } + + // Iterate over items in sorted key order. + if err := b.ForEach(func(k, v []byte) error { + fmt.Printf("A %s is %s.\n", k, v) + return nil + }); err != nil { + return err + } + + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // A cat is lame. + // A dog is fun. + // A liger is awesome. +} diff --git a/cmd/bbolt/OWNERS b/cmd/bbolt/OWNERS new file mode 100644 index 0000000..d4d42d4 --- /dev/null +++ b/cmd/bbolt/OWNERS @@ -0,0 +1,12 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +approvers: + - ahrtr # Benjamin Wang + - fuweid # Wei Fu + - serathius # Marek Siarkowicz + - ptabor # Piotr Tabor + - spzala # Sahdev Zala + - tjungblu # Thomas Jungblut +reviewers: + - elbehery # Mustafa Elbehery + - ivanvc # Ivan Valdes diff --git a/cmd/bbolt/README.md b/cmd/bbolt/README.md new file mode 100644 index 0000000..a075849 --- /dev/null +++ b/cmd/bbolt/README.md @@ -0,0 +1,453 @@ +# Introduction to bbolt command line + +`bbolt` provides a command line utility for inspecting and manipulating bbolt database files. To install bbolt command-line please refer [here](https://github.com/etcd-io/bbolt#installing) + +**Note**: [etcd](https://github.com/etcd-io/etcd) uses bbolt as its backend storage engine. In this document, we take etcd as an example to demonstrate the usage of bbolt commands. Refer to [install etcd](https://etcd.io/docs/v3.5/install/) for installing etcd. + +1. Start a single member etcd cluster with this command below: + + ```bash + $etcd + ``` + + It will create a directory `default.etcd` by default under current working directory, and the directory structure will look like this: + + ```bash + $tree default.etcd + default.etcd + └── member + ├── snap + │   └── db // this is bbolt database file + └── wal + └── 0000000000000000-0000000000000000.wal + + 3 directories, 2 files + ``` + +2. Put some dummy data using [etcdctl](https://github.com/etcd-io/etcd/tree/main/etcdctl). +3. Stop the etcd instance. Note a bbolt database file can only be opened by one read-write process, because it is exclusively locked when opened. + +## Usage + +- `bbolt command [arguments]` + +### help + +- help will print information about that command + + ```bash + $bbolt help + + The commands are: + + version prints the current version of bbolt + bench run synthetic benchmark against bbolt + buckets print a list of buckets + check verifies integrity of bbolt database + compact copies a bbolt database, compacting it in the process + dump print a hexadecimal dump of a single page + get print the value of a key in a bucket + info print basic info + keys print a list of keys in a bucket + help print this screen + page print one or more pages in human readable format + pages print list of pages with their types + page-item print the key and value of a page item. + stats iterate over all pages and generate usage stats + surgery perform surgery on bbolt database + ``` + +- you can use `help` with any command: `bbolt [command] -h` for more information about command. + +## Analyse bbolt database with bbolt command line + +### version + +- `version` print the current version information of bbolt command-line. +- usage: + `bbolt version` + + Example: + + ```bash + $bbolt version + bbolt version: 1.3.7 + Go Version: go1.21.6 + Go OS/Arch: darwin/arm64 + ``` + +### info + +- `info` print the basic information about the given Bbolt database. +- usage: + `bbolt info [path to the bbolt database]` + + Example: + + ```bash + $bbolt info ~/default.etcd/member/snap/db + Page Size: 4096 + ``` + + - **note**: page size is given in bytes + - Bbolt database is using page size of 4KB + +### buckets + +- `buckets` print a list of buckets of Bbolt database is currently having. Find more information on buckets [here](https://github.com/etcd-io/bbolt#using-buckets) +- usage: + `bbolt buckets [path to the bbolt database]` + + Example: + + ```bash + $bbolt buckets ~/default.etcd/member/snap/db + alarm + auth + authRoles + authUsers + cluster + key + lease + members + members_removed + meta + ``` + + - It means when you start an etcd, it creates these `10` buckets using bbolt database. + +### check + +- `check` opens a database at a given `[PATH]` and runs an exhaustive check to verify that all pages are accessible or are marked as freed. It also verifies that no pages are double referenced. +- usage: + `bbolt check [path to the bbolt database]` + + Example: + + ```bash + $bbolt check ~/default.etcd/member/snap/db + ok + ``` + + - It returns `ok` as our database file `db` is not corrupted. + +### stats + +- To gather essential statistics about the bbolt database: `stats` performs an extensive search of the database to track every page reference. It starts at the current meta page and recursively iterates through every accessible bucket. +- usage: + `bbolt stats [path to the bbolt database]` + + Example: + + ```bash + $bbolt stats ~/default.etcd/member/snap/db + Aggregate statistics for 10 buckets + + Page count statistics + Number of logical branch pages: 0 + Number of physical branch overflow pages: 0 + Number of logical leaf pages: 0 + Number of physical leaf overflow pages: 0 + Tree statistics + Number of keys/value pairs: 11 + Number of levels in B+tree: 1 + Page size utilization + Bytes allocated for physical branch pages: 0 + Bytes actually used for branch data: 0 (0%) + Bytes allocated for physical leaf pages: 0 + Bytes actually used for leaf data: 0 (0%) + Bucket statistics + Total number of buckets: 10 + Total number on inlined buckets: 10 (100%) + Bytes used for inlined buckets: 780 (0%) + ``` + +### inspect +- `inspect` inspect the structure of the database. +- Usage: `bbolt inspect [path to the bbolt database]` + + Example: +```bash +$ ./bbolt inspect ~/default.etcd/member/snap/db +{ + "name": "root", + "keyN": 0, + "buckets": [ + { + "name": "alarm", + "keyN": 0 + }, + { + "name": "auth", + "keyN": 2 + }, + { + "name": "authRoles", + "keyN": 1 + }, + { + "name": "authUsers", + "keyN": 1 + }, + { + "name": "cluster", + "keyN": 1 + }, + { + "name": "key", + "keyN": 1285 + }, + { + "name": "lease", + "keyN": 2 + }, + { + "name": "members", + "keyN": 1 + }, + { + "name": "members_removed", + "keyN": 0 + }, + { + "name": "meta", + "keyN": 3 + } + ] +} +``` + +### pages + +- Pages prints a table of pages with their type (meta, leaf, branch, freelist). +- The `meta` will store the metadata information of database. +- The `leaf` and `branch` pages will show a key count in the `items` column. +- The `freelist` will show the number of free pages, which are free for writing again. +- The `overflow` column shows the number of blocks that the page spills over into. +- usage: + `bbolt pages [path to the bbolt database]` + + Example: + + ```bash + $bbolt pages ~/default.etcd/member/snap/db + ID TYPE ITEMS OVRFLW + ======== ========== ====== ====== + 0 meta 0 + 1 meta 0 + 2 free + 3 leaf 10 + 4 freelist 2 + 5 free + ``` + +### page + +- Page prints one or more pages in human readable format. +- usage: + + ```bash + bolt page [path to the bbolt database] pageid [pageid...] + or: bolt page --all [path to the bbolt database] + + Additional options include: + + --all + prints all pages (only skips pages that were considered successful overflow pages) + --format-value=auto|ascii-encoded|hex|bytes|redacted (default: auto) + prints values (on the leaf page) using the given format + ``` + + Example: + + ```bash + $bbolt page ~/default.etcd/member/snap/db 3 + Page ID: 3 + Page Type: leaf + Total Size: 4096 bytes + Overflow pages: 0 + Item Count: 10 + + "alarm": + "auth": + "authRoles": + "authUsers": + "cluster": + "key": + "lease": + "members": + "members_removed": + "meta": + ``` + + - It prints information of page `page ID: 3` + +### page-item + +- page-item prints a page item's key and value. +- usage: + + ```bash + bolt page-item [options] [path to the bbolt database] + Additional options include: + + --key-only + Print only the key + --value-only + Print only the value + --format + Output format. One of: auto|ascii-encoded|hex|bytes|redacted (default=auto) + ``` + + Example: + + ```bash + $bbolt page-item --key-only ~/default.etcd/member/snap/db 3 7 + "members" + ``` + + - It returns the key as `--key-only` flag is passed of `pageID: 3` and `itemID: 7` + +### dump + +- Dump prints a hexadecimal dump of one or more given pages. +- usage: + `bolt dump [path to the bbolt database] [pageid...]` + +### keys + +- Print a list of keys in the given bucket. +- usage: + + ```bash + bolt keys [path to the bbolt database] [BucketName] + + Additional options include: + --format + Output format. One of: auto|ascii-encoded|hex|bytes|redacted (default=auto) + ``` + + Example 1: + + ```bash + $bbolt keys ~/default.etcd/member/snap/db meta + confState + consistent_index + term + ``` + + - It list all the keys in bucket: `meta` + + Example 2: + + ```bash + $bbolt keys ~/default.etcd/member/snap/db members + 8e9e05c52164694d + ``` + + - It list all the keys in `members` bucket which is a `memberId` of etcd cluster member. + - In this case we are running a single member etcd cluster, hence only `one memberId` is present. If we would have run a `3` member etcd cluster then it will return a `3 memberId` as `3 cluster members` would have been present in `members` bucket. + +### get + +- Print the value of the given key in the given bucket. +- usage: + + ```bash + bolt get [path to the bbolt database] [BucketName] [Key] + + Additional options include: + --format + Output format. One of: auto|ascii-encoded|hex|bytes|redacted (default=auto) + --parse-format + Input format (of key). One of: ascii-encoded|hex (default=ascii-encoded)" + ``` + + Example 1: + + ```bash + $bbolt get --format=hex ~/default.etcd/member/snap/db meta term + 0000000000000004 + ``` + + - It returns the value present in bucket: `meta` for key: `term` in hexadecimal format. + + Example 2: + + ```bash + $bbolt get ~/default.etcd/member/snap/db members 8e9e05c52164694d + {"id":10276657743932975437,"peerURLs":["http://localhost:2380"],"name":"default","clientURLs":["http://localhost:2379"]} + ``` + + - It returns the value present in bucket: `members` for key: `8e9e05c52164694d`. + +### compact + +- Compact opens a database at given `[Source Path]` and walks it recursively, copying keys as they are found from all buckets, to a newly created database at `[Destination Path]`. The original database is left untouched. +- usage: + + ```bash + bbolt compact [options] -o [Destination Path] [Source Path] + + Additional options include: + + -tx-max-size NUM + Specifies the maximum size of individual transactions. + Defaults to 64KB + ``` + + Example: + + ```bash + $bbolt compact -o ~/db.compact ~/default.etcd/member/snap/db + 16805888 -> 32768 bytes (gain=512.88x) + ``` + + - It will create a compacted database file: `db.compact` at given path. + +### bench + +- run synthetic benchmark against bbolt database. +- usage: + + ```bash + Usage: + -batch-size int + + -blockprofile string + + -count int + (default 1000) + -cpuprofile string + + -fill-percent float + (default 0.5) + -key-size int + (default 8) + -memprofile string + + -no-sync + + -path string + + -profile-mode string + (default "rw") + -read-mode string + (default "seq") + -value-size int + (default 32) + -work + + -write-mode string + (default "seq") + ``` + + Example: + + ```bash + $bbolt bench ~/default.etcd/member/snap/db -batch-size 400 -key-size 16 + # Write 68.523572ms (68.523µs/op) (14593 op/sec) + # Read 1.000015152s (11ns/op) (90909090 op/sec) + ``` + + - It runs a benchmark with batch size of `400` and with key size of `16` while for others parameters default value is taken. diff --git a/cmd/bbolt/command_check.go b/cmd/bbolt/command_check.go new file mode 100644 index 0000000..f0b5091 --- /dev/null +++ b/cmd/bbolt/command_check.go @@ -0,0 +1,73 @@ +package main + +import ( + "fmt" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +type checkOptions struct { + fromPageID uint64 +} + +func (o *checkOptions) AddFlags(fs *pflag.FlagSet) { + fs.Uint64VarP(&o.fromPageID, "from-page", "", o.fromPageID, "check db integrity starting from the given page ID") +} + +func newCheckCommand() *cobra.Command { + var o checkOptions + checkCmd := &cobra.Command{ + Use: "check ", + Short: "verify integrity of bbolt database data", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return checkFunc(cmd, args[0], o) + }, + } + + o.AddFlags(checkCmd.Flags()) + return checkCmd +} + +func checkFunc(cmd *cobra.Command, dbPath string, cfg checkOptions) error { + if _, err := checkSourceDBPath(dbPath); err != nil { + return err + } + + // Open database. + db, err := bolt.Open(dbPath, 0600, &bolt.Options{ + ReadOnly: true, + PreLoadFreelist: true, + }) + if err != nil { + return err + } + defer db.Close() + + opts := []bolt.CheckOption{bolt.WithKVStringer(CmdKvStringer())} + if cfg.fromPageID != 0 { + opts = append(opts, bolt.WithPageId(cfg.fromPageID)) + } + // Perform consistency check. + return db.View(func(tx *bolt.Tx) error { + var count int + for err := range tx.Check(opts...) { + fmt.Fprintln(cmd.OutOrStdout(), err) + count++ + } + + // Print summary of errors. + if count > 0 { + fmt.Fprintf(cmd.OutOrStdout(), "%d errors found\n", count) + return guts_cli.ErrCorrupt + } + + // Notify user that database is valid. + fmt.Fprintln(cmd.OutOrStdout(), "OK") + return nil + }) +} diff --git a/cmd/bbolt/command_check_test.go b/cmd/bbolt/command_check_test.go new file mode 100644 index 0000000..2eb7702 --- /dev/null +++ b/cmd/bbolt/command_check_test.go @@ -0,0 +1,66 @@ +package main_test + +import ( + "bytes" + "io" + "testing" + + "github.com/stretchr/testify/require" + + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +func TestCheckCommand_Run(t *testing.T) { + testCases := []struct { + name string + args []string + expErr error + expOutput string + }{ + { + name: "check whole db", + args: []string{"check", "path"}, + expErr: nil, + expOutput: "OK\n", + }, + { + name: "check valid pageId", + args: []string{"check", "path", "--from-page", "3"}, + expErr: nil, + expOutput: "OK\n", + }, + { + name: "check invalid pageId", + args: []string{"check", "path", "--from-page", "1"}, + expErr: guts_cli.ErrCorrupt, + expOutput: "page ID (1) out of range [2, 4)", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + t.Log("Creating sample DB") + db := btesting.MustCreateDB(t) + db.Close() + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + t.Log("Running check cmd") + rootCmd := main.NewRootCommand() + outputBuf := bytes.NewBufferString("") // capture output for assertion + rootCmd.SetOut(outputBuf) + + tc.args[1] = db.Path() // path to be replaced with db.Path() + rootCmd.SetArgs(tc.args) + err := rootCmd.Execute() + require.Equal(t, tc.expErr, err) + + t.Log("Checking output") + output, err := io.ReadAll(outputBuf) + require.NoError(t, err) + require.Containsf(t, string(output), tc.expOutput, "unexpected stdout:\n\n%s", string(output)) + }) + } +} diff --git a/cmd/bbolt/command_inspect.go b/cmd/bbolt/command_inspect.go new file mode 100644 index 0000000..4242edc --- /dev/null +++ b/cmd/bbolt/command_inspect.go @@ -0,0 +1,46 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/spf13/cobra" + + bolt "github.com/tutus-one/tutus-bolt" +) + +func newInspectCommand() *cobra.Command { + inspectCmd := &cobra.Command{ + Use: "inspect ", + Short: "inspect the structure of the database", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return inspectFunc(args[0]) + }, + } + + return inspectCmd +} + +func inspectFunc(srcDBPath string) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + db, err := bolt.Open(srcDBPath, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + return db.View(func(tx *bolt.Tx) error { + bs := tx.Inspect() + out, err := json.MarshalIndent(bs, "", " ") + if err != nil { + return err + } + fmt.Fprintln(os.Stdout, string(out)) + return nil + }) +} diff --git a/cmd/bbolt/command_inspect_test.go b/cmd/bbolt/command_inspect_test.go new file mode 100644 index 0000000..929e763 --- /dev/null +++ b/cmd/bbolt/command_inspect_test.go @@ -0,0 +1,27 @@ +package main_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +func TestInspect(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + rootCmd := main.NewRootCommand() + rootCmd.SetArgs([]string{ + "inspect", srcPath, + }) + err := rootCmd.Execute() + require.NoError(t, err) +} diff --git a/cmd/bbolt/command_root.go b/cmd/bbolt/command_root.go new file mode 100644 index 0000000..0336ea3 --- /dev/null +++ b/cmd/bbolt/command_root.go @@ -0,0 +1,27 @@ +package main + +import ( + "github.com/spf13/cobra" +) + +const ( + cliName = "bbolt" + cliDescription = "A simple command line tool for inspecting bbolt databases" +) + +func NewRootCommand() *cobra.Command { + rootCmd := &cobra.Command{ + Use: cliName, + Short: cliDescription, + Version: "dev", + } + + rootCmd.AddCommand( + newVersionCommand(), + newSurgeryCommand(), + newInspectCommand(), + newCheckCommand(), + ) + + return rootCmd +} diff --git a/cmd/bbolt/command_surgery.go b/cmd/bbolt/command_surgery.go new file mode 100644 index 0000000..9626641 --- /dev/null +++ b/cmd/bbolt/command_surgery.go @@ -0,0 +1,300 @@ +package main + +import ( + "errors" + "fmt" + "os" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" + "github.com/tutus-one/tutus-bolt/internal/surgeon" +) + +var ( + ErrSurgeryFreelistAlreadyExist = errors.New("the file already has freelist, please consider to abandon the freelist to forcibly rebuild it") +) + +func newSurgeryCommand() *cobra.Command { + surgeryCmd := &cobra.Command{ + Use: "surgery ", + Short: "surgery related commands", + } + + surgeryCmd.AddCommand(newSurgeryRevertMetaPageCommand()) + surgeryCmd.AddCommand(newSurgeryCopyPageCommand()) + surgeryCmd.AddCommand(newSurgeryClearPageCommand()) + surgeryCmd.AddCommand(newSurgeryClearPageElementsCommand()) + surgeryCmd.AddCommand(newSurgeryFreelistCommand()) + surgeryCmd.AddCommand(newSurgeryMetaCommand()) + + return surgeryCmd +} + +type surgeryBaseOptions struct { + outputDBFilePath string +} + +func (o *surgeryBaseOptions) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&o.outputDBFilePath, "output", o.outputDBFilePath, "path to the filePath db file") + _ = cobra.MarkFlagRequired(fs, "output") +} + +func (o *surgeryBaseOptions) Validate() error { + if o.outputDBFilePath == "" { + return errors.New("output database path wasn't given, specify output database file path with --output option") + } + return nil +} + +func newSurgeryRevertMetaPageCommand() *cobra.Command { + var o surgeryBaseOptions + revertMetaPageCmd := &cobra.Command{ + Use: "revert-meta-page ", + Short: "Revert the meta page to revert the changes performed by the latest transaction", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryRevertMetaPageFunc(args[0], o) + }, + } + o.AddFlags(revertMetaPageCmd.Flags()) + return revertMetaPageCmd +} + +func surgeryRevertMetaPageFunc(srcDBPath string, cfg surgeryBaseOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[revert-meta-page] copy file failed: %w", err) + } + + if err := surgeon.RevertMetaPage(cfg.outputDBFilePath); err != nil { + return fmt.Errorf("revert-meta-page command failed: %w", err) + } + + fmt.Fprintln(os.Stdout, "The meta page is reverted.") + + return nil +} + +type surgeryCopyPageOptions struct { + surgeryBaseOptions + sourcePageId uint64 + destinationPageId uint64 +} + +func (o *surgeryCopyPageOptions) AddFlags(fs *pflag.FlagSet) { + o.surgeryBaseOptions.AddFlags(fs) + fs.Uint64VarP(&o.sourcePageId, "from-page", "", o.sourcePageId, "source page Id") + fs.Uint64VarP(&o.destinationPageId, "to-page", "", o.destinationPageId, "destination page Id") + _ = cobra.MarkFlagRequired(fs, "from-page") + _ = cobra.MarkFlagRequired(fs, "to-page") +} + +func (o *surgeryCopyPageOptions) Validate() error { + if err := o.surgeryBaseOptions.Validate(); err != nil { + return err + } + if o.sourcePageId == o.destinationPageId { + return fmt.Errorf("'--from-page' and '--to-page' have the same value: %d", o.sourcePageId) + } + return nil +} + +func newSurgeryCopyPageCommand() *cobra.Command { + var o surgeryCopyPageOptions + copyPageCmd := &cobra.Command{ + Use: "copy-page ", + Short: "Copy page from the source page Id to the destination page Id", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryCopyPageFunc(args[0], o) + }, + } + o.AddFlags(copyPageCmd.Flags()) + return copyPageCmd +} + +func surgeryCopyPageFunc(srcDBPath string, cfg surgeryCopyPageOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[copy-page] copy file failed: %w", err) + } + + if err := surgeon.CopyPage(cfg.outputDBFilePath, common.Pgid(cfg.sourcePageId), common.Pgid(cfg.destinationPageId)); err != nil { + return fmt.Errorf("copy-page command failed: %w", err) + } + + meta, err := readMetaPage(srcDBPath) + if err != nil { + return err + } + if meta.IsFreelistPersisted() { + fmt.Fprintf(os.Stdout, "WARNING: the free list might have changed.\n") + fmt.Fprintf(os.Stdout, "Please consider executing `./bbolt surgery freelist abandon ...`\n") + } + + fmt.Fprintf(os.Stdout, "The page %d was successfully copied to page %d\n", cfg.sourcePageId, cfg.destinationPageId) + return nil +} + +type surgeryClearPageOptions struct { + surgeryBaseOptions + pageId uint64 +} + +func (o *surgeryClearPageOptions) AddFlags(fs *pflag.FlagSet) { + o.surgeryBaseOptions.AddFlags(fs) + fs.Uint64VarP(&o.pageId, "pageId", "", o.pageId, "page Id") + _ = cobra.MarkFlagRequired(fs, "pageId") +} + +func (o *surgeryClearPageOptions) Validate() error { + if err := o.surgeryBaseOptions.Validate(); err != nil { + return err + } + if o.pageId < 2 { + return fmt.Errorf("the pageId must be at least 2, but got %d", o.pageId) + } + return nil +} + +func newSurgeryClearPageCommand() *cobra.Command { + var o surgeryClearPageOptions + clearPageCmd := &cobra.Command{ + Use: "clear-page ", + Short: "Clears all elements from the given page, which can be a branch or leaf page", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryClearPageFunc(args[0], o) + }, + } + o.AddFlags(clearPageCmd.Flags()) + return clearPageCmd +} + +func surgeryClearPageFunc(srcDBPath string, cfg surgeryClearPageOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[clear-page] copy file failed: %w", err) + } + + needAbandonFreelist, err := surgeon.ClearPage(cfg.outputDBFilePath, common.Pgid(cfg.pageId)) + if err != nil { + return fmt.Errorf("clear-page command failed: %w", err) + } + + if needAbandonFreelist { + fmt.Fprintf(os.Stdout, "WARNING: The clearing has abandoned some pages that are not yet referenced from free list.\n") + fmt.Fprintf(os.Stdout, "Please consider executing `./bbolt surgery freelist abandon ...`\n") + } + + fmt.Fprintf(os.Stdout, "The page (%d) was cleared\n", cfg.pageId) + return nil +} + +type surgeryClearPageElementsOptions struct { + surgeryBaseOptions + pageId uint64 + startElementIdx int + endElementIdx int +} + +func (o *surgeryClearPageElementsOptions) AddFlags(fs *pflag.FlagSet) { + o.surgeryBaseOptions.AddFlags(fs) + fs.Uint64VarP(&o.pageId, "pageId", "", o.pageId, "page id") + fs.IntVarP(&o.startElementIdx, "from-index", "", o.startElementIdx, "start element index (included) to clear, starting from 0") + fs.IntVarP(&o.endElementIdx, "to-index", "", o.endElementIdx, "end element index (excluded) to clear, starting from 0, -1 means to the end of page") + _ = cobra.MarkFlagRequired(fs, "pageId") + _ = cobra.MarkFlagRequired(fs, "from-index") + _ = cobra.MarkFlagRequired(fs, "to-index") +} + +func (o *surgeryClearPageElementsOptions) Validate() error { + if err := o.surgeryBaseOptions.Validate(); err != nil { + return err + } + if o.pageId < 2 { + return fmt.Errorf("the pageId must be at least 2, but got %d", o.pageId) + } + return nil +} + +func newSurgeryClearPageElementsCommand() *cobra.Command { + var o surgeryClearPageElementsOptions + clearElementCmd := &cobra.Command{ + Use: "clear-page-elements ", + Short: "Clears elements from the given page, which can be a branch or leaf page", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryClearPageElementFunc(args[0], o) + }, + } + o.AddFlags(clearElementCmd.Flags()) + return clearElementCmd +} + +func surgeryClearPageElementFunc(srcDBPath string, cfg surgeryClearPageElementsOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[clear-page-element] copy file failed: %w", err) + } + + needAbandonFreelist, err := surgeon.ClearPageElements(cfg.outputDBFilePath, common.Pgid(cfg.pageId), cfg.startElementIdx, cfg.endElementIdx, false) + if err != nil { + return fmt.Errorf("clear-page-element command failed: %w", err) + } + + if needAbandonFreelist { + fmt.Fprintf(os.Stdout, "WARNING: The clearing has abandoned some pages that are not yet referenced from free list.\n") + fmt.Fprintf(os.Stdout, "Please consider executing `./bbolt surgery freelist abandon ...`\n") + } + + fmt.Fprintf(os.Stdout, "All elements in [%d, %d) in page %d were cleared\n", cfg.startElementIdx, cfg.endElementIdx, cfg.pageId) + return nil +} + +func readMetaPage(path string) (*common.Meta, error) { + pageSize, _, err := guts_cli.ReadPageAndHWMSize(path) + if err != nil { + return nil, fmt.Errorf("read Page size failed: %w", err) + } + + m := make([]*common.Meta, 2) + for i := 0; i < 2; i++ { + m[i], _, err = ReadMetaPageAt(path, uint32(i), uint32(pageSize)) + if err != nil { + return nil, fmt.Errorf("read meta page %d failed: %w", i, err) + } + } + + if m[0].Txid() > m[1].Txid() { + return m[0], nil + } + return m[1], nil +} diff --git a/cmd/bbolt/command_surgery_freelist.go b/cmd/bbolt/command_surgery_freelist.go new file mode 100644 index 0000000..b96e65d --- /dev/null +++ b/cmd/bbolt/command_surgery_freelist.go @@ -0,0 +1,111 @@ +package main + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/surgeon" +) + +func newSurgeryFreelistCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "freelist ", + Short: "freelist related surgery commands", + } + + cmd.AddCommand(newSurgeryFreelistAbandonCommand()) + cmd.AddCommand(newSurgeryFreelistRebuildCommand()) + + return cmd +} + +func newSurgeryFreelistAbandonCommand() *cobra.Command { + var o surgeryBaseOptions + abandonFreelistCmd := &cobra.Command{ + Use: "abandon ", + Short: "Abandon the freelist from both meta pages", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryFreelistAbandonFunc(args[0], o) + }, + } + o.AddFlags(abandonFreelistCmd.Flags()) + + return abandonFreelistCmd +} + +func surgeryFreelistAbandonFunc(srcDBPath string, cfg surgeryBaseOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[freelist abandon] copy file failed: %w", err) + } + + if err := surgeon.ClearFreelist(cfg.outputDBFilePath); err != nil { + return fmt.Errorf("abandom-freelist command failed: %w", err) + } + + fmt.Fprintf(os.Stdout, "The freelist was abandoned in both meta pages.\nIt may cause some delay on next startup because bbolt needs to scan the whole db to reconstruct the free list.\n") + return nil +} + +func newSurgeryFreelistRebuildCommand() *cobra.Command { + var o surgeryBaseOptions + rebuildFreelistCmd := &cobra.Command{ + Use: "rebuild ", + Short: "Rebuild the freelist", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryFreelistRebuildFunc(args[0], o) + }, + } + o.AddFlags(rebuildFreelistCmd.Flags()) + + return rebuildFreelistCmd +} + +func surgeryFreelistRebuildFunc(srcDBPath string, cfg surgeryBaseOptions) error { + // Ensure source file exists. + fi, err := checkSourceDBPath(srcDBPath) + if err != nil { + return err + } + + // make sure the freelist isn't present in the file. + meta, err := readMetaPage(srcDBPath) + if err != nil { + return err + } + if meta.IsFreelistPersisted() { + return ErrSurgeryFreelistAlreadyExist + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[freelist rebuild] copy file failed: %w", err) + } + + // bboltDB automatically reconstruct & sync freelist in write mode. + db, err := bolt.Open(cfg.outputDBFilePath, fi.Mode(), &bolt.Options{NoFreelistSync: false}) + if err != nil { + return fmt.Errorf("[freelist rebuild] open db file failed: %w", err) + } + err = db.Close() + if err != nil { + return fmt.Errorf("[freelist rebuild] close db file failed: %w", err) + } + + fmt.Fprintf(os.Stdout, "The freelist was successfully rebuilt.\n") + return nil +} diff --git a/cmd/bbolt/command_surgery_freelist_test.go b/cmd/bbolt/command_surgery_freelist_test.go new file mode 100644 index 0000000..9b29e73 --- /dev/null +++ b/cmd/bbolt/command_surgery_freelist_test.go @@ -0,0 +1,103 @@ +package main_test + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +func TestSurgery_Freelist_Abandon(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "freelist", "abandon", srcPath, + "--output", output, + }) + err := rootCmd.Execute() + require.NoError(t, err) + + meta0 := loadMetaPage(t, output, 0) + assert.Equal(t, common.PgidNoFreelist, meta0.Freelist()) + meta1 := loadMetaPage(t, output, 1) + assert.Equal(t, common.PgidNoFreelist, meta1.Freelist()) +} + +func TestSurgery_Freelist_Rebuild(t *testing.T) { + testCases := []struct { + name string + hasFreelist bool + expectedError error + }{ + { + name: "normal operation", + hasFreelist: false, + expectedError: nil, + }, + { + name: "already has freelist", + hasFreelist: true, + expectedError: main.ErrSurgeryFreelistAlreadyExist, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{ + PageSize: pageSize, + NoFreelistSync: !tc.hasFreelist, + }) + srcPath := db.Path() + + err := db.Update(func(tx *bolt.Tx) error { + // do nothing + return nil + }) + require.NoError(t, err) + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + // Verify the freelist isn't synced in the beginning + meta := readMetaPage(t, srcPath) + if tc.hasFreelist { + if meta.Freelist() <= 1 || meta.Freelist() >= meta.Pgid() { + t.Fatalf("freelist (%d) isn't in the valid range (1, %d)", meta.Freelist(), meta.Pgid()) + } + } else { + require.Equal(t, common.PgidNoFreelist, meta.Freelist()) + } + + // Execute `surgery freelist rebuild` command + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "freelist", "rebuild", srcPath, + "--output", output, + }) + err = rootCmd.Execute() + require.Equal(t, tc.expectedError, err) + + if tc.expectedError == nil { + // Verify the freelist has already been rebuilt. + meta = readMetaPage(t, output) + if meta.Freelist() <= 1 || meta.Freelist() >= meta.Pgid() { + t.Fatalf("freelist (%d) isn't in the valid range (1, %d)", meta.Freelist(), meta.Pgid()) + } + } + }) + } +} diff --git a/cmd/bbolt/command_surgery_meta.go b/cmd/bbolt/command_surgery_meta.go new file mode 100644 index 0000000..7ce950a --- /dev/null +++ b/cmd/bbolt/command_surgery_meta.go @@ -0,0 +1,275 @@ +package main + +import ( + "fmt" + "io" + "os" + "strconv" + "strings" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +const ( + metaFieldPageSize = "pageSize" + metaFieldRoot = "root" + metaFieldFreelist = "freelist" + metaFieldPgid = "pgid" +) + +func newSurgeryMetaCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "meta ", + Short: "meta page related surgery commands", + } + + cmd.AddCommand(newSurgeryMetaValidateCommand()) + cmd.AddCommand(newSurgeryMetaUpdateCommand()) + + return cmd +} + +func newSurgeryMetaValidateCommand() *cobra.Command { + metaValidateCmd := &cobra.Command{ + Use: "validate ", + Short: "Validate both meta pages", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return surgeryMetaValidateFunc(args[0]) + }, + } + return metaValidateCmd +} + +func surgeryMetaValidateFunc(srcDBPath string) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + var pageSize uint32 + + for i := 0; i <= 1; i++ { + m, _, err := ReadMetaPageAt(srcDBPath, uint32(i), pageSize) + if err != nil { + return fmt.Errorf("read meta page %d failed: %w", i, err) + } + if mValidateErr := m.Validate(); mValidateErr != nil { + fmt.Fprintf(os.Stdout, "WARNING: The meta page %d isn't valid: %v!\n", i, mValidateErr) + } else { + fmt.Fprintf(os.Stdout, "The meta page %d is valid!\n", i) + } + + pageSize = m.PageSize() + } + + return nil +} + +type surgeryMetaUpdateOptions struct { + surgeryBaseOptions + fields []string + metaPageId uint32 +} + +var allowedMetaUpdateFields = map[string]struct{}{ + metaFieldPageSize: {}, + metaFieldRoot: {}, + metaFieldFreelist: {}, + metaFieldPgid: {}, +} + +// AddFlags sets the flags for `meta update` command. +// Example: --fields root:16,freelist:8 --fields pgid:128 +// Result: []string{"root:16", "freelist:8", "pgid:128"} +func (o *surgeryMetaUpdateOptions) AddFlags(fs *pflag.FlagSet) { + o.surgeryBaseOptions.AddFlags(fs) + fs.StringSliceVarP(&o.fields, "fields", "", o.fields, "comma separated list of fields (supported fields: pageSize, root, freelist and pgid) to be updated, and each item is a colon-separated key-value pair") + fs.Uint32VarP(&o.metaPageId, "meta-page", "", o.metaPageId, "the meta page ID to operate on, valid values are 0 and 1") +} + +func (o *surgeryMetaUpdateOptions) Validate() error { + if err := o.surgeryBaseOptions.Validate(); err != nil { + return err + } + + if o.metaPageId > 1 { + return fmt.Errorf("invalid meta page id: %d", o.metaPageId) + } + + for _, field := range o.fields { + kv := strings.Split(field, ":") + if len(kv) != 2 { + return fmt.Errorf("invalid key-value pair: %s", field) + } + + if _, ok := allowedMetaUpdateFields[kv[0]]; !ok { + return fmt.Errorf("field %q isn't allowed to be updated", kv[0]) + } + + if _, err := strconv.ParseUint(kv[1], 10, 64); err != nil { + return fmt.Errorf("invalid value %q for field %q", kv[1], kv[0]) + } + } + + return nil +} + +func newSurgeryMetaUpdateCommand() *cobra.Command { + var o surgeryMetaUpdateOptions + metaUpdateCmd := &cobra.Command{ + Use: "update ", + Short: "Update fields in meta pages", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.Validate(); err != nil { + return err + } + return surgeryMetaUpdateFunc(args[0], o) + }, + } + o.AddFlags(metaUpdateCmd.Flags()) + return metaUpdateCmd +} + +func surgeryMetaUpdateFunc(srcDBPath string, cfg surgeryMetaUpdateOptions) error { + if _, err := checkSourceDBPath(srcDBPath); err != nil { + return err + } + + if err := common.CopyFile(srcDBPath, cfg.outputDBFilePath); err != nil { + return fmt.Errorf("[meta update] copy file failed: %w", err) + } + + // read the page size from the first meta page if we want to edit the second meta page. + var pageSize uint32 + if cfg.metaPageId == 1 { + m0, _, err := ReadMetaPageAt(cfg.outputDBFilePath, 0, pageSize) + if err != nil { + return fmt.Errorf("read the first meta page failed: %w", err) + } + pageSize = m0.PageSize() + } + + // update the specified meta page + m, buf, err := ReadMetaPageAt(cfg.outputDBFilePath, cfg.metaPageId, pageSize) + if err != nil { + return fmt.Errorf("read meta page %d failed: %w", cfg.metaPageId, err) + } + mChanged := updateMetaField(m, parseFields(cfg.fields)) + if mChanged { + if err := writeMetaPageAt(cfg.outputDBFilePath, buf, cfg.metaPageId, pageSize); err != nil { + return fmt.Errorf("[meta update] write meta page %d failed: %w", cfg.metaPageId, err) + } + } + + if cfg.metaPageId == 1 && pageSize != m.PageSize() { + fmt.Fprintf(os.Stdout, "WARNING: The page size (%d) in the first meta page doesn't match the second meta page (%d)\n", pageSize, m.PageSize()) + } + + // Display results + if !mChanged { + fmt.Fprintln(os.Stdout, "Nothing changed!") + } + + if mChanged { + fmt.Fprintf(os.Stdout, "The meta page %d has been updated!\n", cfg.metaPageId) + } + + return nil +} + +func parseFields(fields []string) map[string]uint64 { + fieldsMap := make(map[string]uint64) + for _, field := range fields { + kv := strings.SplitN(field, ":", 2) + val, _ := strconv.ParseUint(kv[1], 10, 64) + fieldsMap[kv[0]] = val + } + return fieldsMap +} + +func updateMetaField(m *common.Meta, fields map[string]uint64) bool { + changed := false + for key, val := range fields { + switch key { + case metaFieldPageSize: + m.SetPageSize(uint32(val)) + case metaFieldRoot: + m.SetRootBucket(common.NewInBucket(common.Pgid(val), 0)) + case metaFieldFreelist: + m.SetFreelist(common.Pgid(val)) + case metaFieldPgid: + m.SetPgid(common.Pgid(val)) + } + + changed = true + } + + if m.Magic() != common.Magic { + m.SetMagic(common.Magic) + changed = true + } + if m.Version() != common.Version { + m.SetVersion(common.Version) + changed = true + } + if m.Flags() != common.MetaPageFlag { + m.SetFlags(common.MetaPageFlag) + changed = true + } + + newChecksum := m.Sum64() + if m.Checksum() != newChecksum { + m.SetChecksum(newChecksum) + changed = true + } + + return changed +} + +func ReadMetaPageAt(dbPath string, metaPageId uint32, pageSize uint32) (*common.Meta, []byte, error) { + if metaPageId > 1 { + return nil, nil, fmt.Errorf("invalid metaPageId: %d", metaPageId) + } + + f, err := os.OpenFile(dbPath, os.O_RDONLY, 0444) + if err != nil { + return nil, nil, err + } + defer f.Close() + + // The meta page is just 64 bytes, and definitely less than 1024 bytes, + // so it's fine to only read 1024 bytes. Note we don't care about the + // pageSize when reading the first meta page, because we always read the + // file starting from offset 0. Actually the passed pageSize is 0 when + // reading the first meta page in the `surgery meta update` command. + buf := make([]byte, 1024) + n, err := f.ReadAt(buf, int64(metaPageId*pageSize)) + if n == len(buf) && (err == nil || err == io.EOF) { + return common.LoadPageMeta(buf), buf, nil + } + + return nil, nil, err +} + +func writeMetaPageAt(dbPath string, buf []byte, metaPageId uint32, pageSize uint32) error { + if metaPageId > 1 { + return fmt.Errorf("invalid metaPageId: %d", metaPageId) + } + + f, err := os.OpenFile(dbPath, os.O_RDWR, 0666) + if err != nil { + return err + } + defer f.Close() + + n, err := f.WriteAt(buf, int64(metaPageId*pageSize)) + if n == len(buf) && (err == nil || err == io.EOF) { + return nil + } + + return err +} diff --git a/cmd/bbolt/command_surgery_meta_test.go b/cmd/bbolt/command_surgery_meta_test.go new file mode 100644 index 0000000..6413653 --- /dev/null +++ b/cmd/bbolt/command_surgery_meta_test.go @@ -0,0 +1,126 @@ +package main_test + +import ( + "fmt" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +func TestSurgery_Meta_Validate(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // validate the meta pages + rootCmd := main.NewRootCommand() + rootCmd.SetArgs([]string{ + "surgery", "meta", "validate", srcPath, + }) + err := rootCmd.Execute() + require.NoError(t, err) + + // TODD: add one more case that the validation may fail. We need to + // make the command output configurable, so that test cases can set + // a customized io.Writer. +} + +func TestSurgery_Meta_Update(t *testing.T) { + testCases := []struct { + name string + root common.Pgid + freelist common.Pgid + pgid common.Pgid + }{ + { + name: "root changed", + root: 50, + }, + { + name: "freelist changed", + freelist: 40, + }, + { + name: "pgid changed", + pgid: 600, + }, + { + name: "both root and freelist changed", + root: 45, + freelist: 46, + }, + { + name: "both pgid and freelist changed", + pgid: 256, + freelist: 47, + }, + { + name: "all fields changed", + root: 43, + freelist: 62, + pgid: 256, + }, + } + + for _, tc := range testCases { + for i := 0; i <= 1; i++ { + tc := tc + metaPageId := uint32(i) + + t.Run(tc.name, func(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + var fields []string + if tc.root != 0 { + fields = append(fields, fmt.Sprintf("root:%d", tc.root)) + } + if tc.freelist != 0 { + fields = append(fields, fmt.Sprintf("freelist:%d", tc.freelist)) + } + if tc.pgid != 0 { + fields = append(fields, fmt.Sprintf("pgid:%d", tc.pgid)) + } + + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "meta", "update", srcPath, + "--output", output, + "--meta-page", fmt.Sprintf("%d", metaPageId), + "--fields", strings.Join(fields, ","), + }) + err := rootCmd.Execute() + require.NoError(t, err) + + m, _, err := main.ReadMetaPageAt(output, metaPageId, 4096) + require.NoError(t, err) + + require.Equal(t, common.Magic, m.Magic()) + require.Equal(t, common.Version, m.Version()) + + if tc.root != 0 { + require.Equal(t, tc.root, m.RootBucket().RootPage()) + } + if tc.freelist != 0 { + require.Equal(t, tc.freelist, m.Freelist()) + } + if tc.pgid != 0 { + require.Equal(t, tc.pgid, m.Pgid()) + } + }) + } + } +} diff --git a/cmd/bbolt/command_surgery_test.go b/cmd/bbolt/command_surgery_test.go new file mode 100644 index 0000000..b6bc83e --- /dev/null +++ b/cmd/bbolt/command_surgery_test.go @@ -0,0 +1,636 @@ +package main_test + +import ( + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +func TestSurgery_RevertMetaPage(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + srcFile, err := os.Open(srcPath) + require.NoError(t, err) + defer srcFile.Close() + + // Read both meta0 and meta1 from srcFile + srcBuf0 := readPage(t, srcPath, 0, pageSize) + srcBuf1 := readPage(t, srcPath, 1, pageSize) + meta0Page := common.LoadPageMeta(srcBuf0) + meta1Page := common.LoadPageMeta(srcBuf1) + + // Get the non-active meta page + nonActiveSrcBuf := srcBuf0 + nonActiveMetaPageId := 0 + if meta0Page.Txid() > meta1Page.Txid() { + nonActiveSrcBuf = srcBuf1 + nonActiveMetaPageId = 1 + } + t.Logf("non active meta page id: %d", nonActiveMetaPageId) + + // revert the meta page + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "revert-meta-page", srcPath, + "--output", output, + }) + err = rootCmd.Execute() + require.NoError(t, err) + + // read both meta0 and meta1 from dst file + dstBuf0 := readPage(t, output, 0, pageSize) + dstBuf1 := readPage(t, output, 1, pageSize) + + // check result. Note we should skip the page ID + assert.Equal(t, pageDataWithoutPageId(nonActiveSrcBuf), pageDataWithoutPageId(dstBuf0)) + assert.Equal(t, pageDataWithoutPageId(nonActiveSrcBuf), pageDataWithoutPageId(dstBuf1)) +} + +func TestSurgery_CopyPage(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + // Insert some sample data + t.Log("Insert some sample data") + err := db.Fill([]byte("data"), 1, 20, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 10) }, + ) + require.NoError(t, err) + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + // copy page 3 to page 2 + t.Log("copy page 3 to page 2") + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "dstdb") + rootCmd.SetArgs([]string{ + "surgery", "copy-page", srcPath, + "--output", output, + "--from-page", "3", + "--to-page", "2", + }) + err = rootCmd.Execute() + require.NoError(t, err) + + // The page 2 should have exactly the same data as page 3. + t.Log("Verify result") + srcPageId3Data := readPage(t, srcPath, 3, pageSize) + dstPageId3Data := readPage(t, output, 3, pageSize) + dstPageId2Data := readPage(t, output, 2, pageSize) + + assert.Equal(t, srcPageId3Data, dstPageId3Data) + assert.Equal(t, pageDataWithoutPageId(srcPageId3Data), pageDataWithoutPageId(dstPageId2Data)) +} + +// TODO(ahrtr): add test case below for `surgery clear-page` command: +// 1. The page is a branch page. All its children should become free pages. +func TestSurgery_ClearPage(t *testing.T) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + // Insert some sample data + t.Log("Insert some sample data") + err := db.Fill([]byte("data"), 1, 20, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 10) }, + ) + require.NoError(t, err) + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + // clear page 3 + t.Log("clear page 3") + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "dstdb") + rootCmd.SetArgs([]string{ + "surgery", "clear-page", srcPath, + "--output", output, + "--pageId", "3", + }) + err = rootCmd.Execute() + require.NoError(t, err) + + t.Log("Verify result") + dstPageId3Data := readPage(t, output, 3, pageSize) + + p := common.LoadPage(dstPageId3Data) + assert.Equal(t, uint16(0), p.Count()) + assert.Equal(t, uint32(0), p.Overflow()) +} + +func TestSurgery_ClearPageElements_Without_Overflow(t *testing.T) { + testCases := []struct { + name string + from int + to int + isBranchPage bool + setEndIdxAsCount bool + removeOnlyOneElement bool // only valid when setEndIdxAsCount == true, and startIdx = endIdx -1 in this case. + expectError bool + }{ + // normal range in leaf page + { + name: "normal range in leaf page: [4, 8)", + from: 4, + to: 8, + }, + { + name: "normal range in leaf page: [5, -1)", + from: 4, + to: -1, + }, + { + name: "normal range in leaf page: all", + from: 0, + to: -1, + }, + { + name: "normal range in leaf page: [0, 7)", + from: 0, + to: 7, + }, + { + name: "normal range in leaf page: [3, count)", + from: 4, + setEndIdxAsCount: true, + }, + // normal range in branch page + { + name: "normal range in branch page: [4, 8)", + from: 4, + to: 8, + isBranchPage: true, + }, + { + name: "normal range in branch page: [5, -1)", + from: 4, + to: -1, + isBranchPage: true, + }, + { + name: "normal range in branch page: all", + from: 0, + to: -1, + isBranchPage: true, + }, + { + name: "normal range in branch page: [0, 7)", + from: 0, + to: 7, + isBranchPage: true, + }, + { + name: "normal range in branch page: [3, count)", + from: 4, + isBranchPage: true, + setEndIdxAsCount: true, + }, + // remove only one element + { + name: "one element: the first one", + from: 0, + to: 1, + }, + { + name: "one element: [6, 7)", + from: 6, + to: 7, + }, + { + name: "one element: the last one", + setEndIdxAsCount: true, + removeOnlyOneElement: true, + }, + // abnormal range + { + name: "abnormal range: [-1, 4)", + from: -1, + to: 4, + expectError: true, + }, + { + name: "abnormal range: [-2, 5)", + from: -1, + to: 5, + expectError: true, + }, + { + name: "abnormal range: [3, 3)", + from: 3, + to: 3, + expectError: true, + }, + { + name: "abnormal range: [5, 3)", + from: 5, + to: 3, + expectError: true, + }, + { + name: "abnormal range: [3, -2)", + from: 3, + to: -2, + expectError: true, + }, + { + name: "abnormal range: [3, 1000000)", + from: -1, + to: 4, + expectError: true, + }, + } + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + testSurgeryClearPageElementsWithoutOverflow(t, tc.from, tc.to, tc.isBranchPage, tc.setEndIdxAsCount, tc.removeOnlyOneElement, tc.expectError) + }) + } +} + +func testSurgeryClearPageElementsWithoutOverflow(t *testing.T, startIdx, endIdx int, isBranchPage, setEndIdxAsCount, removeOnlyOne, expectError bool) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + // Generate sample db + t.Log("Generate some sample data") + err := db.Fill([]byte("data"), 10, 200, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", tx*10000+k)) }, + func(tx int, k int) []byte { return make([]byte, 10) }, + ) + require.NoError(t, err) + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + // find a page with at least 10 elements + var ( + pageId uint64 = 2 + elementCount uint16 = 0 + ) + for { + p, _, err := guts_cli.ReadPage(srcPath, pageId) + require.NoError(t, err) + + if isBranchPage { + if p.IsBranchPage() && p.Count() > 10 { + elementCount = p.Count() + break + } + } else { + if p.IsLeafPage() && p.Count() > 10 { + elementCount = p.Count() + break + } + } + pageId++ + } + t.Logf("The original element count: %d", elementCount) + + if setEndIdxAsCount { + t.Logf("Set the endIdx as the element count: %d", elementCount) + endIdx = int(elementCount) + if removeOnlyOne { + startIdx = endIdx - 1 + t.Logf("Set the startIdx as the endIdx-1: %d", startIdx) + } + } + + // clear elements [startIdx, endIdx) in the page + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "clear-page-elements", srcPath, + "--output", output, + "--pageId", fmt.Sprintf("%d", pageId), + "--from-index", fmt.Sprintf("%d", startIdx), + "--to-index", fmt.Sprintf("%d", endIdx), + }) + err = rootCmd.Execute() + if expectError { + require.Error(t, err) + return + } + + require.NoError(t, err) + + // check the element count again + expectedCnt := 0 + if endIdx == -1 { + expectedCnt = startIdx + } else { + expectedCnt = int(elementCount) - (endIdx - startIdx) + } + p, _, err := guts_cli.ReadPage(output, pageId) + require.NoError(t, err) + assert.Equal(t, expectedCnt, int(p.Count())) + + compareDataAfterClearingElement(t, srcPath, output, pageId, isBranchPage, startIdx, endIdx) +} + +func compareDataAfterClearingElement(t *testing.T, srcPath, dstPath string, pageId uint64, isBranchPage bool, startIdx, endIdx int) { + srcPage, _, err := guts_cli.ReadPage(srcPath, pageId) + require.NoError(t, err) + + dstPage, _, err := guts_cli.ReadPage(dstPath, pageId) + require.NoError(t, err) + + var dstIdx uint16 + for i := uint16(0); i < srcPage.Count(); i++ { + // skip the cleared elements + if dstIdx >= uint16(startIdx) && (dstIdx < uint16(endIdx) || endIdx == -1) { + continue + } + + if isBranchPage { + srcElement := srcPage.BranchPageElement(i) + dstElement := dstPage.BranchPageElement(dstIdx) + + require.Equal(t, srcElement.Key(), dstElement.Key()) + require.Equal(t, srcElement.Pgid(), dstElement.Pgid()) + } else { + srcElement := srcPage.LeafPageElement(i) + dstElement := dstPage.LeafPageElement(dstIdx) + + require.Equal(t, srcElement.Flags(), dstElement.Flags()) + require.Equal(t, srcElement.Key(), dstElement.Key()) + require.Equal(t, srcElement.Value(), dstElement.Value()) + } + + dstIdx++ + } +} + +func TestSurgery_ClearPageElements_With_Overflow(t *testing.T) { + testCases := []struct { + name string + from int + to int + valueSizes []int + expectedOverflow int + }{ + // big element + { + name: "remove a big element at the end", + valueSizes: []int{500, 500, 500, 2600}, + from: 3, + to: 4, + expectedOverflow: 0, + }, + { + name: "remove a big element at the begin", + valueSizes: []int{2600, 500, 500, 500}, + from: 0, + to: 1, + expectedOverflow: 0, + }, + { + name: "remove a big element in the middle", + valueSizes: []int{500, 2600, 500, 500}, + from: 1, + to: 2, + expectedOverflow: 0, + }, + // small element + { + name: "remove a small element at the end", + valueSizes: []int{500, 500, 3100, 100}, + from: 3, + to: 4, + expectedOverflow: 1, + }, + { + name: "remove a small element at the begin", + valueSizes: []int{100, 500, 3100, 500}, + from: 0, + to: 1, + expectedOverflow: 1, + }, + { + name: "remove a small element in the middle", + valueSizes: []int{500, 100, 3100, 500}, + from: 1, + to: 2, + expectedOverflow: 1, + }, + { + name: "remove a small element at the end of page with big overflow", + valueSizes: []int{500, 500, 4096 * 5, 100}, + from: 3, + to: 4, + expectedOverflow: 5, + }, + { + name: "remove a small element at the begin of page with big overflow", + valueSizes: []int{100, 500, 4096 * 6, 500}, + from: 0, + to: 1, + expectedOverflow: 6, + }, + { + name: "remove a small element in the middle of page with big overflow", + valueSizes: []int{500, 100, 4096 * 4, 500}, + from: 1, + to: 2, + expectedOverflow: 4, + }, + // huge element + { + name: "remove a huge element at the end", + valueSizes: []int{500, 500, 500, 4096 * 5}, + from: 3, + to: 4, + expectedOverflow: 0, + }, + { + name: "remove a huge element at the begin", + valueSizes: []int{4096 * 5, 500, 500, 500}, + from: 0, + to: 1, + expectedOverflow: 0, + }, + { + name: "remove a huge element in the middle", + valueSizes: []int{500, 4096 * 5, 500, 500}, + from: 1, + to: 2, + expectedOverflow: 0, + }, + } + + for _, tc := range testCases { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + testSurgeryClearPageElementsWithOverflow(t, tc.from, tc.to, tc.valueSizes, tc.expectedOverflow) + }) + } +} + +func testSurgeryClearPageElementsWithOverflow(t *testing.T, startIdx, endIdx int, valueSizes []int, expectedOverflow int) { + pageSize := 4096 + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize}) + srcPath := db.Path() + + // Generate sample db + err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("data")) + for i, valueSize := range valueSizes { + key := []byte(fmt.Sprintf("%04d", i)) + val := make([]byte, valueSize) + if putErr := b.Put(key, val); putErr != nil { + return putErr + } + } + return nil + }) + require.NoError(t, err) + + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + // find a page with overflow pages + var ( + pageId uint64 = 2 + elementCount uint16 = 0 + ) + for { + p, _, err := guts_cli.ReadPage(srcPath, pageId) + require.NoError(t, err) + + if p.Overflow() > 0 { + elementCount = p.Count() + break + } + pageId++ + } + t.Logf("The original element count: %d", elementCount) + + // clear elements [startIdx, endIdx) in the page + rootCmd := main.NewRootCommand() + output := filepath.Join(t.TempDir(), "db") + rootCmd.SetArgs([]string{ + "surgery", "clear-page-elements", srcPath, + "--output", output, + "--pageId", fmt.Sprintf("%d", pageId), + "--from-index", fmt.Sprintf("%d", startIdx), + "--to-index", fmt.Sprintf("%d", endIdx), + }) + err = rootCmd.Execute() + require.NoError(t, err) + + // check the element count again + expectedCnt := 0 + if endIdx == -1 { + expectedCnt = startIdx + } else { + expectedCnt = int(elementCount) - (endIdx - startIdx) + } + p, _, err := guts_cli.ReadPage(output, pageId) + require.NoError(t, err) + assert.Equal(t, expectedCnt, int(p.Count())) + + assert.Equal(t, expectedOverflow, int(p.Overflow())) + + compareDataAfterClearingElement(t, srcPath, output, pageId, false, startIdx, endIdx) +} + +func TestSurgeryRequiredFlags(t *testing.T) { + errMsgFmt := `required flag(s) "%s" not set` + testCases := []struct { + name string + args []string + expectedErrMsg string + }{ + // --output is required for all surgery commands + { + name: "no output flag for revert-meta-page", + args: []string{"surgery", "revert-meta-page", "db"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + { + name: "no output flag for copy-page", + args: []string{"surgery", "copy-page", "db", "--from-page", "3", "--to-page", "2"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + { + name: "no output flag for clear-page", + args: []string{"surgery", "clear-page", "db", "--pageId", "3"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + { + name: "no output flag for clear-page-element", + args: []string{"surgery", "clear-page-elements", "db", "--pageId", "4", "--from-index", "3", "--to-index", "5"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + { + name: "no output flag for freelist abandon", + args: []string{"surgery", "freelist", "abandon", "db"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + { + name: "no output flag for freelist rebuild", + args: []string{"surgery", "freelist", "rebuild", "db"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "output"), + }, + // --from-page and --to-page are required for 'surgery copy-page' command + { + name: "no from-page flag for copy-page", + args: []string{"surgery", "copy-page", "db", "--output", "db", "--to-page", "2"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "from-page"), + }, + { + name: "no to-page flag for copy-page", + args: []string{"surgery", "copy-page", "db", "--output", "db", "--from-page", "2"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "to-page"), + }, + // --pageId is required for 'surgery clear-page' command + { + name: "no pageId flag for clear-page", + args: []string{"surgery", "clear-page", "db", "--output", "db"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "pageId"), + }, + // --pageId, --from-index and --to-index are required for 'surgery clear-page-element' command + { + name: "no pageId flag for clear-page-element", + args: []string{"surgery", "clear-page-elements", "db", "--output", "newdb", "--from-index", "3", "--to-index", "5"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "pageId"), + }, + { + name: "no from-index flag for clear-page-element", + args: []string{"surgery", "clear-page-elements", "db", "--output", "newdb", "--pageId", "2", "--to-index", "5"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "from-index"), + }, + { + name: "no to-index flag for clear-page-element", + args: []string{"surgery", "clear-page-elements", "db", "--output", "newdb", "--pageId", "2", "--from-index", "3"}, + expectedErrMsg: fmt.Sprintf(errMsgFmt, "to-index"), + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + rootCmd := main.NewRootCommand() + rootCmd.SetArgs(tc.args) + err := rootCmd.Execute() + require.ErrorContains(t, err, tc.expectedErrMsg) + }) + } +} diff --git a/cmd/bbolt/command_version.go b/cmd/bbolt/command_version.go new file mode 100644 index 0000000..654e478 --- /dev/null +++ b/cmd/bbolt/command_version.go @@ -0,0 +1,25 @@ +package main + +import ( + "fmt" + "runtime" + + "github.com/spf13/cobra" + + "github.com/tutus-one/tutus-bolt/version" +) + +func newVersionCommand() *cobra.Command { + versionCmd := &cobra.Command{ + Use: "version", + Short: "print the current version of bbolt", + Long: "print the current version of bbolt", + Run: func(cmd *cobra.Command, args []string) { + fmt.Printf("bbolt Version: %s\n", version.Version) + fmt.Printf("Go Version: %s\n", runtime.Version()) + fmt.Printf("Go OS/Arch: %s/%s\n", runtime.GOOS, runtime.GOARCH) + }, + } + + return versionCmd +} diff --git a/cmd/bbolt/main.go b/cmd/bbolt/main.go new file mode 100644 index 0000000..941c2e2 --- /dev/null +++ b/cmd/bbolt/main.go @@ -0,0 +1,1795 @@ +package main + +import ( + "bytes" + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "errors" + "flag" + "fmt" + "io" + "math/rand" + "os" + "runtime" + "runtime/pprof" + "strconv" + "strings" + "sync/atomic" + "testing" + "time" + "unicode" + "unicode/utf8" + + bolt "github.com/tutus-one/tutus-bolt" + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +var ( + // ErrUsage is returned when a usage message was printed and the process + // should simply exit with an error. + ErrUsage = errors.New("usage") + + // ErrUnknownCommand is returned when a CLI command is not specified. + ErrUnknownCommand = errors.New("unknown command") + + // ErrPathRequired is returned when the path to a Bolt database is not specified. + ErrPathRequired = errors.New("path required") + + // ErrFileNotFound is returned when a Bolt database does not exist. + ErrFileNotFound = errors.New("file not found") + + // ErrInvalidValue is returned when a benchmark reads an unexpected value. + ErrInvalidValue = errors.New("invalid value") + + // ErrNonDivisibleBatchSize is returned when the batch size can't be evenly + // divided by the iteration count. + ErrNonDivisibleBatchSize = errors.New("number of iterations must be divisible by the batch size") + + // ErrPageIDRequired is returned when a required page id is not specified. + ErrPageIDRequired = errors.New("page id required") + + // ErrBucketRequired is returned when a bucket is not specified. + ErrBucketRequired = errors.New("bucket required") + + // ErrKeyNotFound is returned when a key is not found. + ErrKeyNotFound = errors.New("key not found") + + // ErrNotEnoughArgs is returned with a cmd is being executed with fewer arguments. + ErrNotEnoughArgs = errors.New("not enough arguments") +) + +func main() { + m := NewMain() + if err := m.Run(os.Args[1:]...); err == ErrUsage { + os.Exit(2) + } else if err == ErrUnknownCommand { + cobraExecute() + } else if err != nil { + fmt.Println(err.Error()) + os.Exit(1) + } +} + +func cobraExecute() { + rootCmd := NewRootCommand() + if err := rootCmd.Execute(); err != nil { + if rootCmd.SilenceErrors { + fmt.Fprintln(os.Stderr, "Error:", err) + os.Exit(1) + } else { + os.Exit(1) + } + } +} + +type baseCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// Main represents the main program execution. +type Main struct { + baseCommand +} + +// NewMain returns a new instance of Main connect to the standard input/output. +func NewMain() *Main { + return &Main{ + baseCommand: baseCommand{ + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + }, + } +} + +// Run executes the program. +func (m *Main) Run(args ...string) error { + // Require a command at the beginning. + if len(args) == 0 || strings.HasPrefix(args[0], "-") { + fmt.Fprintln(m.Stderr, m.Usage()) + return ErrUsage + } + + // Execute command. + switch args[0] { + case "help": + fmt.Fprintln(m.Stderr, m.Usage()) + return ErrUsage + case "bench": + return newBenchCommand(m).Run(args[1:]...) + case "buckets": + return newBucketsCommand(m).Run(args[1:]...) + case "compact": + return newCompactCommand(m).Run(args[1:]...) + case "dump": + return newDumpCommand(m).Run(args[1:]...) + case "page-item": + return newPageItemCommand(m).Run(args[1:]...) + case "get": + return newGetCommand(m).Run(args[1:]...) + case "info": + return newInfoCommand(m).Run(args[1:]...) + case "keys": + return newKeysCommand(m).Run(args[1:]...) + case "page": + return newPageCommand(m).Run(args[1:]...) + case "pages": + return newPagesCommand(m).Run(args[1:]...) + case "stats": + return newStatsCommand(m).Run(args[1:]...) + default: + return ErrUnknownCommand + } +} + +// Usage returns the help message. +func (m *Main) Usage() string { + return strings.TrimLeft(` +Bbolt is a tool for inspecting bbolt databases. + +Usage: + + bbolt command [arguments] + +The commands are: + + version print the current version of bbolt + bench run synthetic benchmark against bbolt + buckets print a list of buckets + check verifies integrity of bbolt database + compact copies a bbolt database, compacting it in the process + dump print a hexadecimal dump of a single page + get print the value of a key in a bucket + info print basic info + keys print a list of keys in a bucket + help print this screen + page print one or more pages in human readable format + pages print list of pages with their types + page-item print the key and value of a page item. + stats iterate over all pages and generate usage stats + inspect inspect the structure of the database + surgery perform surgery on bbolt database + +Use "bbolt [command] -h" for more information about a command. +`, "\n") +} + +// infoCommand represents the "info" command execution. +type infoCommand struct { + baseCommand +} + +// newInfoCommand returns a infoCommand. +func newInfoCommand(m *Main) *infoCommand { + c := &infoCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *infoCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open the database. + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + // Print basic database info. + info := db.Info() + fmt.Fprintf(cmd.Stdout, "Page Size: %d\n", info.PageSize) + + return nil +} + +// Usage returns the help message. +func (cmd *infoCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt info PATH + +Info prints basic information about the Bolt database at PATH. +`, "\n") +} + +// dumpCommand represents the "dump" command execution. +type dumpCommand struct { + baseCommand +} + +// newDumpCommand returns a dumpCommand. +func newDumpCommand(m *Main) *dumpCommand { + c := &dumpCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *dumpCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path and page id. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Read page ids. + pageIDs, err := stringToPages(fs.Args()[1:]) + if err != nil { + return err + } else if len(pageIDs) == 0 { + return ErrPageIDRequired + } + + // Open database to retrieve page size. + pageSize, _, err := guts_cli.ReadPageAndHWMSize(path) + if err != nil { + return err + } + + // Open database file handler. + f, err := os.Open(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + + // Print each page listed. + for i, pageID := range pageIDs { + // Print a separator. + if i > 0 { + fmt.Fprintln(cmd.Stdout, "===============================================") + } + + // Print page to stdout. + if err := cmd.PrintPage(cmd.Stdout, f, pageID, uint64(pageSize)); err != nil { + return err + } + } + + return nil +} + +// PrintPage prints a given page as hexadecimal. +func (cmd *dumpCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID uint64, pageSize uint64) error { + const bytesPerLineN = 16 + + // Read page into buffer. + buf := make([]byte, pageSize) + addr := pageID * uint64(pageSize) + if n, err := r.ReadAt(buf, int64(addr)); err != nil { + return err + } else if uint64(n) != pageSize { + return io.ErrUnexpectedEOF + } + + // Write out to writer in 16-byte lines. + var prev []byte + var skipped bool + for offset := uint64(0); offset < pageSize; offset += bytesPerLineN { + // Retrieve current 16-byte line. + line := buf[offset : offset+bytesPerLineN] + isLastLine := (offset == (pageSize - bytesPerLineN)) + + // If it's the same as the previous line then print a skip. + if bytes.Equal(line, prev) && !isLastLine { + if !skipped { + fmt.Fprintf(w, "%07x *\n", addr+offset) + skipped = true + } + } else { + // Print line as hexadecimal in 2-byte groups. + fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset, + line[0:2], line[2:4], line[4:6], line[6:8], + line[8:10], line[10:12], line[12:14], line[14:16], + ) + + skipped = false + } + + // Save the previous line. + prev = line + } + fmt.Fprint(w, "\n") + + return nil +} + +// Usage returns the help message. +func (cmd *dumpCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt dump PATH pageid [pageid...] + +Dump prints a hexadecimal dump of one or more pages. +`, "\n") +} + +// pageItemCommand represents the "page-item" command execution. +type pageItemCommand struct { + baseCommand +} + +// newPageItemCommand returns a pageItemCommand. +func newPageItemCommand(m *Main) *pageItemCommand { + c := &pageItemCommand{} + c.baseCommand = m.baseCommand + return c +} + +type pageItemOptions struct { + help bool + keyOnly bool + valueOnly bool + format string +} + +// Run executes the command. +func (cmd *pageItemCommand) Run(args ...string) error { + // Parse flags. + options := &pageItemOptions{} + fs := flag.NewFlagSet("", flag.ContinueOnError) + fs.BoolVar(&options.keyOnly, "key-only", false, "Print only the key") + fs.BoolVar(&options.valueOnly, "value-only", false, "Print only the value") + fs.StringVar(&options.format, "format", "auto", "Output format. One of: "+FORMAT_MODES) + fs.BoolVar(&options.help, "h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if options.help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + if options.keyOnly && options.valueOnly { + return errors.New("The --key-only or --value-only flag may be set, but not both.") + } + + // Require database path and page id. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Read page id. + pageID, err := strconv.ParseUint(fs.Arg(1), 10, 64) + if err != nil { + return err + } + + // Read item id. + itemID, err := strconv.ParseUint(fs.Arg(2), 10, 64) + if err != nil { + return err + } + + // Open database file handler. + f, err := os.Open(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + + // Retrieve page info and page size. + _, buf, err := guts_cli.ReadPage(path, pageID) + if err != nil { + return err + } + + if !options.valueOnly { + err := cmd.PrintLeafItemKey(cmd.Stdout, buf, uint16(itemID), options.format) + if err != nil { + return err + } + } + if !options.keyOnly { + err := cmd.PrintLeafItemValue(cmd.Stdout, buf, uint16(itemID), options.format) + if err != nil { + return err + } + } + return nil +} + +func (cmd *pageItemCommand) leafPageElement(pageBytes []byte, index uint16) ([]byte, []byte, error) { + p := common.LoadPage(pageBytes) + if index >= p.Count() { + return nil, nil, fmt.Errorf("leafPageElement: expected item index less than %d, but got %d", p.Count(), index) + } + if p.Typ() != "leaf" { + return nil, nil, fmt.Errorf("leafPageElement: expected page type of 'leaf', but got '%s'", p.Typ()) + } + + e := p.LeafPageElement(index) + return e.Key(), e.Value(), nil +} + +const FORMAT_MODES = "auto|ascii-encoded|hex|bytes|redacted" + +// formatBytes converts bytes into string according to format. +// Supported formats: ascii-encoded, hex, bytes. +func formatBytes(b []byte, format string) (string, error) { + switch format { + case "ascii-encoded": + return fmt.Sprintf("%q", b), nil + case "hex": + return fmt.Sprintf("%x", b), nil + case "bytes": + return string(b), nil + case "auto": + return bytesToAsciiOrHex(b), nil + case "redacted": + hash := sha256.New() + hash.Write(b) + return fmt.Sprintf("", len(b), hash.Sum(nil)), nil + default: + return "", fmt.Errorf("formatBytes: unsupported format: %s", format) + } +} + +func parseBytes(str string, format string) ([]byte, error) { + switch format { + case "ascii-encoded": + return []byte(str), nil + case "hex": + return hex.DecodeString(str) + default: + return nil, fmt.Errorf("parseBytes: unsupported format: %s", format) + } +} + +// writelnBytes writes the byte to the writer. Supported formats: ascii-encoded, hex, bytes, auto, redacted. +// Terminates the write with a new line symbol; +func writelnBytes(w io.Writer, b []byte, format string) error { + str, err := formatBytes(b, format) + if err != nil { + return err + } + _, err = fmt.Fprintln(w, str) + return err +} + +// PrintLeafItemKey writes the bytes of a leaf element's key. +func (cmd *pageItemCommand) PrintLeafItemKey(w io.Writer, pageBytes []byte, index uint16, format string) error { + k, _, err := cmd.leafPageElement(pageBytes, index) + if err != nil { + return err + } + + return writelnBytes(w, k, format) +} + +// PrintLeafItemValue writes the bytes of a leaf element's value. +func (cmd *pageItemCommand) PrintLeafItemValue(w io.Writer, pageBytes []byte, index uint16, format string) error { + _, v, err := cmd.leafPageElement(pageBytes, index) + if err != nil { + return err + } + return writelnBytes(w, v, format) +} + +// Usage returns the help message. +func (cmd *pageItemCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt page-item [options] PATH pageid itemid + +Additional options include: + + --key-only + Print only the key + --value-only + Print only the value + --format + Output format. One of: `+FORMAT_MODES+` (default=auto) + +page-item prints a page item key and value. +`, "\n") +} + +// pagesCommand represents the "pages" command execution. +type pagesCommand struct { + baseCommand +} + +// newPagesCommand returns a pagesCommand. +func newPagesCommand(m *Main) *pagesCommand { + c := &pagesCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *pagesCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0600, &bolt.Options{ + ReadOnly: true, + PreLoadFreelist: true, + }) + if err != nil { + return err + } + defer func() { _ = db.Close() }() + + // Write header. + fmt.Fprintln(cmd.Stdout, "ID TYPE ITEMS OVRFLW") + fmt.Fprintln(cmd.Stdout, "======== ========== ====== ======") + + return db.View(func(tx *bolt.Tx) error { + var id int + for { + p, err := tx.Page(id) + if err != nil { + return &PageError{ID: id, Err: err} + } else if p == nil { + break + } + + // Only display count and overflow if this is a non-free page. + var count, overflow string + if p.Type != "free" { + count = strconv.Itoa(p.Count) + if p.OverflowCount > 0 { + overflow = strconv.Itoa(p.OverflowCount) + } + } + + // Print table row. + fmt.Fprintf(cmd.Stdout, "%-8d %-10s %-6s %-6s\n", p.ID, p.Type, count, overflow) + + // Move to the next non-overflow page. + id += 1 + if p.Type != "free" { + id += p.OverflowCount + } + } + return nil + }) +} + +// Usage returns the help message. +func (cmd *pagesCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt pages PATH + +Pages prints a table of pages with their type (meta, leaf, branch, freelist). +Leaf and branch pages will show a key count in the "items" column while the +freelist will show the number of free pages in the "items" column. + +The "overflow" column shows the number of blocks that the page spills over +into. Normally there is no overflow but large keys and values can cause +a single page to take up multiple blocks. +`, "\n") +} + +// statsCommand represents the "stats" command execution. +type statsCommand struct { + baseCommand +} + +// newStatsCommand returns a statsCommand. +func newStatsCommand(m *Main) *statsCommand { + c := &statsCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *statsCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path, prefix := fs.Arg(0), fs.Arg(1) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + return db.View(func(tx *bolt.Tx) error { + var s bolt.BucketStats + var count int + if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { + if bytes.HasPrefix(name, []byte(prefix)) { + s.Add(b.Stats()) + count += 1 + } + return nil + }); err != nil { + return err + } + + fmt.Fprintf(cmd.Stdout, "Aggregate statistics for %d buckets\n\n", count) + + fmt.Fprintln(cmd.Stdout, "Page count statistics") + fmt.Fprintf(cmd.Stdout, "\tNumber of logical branch pages: %d\n", s.BranchPageN) + fmt.Fprintf(cmd.Stdout, "\tNumber of physical branch overflow pages: %d\n", s.BranchOverflowN) + fmt.Fprintf(cmd.Stdout, "\tNumber of logical leaf pages: %d\n", s.LeafPageN) + fmt.Fprintf(cmd.Stdout, "\tNumber of physical leaf overflow pages: %d\n", s.LeafOverflowN) + + fmt.Fprintln(cmd.Stdout, "Tree statistics") + fmt.Fprintf(cmd.Stdout, "\tNumber of keys/value pairs: %d\n", s.KeyN) + fmt.Fprintf(cmd.Stdout, "\tNumber of levels in B+tree: %d\n", s.Depth) + + fmt.Fprintln(cmd.Stdout, "Page size utilization") + fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical branch pages: %d\n", s.BranchAlloc) + var percentage int + if s.BranchAlloc != 0 { + percentage = int(float32(s.BranchInuse) * 100.0 / float32(s.BranchAlloc)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes actually used for branch data: %d (%d%%)\n", s.BranchInuse, percentage) + fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical leaf pages: %d\n", s.LeafAlloc) + percentage = 0 + if s.LeafAlloc != 0 { + percentage = int(float32(s.LeafInuse) * 100.0 / float32(s.LeafAlloc)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes actually used for leaf data: %d (%d%%)\n", s.LeafInuse, percentage) + + fmt.Fprintln(cmd.Stdout, "Bucket statistics") + fmt.Fprintf(cmd.Stdout, "\tTotal number of buckets: %d\n", s.BucketN) + percentage = 0 + if s.BucketN != 0 { + percentage = int(float32(s.InlineBucketN) * 100.0 / float32(s.BucketN)) + } + fmt.Fprintf(cmd.Stdout, "\tTotal number on inlined buckets: %d (%d%%)\n", s.InlineBucketN, percentage) + percentage = 0 + if s.LeafInuse != 0 { + percentage = int(float32(s.InlineBucketInuse) * 100.0 / float32(s.LeafInuse)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes used for inlined buckets: %d (%d%%)\n", s.InlineBucketInuse, percentage) + + return nil + }) +} + +// Usage returns the help message. +func (cmd *statsCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt stats PATH + +Stats performs an extensive search of the database to track every page +reference. It starts at the current meta page and recursively iterates +through every accessible bucket. + +The following errors can be reported: + + already freed + The page is referenced more than once in the freelist. + + unreachable unfreed + The page is not referenced by a bucket or in the freelist. + + reachable freed + The page is referenced by a bucket but is also in the freelist. + + out of bounds + A page is referenced that is above the high water mark. + + multiple references + A page is referenced by more than one other page. + + invalid type + The page type is not "meta", "leaf", "branch", or "freelist". + +No errors should occur in your database. However, if for some reason you +experience corruption, please submit a ticket to the etcd-io/bbolt project page: + + https://github.com/etcd-io/bbolt/issues +`, "\n") +} + +// bucketsCommand represents the "buckets" command execution. +type bucketsCommand struct { + baseCommand +} + +// newBucketsCommand returns a bucketsCommand. +func newBucketsCommand(m *Main) *bucketsCommand { + c := &bucketsCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *bucketsCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + // Print buckets. + return db.View(func(tx *bolt.Tx) error { + return tx.ForEach(func(name []byte, _ *bolt.Bucket) error { + fmt.Fprintln(cmd.Stdout, string(name)) + return nil + }) + }) +} + +// Usage returns the help message. +func (cmd *bucketsCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt buckets PATH + +Print a list of buckets. +`, "\n") +} + +// keysCommand represents the "keys" command execution. +type keysCommand struct { + baseCommand +} + +// newKeysCommand returns a keysCommand. +func newKeysCommand(m *Main) *keysCommand { + c := &keysCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *keysCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + optionsFormat := fs.String("format", "auto", "Output format. One of: "+FORMAT_MODES+" (default: auto)") + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path and bucket. + relevantArgs := fs.Args() + if len(relevantArgs) < 2 { + return ErrNotEnoughArgs + } + path, buckets := relevantArgs[0], relevantArgs[1:] + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } else if len(buckets) == 0 { + return ErrBucketRequired + } + + // Open database. + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + // Print keys. + return db.View(func(tx *bolt.Tx) error { + // Find bucket. + lastBucket, err := findLastBucket(tx, buckets) + if err != nil { + return err + } + + // Iterate over each key. + return lastBucket.ForEach(func(key, _ []byte) error { + return writelnBytes(cmd.Stdout, key, *optionsFormat) + }) + }) +} + +// Usage returns the help message. +// TODO: Use https://pkg.go.dev/flag#FlagSet.PrintDefaults to print supported flags. +func (cmd *keysCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt keys PATH [BUCKET...] + +Print a list of keys in the given (sub)bucket. +======= + +Additional options include: + + --format + Output format. One of: `+FORMAT_MODES+` (default=auto) + +Print a list of keys in the given bucket. +`, "\n") +} + +// getCommand represents the "get" command execution. +type getCommand struct { + baseCommand +} + +// newGetCommand returns a getCommand. +func newGetCommand(m *Main) *getCommand { + c := &getCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *getCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + var parseFormat string + var format string + fs.StringVar(&parseFormat, "parse-format", "ascii-encoded", "Input format. One of: ascii-encoded|hex (default: ascii-encoded)") + fs.StringVar(&format, "format", "auto", "Output format. One of: "+FORMAT_MODES+" (default: auto)") + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path, bucket and key. + relevantArgs := fs.Args() + if len(relevantArgs) < 3 { + return ErrNotEnoughArgs + } + path, buckets := relevantArgs[0], relevantArgs[1:len(relevantArgs)-1] + key, err := parseBytes(relevantArgs[len(relevantArgs)-1], parseFormat) + if err != nil { + return err + } + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } else if len(buckets) == 0 { + return ErrBucketRequired + } else if len(key) == 0 { + return berrors.ErrKeyRequired + } + + // Open database. + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer db.Close() + + // Print value. + return db.View(func(tx *bolt.Tx) error { + // Find bucket. + lastBucket, err := findLastBucket(tx, buckets) + if err != nil { + return err + } + + // Find value for given key. + val := lastBucket.Get(key) + if val == nil { + return fmt.Errorf("Error %w for key: %q hex: \"%x\"", ErrKeyNotFound, key, string(key)) + } + + // TODO: In this particular case, it would be better to not terminate with '\n' + return writelnBytes(cmd.Stdout, val, format) + }) +} + +// Usage returns the help message. +func (cmd *getCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt get PATH [BUCKET..] KEY + +Print the value of the given key in the given (sub)bucket. + +Additional options include: + + --format + Output format. One of: `+FORMAT_MODES+` (default=auto) + --parse-format + Input format (of key). One of: ascii-encoded|hex (default=ascii-encoded)" +`, "\n") +} + +var benchBucketName = []byte("bench") + +// benchCommand represents the "bench" command execution. +type benchCommand struct { + baseCommand +} + +// newBenchCommand returns a BenchCommand using the +func newBenchCommand(m *Main) *benchCommand { + c := &benchCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the "bench" command. +func (cmd *benchCommand) Run(args ...string) error { + // Parse CLI arguments. + options, err := cmd.ParseFlags(args) + if err != nil { + return err + } + + // Remove path if "-work" is not set. Otherwise keep path. + if options.Work { + fmt.Fprintf(cmd.Stderr, "work: %s\n", options.Path) + } else { + defer os.Remove(options.Path) + } + + // Create database. + db, err := bolt.Open(options.Path, 0600, nil) + if err != nil { + return err + } + db.NoSync = options.NoSync + defer db.Close() + + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + // Write to the database. + var writeResults BenchResults + + fmt.Fprintf(cmd.Stderr, "starting write benchmark.\n") + keys, err := cmd.runWrites(db, options, &writeResults, r) + if err != nil { + return fmt.Errorf("write: %v", err) + } + + if keys != nil { + r.Shuffle(len(keys), func(i, j int) { + keys[i], keys[j] = keys[j], keys[i] + }) + } + + var readResults BenchResults + fmt.Fprintf(cmd.Stderr, "starting read benchmark.\n") + // Read from the database. + if err := cmd.runReads(db, options, &readResults, keys); err != nil { + return fmt.Errorf("bench: read: %s", err) + } + + // Print results. + if options.GoBenchOutput { + // below replicates the output of testing.B benchmarks, e.g. for external tooling + benchWriteName := "BenchmarkWrite" + benchReadName := "BenchmarkRead" + maxLen := max(len(benchReadName), len(benchWriteName)) + printGoBenchResult(cmd.Stdout, writeResults, maxLen, benchWriteName) + printGoBenchResult(cmd.Stdout, readResults, maxLen, benchReadName) + } else { + fmt.Fprintf(cmd.Stdout, "# Write\t%v(ops)\t%v\t(%v/op)\t(%v op/sec)\n", writeResults.CompletedOps(), writeResults.Duration(), writeResults.OpDuration(), writeResults.OpsPerSecond()) + fmt.Fprintf(cmd.Stdout, "# Read\t%v(ops)\t%v\t(%v/op)\t(%v op/sec)\n", readResults.CompletedOps(), readResults.Duration(), readResults.OpDuration(), readResults.OpsPerSecond()) + } + fmt.Fprintln(cmd.Stderr, "") + + return nil +} + +func printGoBenchResult(w io.Writer, r BenchResults, maxLen int, benchName string) { + gobenchResult := testing.BenchmarkResult{} + gobenchResult.T = r.Duration() + gobenchResult.N = int(r.CompletedOps()) + fmt.Fprintf(w, "%-*s\t%s\n", maxLen, benchName, gobenchResult.String()) +} + +// ParseFlags parses the command line flags. +func (cmd *benchCommand) ParseFlags(args []string) (*BenchOptions, error) { + var options BenchOptions + + // Parse flagset. + fs := flag.NewFlagSet("", flag.ContinueOnError) + fs.StringVar(&options.ProfileMode, "profile-mode", "rw", "") + fs.StringVar(&options.WriteMode, "write-mode", "seq", "") + fs.StringVar(&options.ReadMode, "read-mode", "seq", "") + fs.Int64Var(&options.Iterations, "count", 1000, "") + fs.Int64Var(&options.BatchSize, "batch-size", 0, "") + fs.IntVar(&options.KeySize, "key-size", 8, "") + fs.IntVar(&options.ValueSize, "value-size", 32, "") + fs.StringVar(&options.CPUProfile, "cpuprofile", "", "") + fs.StringVar(&options.MemProfile, "memprofile", "", "") + fs.StringVar(&options.BlockProfile, "blockprofile", "", "") + fs.Float64Var(&options.FillPercent, "fill-percent", bolt.DefaultFillPercent, "") + fs.BoolVar(&options.NoSync, "no-sync", false, "") + fs.BoolVar(&options.Work, "work", false, "") + fs.StringVar(&options.Path, "path", "", "") + fs.BoolVar(&options.GoBenchOutput, "gobench-output", false, "") + fs.SetOutput(cmd.Stderr) + if err := fs.Parse(args); err != nil { + return nil, err + } + + // Set batch size to iteration size if not set. + // Require that batch size can be evenly divided by the iteration count. + if options.BatchSize == 0 { + options.BatchSize = options.Iterations + } else if options.Iterations%options.BatchSize != 0 { + return nil, ErrNonDivisibleBatchSize + } + + // Generate temp path if one is not passed in. + if options.Path == "" { + f, err := os.CreateTemp("", "bolt-bench-") + if err != nil { + return nil, fmt.Errorf("temp file: %s", err) + } + f.Close() + os.Remove(f.Name()) + options.Path = f.Name() + } + + return &options, nil +} + +// Writes to the database. +func (cmd *benchCommand) runWrites(db *bolt.DB, options *BenchOptions, results *BenchResults, r *rand.Rand) ([]nestedKey, error) { + // Start profiling for writes. + if options.ProfileMode == "rw" || options.ProfileMode == "w" { + cmd.startProfiling(options) + } + + finishChan := make(chan interface{}) + go checkProgress(results, finishChan, cmd.Stderr) + defer close(finishChan) + + t := time.Now() + + var keys []nestedKey + var err error + switch options.WriteMode { + case "seq": + keys, err = cmd.runWritesSequential(db, options, results) + case "rnd": + keys, err = cmd.runWritesRandom(db, options, results, r) + case "seq-nest": + keys, err = cmd.runWritesSequentialNested(db, options, results) + case "rnd-nest": + keys, err = cmd.runWritesRandomNested(db, options, results, r) + default: + return nil, fmt.Errorf("invalid write mode: %s", options.WriteMode) + } + + // Save time to write. + results.SetDuration(time.Since(t)) + + // Stop profiling for writes only. + if options.ProfileMode == "w" { + cmd.stopProfiling() + } + + return keys, err +} + +func (cmd *benchCommand) runWritesSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) ([]nestedKey, error) { + var i = uint32(0) + return cmd.runWritesWithSource(db, options, results, func() uint32 { i++; return i }) +} + +func (cmd *benchCommand) runWritesRandom(db *bolt.DB, options *BenchOptions, results *BenchResults, r *rand.Rand) ([]nestedKey, error) { + return cmd.runWritesWithSource(db, options, results, func() uint32 { return r.Uint32() }) +} + +func (cmd *benchCommand) runWritesSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) ([]nestedKey, error) { + var i = uint32(0) + return cmd.runWritesNestedWithSource(db, options, results, func() uint32 { i++; return i }) +} + +func (cmd *benchCommand) runWritesRandomNested(db *bolt.DB, options *BenchOptions, results *BenchResults, r *rand.Rand) ([]nestedKey, error) { + return cmd.runWritesNestedWithSource(db, options, results, func() uint32 { return r.Uint32() }) +} + +func (cmd *benchCommand) runWritesWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) ([]nestedKey, error) { + var keys []nestedKey + if options.ReadMode == "rnd" { + keys = make([]nestedKey, 0, options.Iterations) + } + + for i := int64(0); i < options.Iterations; i += options.BatchSize { + if err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists(benchBucketName) + b.FillPercent = options.FillPercent + + fmt.Fprintf(cmd.Stderr, "Starting write iteration %d\n", i) + for j := int64(0); j < options.BatchSize; j++ { + key := make([]byte, options.KeySize) + value := make([]byte, options.ValueSize) + + // Write key as uint32. + binary.BigEndian.PutUint32(key, keySource()) + + // Insert key/value. + if err := b.Put(key, value); err != nil { + return err + } + if keys != nil { + keys = append(keys, nestedKey{nil, key}) + } + results.AddCompletedOps(1) + } + fmt.Fprintf(cmd.Stderr, "Finished write iteration %d\n", i) + + return nil + }); err != nil { + return nil, err + } + } + return keys, nil +} + +func (cmd *benchCommand) runWritesNestedWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) ([]nestedKey, error) { + var keys []nestedKey + if options.ReadMode == "rnd" { + keys = make([]nestedKey, 0, options.Iterations) + } + + for i := int64(0); i < options.Iterations; i += options.BatchSize { + if err := db.Update(func(tx *bolt.Tx) error { + top, err := tx.CreateBucketIfNotExists(benchBucketName) + if err != nil { + return err + } + top.FillPercent = options.FillPercent + + // Create bucket key. + name := make([]byte, options.KeySize) + binary.BigEndian.PutUint32(name, keySource()) + + // Create bucket. + b, err := top.CreateBucketIfNotExists(name) + if err != nil { + return err + } + b.FillPercent = options.FillPercent + + fmt.Fprintf(cmd.Stderr, "Starting write iteration %d\n", i) + for j := int64(0); j < options.BatchSize; j++ { + var key = make([]byte, options.KeySize) + var value = make([]byte, options.ValueSize) + + // Generate key as uint32. + binary.BigEndian.PutUint32(key, keySource()) + + // Insert value into subbucket. + if err := b.Put(key, value); err != nil { + return err + } + if keys != nil { + keys = append(keys, nestedKey{name, key}) + } + results.AddCompletedOps(1) + } + fmt.Fprintf(cmd.Stderr, "Finished write iteration %d\n", i) + + return nil + }); err != nil { + return nil, err + } + } + return keys, nil +} + +// Reads from the database. +func (cmd *benchCommand) runReads(db *bolt.DB, options *BenchOptions, results *BenchResults, keys []nestedKey) error { + // Start profiling for reads. + if options.ProfileMode == "r" { + cmd.startProfiling(options) + } + + finishChan := make(chan interface{}) + go checkProgress(results, finishChan, cmd.Stderr) + defer close(finishChan) + + t := time.Now() + + var err error + switch options.ReadMode { + case "seq": + switch options.WriteMode { + case "seq-nest", "rnd-nest": + err = cmd.runReadsSequentialNested(db, options, results) + default: + err = cmd.runReadsSequential(db, options, results) + } + case "rnd": + switch options.WriteMode { + case "seq-nest", "rnd-nest": + err = cmd.runReadsRandomNested(db, options, keys, results) + default: + err = cmd.runReadsRandom(db, options, keys, results) + } + default: + return fmt.Errorf("invalid read mode: %s", options.ReadMode) + } + + // Save read time. + results.SetDuration(time.Since(t)) + + // Stop profiling for reads. + if options.ProfileMode == "rw" || options.ProfileMode == "r" { + cmd.stopProfiling() + } + + return err +} + +type nestedKey struct{ bucket, key []byte } + +func (cmd *benchCommand) runReadsSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + numReads := int64(0) + err := func() error { + defer func() { results.AddCompletedOps(numReads) }() + + c := tx.Bucket(benchBucketName).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + numReads++ + if v == nil { + return ErrInvalidValue + } + } + + return nil + }() + + if err != nil { + return err + } + + if options.WriteMode == "seq" && numReads != options.Iterations { + return fmt.Errorf("read seq: iter mismatch: expected %d, got %d", options.Iterations, numReads) + } + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +func (cmd *benchCommand) runReadsRandom(db *bolt.DB, options *BenchOptions, keys []nestedKey, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + numReads := int64(0) + err := func() error { + defer func() { results.AddCompletedOps(numReads) }() + + b := tx.Bucket(benchBucketName) + for _, key := range keys { + v := b.Get(key.key) + numReads++ + if v == nil { + return ErrInvalidValue + } + } + + return nil + }() + + if err != nil { + return err + } + + if options.WriteMode == "seq" && numReads != options.Iterations { + return fmt.Errorf("read seq: iter mismatch: expected %d, got %d", options.Iterations, numReads) + } + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +func (cmd *benchCommand) runReadsSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + numReads := int64(0) + var top = tx.Bucket(benchBucketName) + if err := top.ForEach(func(name, _ []byte) error { + defer func() { results.AddCompletedOps(numReads) }() + if b := top.Bucket(name); b != nil { + c := b.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + numReads++ + if v == nil { + return ErrInvalidValue + } + } + } + return nil + }); err != nil { + return err + } + + if options.WriteMode == "seq-nest" && numReads != options.Iterations { + return fmt.Errorf("read seq-nest: iter mismatch: expected %d, got %d", options.Iterations, numReads) + } + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +func (cmd *benchCommand) runReadsRandomNested(db *bolt.DB, options *BenchOptions, nestedKeys []nestedKey, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + numReads := int64(0) + err := func() error { + defer func() { results.AddCompletedOps(numReads) }() + + var top = tx.Bucket(benchBucketName) + for _, nestedKey := range nestedKeys { + if b := top.Bucket(nestedKey.bucket); b != nil { + v := b.Get(nestedKey.key) + numReads++ + if v == nil { + return ErrInvalidValue + } + } + } + + return nil + }() + + if err != nil { + return err + } + + if options.WriteMode == "seq-nest" && numReads != options.Iterations { + return fmt.Errorf("read seq-nest: iter mismatch: expected %d, got %d", options.Iterations, numReads) + } + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +func checkProgress(results *BenchResults, finishChan chan interface{}, stderr io.Writer) { + ticker := time.Tick(time.Second) + lastCompleted, lastTime := int64(0), time.Now() + for { + select { + case <-finishChan: + return + case t := <-ticker: + completed, taken := results.CompletedOps(), t.Sub(lastTime) + fmt.Fprintf(stderr, "Completed %d requests, %d/s \n", + completed, ((completed-lastCompleted)*int64(time.Second))/int64(taken), + ) + lastCompleted, lastTime = completed, t + } + } +} + +// File handlers for the various profiles. +var cpuprofile, memprofile, blockprofile *os.File + +// Starts all profiles set on the options. +func (cmd *benchCommand) startProfiling(options *BenchOptions) { + var err error + + // Start CPU profiling. + if options.CPUProfile != "" { + cpuprofile, err = os.Create(options.CPUProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create cpu profile %q: %v\n", options.CPUProfile, err) + os.Exit(1) + } + err = pprof.StartCPUProfile(cpuprofile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not start cpu profile %q: %v\n", options.CPUProfile, err) + os.Exit(1) + } + } + + // Start memory profiling. + if options.MemProfile != "" { + memprofile, err = os.Create(options.MemProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create memory profile %q: %v\n", options.MemProfile, err) + os.Exit(1) + } + runtime.MemProfileRate = 4096 + } + + // Start fatal profiling. + if options.BlockProfile != "" { + blockprofile, err = os.Create(options.BlockProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create block profile %q: %v\n", options.BlockProfile, err) + os.Exit(1) + } + runtime.SetBlockProfileRate(1) + } +} + +// Stops all profiles. +func (cmd *benchCommand) stopProfiling() { + if cpuprofile != nil { + pprof.StopCPUProfile() + cpuprofile.Close() + cpuprofile = nil + } + + if memprofile != nil { + err := pprof.Lookup("heap").WriteTo(memprofile, 0) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not write mem profile") + } + memprofile.Close() + memprofile = nil + } + + if blockprofile != nil { + err := pprof.Lookup("block").WriteTo(blockprofile, 0) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not write block profile") + } + blockprofile.Close() + blockprofile = nil + runtime.SetBlockProfileRate(0) + } +} + +// BenchOptions represents the set of options that can be passed to "bolt bench". +type BenchOptions struct { + ProfileMode string + WriteMode string + ReadMode string + Iterations int64 + BatchSize int64 + KeySize int + ValueSize int + CPUProfile string + MemProfile string + BlockProfile string + StatsInterval time.Duration + FillPercent float64 + NoSync bool + Work bool + Path string + GoBenchOutput bool +} + +// BenchResults represents the performance results of the benchmark and is thread-safe. +type BenchResults struct { + completedOps int64 + duration int64 +} + +func (r *BenchResults) AddCompletedOps(amount int64) { + atomic.AddInt64(&r.completedOps, amount) +} + +func (r *BenchResults) CompletedOps() int64 { + return atomic.LoadInt64(&r.completedOps) +} + +func (r *BenchResults) SetDuration(dur time.Duration) { + atomic.StoreInt64(&r.duration, int64(dur)) +} + +func (r *BenchResults) Duration() time.Duration { + return time.Duration(atomic.LoadInt64(&r.duration)) +} + +// Returns the duration for a single read/write operation. +func (r *BenchResults) OpDuration() time.Duration { + if r.CompletedOps() == 0 { + return 0 + } + return r.Duration() / time.Duration(r.CompletedOps()) +} + +// Returns average number of read/write operations that can be performed per second. +func (r *BenchResults) OpsPerSecond() int { + var op = r.OpDuration() + if op == 0 { + return 0 + } + return int(time.Second) / int(op) +} + +type PageError struct { + ID int + Err error +} + +func (e *PageError) Error() string { + return fmt.Sprintf("page error: id=%d, err=%s", e.ID, e.Err) +} + +// isPrintable returns true if the string is valid unicode and contains only printable runes. +func isPrintable(s string) bool { + if !utf8.ValidString(s) { + return false + } + for _, ch := range s { + if !unicode.IsPrint(ch) { + return false + } + } + return true +} + +func bytesToAsciiOrHex(b []byte) string { + sb := string(b) + if isPrintable(sb) { + return sb + } else { + return hex.EncodeToString(b) + } +} + +func stringToPage(str string) (uint64, error) { + return strconv.ParseUint(str, 10, 64) +} + +// stringToPages parses a slice of strings into page ids. +func stringToPages(strs []string) ([]uint64, error) { + var a []uint64 + for _, str := range strs { + i, err := stringToPage(str) + if err != nil { + return nil, err + } + a = append(a, i) + } + return a, nil +} + +// compactCommand represents the "compact" command execution. +type compactCommand struct { + baseCommand + + SrcPath string + DstPath string + TxMaxSize int64 + DstNoSync bool +} + +// newCompactCommand returns a CompactCommand. +func newCompactCommand(m *Main) *compactCommand { + c := &compactCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *compactCommand) Run(args ...string) (err error) { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + fs.SetOutput(io.Discard) + fs.StringVar(&cmd.DstPath, "o", "", "") + fs.Int64Var(&cmd.TxMaxSize, "tx-max-size", 65536, "") + fs.BoolVar(&cmd.DstNoSync, "no-sync", false, "") + if err := fs.Parse(args); err == flag.ErrHelp { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } else if err != nil { + return err + } else if cmd.DstPath == "" { + return errors.New("output file required") + } + + // Require database paths. + cmd.SrcPath = fs.Arg(0) + if cmd.SrcPath == "" { + return ErrPathRequired + } + + // Ensure source file exists. + fi, err := os.Stat(cmd.SrcPath) + if os.IsNotExist(err) { + return ErrFileNotFound + } else if err != nil { + return err + } + initialSize := fi.Size() + + // Open source database. + src, err := bolt.Open(cmd.SrcPath, 0400, &bolt.Options{ReadOnly: true}) + if err != nil { + return err + } + defer src.Close() + + // Open destination database. + dst, err := bolt.Open(cmd.DstPath, fi.Mode(), &bolt.Options{NoSync: cmd.DstNoSync}) + if err != nil { + return err + } + defer dst.Close() + + // Run compaction. + if err := bolt.Compact(dst, src, cmd.TxMaxSize); err != nil { + return err + } + + // Report stats on new size. + fi, err = os.Stat(cmd.DstPath) + if err != nil { + return err + } else if fi.Size() == 0 { + return fmt.Errorf("zero db size") + } + fmt.Fprintf(cmd.Stdout, "%d -> %d bytes (gain=%.2fx)\n", initialSize, fi.Size(), float64(initialSize)/float64(fi.Size())) + + return nil +} + +// Usage returns the help message. +func (cmd *compactCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt compact [options] -o DST SRC + +Compact opens a database at SRC path and walks it recursively, copying keys +as they are found from all buckets, to a newly created database at DST path. + +The original database is left untouched. + +Additional options include: + + -tx-max-size NUM + Specifies the maximum size of individual transactions. + Defaults to 64KB. + + -no-sync BOOL + Skip fsync() calls after each commit (fast but unsafe) + Defaults to false +`, "\n") +} + +type cmdKvStringer struct{} + +func (cmdKvStringer) KeyToString(key []byte) string { + return bytesToAsciiOrHex(key) +} + +func (cmdKvStringer) ValueToString(value []byte) string { + return bytesToAsciiOrHex(value) +} + +func CmdKvStringer() bolt.KVStringer { + return cmdKvStringer{} +} + +func findLastBucket(tx *bolt.Tx, bucketNames []string) (*bolt.Bucket, error) { + lastbucket := tx.Bucket([]byte(bucketNames[0])) + if lastbucket == nil { + return nil, berrors.ErrBucketNotFound + } + for _, bucket := range bucketNames[1:] { + lastbucket = lastbucket.Bucket([]byte(bucket)) + if lastbucket == nil { + return nil, berrors.ErrBucketNotFound + } + } + return lastbucket, nil +} diff --git a/cmd/bbolt/main_test.go b/cmd/bbolt/main_test.go new file mode 100644 index 0000000..df31c66 --- /dev/null +++ b/cmd/bbolt/main_test.go @@ -0,0 +1,754 @@ +package main_test + +import ( + "bytes" + crypto "crypto/rand" + "encoding/binary" + "encoding/hex" + "fmt" + "io" + "math/rand" + "os" + "strconv" + "strings" + "sync" + "testing" + + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + main "github.com/tutus-one/tutus-bolt/cmd/bbolt" +) + +// Ensure the "info" command can print information about a database. +func TestInfoCommand_Run(t *testing.T) { + db := btesting.MustCreateDB(t) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // Run the info command. + m := NewMain() + if err := m.Run("info", db.Path()); err != nil { + t.Fatal(err) + } +} + +// Ensure the "stats" command executes correctly with an empty database. +func TestStatsCommand_Run_EmptyDatabase(t *testing.T) { + // Ignore + if os.Getpagesize() != 4096 { + t.Skip("system does not use 4KB page size") + } + + db := btesting.MustCreateDB(t) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // Generate expected result. + exp := "Aggregate statistics for 0 buckets\n\n" + + "Page count statistics\n" + + "\tNumber of logical branch pages: 0\n" + + "\tNumber of physical branch overflow pages: 0\n" + + "\tNumber of logical leaf pages: 0\n" + + "\tNumber of physical leaf overflow pages: 0\n" + + "Tree statistics\n" + + "\tNumber of keys/value pairs: 0\n" + + "\tNumber of levels in B+tree: 0\n" + + "Page size utilization\n" + + "\tBytes allocated for physical branch pages: 0\n" + + "\tBytes actually used for branch data: 0 (0%)\n" + + "\tBytes allocated for physical leaf pages: 0\n" + + "\tBytes actually used for leaf data: 0 (0%)\n" + + "Bucket statistics\n" + + "\tTotal number of buckets: 0\n" + + "\tTotal number on inlined buckets: 0 (0%)\n" + + "\tBytes used for inlined buckets: 0 (0%)\n" + + // Run the command. + m := NewMain() + if err := m.Run("stats", db.Path()); err != nil { + t.Fatal(err) + } else if m.Stdout.String() != exp { + t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String()) + } +} + +func TestDumpCommand_Run(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + exp := `0000010 edda 0ced 0200 0000 0010 0000 0000 0000` + + m := NewMain() + err := m.Run("dump", db.Path(), "0") + require.NoError(t, err) + if !strings.Contains(m.Stdout.String(), exp) { + t.Fatalf("unexpected stdout:\n%s\n", m.Stdout.String()) + } +} + +func TestPageCommand_Run(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + exp := "Page ID: 0\n" + + "Page Type: meta\n" + + "Total Size: 4096 bytes\n" + + "Overflow pages: 0\n" + + "Version: 2\n" + + "Page Size: 4096 bytes\n" + + "Flags: 00000000\n" + + "Root: \n" + + "Freelist: \n" + + "HWM: \n" + + "Txn ID: 0\n" + + "Checksum: 07516e114689fdee\n\n" + + m := NewMain() + err := m.Run("page", db.Path(), "0") + require.NoError(t, err) + if m.Stdout.String() != exp { + t.Fatalf("unexpected stdout:\n%s\n%s", m.Stdout.String(), exp) + } +} + +func TestPageItemCommand_Run(t *testing.T) { + testCases := []struct { + name string + printable bool + itemId string + expectedKey string + expectedValue string + }{ + { + name: "printable items", + printable: true, + itemId: "0", + expectedKey: "key_0", + expectedValue: "value_0", + }, + { + name: "non printable items", + printable: false, + itemId: "0", + expectedKey: hex.EncodeToString(convertInt64IntoBytes(0 + 1)), + expectedValue: hex.EncodeToString(convertInt64IntoBytes(0 + 2)), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + srcPath := db.Path() + + t.Log("Insert some sample data") + err := db.Update(func(tx *bolt.Tx) error { + b, bErr := tx.CreateBucketIfNotExists([]byte("data")) + if bErr != nil { + return bErr + } + + for i := 0; i < 100; i++ { + if tc.printable { + if bErr = b.Put([]byte(fmt.Sprintf("key_%d", i)), []byte(fmt.Sprintf("value_%d", i))); bErr != nil { + return bErr + } + } else { + k, v := convertInt64IntoBytes(int64(i+1)), convertInt64IntoBytes(int64(i+2)) + if bErr = b.Put(k, v); bErr != nil { + return bErr + } + } + } + return nil + }) + require.NoError(t, err) + defer requireDBNoChange(t, dbData(t, srcPath), srcPath) + + meta := readMetaPage(t, srcPath) + leafPageId := 0 + for i := 2; i < int(meta.Pgid()); i++ { + p, _, err := guts_cli.ReadPage(srcPath, uint64(i)) + require.NoError(t, err) + if p.IsLeafPage() && p.Count() > 1 { + leafPageId = int(p.Id()) + } + } + require.NotEqual(t, 0, leafPageId) + + m := NewMain() + err = m.Run("page-item", db.Path(), fmt.Sprintf("%d", leafPageId), tc.itemId) + require.NoError(t, err) + if !strings.Contains(m.Stdout.String(), tc.expectedKey) || !strings.Contains(m.Stdout.String(), tc.expectedValue) { + t.Fatalf("Unexpected output:\n%s\n", m.Stdout.String()) + } + }) + } +} + +// Ensure the "stats" command can execute correctly. +func TestStatsCommand_Run(t *testing.T) { + // Ignore + if os.Getpagesize() != 4096 { + t.Skip("system does not use 4KB page size") + } + + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + // Create "foo" bucket. + b, err := tx.CreateBucket([]byte("foo")) + if err != nil { + return err + } + for i := 0; i < 10; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + return err + } + } + + // Create "bar" bucket. + b, err = tx.CreateBucket([]byte("bar")) + if err != nil { + return err + } + for i := 0; i < 100; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + return err + } + } + + // Create "baz" bucket. + b, err = tx.CreateBucket([]byte("baz")) + if err != nil { + return err + } + if err := b.Put([]byte("key"), []byte("value")); err != nil { + return err + } + + return nil + }); err != nil { + t.Fatal(err) + } + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // Generate expected result. + exp := "Aggregate statistics for 3 buckets\n\n" + + "Page count statistics\n" + + "\tNumber of logical branch pages: 0\n" + + "\tNumber of physical branch overflow pages: 0\n" + + "\tNumber of logical leaf pages: 1\n" + + "\tNumber of physical leaf overflow pages: 0\n" + + "Tree statistics\n" + + "\tNumber of keys/value pairs: 111\n" + + "\tNumber of levels in B+tree: 1\n" + + "Page size utilization\n" + + "\tBytes allocated for physical branch pages: 0\n" + + "\tBytes actually used for branch data: 0 (0%)\n" + + "\tBytes allocated for physical leaf pages: 4096\n" + + "\tBytes actually used for leaf data: 1996 (48%)\n" + + "Bucket statistics\n" + + "\tTotal number of buckets: 3\n" + + "\tTotal number on inlined buckets: 2 (66%)\n" + + "\tBytes used for inlined buckets: 236 (11%)\n" + + // Run the command. + m := NewMain() + if err := m.Run("stats", db.Path()); err != nil { + t.Fatal(err) + } else if m.Stdout.String() != exp { + t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String()) + } +} + +// Ensure the "buckets" command can print a list of buckets. +func TestBucketsCommand_Run(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + for _, name := range []string{"foo", "bar", "baz"} { + _, err := tx.CreateBucket([]byte(name)) + if err != nil { + return err + } + } + return nil + }); err != nil { + t.Fatal(err) + } + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + expected := "bar\nbaz\nfoo\n" + + // Run the command. + m := NewMain() + if err := m.Run("buckets", db.Path()); err != nil { + t.Fatal(err) + } else if actual := m.Stdout.String(); actual != expected { + t.Fatalf("unexpected stdout:\n\n%s", actual) + } +} + +// Ensure the "keys" command can print a list of keys for a bucket. +func TestKeysCommand_Run(t *testing.T) { + testCases := []struct { + name string + printable bool + testBucket string + expected string + }{ + { + name: "printable keys", + printable: true, + testBucket: "foo", + expected: "foo-0\nfoo-1\nfoo-2\n", + }, + { + name: "non printable keys", + printable: false, + testBucket: "bar", + expected: convertInt64KeysIntoHexString(100001, 100002, 100003), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Logf("creating test database for subtest '%s'", tc.name) + db := btesting.MustCreateDB(t) + + err := db.Update(func(tx *bolt.Tx) error { + t.Logf("creating test bucket %s", tc.testBucket) + b, bErr := tx.CreateBucketIfNotExists([]byte(tc.testBucket)) + if bErr != nil { + return fmt.Errorf("error creating test bucket %q: %v", tc.testBucket, bErr) + } + + t.Logf("inserting test data into test bucket %s", tc.testBucket) + if tc.printable { + for i := 0; i < 3; i++ { + key := fmt.Sprintf("%s-%d", tc.testBucket, i) + if pErr := b.Put([]byte(key), []byte{0}); pErr != nil { + return pErr + } + } + } else { + for i := 100001; i < 100004; i++ { + k := convertInt64IntoBytes(int64(i)) + if pErr := b.Put(k, []byte{0}); pErr != nil { + return pErr + } + } + } + return nil + }) + require.NoError(t, err) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + t.Log("running Keys cmd") + m := NewMain() + kErr := m.Run("keys", db.Path(), tc.testBucket) + require.NoError(t, kErr) + actual := m.Stdout.String() + assert.Equal(t, tc.expected, actual) + }) + } +} + +// Ensure the "get" command can print the value of a key in a bucket. +func TestGetCommand_Run(t *testing.T) { + testCases := []struct { + name string + printable bool + testBucket string + testKey string + expectedValue string + }{ + { + name: "printable data", + printable: true, + testBucket: "foo", + testKey: "foo-1", + expectedValue: "val-foo-1\n", + }, + { + name: "non printable data", + printable: false, + testBucket: "bar", + testKey: "100001", + expectedValue: hex.EncodeToString(convertInt64IntoBytes(100001)) + "\n", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte(tc.testBucket)) + if err != nil { + return err + } + if tc.printable { + val := fmt.Sprintf("val-%s", tc.testKey) + if err := b.Put([]byte(tc.testKey), []byte(val)); err != nil { + return err + } + } else { + if err := b.Put([]byte(tc.testKey), convertInt64IntoBytes(100001)); err != nil { + return err + } + } + return nil + }); err != nil { + t.Fatal(err) + } + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // Run the command. + m := NewMain() + if err := m.Run("get", db.Path(), tc.testBucket, tc.testKey); err != nil { + t.Fatal(err) + } + actual := m.Stdout.String() + assert.Equal(t, tc.expectedValue, actual) + }) + } +} + +// Ensure the "pages" command neither panic, nor change the db file. +func TestPagesCommand_Run(t *testing.T) { + db := btesting.MustCreateDB(t) + + err := db.Update(func(tx *bolt.Tx) error { + for _, name := range []string{"foo", "bar"} { + b, err := tx.CreateBucket([]byte(name)) + if err != nil { + return err + } + for i := 0; i < 3; i++ { + key := fmt.Sprintf("%s-%d", name, i) + val := fmt.Sprintf("val-%s-%d", name, i) + if err := b.Put([]byte(key), []byte(val)); err != nil { + return err + } + } + } + return nil + }) + require.NoError(t, err) + db.Close() + + defer requireDBNoChange(t, dbData(t, db.Path()), db.Path()) + + // Run the command. + m := NewMain() + err = m.Run("pages", db.Path()) + require.NoError(t, err) +} + +// Ensure the "bench" command runs and exits without errors +func TestBenchCommand_Run(t *testing.T) { + tests := map[string]struct { + args []string + }{ + "no-args": {}, + "100k count": {[]string{"-count", "100000"}}, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + // Run the command. + m := NewMain() + args := append([]string{"bench"}, test.args...) + if err := m.Run(args...); err != nil { + t.Fatal(err) + } + + stderr := m.Stderr.String() + stdout := m.Stdout.String() + if !strings.Contains(stderr, "starting write benchmark.") || !strings.Contains(stderr, "starting read benchmark.") { + t.Fatal(fmt.Errorf("benchmark result does not contain read/write start output:\n%s", stderr)) + } + + if strings.Contains(stderr, "iter mismatch") { + t.Fatal(fmt.Errorf("found iter mismatch in stdout:\n%s", stderr)) + } + + if !strings.Contains(stdout, "# Write") || !strings.Contains(stdout, "# Read") { + t.Fatal(fmt.Errorf("benchmark result does not contain read/write output:\n%s", stdout)) + } + }) + } +} + +type ConcurrentBuffer struct { + m sync.Mutex + buf bytes.Buffer +} + +func (b *ConcurrentBuffer) Read(p []byte) (n int, err error) { + b.m.Lock() + defer b.m.Unlock() + + return b.buf.Read(p) +} + +func (b *ConcurrentBuffer) Write(p []byte) (n int, err error) { + b.m.Lock() + defer b.m.Unlock() + + return b.buf.Write(p) +} + +func (b *ConcurrentBuffer) String() string { + b.m.Lock() + defer b.m.Unlock() + + return b.buf.String() +} + +// Main represents a test wrapper for main.Main that records output. +type Main struct { + *main.Main + Stdin ConcurrentBuffer + Stdout ConcurrentBuffer + Stderr ConcurrentBuffer +} + +// NewMain returns a new instance of Main. +func NewMain() *Main { + m := &Main{Main: main.NewMain()} + m.Main.Stdin = &m.Stdin + m.Main.Stdout = &m.Stdout + m.Main.Stderr = &m.Stderr + return m +} + +func TestCompactCommand_Run(t *testing.T) { + dstdb := btesting.MustCreateDB(t) + dstdb.Close() + + // fill the db + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + n := 2 + rand.Intn(5) + for i := 0; i < n; i++ { + k := []byte(fmt.Sprintf("b%d", i)) + b, err := tx.CreateBucketIfNotExists(k) + if err != nil { + return err + } + if err := b.SetSequence(uint64(i)); err != nil { + return err + } + if err := fillBucket(b, append(k, '.')); err != nil { + return err + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + // make the db grow by adding large values, and delete them. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("large_vals")) + if err != nil { + return err + } + n := 5 + rand.Intn(5) + for i := 0; i < n; i++ { + v := make([]byte, 1000*1000*(1+rand.Intn(5))) + _, err := crypto.Read(v) + if err != nil { + return err + } + if err := b.Put([]byte(fmt.Sprintf("l%d", i)), v); err != nil { + return err + } + } + return nil + }); err != nil { + t.Fatal(err) + } + if err := db.Update(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("large_vals")).Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + if err := c.Delete(); err != nil { + return err + } + } + return tx.DeleteBucket([]byte("large_vals")) + }); err != nil { + t.Fatal(err) + } + db.Close() + + dbChk, err := chkdb(db.Path()) + if err != nil { + t.Fatal(err) + } + + m := NewMain() + if err := m.Run("compact", "-o", dstdb.Path(), db.Path()); err != nil { + t.Fatal(err) + } + + dbChkAfterCompact, err := chkdb(db.Path()) + if err != nil { + t.Fatal(err) + } + + dstdbChk, err := chkdb(dstdb.Path()) + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(dbChk, dbChkAfterCompact) { + t.Error("the original db has been touched") + } + if !bytes.Equal(dbChk, dstdbChk) { + t.Error("the compacted db data isn't the same than the original db") + } +} + +func TestCommands_Run_NoArgs(t *testing.T) { + testCases := []struct { + name string + cmd string + expErr error + }{ + { + name: "get", + cmd: "get", + expErr: main.ErrNotEnoughArgs, + }, + { + name: "keys", + cmd: "keys", + expErr: main.ErrNotEnoughArgs, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + m := NewMain() + err := m.Run(tc.cmd) + require.ErrorIs(t, err, main.ErrNotEnoughArgs) + }) + } +} + +func fillBucket(b *bolt.Bucket, prefix []byte) error { + n := 10 + rand.Intn(50) + for i := 0; i < n; i++ { + v := make([]byte, 10*(1+rand.Intn(4))) + _, err := crypto.Read(v) + if err != nil { + return err + } + k := append(prefix, []byte(fmt.Sprintf("k%d", i))...) + if err := b.Put(k, v); err != nil { + return err + } + } + // limit depth of subbuckets + s := 2 + rand.Intn(4) + if len(prefix) > (2*s + 1) { + return nil + } + n = 1 + rand.Intn(3) + for i := 0; i < n; i++ { + k := append(prefix, []byte(fmt.Sprintf("b%d", i))...) + sb, err := b.CreateBucket(k) + if err != nil { + return err + } + if err := fillBucket(sb, append(k, '.')); err != nil { + return err + } + } + return nil +} + +func chkdb(path string) ([]byte, error) { + db, err := bolt.Open(path, 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + return nil, err + } + defer db.Close() + var buf bytes.Buffer + err = db.View(func(tx *bolt.Tx) error { + return tx.ForEach(func(name []byte, b *bolt.Bucket) error { + return walkBucket(b, name, nil, &buf) + }) + }) + if err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +func walkBucket(parent *bolt.Bucket, k []byte, v []byte, w io.Writer) error { + if _, err := fmt.Fprintf(w, "%d:%x=%x\n", parent.Sequence(), k, v); err != nil { + return err + } + + // not a bucket, exit. + if v != nil { + return nil + } + return parent.ForEach(func(k, v []byte) error { + if v == nil { + return walkBucket(parent.Bucket(k), k, nil, w) + } + return walkBucket(parent, k, v, w) + }) +} + +func dbData(t *testing.T, filePath string) []byte { + data, err := os.ReadFile(filePath) + require.NoError(t, err) + return data +} + +func requireDBNoChange(t *testing.T, oldData []byte, filePath string) { + newData, err := os.ReadFile(filePath) + require.NoError(t, err) + + noChange := bytes.Equal(oldData, newData) + require.True(t, noChange) +} + +func convertInt64IntoBytes(num int64) []byte { + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutVarint(buf, num) + return buf[:n] +} + +func convertInt64KeysIntoHexString(nums ...int64) string { + var res []string + for _, num := range nums { + res = append(res, hex.EncodeToString(convertInt64IntoBytes(num))) + } + return strings.Join(res, "\n") + "\n" // last newline char +} diff --git a/cmd/bbolt/page_command.go b/cmd/bbolt/page_command.go new file mode 100644 index 0000000..02f2afe --- /dev/null +++ b/cmd/bbolt/page_command.go @@ -0,0 +1,290 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "io" + "os" + "strings" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +// pageCommand represents the "page" command execution. +type pageCommand struct { + baseCommand +} + +// newPageCommand returns a pageCommand. +func newPageCommand(m *Main) *pageCommand { + c := &pageCommand{} + c.baseCommand = m.baseCommand + return c +} + +// Run executes the command. +func (cmd *pageCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + all := fs.Bool("all", false, "list all pages") + formatValue := fs.String("format-value", "auto", "One of: "+FORMAT_MODES+" . Applies to values on the leaf page.") + + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path and page id. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + if !*all { + // Read page ids. + pageIDs, err := stringToPages(fs.Args()[1:]) + if err != nil { + return err + } else if len(pageIDs) == 0 { + return ErrPageIDRequired + } + cmd.printPages(pageIDs, path, formatValue) + } else { + cmd.printAllPages(path, formatValue) + } + return nil +} + +func (cmd *pageCommand) printPages(pageIDs []uint64, path string, formatValue *string) { + // Print each page listed. + for i, pageID := range pageIDs { + // Print a separator. + if i > 0 { + fmt.Fprintln(cmd.Stdout, "===============================================") + } + _, err2 := cmd.printPage(path, pageID, *formatValue) + if err2 != nil { + fmt.Fprintf(cmd.Stdout, "Prining page %d failed: %s. Continuing...\n", pageID, err2) + } + } +} + +func (cmd *pageCommand) printAllPages(path string, formatValue *string) { + _, hwm, err := guts_cli.ReadPageAndHWMSize(path) + if err != nil { + fmt.Fprintf(cmd.Stdout, "cannot read number of pages: %v", err) + } + + // Print each page listed. + for pageID := uint64(0); pageID < uint64(hwm); { + // Print a separator. + if pageID > 0 { + fmt.Fprintln(cmd.Stdout, "===============================================") + } + overflow, err2 := cmd.printPage(path, pageID, *formatValue) + if err2 != nil { + fmt.Fprintf(cmd.Stdout, "Prining page %d failed: %s. Continuing...\n", pageID, err2) + pageID++ + } else { + pageID += uint64(overflow) + 1 + } + } +} + +// printPage prints given page to cmd.Stdout and returns error or number of interpreted pages. +func (cmd *pageCommand) printPage(path string, pageID uint64, formatValue string) (numPages uint32, reterr error) { + defer func() { + if err := recover(); err != nil { + reterr = fmt.Errorf("%s", err) + } + }() + + // Retrieve page info and page size. + p, buf, err := guts_cli.ReadPage(path, pageID) + if err != nil { + return 0, err + } + + // Print basic page info. + fmt.Fprintf(cmd.Stdout, "Page ID: %d\n", p.Id()) + fmt.Fprintf(cmd.Stdout, "Page Type: %s\n", p.Typ()) + fmt.Fprintf(cmd.Stdout, "Total Size: %d bytes\n", len(buf)) + fmt.Fprintf(cmd.Stdout, "Overflow pages: %d\n", p.Overflow()) + + // Print type-specific data. + switch p.Typ() { + case "meta": + err = cmd.PrintMeta(cmd.Stdout, buf) + case "leaf": + err = cmd.PrintLeaf(cmd.Stdout, buf, formatValue) + case "branch": + err = cmd.PrintBranch(cmd.Stdout, buf) + case "freelist": + err = cmd.PrintFreelist(cmd.Stdout, buf) + } + if err != nil { + return 0, err + } + return p.Overflow(), nil +} + +// PrintMeta prints the data from the meta page. +func (cmd *pageCommand) PrintMeta(w io.Writer, buf []byte) error { + m := common.LoadPageMeta(buf) + m.Print(w) + return nil +} + +// PrintLeaf prints the data for a leaf page. +func (cmd *pageCommand) PrintLeaf(w io.Writer, buf []byte, formatValue string) error { + p := common.LoadPage(buf) + + // Print number of items. + fmt.Fprintf(w, "Item Count: %d\n", p.Count()) + fmt.Fprintf(w, "\n") + + // Print each key/value. + for i := uint16(0); i < p.Count(); i++ { + e := p.LeafPageElement(i) + + // Format key as string. + var k string + if isPrintable(string(e.Key())) { + k = fmt.Sprintf("%q", string(e.Key())) + } else { + k = fmt.Sprintf("%x", string(e.Key())) + } + + // Format value as string. + var v string + if e.IsBucketEntry() { + b := e.Bucket() + v = b.String() + } else { + var err error + v, err = formatBytes(e.Value(), formatValue) + if err != nil { + return err + } + } + + fmt.Fprintf(w, "%s: %s\n", k, v) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintBranch prints the data for a leaf page. +func (cmd *pageCommand) PrintBranch(w io.Writer, buf []byte) error { + p := common.LoadPage(buf) + + // Print number of items. + fmt.Fprintf(w, "Item Count: %d\n", p.Count()) + fmt.Fprintf(w, "\n") + + // Print each key/value. + for i := uint16(0); i < p.Count(); i++ { + e := p.BranchPageElement(i) + + // Format key as string. + var k string + if isPrintable(string(e.Key())) { + k = fmt.Sprintf("%q", string(e.Key())) + } else { + k = fmt.Sprintf("%x", string(e.Key())) + } + + fmt.Fprintf(w, "%s: \n", k, e.Pgid()) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintFreelist prints the data for a freelist page. +func (cmd *pageCommand) PrintFreelist(w io.Writer, buf []byte) error { + p := common.LoadPage(buf) + + // Print number of items. + _, cnt := p.FreelistPageCount() + fmt.Fprintf(w, "Item Count: %d\n", cnt) + fmt.Fprintf(w, "Overflow: %d\n", p.Overflow()) + + fmt.Fprintf(w, "\n") + + // Print each page in the freelist. + ids := p.FreelistPageIds() + for _, ids := range ids { + fmt.Fprintf(w, "%d\n", ids) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintPage prints a given page as hexadecimal. +func (cmd *pageCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSize int) error { + const bytesPerLineN = 16 + + // Read page into buffer. + buf := make([]byte, pageSize) + addr := pageID * pageSize + if n, err := r.ReadAt(buf, int64(addr)); err != nil { + return err + } else if n != pageSize { + return io.ErrUnexpectedEOF + } + + // Write out to writer in 16-byte lines. + var prev []byte + var skipped bool + for offset := 0; offset < pageSize; offset += bytesPerLineN { + // Retrieve current 16-byte line. + line := buf[offset : offset+bytesPerLineN] + isLastLine := offset == (pageSize - bytesPerLineN) + + // If it's the same as the previous line then print a skip. + if bytes.Equal(line, prev) && !isLastLine { + if !skipped { + fmt.Fprintf(w, "%07x *\n", addr+offset) + skipped = true + } + } else { + // Print line as hexadecimal in 2-byte groups. + fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset, + line[0:2], line[2:4], line[4:6], line[6:8], + line[8:10], line[10:12], line[12:14], line[14:16], + ) + + skipped = false + } + + // Save the previous line. + prev = line + } + fmt.Fprint(w, "\n") + + return nil +} + +// Usage returns the help message. +func (cmd *pageCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt page PATH pageid [pageid...] + or: bolt page --all PATH + +Additional options include: + + --all + prints all pages (only skips pages that were considered successful overflow pages) + --format-value=`+FORMAT_MODES+` (default: auto) + prints values (on the leaf page) using the given format. + +Page prints one or more pages in human readable format. +`, "\n") +} diff --git a/cmd/bbolt/utils.go b/cmd/bbolt/utils.go new file mode 100644 index 0000000..71f1a3d --- /dev/null +++ b/cmd/bbolt/utils.go @@ -0,0 +1,16 @@ +package main + +import ( + "fmt" + "os" +) + +func checkSourceDBPath(srcPath string) (os.FileInfo, error) { + fi, err := os.Stat(srcPath) + if os.IsNotExist(err) { + return nil, fmt.Errorf("source database file %q doesn't exist", srcPath) + } else if err != nil { + return nil, fmt.Errorf("failed to open source database file %q: %v", srcPath, err) + } + return fi, nil +} diff --git a/cmd/bbolt/utils_test.go b/cmd/bbolt/utils_test.go new file mode 100644 index 0000000..c5dc0ef --- /dev/null +++ b/cmd/bbolt/utils_test.go @@ -0,0 +1,46 @@ +package main_test + +import ( + "os" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +func loadMetaPage(t *testing.T, dbPath string, pageID uint64) *common.Meta { + _, buf, err := guts_cli.ReadPage(dbPath, pageID) + require.NoError(t, err) + return common.LoadPageMeta(buf) +} + +func readMetaPage(t *testing.T, path string) *common.Meta { + _, activeMetaPageId, err := guts_cli.GetRootPage(path) + require.NoError(t, err) + _, buf, err := guts_cli.ReadPage(path, uint64(activeMetaPageId)) + require.NoError(t, err) + return common.LoadPageMeta(buf) +} + +func readPage(t *testing.T, path string, pageId int, pageSize int) []byte { + dbFile, err := os.Open(path) + require.NoError(t, err) + defer dbFile.Close() + + fi, err := dbFile.Stat() + require.NoError(t, err) + require.GreaterOrEqual(t, fi.Size(), int64((pageId+1)*pageSize)) + + buf := make([]byte, pageSize) + byteRead, err := dbFile.ReadAt(buf, int64(pageId*pageSize)) + require.NoError(t, err) + require.Equal(t, pageSize, byteRead) + + return buf +} + +func pageDataWithoutPageId(buf []byte) []byte { + return buf[8:] +} diff --git a/code-of-conduct.md b/code-of-conduct.md new file mode 100644 index 0000000..f78dd84 --- /dev/null +++ b/code-of-conduct.md @@ -0,0 +1,3 @@ +# etcd Community Code of Conduct + +Please refer to [etcd Community Code of Conduct](https://github.com/etcd-io/etcd/blob/main/code-of-conduct.md). diff --git a/compact.go b/compact.go new file mode 100644 index 0000000..5f1d4c3 --- /dev/null +++ b/compact.go @@ -0,0 +1,119 @@ +package bbolt + +// Compact will create a copy of the source DB and in the destination DB. This may +// reclaim space that the source database no longer has use for. txMaxSize can be +// used to limit the transactions size of this process and may trigger intermittent +// commits. A value of zero will ignore transaction sizes. +// TODO: merge with: https://github.com/etcd-io/etcd/blob/b7f0f52a16dbf83f18ca1d803f7892d750366a94/mvcc/backend/backend.go#L349 +func Compact(dst, src *DB, txMaxSize int64) error { + // commit regularly, or we'll run out of memory for large datasets if using one transaction. + var size int64 + tx, err := dst.Begin(true) + if err != nil { + return err + } + defer func() { + if tempErr := tx.Rollback(); tempErr != nil { + err = tempErr + } + }() + + if err := walk(src, func(keys [][]byte, k, v []byte, seq uint64) error { + // On each key/value, check if we have exceeded tx size. + sz := int64(len(k) + len(v)) + if size+sz > txMaxSize && txMaxSize != 0 { + // Commit previous transaction. + if err := tx.Commit(); err != nil { + return err + } + + // Start new transaction. + tx, err = dst.Begin(true) + if err != nil { + return err + } + size = 0 + } + size += sz + + // Create bucket on the root transaction if this is the first level. + nk := len(keys) + if nk == 0 { + bkt, err := tx.CreateBucket(k) + if err != nil { + return err + } + if err := bkt.SetSequence(seq); err != nil { + return err + } + return nil + } + + // Create buckets on subsequent levels, if necessary. + b := tx.Bucket(keys[0]) + if nk > 1 { + for _, k := range keys[1:] { + b = b.Bucket(k) + } + } + + // Fill the entire page for best compaction. + b.FillPercent = 1.0 + + // If there is no value then this is a bucket call. + if v == nil { + bkt, err := b.CreateBucket(k) + if err != nil { + return err + } + if err := bkt.SetSequence(seq); err != nil { + return err + } + return nil + } + + // Otherwise treat it as a key/value pair. + return b.Put(k, v) + }); err != nil { + return err + } + err = tx.Commit() + + return err +} + +// walkFunc is the type of the function called for keys (buckets and "normal" +// values) discovered by Walk. keys is the list of keys to descend to the bucket +// owning the discovered key/value pair k/v. +type walkFunc func(keys [][]byte, k, v []byte, seq uint64) error + +// walk walks recursively the bolt database db, calling walkFn for each key it finds. +func walk(db *DB, walkFn walkFunc) error { + return db.View(func(tx *Tx) error { + return tx.ForEach(func(name []byte, b *Bucket) error { + return walkBucket(b, nil, name, nil, b.Sequence(), walkFn) + }) + }) +} + +func walkBucket(b *Bucket, keypath [][]byte, k, v []byte, seq uint64, fn walkFunc) error { + // Execute callback. + if err := fn(keypath, k, v, seq); err != nil { + return err + } + + // If this is not a bucket then stop. + if v != nil { + return nil + } + + // Iterate over each child key/value. + keypath = append(keypath, k) + return b.ForEach(func(k, v []byte) error { + if v == nil { + bkt := b.Bucket(k) + return walkBucket(bkt, keypath, k, nil, bkt.Sequence(), fn) + } + return walkBucket(b, keypath, k, v, b.Sequence(), fn) + }) +} diff --git a/concurrent_test.go b/concurrent_test.go new file mode 100644 index 0000000..4ac00a5 --- /dev/null +++ b/concurrent_test.go @@ -0,0 +1,956 @@ +package bbolt_test + +import ( + "bytes" + crand "crypto/rand" + "encoding/hex" + "encoding/json" + "fmt" + "io" + mrand "math/rand" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "testing" + "time" + "unicode/utf8" + + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" + + bolt "github.com/tutus-one/tutus-bolt" +) + +const ( + bucketPrefix = "bucket" + keyPrefix = "key" + noopTxKey = "%magic-no-op-key%" + + // TestConcurrentCaseDuration is used as a env variable to specify the + // concurrent test duration. + testConcurrentCaseDuration = "TEST_CONCURRENT_CASE_DURATION" + defaultConcurrentTestDuration = 30 * time.Second +) + +type duration struct { + min time.Duration + max time.Duration +} + +type bytesRange struct { + min int + max int +} + +type operationChance struct { + operation OperationType + chance int +} + +type concurrentConfig struct { + bucketCount int + keyCount int + workInterval duration + operationRatio []operationChance + readInterval duration // only used by readOperation + noopWriteRatio int // only used by writeOperation + writeBytes bytesRange // only used by writeOperation +} + +/* +TestConcurrentGenericReadAndWrite verifies: + 1. Repeatable read: a read transaction should always see the same data + view during its lifecycle. + 2. Any data written by a writing transaction should be visible to any + following reading transactions (with txid >= previous writing txid). + 3. The txid should never decrease. +*/ +func TestConcurrentGenericReadAndWrite(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + testDuration := concurrentTestDuration(t) + conf := concurrentConfig{ + bucketCount: 5, + keyCount: 10000, + workInterval: duration{}, + operationRatio: []operationChance{ + {operation: Read, chance: 60}, + {operation: Write, chance: 20}, + {operation: Delete, chance: 20}, + }, + readInterval: duration{ + min: 50 * time.Millisecond, + max: 100 * time.Millisecond, + }, + noopWriteRatio: 20, + writeBytes: bytesRange{ + min: 200, + max: 16000, + }, + } + + testCases := []struct { + name string + workerCount int + conf concurrentConfig + testDuration time.Duration + }{ + { + name: "1 worker", + workerCount: 1, + conf: conf, + testDuration: testDuration, + }, + { + name: "10 workers", + workerCount: 10, + conf: conf, + testDuration: testDuration, + }, + { + name: "50 workers", + workerCount: 50, + conf: conf, + testDuration: testDuration, + }, + { + name: "100 workers", + workerCount: 100, + conf: conf, + testDuration: testDuration, + }, + { + name: "200 workers", + workerCount: 200, + conf: conf, + testDuration: testDuration, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + concurrentReadAndWrite(t, + tc.workerCount, + tc.conf, + tc.testDuration) + }) + } +} + +func concurrentTestDuration(t *testing.T) time.Duration { + durationInEnv := strings.ToLower(os.Getenv(testConcurrentCaseDuration)) + if durationInEnv == "" { + t.Logf("%q not set, defaults to %s", testConcurrentCaseDuration, defaultConcurrentTestDuration) + return defaultConcurrentTestDuration + } + + d, err := time.ParseDuration(durationInEnv) + if err != nil { + t.Logf("Failed to parse %s=%s, error: %v, defaults to %s", testConcurrentCaseDuration, durationInEnv, err, defaultConcurrentTestDuration) + return defaultConcurrentTestDuration + } + + t.Logf("Concurrent test duration set by %s=%s", testConcurrentCaseDuration, d) + return d +} + +func concurrentReadAndWrite(t *testing.T, + workerCount int, + conf concurrentConfig, + testDuration time.Duration) { + + t.Log("Preparing db.") + db := mustCreateDB(t, &bolt.Options{ + PageSize: 4096, + }) + defer db.Close() + err := db.Update(func(tx *bolt.Tx) error { + for i := 0; i < conf.bucketCount; i++ { + if _, err := tx.CreateBucketIfNotExists(bucketName(i)); err != nil { + return err + } + } + return nil + }) + require.NoError(t, err) + + var records historyRecords + // t.Failed() returns false during panicking. We need to forcibly + // save data on panicking. + // Refer to: https://github.com/golang/go/issues/49929 + panicked := true + defer func() { + t.Log("Save data if failed.") + saveDataIfFailed(t, db, records, panicked) + }() + + t.Log("Starting workers.") + records = runWorkers(t, + db, + workerCount, + conf, + testDuration) + + t.Log("Analyzing the history records.") + if err := validateSequential(records); err != nil { + t.Errorf("The history records are not sequential:\n %v", err) + } + + t.Log("Checking database consistency.") + if err := checkConsistency(t, db); err != nil { + t.Errorf("The data isn't consistency: %v", err) + } + + panicked = false + // TODO (ahrtr): + // 1. intentionally inject a random failpoint. +} + +// mustCreateDB is created in place of `btesting.MustCreateDB`, and it's +// only supposed to be used by the concurrent test case. The purpose is +// to ensure the test case can be executed on old branches or versions, +// e.g. `release-1.3` or `1.3.[5-7]`. +func mustCreateDB(t *testing.T, o *bolt.Options) *bolt.DB { + f := filepath.Join(t.TempDir(), "db") + + return mustOpenDB(t, f, o) +} + +func mustReOpenDB(t *testing.T, db *bolt.DB, o *bolt.Options) *bolt.DB { + f := db.Path() + + t.Logf("Closing bbolt DB at: %s", f) + err := db.Close() + require.NoError(t, err) + + return mustOpenDB(t, f, o) +} + +func mustOpenDB(t *testing.T, dbPath string, o *bolt.Options) *bolt.DB { + t.Logf("Opening bbolt DB at: %s", dbPath) + if o == nil { + o = bolt.DefaultOptions + } + + freelistType := bolt.FreelistArrayType + if env := os.Getenv("TEST_FREELIST_TYPE"); env == string(bolt.FreelistMapType) { + freelistType = bolt.FreelistMapType + } + + o.FreelistType = freelistType + + db, err := bolt.Open(dbPath, 0600, o) + require.NoError(t, err) + + return db +} + +func checkConsistency(t *testing.T, db *bolt.DB) error { + return db.View(func(tx *bolt.Tx) error { + cnt := 0 + for err := range tx.Check() { + t.Errorf("Consistency error: %v", err) + cnt++ + } + if cnt > 0 { + return fmt.Errorf("%d consistency errors found", cnt) + } + return nil + }) +} + +/* +********************************************************* +Data structures and functions/methods for running concurrent +workers, which execute different operations, including `Read`, +`Write` and `Delete`. +********************************************************* +*/ +func runWorkers(t *testing.T, + db *bolt.DB, + workerCount int, + conf concurrentConfig, + testDuration time.Duration) historyRecords { + stopCh := make(chan struct{}, 1) + errCh := make(chan error, workerCount) + + var mu sync.Mutex + var rs historyRecords + + g := new(errgroup.Group) + for i := 0; i < workerCount; i++ { + w := &worker{ + id: i, + db: db, + + conf: conf, + + errCh: errCh, + stopCh: stopCh, + t: t, + } + g.Go(func() error { + wrs, err := runWorker(t, w, errCh) + mu.Lock() + rs = append(rs, wrs...) + mu.Unlock() + return err + }) + } + + t.Logf("Keep all workers running for about %s.", testDuration) + select { + case <-time.After(testDuration): + case <-errCh: + } + + close(stopCh) + t.Log("Waiting for all workers to finish.") + if err := g.Wait(); err != nil { + t.Errorf("Received error: %v", err) + } + + return rs +} + +func runWorker(t *testing.T, w *worker, errCh chan error) (historyRecords, error) { + rs, err := w.run() + if len(rs) > 0 && err == nil { + if terr := validateIncrementalTxid(rs); terr != nil { + txidErr := fmt.Errorf("[%s]: %w", w.name(), terr) + t.Error(txidErr) + errCh <- txidErr + return rs, txidErr + } + } + return rs, err +} + +type worker struct { + id int + db *bolt.DB + + conf concurrentConfig + + errCh chan error + stopCh chan struct{} + + t *testing.T +} + +func (w *worker) name() string { + return fmt.Sprintf("worker-%d", w.id) +} + +func (w *worker) run() (historyRecords, error) { + var rs historyRecords + + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-w.stopCh: + return rs, nil + default: + } + + err := w.db.Update(func(tx *bolt.Tx) error { + for { + op := w.pickOperation() + bucket, key := w.pickBucket(), w.pickKey() + rec, eerr := executeOperation(op, tx, bucket, key, w.conf) + if eerr != nil { + opErr := fmt.Errorf("[%s: %s]: %w", w.name(), op, eerr) + w.t.Error(opErr) + w.errCh <- opErr + return opErr + } + + rs = append(rs, rec) + if w.conf.workInterval != (duration{}) { + time.Sleep(randomDurationInRange(w.conf.workInterval.min, w.conf.workInterval.max)) + } + + select { + case <-ticker.C: + return nil + case <-w.stopCh: + return nil + default: + } + } + }) + if err != nil { + return rs, err + } + } +} + +func (w *worker) pickBucket() []byte { + return bucketName(mrand.Intn(w.conf.bucketCount)) +} + +func bucketName(index int) []byte { + bucket := fmt.Sprintf("%s_%d", bucketPrefix, index) + return []byte(bucket) +} + +func (w *worker) pickKey() []byte { + key := fmt.Sprintf("%s_%d", keyPrefix, mrand.Intn(w.conf.keyCount)) + return []byte(key) +} + +func (w *worker) pickOperation() OperationType { + sum := 0 + for _, op := range w.conf.operationRatio { + sum += op.chance + } + roll := mrand.Int() % sum + for _, op := range w.conf.operationRatio { + if roll < op.chance { + return op.operation + } + roll -= op.chance + } + panic("unexpected") +} + +func executeOperation(op OperationType, tx *bolt.Tx, bucket []byte, key []byte, conf concurrentConfig) (historyRecord, error) { + switch op { + case Read: + return executeRead(tx, bucket, key, conf.readInterval) + case Write: + return executeWrite(tx, bucket, key, conf.writeBytes, conf.noopWriteRatio) + case Delete: + return executeDelete(tx, bucket, key) + default: + panic(fmt.Sprintf("unexpected operation type: %s", op)) + } +} + +func executeRead(tx *bolt.Tx, bucket []byte, key []byte, readInterval duration) (historyRecord, error) { + var rec historyRecord + + b := tx.Bucket(bucket) + + initialVal := b.Get(key) + time.Sleep(randomDurationInRange(readInterval.min, readInterval.max)) + val := b.Get(key) + + if !bytes.Equal(initialVal, val) { + return rec, fmt.Errorf("read different values for the same key (%q), value1: %q, value2: %q", + string(key), formatBytes(initialVal), formatBytes(val)) + } + + clonedVal := make([]byte, len(val)) + copy(clonedVal, val) + + rec = historyRecord{ + OperationType: Read, + Bucket: string(bucket), + Key: string(key), + Value: clonedVal, + Txid: tx.ID(), + } + + return rec, nil +} + +func executeWrite(tx *bolt.Tx, bucket []byte, key []byte, writeBytes bytesRange, noopWriteRatio int) (historyRecord, error) { + var rec historyRecord + + if mrand.Intn(100) < noopWriteRatio { + // A no-op write transaction has two consequences: + // 1. The txid increases by 1; + // 2. Two meta pages point to the same root page. + rec = historyRecord{ + OperationType: Write, + Bucket: string(bucket), + Key: noopTxKey, + Value: nil, + Txid: tx.ID(), + } + return rec, nil + } + + b := tx.Bucket(bucket) + + valueBytes := randomIntInRange(writeBytes.min, writeBytes.max) + v := make([]byte, valueBytes) + if _, cErr := crand.Read(v); cErr != nil { + return rec, cErr + } + + putErr := b.Put(key, v) + if putErr == nil { + rec = historyRecord{ + OperationType: Write, + Bucket: string(bucket), + Key: string(key), + Value: v, + Txid: tx.ID(), + } + } + + return rec, putErr +} + +func executeDelete(tx *bolt.Tx, bucket []byte, key []byte) (historyRecord, error) { + var rec historyRecord + + b := tx.Bucket(bucket) + + err := b.Delete(key) + if err == nil { + rec = historyRecord{ + OperationType: Delete, + Bucket: string(bucket), + Key: string(key), + Txid: tx.ID(), + } + } + + return rec, err +} + +func randomDurationInRange(min, max time.Duration) time.Duration { + d := int64(max) - int64(min) + d = int64(mrand.Intn(int(d))) + int64(min) + return time.Duration(d) +} + +func randomIntInRange(min, max int) int { + return mrand.Intn(max-min) + min +} + +func formatBytes(val []byte) string { + if utf8.ValidString(string(val)) { + return string(val) + } + + return hex.EncodeToString(val) +} + +/* +********************************************************* +Functions for persisting test data, including db file +and operation history +********************************************************* +*/ +func saveDataIfFailed(t *testing.T, db *bolt.DB, rs historyRecords, force bool) { + if t.Failed() || force { + t.Log("Saving data...") + dbPath := db.Path() + if err := db.Close(); err != nil { + t.Errorf("Failed to close db: %v", err) + } + backupPath := testResultsDirectory(t) + backupDB(t, dbPath, backupPath) + persistHistoryRecords(t, rs, backupPath) + } +} + +func backupDB(t *testing.T, srcPath string, dstPath string) { + targetFile := filepath.Join(dstPath, "db.bak") + t.Logf("Saving the DB file to %s", targetFile) + err := copyFile(srcPath, targetFile) + require.NoError(t, err) + t.Logf("DB file saved to %s", targetFile) +} + +func copyFile(srcPath, dstPath string) error { + // Ensure source file exists. + _, err := os.Stat(srcPath) + if os.IsNotExist(err) { + return fmt.Errorf("source file %q not found", srcPath) + } else if err != nil { + return err + } + + // Ensure output file not exist. + _, err = os.Stat(dstPath) + if err == nil { + return fmt.Errorf("output file %q already exists", dstPath) + } else if !os.IsNotExist(err) { + return err + } + + srcDB, err := os.Open(srcPath) + if err != nil { + return fmt.Errorf("failed to open source file %q: %w", srcPath, err) + } + defer srcDB.Close() + dstDB, err := os.Create(dstPath) + if err != nil { + return fmt.Errorf("failed to create output file %q: %w", dstPath, err) + } + defer dstDB.Close() + written, err := io.Copy(dstDB, srcDB) + if err != nil { + return fmt.Errorf("failed to copy database file from %q to %q: %w", srcPath, dstPath, err) + } + + srcFi, err := srcDB.Stat() + if err != nil { + return fmt.Errorf("failed to get source file info %q: %w", srcPath, err) + } + initialSize := srcFi.Size() + if initialSize != written { + return fmt.Errorf("the byte copied (%q: %d) isn't equal to the initial db size (%q: %d)", dstPath, written, srcPath, initialSize) + } + + return nil +} + +func persistHistoryRecords(t *testing.T, rs historyRecords, path string) { + recordFilePath := filepath.Join(path, "history_records.json") + t.Logf("Saving history records to %s", recordFilePath) + recordFile, err := os.OpenFile(recordFilePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) + require.NoError(t, err) + defer recordFile.Close() + encoder := json.NewEncoder(recordFile) + for _, rec := range rs { + err := encoder.Encode(rec) + require.NoError(t, err) + } +} + +func testResultsDirectory(t *testing.T) string { + resultsDirectory, ok := os.LookupEnv("RESULTS_DIR") + var err error + if !ok { + resultsDirectory, err = os.MkdirTemp("", "*.db") + require.NoError(t, err) + } + resultsDirectory, err = filepath.Abs(resultsDirectory) + require.NoError(t, err) + + path, err := filepath.Abs(filepath.Join(resultsDirectory, strings.ReplaceAll(t.Name(), "/", "_"))) + require.NoError(t, err) + + err = os.RemoveAll(path) + require.NoError(t, err) + + err = os.MkdirAll(path, 0700) + require.NoError(t, err) + + return path +} + +/* +********************************************************* +Data structures and functions for analyzing history records +********************************************************* +*/ +type OperationType string + +const ( + Read OperationType = "read" + Write OperationType = "write" + Delete OperationType = "delete" +) + +type historyRecord struct { + OperationType OperationType `json:"operationType,omitempty"` + Txid int `json:"txid,omitempty"` + Bucket string `json:"bucket,omitempty"` + Key string `json:"key,omitempty"` + Value []byte `json:"value,omitempty"` +} + +type historyRecords []historyRecord + +func (rs historyRecords) Len() int { + return len(rs) +} + +func (rs historyRecords) Less(i, j int) bool { + // Sorted by (bucket, key) firstly: all records in the same + // (bucket, key) are grouped together. + bucketCmp := strings.Compare(rs[i].Bucket, rs[j].Bucket) + if bucketCmp != 0 { + return bucketCmp < 0 + } + keyCmp := strings.Compare(rs[i].Key, rs[j].Key) + if keyCmp != 0 { + return keyCmp < 0 + } + + // Sorted by txid + return rs[i].Txid < rs[j].Txid +} + +func (rs historyRecords) Swap(i, j int) { + rs[i], rs[j] = rs[j], rs[i] +} + +func validateIncrementalTxid(rs historyRecords) error { + lastTxid := rs[0].Txid + + for i := 1; i < len(rs); i++ { + if rs[i].Txid < lastTxid { + return fmt.Errorf("detected non-incremental txid(%d, %d) in %s mode", lastTxid, rs[i].Txid, rs[i].OperationType) + } + lastTxid = rs[i].Txid + } + + return nil +} + +func validateSequential(rs historyRecords) error { + sort.Stable(rs) + + type bucketAndKey struct { + bucket string + key string + } + lastWriteKeyValueMap := make(map[bucketAndKey]*historyRecord) + + for _, rec := range rs { + bk := bucketAndKey{ + bucket: rec.Bucket, + key: rec.Key, + } + if v, ok := lastWriteKeyValueMap[bk]; ok { + if rec.OperationType == Write { + v.Txid = rec.Txid + if rec.Key != noopTxKey { + v.Value = rec.Value + } + } else if rec.OperationType == Delete { + delete(lastWriteKeyValueMap, bk) + } else { + if !bytes.Equal(v.Value, rec.Value) { + return fmt.Errorf("readOperation[txid: %d, bucket: %s, key: %s] read %x, \nbut writer[txid: %d] wrote %x", + rec.Txid, rec.Bucket, rec.Key, rec.Value, v.Txid, v.Value) + } + } + } else { + if rec.OperationType == Write && rec.Key != noopTxKey { + lastWriteKeyValueMap[bk] = &historyRecord{ + OperationType: Write, + Bucket: rec.Bucket, + Key: rec.Key, + Value: rec.Value, + Txid: rec.Txid, + } + } else if rec.OperationType == Read { + if len(rec.Value) != 0 { + return fmt.Errorf("expected the first readOperation[txid: %d, bucket: %s, key: %s] read nil, \nbut got %x", + rec.Txid, rec.Bucket, rec.Key, rec.Value) + } + } + } + } + + return nil +} + +/* +TestConcurrentRepeatableRead verifies repeatable read. The case +intentionally creates a scenario that read and write transactions +are interleaved. It performs several writing operations after starting +each long-running read transaction to ensure it has a larger txid +than previous read transaction. It verifies that bbolt correctly +releases free pages, and will not pollute (e.g. prematurely release) +any pages which are still being used by any read transaction. +*/ +func TestConcurrentRepeatableRead(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + testCases := []struct { + name string + noFreelistSync bool + freelistType bolt.FreelistType + }{ + // [array] freelist + { + name: "sync array freelist", + noFreelistSync: false, + freelistType: bolt.FreelistArrayType, + }, + { + name: "not sync array freelist", + noFreelistSync: true, + freelistType: bolt.FreelistArrayType, + }, + // [map] freelist + { + name: "sync map freelist", + noFreelistSync: false, + freelistType: bolt.FreelistMapType, + }, + { + name: "not sync map freelist", + noFreelistSync: true, + freelistType: bolt.FreelistMapType, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + + t.Log("Preparing db.") + var ( + bucket = []byte("data") + key = []byte("mykey") + + option = &bolt.Options{ + PageSize: 4096, + NoFreelistSync: tc.noFreelistSync, + FreelistType: tc.freelistType, + } + ) + + db := mustCreateDB(t, option) + defer func() { + // The db will be reopened later, so put `db.Close()` in a function + // to avoid premature evaluation of `db`. Note that the execution + // of a deferred function is deferred to the moment the surrounding + // function returns, but the function value and parameters to the + // call are evaluated as usual and saved anew. + db.Close() + }() + + // Create lots of K/V to allocate some pages + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists(bucket) + if err != nil { + return err + } + for i := 0; i < 1000; i++ { + k := fmt.Sprintf("key_%d", i) + if err := b.Put([]byte(k), make([]byte, 1024)); err != nil { + return err + } + } + return nil + }) + require.NoError(t, err) + + // Remove all K/V to create some free pages + err = db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucket) + for i := 0; i < 1000; i++ { + k := fmt.Sprintf("key_%d", i) + if err := b.Delete([]byte(k)); err != nil { + return err + } + } + return b.Put(key, []byte("randomValue")) + }) + require.NoError(t, err) + + // bbolt will not release free pages directly after committing + // a writing transaction; instead all pages freed are putting + // into a pending list. Accordingly, the free pages might not + // be able to be reused by following writing transactions. So + // we reopen the db to completely release all free pages. + db = mustReOpenDB(t, db, option) + + var ( + wg sync.WaitGroup + longRunningReaderCount = 10 + stopCh = make(chan struct{}) + errCh = make(chan error, longRunningReaderCount) + readInterval = duration{5 * time.Millisecond, 10 * time.Millisecond} + + writeOperationCountInBetween = 5 + writeBytes = bytesRange{10, 20} + + testDuration = 10 * time.Second + ) + + for i := 0; i < longRunningReaderCount; i++ { + readWorkerName := fmt.Sprintf("reader_%d", i) + t.Logf("Starting long running read operation: %s", readWorkerName) + wg.Add(1) + go func() { + defer wg.Done() + rErr := executeLongRunningRead(t, readWorkerName, db, bucket, key, readInterval, stopCh) + if rErr != nil { + errCh <- rErr + } + }() + time.Sleep(500 * time.Millisecond) + + t.Logf("Perform %d write operations after starting a long running read operation", writeOperationCountInBetween) + for j := 0; j < writeOperationCountInBetween; j++ { + err := db.Update(func(tx *bolt.Tx) error { + _, eerr := executeWrite(tx, bucket, key, writeBytes, 0) + return eerr + }) + + require.NoError(t, err) + } + } + + t.Log("Perform lots of write operations to check whether the long running read operations will read dirty data") + wg.Add(1) + go func() { + defer wg.Done() + cnt := longRunningReaderCount * writeOperationCountInBetween + for i := 0; i < cnt; i++ { + select { + case <-stopCh: + return + default: + } + err := db.Update(func(tx *bolt.Tx) error { + _, eerr := executeWrite(tx, bucket, key, writeBytes, 0) + return eerr + }) + require.NoError(t, err) + } + }() + + t.Log("Waiting for result") + select { + case err := <-errCh: + close(stopCh) + t.Errorf("Detected dirty read: %v", err) + case <-time.After(testDuration): + close(stopCh) + } + + wg.Wait() + }) + } +} + +func executeLongRunningRead(t *testing.T, name string, db *bolt.DB, bucket []byte, key []byte, readInterval duration, stopCh chan struct{}) error { + err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket(bucket) + + initialVal := b.Get(key) + + for { + select { + case <-stopCh: + t.Logf("%q finished.", name) + return nil + default: + } + + time.Sleep(randomDurationInRange(readInterval.min, readInterval.max)) + val := b.Get(key) + + if !bytes.Equal(initialVal, val) { + dirtyReadErr := fmt.Errorf("read different values for the same key (%q), value1: %q, value2: %q", + string(key), formatBytes(initialVal), formatBytes(val)) + return dirtyReadErr + } + } + }) + + return err +} diff --git a/cursor.go b/cursor.go new file mode 100644 index 0000000..b29ee9e --- /dev/null +++ b/cursor.go @@ -0,0 +1,432 @@ +package bbolt + +import ( + "bytes" + "fmt" + "sort" + + "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// Cursor represents an iterator that can traverse over all key/value pairs in a bucket +// in lexicographical order. +// Cursors see nested buckets with value == nil. +// Cursors can be obtained from a transaction and are valid as long as the transaction is open. +// +// Keys and values returned from the cursor are only valid for the life of the transaction. +// +// Changing data while traversing with a cursor may cause it to be invalidated +// and return unexpected keys and/or values. You must reposition your cursor +// after mutating data. +type Cursor struct { + bucket *Bucket + stack []elemRef +} + +// Bucket returns the bucket that this cursor was created from. +func (c *Cursor) Bucket() *Bucket { + return c.bucket +} + +// First moves the cursor to the first item in the bucket and returns its key and value. +// If the bucket is empty then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) First() (key []byte, value []byte) { + common.Assert(c.bucket.tx.db != nil, "tx closed") + k, v, flags := c.first() + if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +func (c *Cursor) first() (key []byte, value []byte, flags uint32) { + c.stack = c.stack[:0] + p, n := c.bucket.pageNode(c.bucket.RootPage()) + c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) + c.goToFirstElementOnTheStack() + + // If we land on an empty page then move to the next value. + // https://github.com/boltdb/bolt/issues/450 + if c.stack[len(c.stack)-1].count() == 0 { + c.next() + } + + k, v, flags := c.keyValue() + if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil, flags + } + return k, v, flags +} + +// Last moves the cursor to the last item in the bucket and returns its key and value. +// If the bucket is empty then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Last() (key []byte, value []byte) { + common.Assert(c.bucket.tx.db != nil, "tx closed") + c.stack = c.stack[:0] + p, n := c.bucket.pageNode(c.bucket.RootPage()) + ref := elemRef{page: p, node: n} + ref.index = ref.count() - 1 + c.stack = append(c.stack, ref) + c.last() + + // If this is an empty page (calling Delete may result in empty pages) + // we call prev to find the last page that is not empty + for len(c.stack) > 1 && c.stack[len(c.stack)-1].count() == 0 { + c.prev() + } + + if len(c.stack) == 0 { + return nil, nil + } + + k, v, flags := c.keyValue() + if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Next moves the cursor to the next item in the bucket and returns its key and value. +// If the cursor is at the end of the bucket then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Next() (key []byte, value []byte) { + common.Assert(c.bucket.tx.db != nil, "tx closed") + k, v, flags := c.next() + if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Prev moves the cursor to the previous item in the bucket and returns its key and value. +// If the cursor is at the beginning of the bucket then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Prev() (key []byte, value []byte) { + common.Assert(c.bucket.tx.db != nil, "tx closed") + k, v, flags := c.prev() + if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Seek moves the cursor to a given key using a b-tree search and returns it. +// If the key does not exist then the next key is used. If no keys +// follow, a nil key is returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) { + common.Assert(c.bucket.tx.db != nil, "tx closed") + + k, v, flags := c.seek(seek) + + // If we ended up after the last element of a page then move to the next one. + if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() { + k, v, flags = c.next() + } + + if k == nil { + return nil, nil + } else if (flags & uint32(common.BucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Delete removes the current key/value under the cursor from the bucket. +// Delete fails if current key/value is a bucket or if the transaction is not writable. +func (c *Cursor) Delete() error { + if c.bucket.tx.db == nil { + return errors.ErrTxClosed + } else if !c.bucket.Writable() { + return errors.ErrTxNotWritable + } + + key, _, flags := c.keyValue() + // Return an error if current value is a bucket. + if (flags & common.BucketLeafFlag) != 0 { + return errors.ErrIncompatibleValue + } + c.node().del(key) + + return nil +} + +// seek moves the cursor to a given key and returns it. +// If the key does not exist then the next key is used. +func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { + // Start from root page/node and traverse to correct page. + c.stack = c.stack[:0] + c.search(seek, c.bucket.RootPage()) + + // If this is a bucket then return a nil value. + return c.keyValue() +} + +// first moves the cursor to the first leaf element under the last page in the stack. +func (c *Cursor) goToFirstElementOnTheStack() { + for { + // Exit when we hit a leaf page. + var ref = &c.stack[len(c.stack)-1] + if ref.isLeaf() { + break + } + + // Keep adding pages pointing to the first element to the stack. + var pgId common.Pgid + if ref.node != nil { + pgId = ref.node.inodes[ref.index].Pgid() + } else { + pgId = ref.page.BranchPageElement(uint16(ref.index)).Pgid() + } + p, n := c.bucket.pageNode(pgId) + c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) + } +} + +// last moves the cursor to the last leaf element under the last page in the stack. +func (c *Cursor) last() { + for { + // Exit when we hit a leaf page. + ref := &c.stack[len(c.stack)-1] + if ref.isLeaf() { + break + } + + // Keep adding pages pointing to the last element in the stack. + var pgId common.Pgid + if ref.node != nil { + pgId = ref.node.inodes[ref.index].Pgid() + } else { + pgId = ref.page.BranchPageElement(uint16(ref.index)).Pgid() + } + p, n := c.bucket.pageNode(pgId) + + var nextRef = elemRef{page: p, node: n} + nextRef.index = nextRef.count() - 1 + c.stack = append(c.stack, nextRef) + } +} + +// next moves to the next leaf element and returns the key and value. +// If the cursor is at the last leaf element then it stays there and returns nil. +func (c *Cursor) next() (key []byte, value []byte, flags uint32) { + for { + // Attempt to move over one element until we're successful. + // Move up the stack as we hit the end of each page in our stack. + var i int + for i = len(c.stack) - 1; i >= 0; i-- { + elem := &c.stack[i] + if elem.index < elem.count()-1 { + elem.index++ + break + } + } + + // If we've hit the root page then stop and return. This will leave the + // cursor on the last element of the last page. + if i == -1 { + return nil, nil, 0 + } + + // Otherwise start from where we left off in the stack and find the + // first element of the first leaf page. + c.stack = c.stack[:i+1] + c.goToFirstElementOnTheStack() + + // If this is an empty page then restart and move back up the stack. + // https://github.com/boltdb/bolt/issues/450 + if c.stack[len(c.stack)-1].count() == 0 { + continue + } + + return c.keyValue() + } +} + +// prev moves the cursor to the previous item in the bucket and returns its key and value. +// If the cursor is at the beginning of the bucket then a nil key and value are returned. +func (c *Cursor) prev() (key []byte, value []byte, flags uint32) { + // Attempt to move back one element until we're successful. + // Move up the stack as we hit the beginning of each page in our stack. + for i := len(c.stack) - 1; i >= 0; i-- { + elem := &c.stack[i] + if elem.index > 0 { + elem.index-- + break + } + // If we've hit the beginning, we should stop moving the cursor, + // and stay at the first element, so that users can continue to + // iterate over the elements in reverse direction by calling `Next`. + // We should return nil in such case. + // Refer to https://github.com/etcd-io/bbolt/issues/733 + if len(c.stack) == 1 { + c.first() + return nil, nil, 0 + } + c.stack = c.stack[:i] + } + + // If we've hit the end then return nil. + if len(c.stack) == 0 { + return nil, nil, 0 + } + + // Move down the stack to find the last element of the last leaf under this branch. + c.last() + return c.keyValue() +} + +// search recursively performs a binary search against a given page/node until it finds a given key. +func (c *Cursor) search(key []byte, pgId common.Pgid) { + p, n := c.bucket.pageNode(pgId) + if p != nil && !p.IsBranchPage() && !p.IsLeafPage() { + panic(fmt.Sprintf("invalid page type: %d: %x", p.Id(), p.Flags())) + } + e := elemRef{page: p, node: n} + c.stack = append(c.stack, e) + + // If we're on a leaf page/node then find the specific node. + if e.isLeaf() { + c.nsearch(key) + return + } + + if n != nil { + c.searchNode(key, n) + return + } + c.searchPage(key, p) +} + +func (c *Cursor) searchNode(key []byte, n *node) { + var exact bool + index := sort.Search(len(n.inodes), func(i int) bool { + // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. + // sort.Search() finds the lowest index where f() != -1 but we need the highest index. + ret := bytes.Compare(n.inodes[i].Key(), key) + if ret == 0 { + exact = true + } + return ret != -1 + }) + if !exact && index > 0 { + index-- + } + c.stack[len(c.stack)-1].index = index + + // Recursively search to the next page. + c.search(key, n.inodes[index].Pgid()) +} + +func (c *Cursor) searchPage(key []byte, p *common.Page) { + // Binary search for the correct range. + inodes := p.BranchPageElements() + + var exact bool + index := sort.Search(int(p.Count()), func(i int) bool { + // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. + // sort.Search() finds the lowest index where f() != -1 but we need the highest index. + ret := bytes.Compare(inodes[i].Key(), key) + if ret == 0 { + exact = true + } + return ret != -1 + }) + if !exact && index > 0 { + index-- + } + c.stack[len(c.stack)-1].index = index + + // Recursively search to the next page. + c.search(key, inodes[index].Pgid()) +} + +// nsearch searches the leaf node on the top of the stack for a key. +func (c *Cursor) nsearch(key []byte) { + e := &c.stack[len(c.stack)-1] + p, n := e.page, e.node + + // If we have a node then search its inodes. + if n != nil { + index := sort.Search(len(n.inodes), func(i int) bool { + return bytes.Compare(n.inodes[i].Key(), key) != -1 + }) + e.index = index + return + } + + // If we have a page then search its leaf elements. + inodes := p.LeafPageElements() + index := sort.Search(int(p.Count()), func(i int) bool { + return bytes.Compare(inodes[i].Key(), key) != -1 + }) + e.index = index +} + +// keyValue returns the key and value of the current leaf element. +func (c *Cursor) keyValue() ([]byte, []byte, uint32) { + ref := &c.stack[len(c.stack)-1] + + // If the cursor is pointing to the end of page/node then return nil. + if ref.count() == 0 || ref.index >= ref.count() { + return nil, nil, 0 + } + + // Retrieve value from node. + if ref.node != nil { + inode := &ref.node.inodes[ref.index] + return inode.Key(), inode.Value(), inode.Flags() + } + + // Or retrieve value from page. + elem := ref.page.LeafPageElement(uint16(ref.index)) + return elem.Key(), elem.Value(), elem.Flags() +} + +// node returns the node that the cursor is currently positioned on. +func (c *Cursor) node() *node { + common.Assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack") + + // If the top of the stack is a leaf node then just return it. + if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() { + return ref.node + } + + // Start from root and traverse down the hierarchy. + var n = c.stack[0].node + if n == nil { + n = c.bucket.node(c.stack[0].page.Id(), nil) + } + for _, ref := range c.stack[:len(c.stack)-1] { + common.Assert(!n.isLeaf, "expected branch node") + n = n.childAt(ref.index) + } + common.Assert(n.isLeaf, "expected leaf node") + return n +} + +// elemRef represents a reference to an element on a given page/node. +type elemRef struct { + page *common.Page + node *node + index int +} + +// isLeaf returns whether the ref is pointing at a leaf page/node. +func (r *elemRef) isLeaf() bool { + if r.node != nil { + return r.node.isLeaf + } + return r.page.IsLeafPage() +} + +// count returns the number of inodes or page elements. +func (r *elemRef) count() int { + if r.node != nil { + return len(r.node.inodes) + } + return int(r.page.Count()) +} diff --git a/cursor_test.go b/cursor_test.go new file mode 100644 index 0000000..1b980d9 --- /dev/null +++ b/cursor_test.go @@ -0,0 +1,986 @@ +package bbolt_test + +import ( + "bytes" + "encoding/binary" + "fmt" + "log" + "os" + "reflect" + "sort" + "testing" + "testing/quick" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +// TestCursor_RepeatOperations verifies that a cursor can continue to +// iterate over all elements in reverse direction when it has already +// reached to the end or beginning. +// Refer to https://github.com/etcd-io/bbolt/issues/733 +func TestCursor_RepeatOperations(t *testing.T) { + testCases := []struct { + name string + testFunc func(t2 *testing.T, bucket *bolt.Bucket) + }{ + { + name: "Repeat NextPrevNext", + testFunc: testRepeatCursorOperations_NextPrevNext, + }, + { + name: "Repeat PrevNextPrev", + testFunc: testRepeatCursorOperations_PrevNextPrev, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + + bucketName := []byte("data") + + _ = db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists(bucketName) + testCursorRepeatOperations_PrepareData(t, b) + return nil + }) + + _ = db.View(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + tc.testFunc(t, b) + return nil + }) + }) + } +} + +func testCursorRepeatOperations_PrepareData(t *testing.T, b *bolt.Bucket) { + // ensure we have at least one branch page. + for i := 0; i < 1000; i++ { + k := []byte(fmt.Sprintf("%05d", i)) + err := b.Put(k, k) + require.NoError(t, err) + } +} + +func testRepeatCursorOperations_NextPrevNext(t *testing.T, b *bolt.Bucket) { + c := b.Cursor() + c.First() + startKey := []byte(fmt.Sprintf("%05d", 2)) + returnedKey, _ := c.Seek(startKey) + require.Equal(t, startKey, returnedKey) + + // Step 1: verify next + for i := 3; i < 1000; i++ { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Next() + require.Equal(t, expectedKey, actualKey) + } + + // Once we've reached the end, it should always return nil no matter how many times we call `Next`. + for i := 0; i < 10; i++ { + k, _ := c.Next() + require.Equal(t, []byte(nil), k) + } + + // Step 2: verify prev + for i := 998; i >= 0; i-- { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Prev() + require.Equal(t, expectedKey, actualKey) + } + + // Once we've reached the beginning, it should always return nil no matter how many times we call `Prev`. + for i := 0; i < 10; i++ { + k, _ := c.Prev() + require.Equal(t, []byte(nil), k) + } + + // Step 3: verify next again + for i := 1; i < 1000; i++ { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Next() + require.Equal(t, expectedKey, actualKey) + } +} + +func testRepeatCursorOperations_PrevNextPrev(t *testing.T, b *bolt.Bucket) { + c := b.Cursor() + + startKey := []byte(fmt.Sprintf("%05d", 998)) + returnedKey, _ := c.Seek(startKey) + require.Equal(t, startKey, returnedKey) + + // Step 1: verify prev + for i := 997; i >= 0; i-- { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Prev() + require.Equal(t, expectedKey, actualKey) + } + + // Once we've reached the beginning, it should always return nil no matter how many times we call `Prev`. + for i := 0; i < 10; i++ { + k, _ := c.Prev() + require.Equal(t, []byte(nil), k) + } + + // Step 2: verify next + for i := 1; i < 1000; i++ { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Next() + require.Equal(t, expectedKey, actualKey) + } + + // Once we've reached the end, it should always return nil no matter how many times we call `Next`. + for i := 0; i < 10; i++ { + k, _ := c.Next() + require.Equal(t, []byte(nil), k) + } + + // Step 3: verify prev again + for i := 998; i >= 0; i-- { + expectedKey := []byte(fmt.Sprintf("%05d", i)) + actualKey, _ := c.Prev() + require.Equal(t, expectedKey, actualKey) + } +} + +// Ensure that a cursor can return a reference to the bucket that created it. +func TestCursor_Bucket(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if cb := b.Cursor().Bucket(); !reflect.DeepEqual(cb, b) { + t.Fatal("cursor bucket mismatch") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can seek to the appropriate keys. +func TestCursor_Seek(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("0001")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("bar"), []byte("0002")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("0003")); err != nil { + t.Fatal(err) + } + + if _, err := b.CreateBucket([]byte("bkt")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + + // Exact match should go to the key. + if k, v := c.Seek([]byte("bar")); !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte("0002")) { + t.Fatalf("unexpected value: %v", v) + } + + // Inexact match should go to the next key. + if k, v := c.Seek([]byte("bas")); !bytes.Equal(k, []byte("baz")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte("0003")) { + t.Fatalf("unexpected value: %v", v) + } + + // Low key should go to the first key. + if k, v := c.Seek([]byte("")); !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte("0002")) { + t.Fatalf("unexpected value: %v", v) + } + + // High key should return no key. + if k, v := c.Seek([]byte("zzz")); k != nil { + t.Fatalf("expected nil key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + // Buckets should return their key but no value. + if k, v := c.Seek([]byte("bkt")); !bytes.Equal(k, []byte("bkt")) { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +func TestCursor_Delete(t *testing.T) { + db := btesting.MustCreateDB(t) + + const count = 1000 + + // Insert every other key between 0 and $count. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for i := 0; i < count; i += 1 { + k := make([]byte, 8) + binary.BigEndian.PutUint64(k, uint64(i)) + if err := b.Put(k, make([]byte, 100)); err != nil { + t.Fatal(err) + } + } + if _, err := b.CreateBucket([]byte("sub")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + bound := make([]byte, 8) + binary.BigEndian.PutUint64(bound, uint64(count/2)) + for key, _ := c.First(); bytes.Compare(key, bound) < 0; key, _ = c.Next() { + if err := c.Delete(); err != nil { + t.Fatal(err) + } + } + + c.Seek([]byte("sub")) + if err := c.Delete(); err != errors.ErrIncompatibleValue { + t.Fatalf("unexpected error: %s", err) + } + + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + stats := tx.Bucket([]byte("widgets")).Stats() + if stats.KeyN != count/2+1 { + t.Fatalf("unexpected KeyN: %d", stats.KeyN) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can seek to the appropriate keys when there are a +// large number of keys. This test also checks that seek will always move +// forward to the next key. +// +// Related: https://github.com/boltdb/bolt/pull/187 +func TestCursor_Seek_Large(t *testing.T) { + db := btesting.MustCreateDB(t) + + var count = 10000 + + // Insert every other key between 0 and $count. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < count; i += 100 { + for j := i; j < i+100; j += 2 { + k := make([]byte, 8) + binary.BigEndian.PutUint64(k, uint64(j)) + if err := b.Put(k, make([]byte, 100)); err != nil { + t.Fatal(err) + } + } + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + for i := 0; i < count; i++ { + seek := make([]byte, 8) + binary.BigEndian.PutUint64(seek, uint64(i)) + + k, _ := c.Seek(seek) + + // The last seek is beyond the end of the range so + // it should return nil. + if i == count-1 { + if k != nil { + t.Fatal("expected nil key") + } + continue + } + + // Otherwise we should seek to the exact key or the next key. + num := binary.BigEndian.Uint64(k) + if i%2 == 0 { + if num != uint64(i) { + t.Fatalf("unexpected num: %d", num) + } + } else { + if num != uint64(i+1) { + t.Fatalf("unexpected num: %d", num) + } + } + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a cursor can iterate over an empty bucket without error. +func TestCursor_EmptyBucket(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + k, v := c.First() + if k != nil { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can reverse iterate over an empty bucket without error. +func TestCursor_EmptyBucketReverse(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + if err := db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + k, v := c.Last() + if k != nil { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can iterate over a single root with a couple elements. +func TestCursor_Iterate_Leaf(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte{}); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte{0}); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("bar"), []byte{1}); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + tx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + defer func() { _ = tx.Rollback() }() + + c := tx.Bucket([]byte("widgets")).Cursor() + + k, v := c.First() + if !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{1}) { + t.Fatalf("unexpected value: %v", v) + } + + k, v = c.Next() + if !bytes.Equal(k, []byte("baz")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{}) { + t.Fatalf("unexpected value: %v", v) + } + + k, v = c.Next() + if !bytes.Equal(k, []byte("foo")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{0}) { + t.Fatalf("unexpected value: %v", v) + } + + k, v = c.Next() + if k != nil { + t.Fatalf("expected nil key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + k, v = c.Next() + if k != nil { + t.Fatalf("expected nil key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can iterate in reverse over a single root with a couple elements. +func TestCursor_LeafRootReverse(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte{}); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte{0}); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("bar"), []byte{1}); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + tx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + c := tx.Bucket([]byte("widgets")).Cursor() + + if k, v := c.Last(); !bytes.Equal(k, []byte("foo")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{0}) { + t.Fatalf("unexpected value: %v", v) + } + + if k, v := c.Prev(); !bytes.Equal(k, []byte("baz")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{}) { + t.Fatalf("unexpected value: %v", v) + } + + if k, v := c.Prev(); !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, []byte{1}) { + t.Fatalf("unexpected value: %v", v) + } + + if k, v := c.Prev(); k != nil { + t.Fatalf("expected nil key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + if k, v := c.Prev(); k != nil { + t.Fatalf("expected nil key: %v", k) + } else if v != nil { + t.Fatalf("expected nil value: %v", v) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can restart from the beginning. +func TestCursor_Restart(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("bar"), []byte{}); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte{}); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + tx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + c := tx.Bucket([]byte("widgets")).Cursor() + + if k, _ := c.First(); !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } + if k, _ := c.Next(); !bytes.Equal(k, []byte("foo")) { + t.Fatalf("unexpected key: %v", k) + } + + if k, _ := c.First(); !bytes.Equal(k, []byte("bar")) { + t.Fatalf("unexpected key: %v", k) + } + if k, _ := c.Next(); !bytes.Equal(k, []byte("foo")) { + t.Fatalf("unexpected key: %v", k) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } +} + +// Ensure that a cursor can skip over empty pages that have been deleted. +func TestCursor_First_EmptyPages(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Create 1000 keys in the "widgets" bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 1000; i++ { + if err := b.Put(u64tob(uint64(i)), []byte{}); err != nil { + t.Fatal(err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } + + // Delete half the keys and then try to iterate. + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 0; i < 600; i++ { + if err := b.Delete(u64tob(uint64(i))); err != nil { + t.Fatal(err) + } + } + + c := b.Cursor() + var n int + for k, _ := c.First(); k != nil; k, _ = c.Next() { + n++ + } + if n != 400 { + t.Fatalf("unexpected key count: %d", n) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a cursor can skip over empty pages that have been deleted. +func TestCursor_Last_EmptyPages(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Create 1000 keys in the "widgets" bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 1000; i++ { + if err := b.Put(u64tob(uint64(i)), []byte{}); err != nil { + t.Fatal(err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } + + // Delete last 800 elements to ensure last page is empty + if err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 200; i < 1000; i++ { + if err := b.Delete(u64tob(uint64(i))); err != nil { + t.Fatal(err) + } + } + + c := b.Cursor() + var n int + for k, _ := c.Last(); k != nil; k, _ = c.Prev() { + n++ + } + if n != 200 { + t.Fatalf("unexpected key count: %d", n) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx can iterate over all elements in a bucket. +func TestCursor_QuickCheck(t *testing.T) { + f := func(items testdata) bool { + db := btesting.MustCreateDB(t) + defer db.MustClose() + + // Bulk insert all values. + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for _, item := range items { + if err := b.Put(item.Key, item.Value); err != nil { + t.Fatal(err) + } + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + + // Sort test data. + sort.Sort(items) + + // Iterate over all items and check consistency. + var index = 0 + tx, err = db.Begin(false) + if err != nil { + t.Fatal(err) + } + + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil && index < len(items); k, v = c.Next() { + if !bytes.Equal(k, items[index].Key) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, items[index].Value) { + t.Fatalf("unexpected value: %v", v) + } + index++ + } + if len(items) != index { + t.Fatalf("unexpected item count: %v, expected %v", len(items), index) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can iterate over all elements in a bucket in reverse. +func TestCursor_QuickCheck_Reverse(t *testing.T) { + f := func(items testdata) bool { + db := btesting.MustCreateDB(t) + defer db.MustClose() + + // Bulk insert all values. + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + for _, item := range items { + if err := b.Put(item.Key, item.Value); err != nil { + t.Fatal(err) + } + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + + // Sort test data. + sort.Sort(revtestdata(items)) + + // Iterate over all items and check consistency. + var index = 0 + tx, err = db.Begin(false) + if err != nil { + t.Fatal(err) + } + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.Last(); k != nil && index < len(items); k, v = c.Prev() { + if !bytes.Equal(k, items[index].Key) { + t.Fatalf("unexpected key: %v", k) + } else if !bytes.Equal(v, items[index].Value) { + t.Fatalf("unexpected value: %v", v) + } + index++ + } + if len(items) != index { + t.Fatalf("unexpected item count: %v, expected %v", len(items), index) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a Tx cursor can iterate over subbuckets. +func TestCursor_QuickCheck_BucketsOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("bar")); err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("baz")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + var names []string + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + names = append(names, string(k)) + if v != nil { + t.Fatalf("unexpected value: %v", v) + } + } + if !reflect.DeepEqual(names, []string{"bar", "baz", "foo"}) { + t.Fatalf("unexpected names: %+v", names) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx cursor can reverse iterate over subbuckets. +func TestCursor_QuickCheck_BucketsOnly_Reverse(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("bar")); err != nil { + t.Fatal(err) + } + if _, err := b.CreateBucket([]byte("baz")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + var names []string + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.Last(); k != nil; k, v = c.Prev() { + names = append(names, string(k)) + if v != nil { + t.Fatalf("unexpected value: %v", v) + } + } + if !reflect.DeepEqual(names, []string{"foo", "baz", "bar"}) { + t.Fatalf("unexpected names: %+v", names) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +func ExampleCursor() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Start a read-write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + // Create a new bucket. + b, err := tx.CreateBucket([]byte("animals")) + if err != nil { + return err + } + + // Insert data into a bucket. + if err := b.Put([]byte("dog"), []byte("fun")); err != nil { + log.Fatal(err) + } + if err := b.Put([]byte("cat"), []byte("lame")); err != nil { + log.Fatal(err) + } + if err := b.Put([]byte("liger"), []byte("awesome")); err != nil { + log.Fatal(err) + } + + // Create a cursor for iteration. + c := b.Cursor() + + // Iterate over items in sorted key order. This starts from the + // first key/value pair and updates the k/v variables to the + // next key/value on each iteration. + // + // The loop finishes at the end of the cursor when a nil key is returned. + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("A %s is %s.\n", k, v) + } + + return nil + }); err != nil { + log.Fatal(err) + } + + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // A cat is lame. + // A dog is fun. + // A liger is awesome. +} + +func ExampleCursor_reverse() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Start a read-write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + // Create a new bucket. + b, err := tx.CreateBucket([]byte("animals")) + if err != nil { + return err + } + + // Insert data into a bucket. + if err := b.Put([]byte("dog"), []byte("fun")); err != nil { + log.Fatal(err) + } + if err := b.Put([]byte("cat"), []byte("lame")); err != nil { + log.Fatal(err) + } + if err := b.Put([]byte("liger"), []byte("awesome")); err != nil { + log.Fatal(err) + } + + // Create a cursor for iteration. + c := b.Cursor() + + // Iterate over items in reverse sorted key order. This starts + // from the last key/value pair and updates the k/v variables to + // the previous key/value on each iteration. + // + // The loop finishes at the beginning of the cursor when a nil key + // is returned. + for k, v := c.Last(); k != nil; k, v = c.Prev() { + fmt.Printf("A %s is %s.\n", k, v) + } + + return nil + }); err != nil { + log.Fatal(err) + } + + // Close the database to release the file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // A liger is awesome. + // A dog is fun. + // A cat is lame. +} diff --git a/db.go b/db.go new file mode 100644 index 0000000..8f7b131 --- /dev/null +++ b/db.go @@ -0,0 +1,1417 @@ +package bbolt + +import ( + "errors" + "fmt" + "io" + "os" + "runtime" + "sync" + "time" + "unsafe" + + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/common" + fl "github.com/tutus-one/tutus-bolt/internal/freelist" +) + +// The time elapsed between consecutive file locking attempts. +const flockRetryTimeout = 50 * time.Millisecond + +// FreelistType is the type of the freelist backend +type FreelistType string + +// TODO(ahrtr): eventually we should (step by step) +// 1. default to `FreelistMapType`; +// 2. remove the `FreelistArrayType`, do not export `FreelistMapType` +// and remove field `FreelistType' from both `DB` and `Options`; +const ( + // FreelistArrayType indicates backend freelist type is array + FreelistArrayType = FreelistType("array") + // FreelistMapType indicates backend freelist type is hashmap + FreelistMapType = FreelistType("hashmap") +) + +// DB represents a collection of buckets persisted to a file on disk. +// All data access is performed through transactions which can be obtained through the DB. +// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. +type DB struct { + // Put `stats` at the first field to ensure it's 64-bit aligned. Note that + // the first word in an allocated struct can be relied upon to be 64-bit + // aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. Also + // refer to discussion in https://github.com/etcd-io/bbolt/issues/577. + stats Stats + + // When enabled, the database will perform a Check() after every commit. + // A panic is issued if the database is in an inconsistent state. This + // flag has a large performance impact so it should only be used for + // debugging purposes. + StrictMode bool + + // Setting the NoSync flag will cause the database to skip fsync() + // calls after each commit. This can be useful when bulk loading data + // into a database and you can restart the bulk load in the event of + // a system failure or database corruption. Do not set this flag for + // normal use. + // + // If the package global IgnoreNoSync constant is true, this value is + // ignored. See the comment on that constant for more details. + // + // THIS IS UNSAFE. PLEASE USE WITH CAUTION. + NoSync bool + + // When true, skips syncing freelist to disk. This improves the database + // write performance under normal operation, but requires a full database + // re-sync during recovery. + NoFreelistSync bool + + // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures + // dramatic performance degradation if database is large and fragmentation in freelist is common. + // The alternative one is using hashmap, it is faster in almost all circumstances + // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. + // The default type is array + FreelistType FreelistType + + // When true, skips the truncate call when growing the database. + // Setting this to true is only safe on non-ext3/ext4 systems. + // Skipping truncation avoids preallocation of hard drive space and + // bypasses a truncate() and fsync() syscall on remapping. + // + // https://github.com/boltdb/bolt/issues/284 + NoGrowSync bool + + // When `true`, bbolt will always load the free pages when opening the DB. + // When opening db in write mode, this flag will always automatically + // set to `true`. + PreLoadFreelist bool + + // If you want to read the entire database fast, you can set MmapFlag to + // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. + MmapFlags int + + // MaxBatchSize is the maximum size of a batch. Default value is + // copied from DefaultMaxBatchSize in Open. + // + // If <=0, disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchSize int + + // MaxBatchDelay is the maximum delay before a batch starts. + // Default value is copied from DefaultMaxBatchDelay in Open. + // + // If <=0, effectively disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchDelay time.Duration + + // AllocSize is the amount of space allocated when the database + // needs to create new pages. This is done to amortize the cost + // of truncate() and fsync() when growing the data file. + AllocSize int + + // MaxSize is the maximum size (in bytes) allowed for the data file. + // If a caller's attempt to add data results in the need to grow + // the data file, an error will be returned and the data file will not grow. + // <=0 means no limit. + MaxSize int + + // Mlock locks database file in memory when set to true. + // It prevents major page faults, however used memory can't be reclaimed. + // + // Supported only on Unix via mlock/munlock syscalls. + Mlock bool + + logger Logger + + path string + openFile func(string, int, os.FileMode) (*os.File, error) + file *os.File + // `dataref` isn't used at all on Windows, and the golangci-lint + // always fails on Windows platform. + //nolint + dataref []byte // mmap'ed readonly, write throws SEGV + data *[maxMapSize]byte + datasz int + meta0 *common.Meta + meta1 *common.Meta + pageSize int + opened bool + rwtx *Tx + txs []*Tx + + freelist fl.Interface + freelistLoad sync.Once + + pagePool sync.Pool + + batchMu sync.Mutex + batch *batch + + rwlock sync.Mutex // Allows only one writer at a time. + metalock sync.Mutex // Protects meta page access. + mmaplock sync.RWMutex // Protects mmap access during remapping. + statlock sync.RWMutex // Protects stats access. + + ops struct { + writeAt func(b []byte, off int64) (n int, err error) + } + + // Read only mode. + // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. + readOnly bool +} + +// Path returns the path to currently open database file. +func (db *DB) Path() string { + return db.path +} + +// GoString returns the Go string representation of the database. +func (db *DB) GoString() string { + return fmt.Sprintf("bolt.DB{path:%q}", db.path) +} + +// String returns the string representation of the database. +func (db *DB) String() string { + return fmt.Sprintf("DB<%q>", db.path) +} + +// Open creates and opens a database at the given path with a given file mode. +// If the file does not exist then it will be created automatically with a given file mode. +// Passing in nil options will cause Bolt to open the database with the default options. +// Note: For read/write transactions, ensure the owner has write permission on the created/opened database file, e.g. 0600 +func Open(path string, mode os.FileMode, options *Options) (db *DB, err error) { + db = &DB{ + opened: true, + } + + // Set default options if no options are provided. + if options == nil { + options = DefaultOptions + } + db.NoSync = options.NoSync + db.NoGrowSync = options.NoGrowSync + db.MmapFlags = options.MmapFlags + db.NoFreelistSync = options.NoFreelistSync + db.PreLoadFreelist = options.PreLoadFreelist + db.FreelistType = options.FreelistType + db.Mlock = options.Mlock + db.MaxSize = options.MaxSize + + // Set default values for later DB operations. + db.MaxBatchSize = common.DefaultMaxBatchSize + db.MaxBatchDelay = common.DefaultMaxBatchDelay + db.AllocSize = common.DefaultAllocSize + + if options.Logger == nil { + db.logger = getDiscardLogger() + } else { + db.logger = options.Logger + } + + lg := db.Logger() + if lg != discardLogger { + lg.Infof("Opening db file (%s) with mode %s and with options: %s", path, mode, options) + defer func() { + if err != nil { + lg.Errorf("Opening bbolt db (%s) failed: %v", path, err) + } else { + lg.Infof("Opening bbolt db (%s) successfully", path) + } + }() + } + + flag := os.O_RDWR + if options.ReadOnly { + flag = os.O_RDONLY + db.readOnly = true + } else { + // always load free pages in write mode + db.PreLoadFreelist = true + flag |= os.O_CREATE + } + + db.openFile = options.OpenFile + if db.openFile == nil { + db.openFile = os.OpenFile + } + + // Open data file and separate sync handler for metadata writes. + if db.file, err = db.openFile(path, flag, mode); err != nil { + _ = db.close() + lg.Errorf("failed to open db file (%s): %v", path, err) + return nil, err + } + db.path = db.file.Name() + + // Lock file so that other processes using Bolt in read-write mode cannot + // use the database at the same time. This would cause corruption since + // the two processes would write meta pages and free pages separately. + // The database file is locked exclusively (only one process can grab the lock) + // if !options.ReadOnly. + // The database file is locked using the shared lock (more than one process may + // hold a lock at the same time) otherwise (options.ReadOnly is set). + if err = flock(db, !db.readOnly, options.Timeout); err != nil { + _ = db.close() + lg.Errorf("failed to lock db file (%s), readonly: %t, error: %v", path, db.readOnly, err) + return nil, err + } + + // Default values for test hooks + db.ops.writeAt = db.file.WriteAt + + if db.pageSize = options.PageSize; db.pageSize == 0 { + // Set the default page size to the OS page size. + db.pageSize = common.DefaultPageSize + } + + // Initialize the database if it doesn't exist. + if info, statErr := db.file.Stat(); statErr != nil { + _ = db.close() + lg.Errorf("failed to get db file's stats (%s): %v", path, err) + return nil, statErr + } else if info.Size() == 0 { + // Initialize new files with meta pages. + if err = db.init(); err != nil { + // clean up file descriptor on initialization fail + _ = db.close() + lg.Errorf("failed to initialize db file (%s): %v", path, err) + return nil, err + } + } else { + // try to get the page size from the metadata pages + if db.pageSize, err = db.getPageSize(); err != nil { + _ = db.close() + lg.Errorf("failed to get page size from db file (%s): %v", path, err) + return nil, err + } + } + + // Initialize page pool. + db.pagePool = sync.Pool{ + New: func() interface{} { + return make([]byte, db.pageSize) + }, + } + + // Memory map the data file. + if err = db.mmap(options.InitialMmapSize); err != nil { + _ = db.close() + lg.Errorf("failed to map db file (%s): %v", path, err) + return nil, err + } + + if db.PreLoadFreelist { + db.loadFreelist() + } + + if db.readOnly { + return db, nil + } + + // Flush freelist when transitioning from no sync to sync so + // NoFreelistSync unaware boltdb can open the db later. + if !db.NoFreelistSync && !db.hasSyncedFreelist() { + tx, txErr := db.Begin(true) + if tx != nil { + txErr = tx.Commit() + } + if txErr != nil { + lg.Errorf("starting readwrite transaction failed: %v", txErr) + _ = db.close() + return nil, txErr + } + } + + // Mark the database as opened and return. + return db, nil +} + +// getPageSize reads the pageSize from the meta pages. It tries +// to read the first meta page firstly. If the first page is invalid, +// then it tries to read the second page using the default page size. +func (db *DB) getPageSize() (int, error) { + var ( + meta0CanRead, meta1CanRead bool + ) + + // Read the first meta page to determine the page size. + if pgSize, canRead, err := db.getPageSizeFromFirstMeta(); err != nil { + // We cannot read the page size from page 0, but can read page 0. + meta0CanRead = canRead + } else { + return pgSize, nil + } + + // Read the second meta page to determine the page size. + if pgSize, canRead, err := db.getPageSizeFromSecondMeta(); err != nil { + // We cannot read the page size from page 1, but can read page 1. + meta1CanRead = canRead + } else { + return pgSize, nil + } + + // If we can't read the page size from both pages, but can read + // either page, then we assume it's the same as the OS or the one + // given, since that's how the page size was chosen in the first place. + // + // If both pages are invalid, and (this OS uses a different page size + // from what the database was created with or the given page size is + // different from what the database was created with), then we are out + // of luck and cannot access the database. + if meta0CanRead || meta1CanRead { + return db.pageSize, nil + } + + return 0, berrors.ErrInvalid +} + +// getPageSizeFromFirstMeta reads the pageSize from the first meta page +func (db *DB) getPageSizeFromFirstMeta() (int, bool, error) { + var buf [0x1000]byte + var metaCanRead bool + if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { + metaCanRead = true + if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil { + return int(m.PageSize()), metaCanRead, nil + } + } + return 0, metaCanRead, berrors.ErrInvalid +} + +// getPageSizeFromSecondMeta reads the pageSize from the second meta page +func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) { + var ( + fileSize int64 + metaCanRead bool + ) + + // get the db file size + if info, err := db.file.Stat(); err != nil { + return 0, metaCanRead, err + } else { + fileSize = info.Size() + } + + // We need to read the second meta page, so we should skip the first page; + // but we don't know the exact page size yet, it's chicken & egg problem. + // The solution is to try all the possible page sizes, which starts from 1KB + // and until 16MB (1024<<14) or the end of the db file + // + // TODO: should we support larger page size? + for i := 0; i <= 14; i++ { + var buf [0x1000]byte + var pos int64 = 1024 << uint(i) + if pos >= fileSize-1024 { + break + } + bw, err := db.file.ReadAt(buf[:], pos) + if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) { + metaCanRead = true + if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil { + return int(m.PageSize()), metaCanRead, nil + } + } + } + + return 0, metaCanRead, berrors.ErrInvalid +} + +// loadFreelist reads the freelist if it is synced, or reconstructs it +// by scanning the DB if it is not synced. It assumes there are no +// concurrent accesses being made to the freelist. +func (db *DB) loadFreelist() { + db.freelistLoad.Do(func() { + db.freelist = newFreelist(db.FreelistType) + if !db.hasSyncedFreelist() { + // Reconstruct free list by scanning the DB. + db.freelist.Init(db.freepages()) + } else { + // Read free list from freelist page. + db.freelist.Read(db.page(db.meta().Freelist())) + } + db.stats.FreePageN = db.freelist.FreeCount() + }) +} + +func (db *DB) hasSyncedFreelist() bool { + return db.meta().Freelist() != common.PgidNoFreelist +} + +func (db *DB) fileSize() (int, error) { + info, err := db.file.Stat() + if err != nil { + return 0, fmt.Errorf("file stat error: %w", err) + } + sz := int(info.Size()) + if sz < db.pageSize*2 { + return 0, fmt.Errorf("file size too small %d", sz) + } + return sz, nil +} + +// mmap opens the underlying memory-mapped file and initializes the meta references. +// minsz is the minimum size that the new mmap can be. +func (db *DB) mmap(minsz int) (err error) { + db.mmaplock.Lock() + defer db.mmaplock.Unlock() + + lg := db.Logger() + + // Ensure the size is at least the minimum size. + var fileSize int + fileSize, err = db.fileSize() + if err != nil { + lg.Errorf("getting file size failed: %w", err) + return err + } + var size = fileSize + if size < minsz { + size = minsz + } + size, err = db.mmapSize(size) + if err != nil { + lg.Errorf("getting map size failed: %w", err) + return err + } + + if db.Mlock { + // Unlock db memory + if err := db.munlock(fileSize); err != nil { + return err + } + } + + // Dereference all mmap references before unmapping. + if db.rwtx != nil { + db.rwtx.root.dereference() + } + + // Unmap existing data before continuing. + if err = db.munmap(); err != nil { + return err + } + + // Memory-map the data file as a byte slice. + // gofail: var mapError string + // return errors.New(mapError) + if err = mmap(db, size); err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] mmap failed, size: %d, error: %v", runtime.GOOS, runtime.GOARCH, size, err) + return err + } + + // Perform unmmap on any error to reset all data fields: + // dataref, data, datasz, meta0 and meta1. + defer func() { + if err != nil { + if unmapErr := db.munmap(); unmapErr != nil { + err = fmt.Errorf("%w; rollback unmap also failed: %v", err, unmapErr) + } + } + }() + + if db.Mlock { + // Don't allow swapping of data file + if err := db.mlock(fileSize); err != nil { + return err + } + } + + // Save references to the meta pages. + db.meta0 = db.page(0).Meta() + db.meta1 = db.page(1).Meta() + + // Validate the meta pages. We only return an error if both meta pages fail + // validation, since meta0 failing validation means that it wasn't saved + // properly -- but we can recover using meta1. And vice-versa. + err0 := db.meta0.Validate() + err1 := db.meta1.Validate() + if err0 != nil && err1 != nil { + lg.Errorf("both meta pages are invalid, meta0: %v, meta1: %v", err0, err1) + return err0 + } + + return nil +} + +func (db *DB) invalidate() { + db.dataref = nil + db.data = nil + db.datasz = 0 + + db.meta0 = nil + db.meta1 = nil +} + +// munmap unmaps the data file from memory. +func (db *DB) munmap() error { + defer db.invalidate() + + // gofail: var unmapError string + // return errors.New(unmapError) + if err := munmap(db); err != nil { + db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munmap failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err) + return fmt.Errorf("unmap error: %v", err.Error()) + } + + return nil +} + +// mmapSize determines the appropriate size for the mmap given the current size +// of the database. The minimum size is 32KB and doubles until it reaches 1GB. +// Returns an error if the new mmap size is greater than the max allowed. +func (db *DB) mmapSize(size int) (int, error) { + // Double the size from 32KB until 1GB. + for i := uint(15); i <= 30; i++ { + if size <= 1< maxMapSize { + return 0, errors.New("mmap too large") + } + + // If larger than 1GB then grow by 1GB at a time. + sz := int64(size) + if remainder := sz % int64(common.MaxMmapStep); remainder > 0 { + sz += int64(common.MaxMmapStep) - remainder + } + + // Ensure that the mmap size is a multiple of the page size. + // This should always be true since we're incrementing in MBs. + pageSize := int64(db.pageSize) + if (sz % pageSize) != 0 { + sz = ((sz / pageSize) + 1) * pageSize + } + + // If we've exceeded the max size then only grow up to the max size. + if sz > maxMapSize { + sz = maxMapSize + } + + return int(sz), nil +} + +func (db *DB) munlock(fileSize int) error { + // gofail: var munlockError string + // return errors.New(munlockError) + if err := munlock(db, fileSize); err != nil { + db.Logger().Errorf("[GOOS: %s, GOARCH: %s] munlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err) + return fmt.Errorf("munlock error: %v", err.Error()) + } + return nil +} + +func (db *DB) mlock(fileSize int) error { + // gofail: var mlockError string + // return errors.New(mlockError) + if err := mlock(db, fileSize); err != nil { + db.Logger().Errorf("[GOOS: %s, GOARCH: %s] mlock failed, fileSize: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, fileSize, db.datasz, err) + return fmt.Errorf("mlock error: %v", err.Error()) + } + return nil +} + +func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error { + if err := db.munlock(fileSizeFrom); err != nil { + return err + } + if err := db.mlock(fileSizeTo); err != nil { + return err + } + return nil +} + +// init creates a new database file and initializes its meta pages. +func (db *DB) init() error { + // Create two meta pages on a buffer. + buf := make([]byte, db.pageSize*4) + for i := 0; i < 2; i++ { + p := db.pageInBuffer(buf, common.Pgid(i)) + p.SetId(common.Pgid(i)) + p.SetFlags(common.MetaPageFlag) + + // Initialize the meta page. + m := p.Meta() + m.SetMagic(common.Magic) + m.SetVersion(common.Version) + m.SetPageSize(uint32(db.pageSize)) + m.SetFreelist(2) + m.SetRootBucket(common.NewInBucket(3, 0)) + m.SetPgid(4) + m.SetTxid(common.Txid(i)) + m.SetChecksum(m.Sum64()) + } + + // Write an empty freelist at page 3. + p := db.pageInBuffer(buf, common.Pgid(2)) + p.SetId(2) + p.SetFlags(common.FreelistPageFlag) + p.SetCount(0) + + // Write an empty leaf page at page 4. + p = db.pageInBuffer(buf, common.Pgid(3)) + p.SetId(3) + p.SetFlags(common.LeafPageFlag) + p.SetCount(0) + + // Write the buffer to our data file. + if _, err := db.ops.writeAt(buf, 0); err != nil { + db.Logger().Errorf("writeAt failed: %w", err) + return err + } + if err := fdatasync(db); err != nil { + db.Logger().Errorf("[GOOS: %s, GOARCH: %s] fdatasync failed: %w", runtime.GOOS, runtime.GOARCH, err) + return err + } + + return nil +} + +// Close releases all database resources. +// It will block waiting for any open transactions to finish +// before closing the database and returning. +func (db *DB) Close() error { + db.rwlock.Lock() + defer db.rwlock.Unlock() + + db.metalock.Lock() + defer db.metalock.Unlock() + + db.mmaplock.Lock() + defer db.mmaplock.Unlock() + + return db.close() +} + +func (db *DB) close() error { + if !db.opened { + return nil + } + + db.opened = false + + db.freelist = nil + + // Clear ops. + db.ops.writeAt = nil + + var errs []error + // Close the mmap. + if err := db.munmap(); err != nil { + errs = append(errs, err) + } + + // Close file handles. + if db.file != nil { + // No need to unlock read-only file. + if !db.readOnly { + // Unlock the file. + if err := funlock(db); err != nil { + errs = append(errs, fmt.Errorf("bolt.Close(): funlock error: %w", err)) + } + } + + // Close the file descriptor. + if err := db.file.Close(); err != nil { + errs = append(errs, fmt.Errorf("db file close: %w", err)) + } + db.file = nil + } + + db.path = "" + + if len(errs) > 0 { + return errs[0] + } + return nil +} + +// Begin starts a new transaction. +// Multiple read-only transactions can be used concurrently but only one +// write transaction can be used at a time. Starting multiple write transactions +// will cause the calls to block and be serialized until the current write +// transaction finishes. +// +// Transactions should not be dependent on one another. Opening a read +// transaction and a write transaction in the same goroutine can cause the +// writer to deadlock because the database periodically needs to re-mmap itself +// as it grows and it cannot do that while a read transaction is open. +// +// If a long running read transaction (for example, a snapshot transaction) is +// needed, you might want to set DB.InitialMmapSize to a large enough value +// to avoid potential blocking of write transaction. +// +// IMPORTANT: You must close read-only transactions after you are finished or +// else the database will not reclaim old pages. +func (db *DB) Begin(writable bool) (t *Tx, err error) { + if lg := db.Logger(); lg != discardLogger { + lg.Debugf("Starting a new transaction [writable: %t]", writable) + defer func() { + if err != nil { + lg.Errorf("Starting a new transaction [writable: %t] failed: %v", writable, err) + } else { + lg.Debugf("Starting a new transaction [writable: %t] successfully", writable) + } + }() + } + + if writable { + return db.beginRWTx() + } + return db.beginTx() +} + +func (db *DB) Logger() Logger { + if db == nil || db.logger == nil { + return getDiscardLogger() + } + return db.logger +} + +func (db *DB) beginTx() (*Tx, error) { + // Lock the meta pages while we initialize the transaction. We obtain + // the meta lock before the mmap lock because that's the order that the + // write transaction will obtain them. + db.metalock.Lock() + + // Obtain a read-only lock on the mmap. When the mmap is remapped it will + // obtain a write lock so all transactions must finish before it can be + // remapped. + db.mmaplock.RLock() + + // Exit if the database is not open yet. + if !db.opened { + db.mmaplock.RUnlock() + db.metalock.Unlock() + return nil, berrors.ErrDatabaseNotOpen + } + + // Exit if the database is not correctly mapped. + if db.data == nil { + db.mmaplock.RUnlock() + db.metalock.Unlock() + return nil, berrors.ErrInvalidMapping + } + + // Create a transaction associated with the database. + t := &Tx{} + t.init(db) + + // Keep track of transaction until it closes. + db.txs = append(db.txs, t) + n := len(db.txs) + if db.freelist != nil { + db.freelist.AddReadonlyTXID(t.meta.Txid()) + } + + // Unlock the meta pages. + db.metalock.Unlock() + + // Update the transaction stats. + db.statlock.Lock() + db.stats.TxN++ + db.stats.OpenTxN = n + db.statlock.Unlock() + + return t, nil +} + +func (db *DB) beginRWTx() (*Tx, error) { + // If the database was opened with Options.ReadOnly, return an error. + if db.readOnly { + return nil, berrors.ErrDatabaseReadOnly + } + + // Obtain writer lock. This is released by the transaction when it closes. + // This enforces only one writer transaction at a time. + db.rwlock.Lock() + + // Once we have the writer lock then we can lock the meta pages so that + // we can set up the transaction. + db.metalock.Lock() + defer db.metalock.Unlock() + + // Exit if the database is not open yet. + if !db.opened { + db.rwlock.Unlock() + return nil, berrors.ErrDatabaseNotOpen + } + + // Exit if the database is not correctly mapped. + if db.data == nil { + db.rwlock.Unlock() + return nil, berrors.ErrInvalidMapping + } + + // Create a transaction associated with the database. + t := &Tx{writable: true} + t.init(db) + db.rwtx = t + db.freelist.ReleasePendingPages() + return t, nil +} + +// removeTx removes a transaction from the database. +func (db *DB) removeTx(tx *Tx) { + // Release the read lock on the mmap. + db.mmaplock.RUnlock() + + // Use the meta lock to restrict access to the DB object. + db.metalock.Lock() + + // Remove the transaction. + for i, t := range db.txs { + if t == tx { + last := len(db.txs) - 1 + db.txs[i] = db.txs[last] + db.txs[last] = nil + db.txs = db.txs[:last] + break + } + } + n := len(db.txs) + if db.freelist != nil { + db.freelist.RemoveReadonlyTXID(tx.meta.Txid()) + } + + // Unlock the meta pages. + db.metalock.Unlock() + + // Merge statistics. + db.statlock.Lock() + db.stats.OpenTxN = n + db.stats.TxStats.add(&tx.stats) + db.statlock.Unlock() +} + +// Update executes a function within the context of a read-write managed transaction. +// If no error is returned from the function then the transaction is committed. +// If an error is returned then the entire transaction is rolled back. +// Any error that is returned from the function or returned from the commit is +// returned from the Update() method. +// +// Attempting to manually commit or rollback within the function will cause a panic. +func (db *DB) Update(fn func(*Tx) error) error { + t, err := db.Begin(true) + if err != nil { + return err + } + + // Make sure the transaction rolls back in the event of a panic. + defer func() { + if t.db != nil { + t.rollback() + } + }() + + // Mark as a managed tx so that the inner function cannot manually commit. + t.managed = true + + // If an error is returned from the function then rollback and return error. + err = fn(t) + t.managed = false + if err != nil { + _ = t.Rollback() + return err + } + + return t.Commit() +} + +// View executes a function within the context of a managed read-only transaction. +// Any error that is returned from the function is returned from the View() method. +// +// Attempting to manually rollback within the function will cause a panic. +func (db *DB) View(fn func(*Tx) error) error { + t, err := db.Begin(false) + if err != nil { + return err + } + + // Make sure the transaction rolls back in the event of a panic. + defer func() { + if t.db != nil { + t.rollback() + } + }() + + // Mark as a managed tx so that the inner function cannot manually rollback. + t.managed = true + + // If an error is returned from the function then pass it through. + err = fn(t) + t.managed = false + if err != nil { + _ = t.Rollback() + return err + } + + return t.Rollback() +} + +// Batch calls fn as part of a batch. It behaves similar to Update, +// except: +// +// 1. concurrent Batch calls can be combined into a single Bolt +// transaction. +// +// 2. the function passed to Batch may be called multiple times, +// regardless of whether it returns error or not. +// +// This means that Batch function side effects must be idempotent and +// take permanent effect only after a successful return is seen in +// caller. +// +// The maximum batch size and delay can be adjusted with DB.MaxBatchSize +// and DB.MaxBatchDelay, respectively. +// +// Batch is only useful when there are multiple goroutines calling it. +func (db *DB) Batch(fn func(*Tx) error) error { + errCh := make(chan error, 1) + + db.batchMu.Lock() + if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { + // There is no existing batch, or the existing batch is full; start a new one. + db.batch = &batch{ + db: db, + } + db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) + } + db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) + if len(db.batch.calls) >= db.MaxBatchSize { + // wake up batch, it's ready to run + go db.batch.trigger() + } + db.batchMu.Unlock() + + err := <-errCh + if err == trySolo { + err = db.Update(fn) + } + return err +} + +type call struct { + fn func(*Tx) error + err chan<- error +} + +type batch struct { + db *DB + timer *time.Timer + start sync.Once + calls []call +} + +// trigger runs the batch if it hasn't already been run. +func (b *batch) trigger() { + b.start.Do(b.run) +} + +// run performs the transactions in the batch and communicates results +// back to DB.Batch. +func (b *batch) run() { + b.db.batchMu.Lock() + b.timer.Stop() + // Make sure no new work is added to this batch, but don't break + // other batches. + if b.db.batch == b { + b.db.batch = nil + } + b.db.batchMu.Unlock() + +retry: + for len(b.calls) > 0 { + var failIdx = -1 + err := b.db.Update(func(tx *Tx) error { + for i, c := range b.calls { + if err := safelyCall(c.fn, tx); err != nil { + failIdx = i + return err + } + } + return nil + }) + + if failIdx >= 0 { + // take the failing transaction out of the batch. it's + // safe to shorten b.calls here because db.batch no longer + // points to us, and we hold the mutex anyway. + c := b.calls[failIdx] + b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] + // tell the submitter re-run it solo, continue with the rest of the batch + c.err <- trySolo + continue retry + } + + // pass success, or bolt internal errors, to all callers + for _, c := range b.calls { + c.err <- err + } + break retry + } +} + +// trySolo is a special sentinel error value used for signaling that a +// transaction function should be re-run. It should never be seen by +// callers. +var trySolo = errors.New("batch function returned an error and should be re-run solo") + +type panicked struct { + reason interface{} +} + +func (p panicked) Error() string { + if err, ok := p.reason.(error); ok { + return err.Error() + } + return fmt.Sprintf("panic: %v", p.reason) +} + +func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { + defer func() { + if p := recover(); p != nil { + err = panicked{p} + } + }() + return fn(tx) +} + +// Sync executes fdatasync() against the database file handle. +// +// This is not necessary under normal operation, however, if you use NoSync +// then it allows you to force the database file to sync against the disk. +func (db *DB) Sync() (err error) { + if lg := db.Logger(); lg != discardLogger { + lg.Debugf("Syncing bbolt db (%s)", db.path) + defer func() { + if err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] syncing bbolt db (%s) failed: %v", runtime.GOOS, runtime.GOARCH, db.path, err) + } else { + lg.Debugf("Syncing bbolt db (%s) successfully", db.path) + } + }() + } + + return fdatasync(db) +} + +// Stats retrieves ongoing performance stats for the database. +// This is only updated when a transaction closes. +func (db *DB) Stats() Stats { + db.statlock.RLock() + defer db.statlock.RUnlock() + return db.stats +} + +// This is for internal access to the raw data bytes from the C cursor, use +// carefully, or not at all. +func (db *DB) Info() *Info { + common.Assert(db.data != nil, "database file isn't correctly mapped") + return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} +} + +// page retrieves a page reference from the mmap based on the current page size. +func (db *DB) page(id common.Pgid) *common.Page { + pos := id * common.Pgid(db.pageSize) + return (*common.Page)(unsafe.Pointer(&db.data[pos])) +} + +// pageInBuffer retrieves a page reference from a given byte array based on the current page size. +func (db *DB) pageInBuffer(b []byte, id common.Pgid) *common.Page { + return (*common.Page)(unsafe.Pointer(&b[id*common.Pgid(db.pageSize)])) +} + +// meta retrieves the current meta page reference. +func (db *DB) meta() *common.Meta { + // We have to return the meta with the highest txid which doesn't fail + // validation. Otherwise, we can cause errors when in fact the database is + // in a consistent state. metaA is the one with the higher txid. + metaA := db.meta0 + metaB := db.meta1 + if db.meta1.Txid() > db.meta0.Txid() { + metaA = db.meta1 + metaB = db.meta0 + } + + // Use higher meta page if valid. Otherwise, fallback to previous, if valid. + if err := metaA.Validate(); err == nil { + return metaA + } else if err := metaB.Validate(); err == nil { + return metaB + } + + // This should never be reached, because both meta1 and meta0 were validated + // on mmap() and we do fsync() on every write. + panic("bolt.DB.meta(): invalid meta pages") +} + +// allocate returns a contiguous block of memory starting at a given page. +func (db *DB) allocate(txid common.Txid, count int) (*common.Page, error) { + // Allocate a temporary buffer for the page. + var buf []byte + if count == 1 { + buf = db.pagePool.Get().([]byte) + } else { + buf = make([]byte, count*db.pageSize) + } + p := (*common.Page)(unsafe.Pointer(&buf[0])) + p.SetOverflow(uint32(count - 1)) + + // Use pages from the freelist if they are available. + p.SetId(db.freelist.Allocate(txid, count)) + if p.Id() != 0 { + return p, nil + } + + // Resize mmap() if we're at the end. + p.SetId(db.rwtx.meta.Pgid()) + var minsz = int((p.Id()+common.Pgid(count))+1) * db.pageSize + if minsz >= db.datasz { + if err := db.mmap(minsz); err != nil { + if err == berrors.ErrMaxSizeReached { + return nil, err + } else { + return nil, fmt.Errorf("mmap allocate error: %s", err) + } + } + } + + // Move the page id high water mark. + curPgid := db.rwtx.meta.Pgid() + db.rwtx.meta.SetPgid(curPgid + common.Pgid(count)) + + return p, nil +} + +// grow grows the size of the database to the given sz. +func (db *DB) grow(sz int) error { + // Ignore if the new size is less than available file size. + lg := db.Logger() + fileSize, err := db.fileSize() + if err != nil { + lg.Errorf("getting file size failed: %w", err) + return err + } + if sz <= fileSize { + return nil + } + + // If the data is smaller than the alloc size then only allocate what's needed. + // Once it goes over the allocation size then allocate in chunks. + if db.datasz <= db.AllocSize { + sz = db.datasz + } else { + sz += db.AllocSize + } + + if !db.readOnly && db.MaxSize > 0 && sz > db.MaxSize { + lg.Errorf("[GOOS: %s, GOARCH: %s] maximum db size reached, size: %d, db.MaxSize: %d", runtime.GOOS, runtime.GOARCH, sz, db.MaxSize) + return berrors.ErrMaxSizeReached + } + + // Truncate and fsync to ensure file size metadata is flushed. + // https://github.com/boltdb/bolt/issues/284 + if !db.NoGrowSync && !db.readOnly { + if runtime.GOOS != "windows" { + // gofail: var resizeFileError string + // return errors.New(resizeFileError) + if err := db.file.Truncate(int64(sz)); err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] truncating file failed, size: %d, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, sz, db.datasz, err) + return fmt.Errorf("file resize error: %s", err) + } + } + if err := db.file.Sync(); err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] syncing file failed, db.datasz: %d, error: %v", runtime.GOOS, runtime.GOARCH, db.datasz, err) + return fmt.Errorf("file sync error: %s", err) + } + if db.Mlock { + // unlock old file and lock new one + if err := db.mrelock(fileSize, sz); err != nil { + return fmt.Errorf("mlock/munlock error: %s", err) + } + } + } + + return nil +} + +func (db *DB) IsReadOnly() bool { + return db.readOnly +} + +func (db *DB) freepages() []common.Pgid { + tx, err := db.beginTx() + defer func() { + err = tx.Rollback() + if err != nil { + panic("freepages: failed to rollback tx") + } + }() + if err != nil { + panic("freepages: failed to open read only tx") + } + + reachable := make(map[common.Pgid]*common.Page) + nofreed := make(map[common.Pgid]bool) + ech := make(chan error) + go func() { + for e := range ech { + panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e)) + } + }() + tx.recursivelyCheckBucket(&tx.root, reachable, nofreed, HexKVStringer(), ech) + close(ech) + + // TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages. + + var fids []common.Pgid + for i := common.Pgid(2); i < db.meta().Pgid(); i++ { + if _, ok := reachable[i]; !ok { + fids = append(fids, i) + } + } + return fids +} + +func newFreelist(freelistType FreelistType) fl.Interface { + if freelistType == FreelistMapType { + return fl.NewHashMapFreelist() + } + return fl.NewArrayFreelist() +} + +// Options represents the options that can be set when opening a database. +type Options struct { + // Timeout is the amount of time to wait to obtain a file lock. + // When set to zero it will wait indefinitely. + Timeout time.Duration + + // Sets the DB.NoGrowSync flag before memory mapping the file. + NoGrowSync bool + + // Do not sync freelist to disk. This improves the database write performance + // under normal operation, but requires a full database re-sync during recovery. + NoFreelistSync bool + + // PreLoadFreelist sets whether to load the free pages when opening + // the db file. Note when opening db in write mode, bbolt will always + // load the free pages. + PreLoadFreelist bool + + // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures + // dramatic performance degradation if database is large and fragmentation in freelist is common. + // The alternative one is using hashmap, it is faster in almost all circumstances + // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. + // The default type is array + FreelistType FreelistType + + // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to + // grab a shared lock (UNIX). + ReadOnly bool + + // Sets the DB.MmapFlags flag before memory mapping the file. + MmapFlags int + + // InitialMmapSize is the initial mmap size of the database + // in bytes. Read transactions won't block write transaction + // if the InitialMmapSize is large enough to hold database mmap + // size. (See DB.Begin for more information) + // + // If <=0, the initial map size is 0. + // If initialMmapSize is smaller than the previous database size, + // it takes no effect. + // + // Note: On Windows, due to platform limitations, the database file size + // will be immediately resized to match `InitialMmapSize` (aligned to page size) + // when the DB is opened. On non-Windows platforms, the file size will grow + // dynamically based on the actual amount of written data, regardless of `InitialMmapSize`. + // Refer to https://github.com/etcd-io/bbolt/issues/378#issuecomment-1378121966. + InitialMmapSize int + + // PageSize overrides the default OS page size. + PageSize int + + // MaxSize sets the maximum size of the data file. <=0 means no maximum. + MaxSize int + + // NoSync sets the initial value of DB.NoSync. Normally this can just be + // set directly on the DB itself when returned from Open(), but this option + // is useful in APIs which expose Options but not the underlying DB. + NoSync bool + + // OpenFile is used to open files. It defaults to os.OpenFile. This option + // is useful for writing hermetic tests. + OpenFile func(string, int, os.FileMode) (*os.File, error) + + // Mlock locks database file in memory when set to true. + // It prevents potential page faults, however + // used memory can't be reclaimed. (UNIX only) + Mlock bool + + // Logger is the logger used for bbolt. + Logger Logger +} + +func (o *Options) String() string { + if o == nil { + return "{}" + } + + return fmt.Sprintf("{Timeout: %s, NoGrowSync: %t, NoFreelistSync: %t, PreLoadFreelist: %t, FreelistType: %s, ReadOnly: %t, MmapFlags: %x, InitialMmapSize: %d, PageSize: %d, MaxSize: %d, NoSync: %t, OpenFile: %p, Mlock: %t, Logger: %p}", + o.Timeout, o.NoGrowSync, o.NoFreelistSync, o.PreLoadFreelist, o.FreelistType, o.ReadOnly, o.MmapFlags, o.InitialMmapSize, o.PageSize, o.MaxSize, o.NoSync, o.OpenFile, o.Mlock, o.Logger) + +} + +// DefaultOptions represent the options used if nil options are passed into Open(). +// No timeout is used which will cause Bolt to wait indefinitely for a lock. +var DefaultOptions = &Options{ + Timeout: 0, + NoGrowSync: false, + FreelistType: FreelistArrayType, +} + +// Stats represents statistics about the database. +type Stats struct { + // Put `TxStats` at the first field to ensure it's 64-bit aligned. Note + // that the first word in an allocated struct can be relied upon to be + // 64-bit aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. + // Also refer to discussion in https://github.com/etcd-io/bbolt/issues/577. + TxStats TxStats // global, ongoing stats. + + // Freelist stats + FreePageN int // total number of free pages on the freelist + PendingPageN int // total number of pending pages on the freelist + FreeAlloc int // total bytes allocated in free pages + FreelistInuse int // total bytes used by the freelist + + // Transaction stats + TxN int // total number of started read transactions + OpenTxN int // number of currently open read transactions +} + +// Sub calculates and returns the difference between two sets of database stats. +// This is useful when obtaining stats at two different points and time and +// you need the performance counters that occurred within that time span. +func (s *Stats) Sub(other *Stats) Stats { + if other == nil { + return *s + } + var diff Stats + diff.FreePageN = s.FreePageN + diff.PendingPageN = s.PendingPageN + diff.FreeAlloc = s.FreeAlloc + diff.FreelistInuse = s.FreelistInuse + diff.TxN = s.TxN - other.TxN + diff.TxStats = s.TxStats.Sub(&other.TxStats) + return diff +} + +type Info struct { + Data uintptr + PageSize int +} diff --git a/db_test.go b/db_test.go new file mode 100644 index 0000000..f70ab4a --- /dev/null +++ b/db_test.go @@ -0,0 +1,1904 @@ +package bbolt_test + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "hash/fnv" + "log" + "math/rand" + "os" + "path/filepath" + "reflect" + "runtime" + "strings" + "sync" + "testing" + "time" + "unsafe" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +// pageSize is the size of one page in the data file. +const pageSize = 4096 + +// pageHeaderSize is the size of a page header. +const pageHeaderSize = 16 + +// meta represents a simplified version of a database meta page for testing. +type meta struct { + _ uint32 + version uint32 + _ uint32 + _ uint32 + _ [16]byte + _ uint64 + pgid uint64 + _ uint64 + _ uint64 +} + +// Ensure that a database can be opened without error. +func TestOpen(t *testing.T) { + path := tempfile() + defer os.RemoveAll(path) + + db, err := bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } else if db == nil { + t.Fatal("expected db") + } + + if s := db.Path(); s != path { + t.Fatalf("unexpected path: %s", s) + } + + if err := db.Close(); err != nil { + t.Fatal(err) + } +} + +// Regression validation for https://github.com/etcd-io/bbolt/pull/122. +// Tests multiple goroutines simultaneously opening a database. +func TestOpen_MultipleGoroutines(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode") + } + + const ( + instances = 30 + iterations = 30 + ) + path := tempfile() + defer os.RemoveAll(path) + var wg sync.WaitGroup + errCh := make(chan error, iterations*instances) + for iteration := 0; iteration < iterations; iteration++ { + for instance := 0; instance < instances; instance++ { + wg.Add(1) + go func() { + defer wg.Done() + db, err := bolt.Open(path, 0600, nil) + if err != nil { + errCh <- err + return + } + if err := db.Close(); err != nil { + errCh <- err + return + } + }() + } + wg.Wait() + } + close(errCh) + for err := range errCh { + if err != nil { + t.Fatalf("error from inside goroutine: %v", err) + } + } +} + +// Ensure that opening a database with a blank path returns an error. +func TestOpen_ErrPathRequired(t *testing.T) { + _, err := bolt.Open("", 0600, nil) + if err == nil { + t.Fatalf("expected error") + } +} + +// Ensure that opening a database with a bad path returns an error. +func TestOpen_ErrNotExists(t *testing.T) { + _, err := bolt.Open(filepath.Join(tempfile(), "bad-path"), 0600, nil) + if err == nil { + t.Fatal("expected error") + } +} + +// Ensure that opening a file that is not a Bolt database returns ErrInvalid. +func TestOpen_ErrInvalid(t *testing.T) { + path := tempfile() + defer os.RemoveAll(path) + + f, err := os.Create(path) + if err != nil { + t.Fatal(err) + } + if _, err := fmt.Fprintln(f, "this is not a bolt database"); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + + if _, err := bolt.Open(path, 0600, nil); err != berrors.ErrInvalid { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that opening a file with two invalid versions returns ErrVersionMismatch. +func TestOpen_ErrVersionMismatch(t *testing.T) { + if pageSize != os.Getpagesize() { + t.Skip("page size mismatch") + } + + // Create empty database. + db := btesting.MustCreateDB(t) + path := db.Path() + + // Close database. + if err := db.Close(); err != nil { + t.Fatal(err) + } + + // Read data file. + buf, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + + // Rewrite meta pages. + meta0 := (*meta)(unsafe.Pointer(&buf[pageHeaderSize])) + meta0.version++ + meta1 := (*meta)(unsafe.Pointer(&buf[pageSize+pageHeaderSize])) + meta1.version++ + if err := os.WriteFile(path, buf, 0666); err != nil { + t.Fatal(err) + } + + // Reopen data file. + if _, err := bolt.Open(path, 0600, nil); err != berrors.ErrVersionMismatch { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that opening a file with two invalid checksums returns ErrChecksum. +func TestOpen_ErrChecksum(t *testing.T) { + if pageSize != os.Getpagesize() { + t.Skip("page size mismatch") + } + + // Create empty database. + db := btesting.MustCreateDB(t) + path := db.Path() + + // Close database. + if err := db.Close(); err != nil { + t.Fatal(err) + } + + // Read data file. + buf, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + + // Rewrite meta pages. + meta0 := (*meta)(unsafe.Pointer(&buf[pageHeaderSize])) + meta0.pgid++ + meta1 := (*meta)(unsafe.Pointer(&buf[pageSize+pageHeaderSize])) + meta1.pgid++ + if err := os.WriteFile(path, buf, 0666); err != nil { + t.Fatal(err) + } + + // Reopen data file. + if _, err := bolt.Open(path, 0600, nil); err != berrors.ErrChecksum { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that it can read the page size from the second meta page if the first one is invalid. +// The page size is expected to be the OS's page size in this case. +func TestOpen_ReadPageSize_FromMeta1_OS(t *testing.T) { + // Create empty database. + db := btesting.MustCreateDB(t) + path := db.Path() + // Close the database + db.MustClose() + + // Read data file. + buf, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + + // Rewrite first meta page. + meta0 := (*meta)(unsafe.Pointer(&buf[pageHeaderSize])) + meta0.pgid++ + if err := os.WriteFile(path, buf, 0666); err != nil { + t.Fatal(err) + } + + // Reopen data file. + db = btesting.MustOpenDBWithOption(t, path, nil) + require.Equalf(t, os.Getpagesize(), db.Info().PageSize, "check page size failed") +} + +// Ensure that it can read the page size from the second meta page if the first one is invalid. +// The page size is expected to be the given page size in this case. +func TestOpen_ReadPageSize_FromMeta1_Given(t *testing.T) { + // test page size from 1KB (1024<<0) to 16MB(1024<<14) + for i := 0; i <= 14; i++ { + givenPageSize := 1024 << uint(i) + t.Logf("Testing page size %d", givenPageSize) + // Create empty database. + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: givenPageSize}) + path := db.Path() + // Close the database + db.MustClose() + + // Read data file. + buf, err := os.ReadFile(path) + require.NoError(t, err) + + // Rewrite meta pages. + if i%3 == 0 { + t.Logf("#%d: Intentionally corrupt the first meta page for pageSize %d", i, givenPageSize) + meta0 := (*meta)(unsafe.Pointer(&buf[pageHeaderSize])) + meta0.pgid++ + err = os.WriteFile(path, buf, 0666) + require.NoError(t, err) + } + + // Reopen data file. + db = btesting.MustOpenDBWithOption(t, path, nil) + require.Equalf(t, givenPageSize, db.Info().PageSize, "check page size failed") + db.MustClose() + } +} + +// Ensure that opening a database does not increase its size. +// https://github.com/boltdb/bolt/issues/291 +func TestOpen_Size(t *testing.T) { + // Open a data file. + db := btesting.MustCreateDB(t) + + pagesize := db.Info().PageSize + + // Insert until we get above the minimum 4MB size. + err := db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 1000) }, + ) + if err != nil { + t.Fatal(err) + } + + path := db.Path() + db.MustClose() + + sz := fileSize(path) + if sz == 0 { + t.Fatalf("unexpected new file size: %d", sz) + } + + db.MustReopen() + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + if err := db.Close(); err != nil { + t.Fatal(err) + } + newSz := fileSize(path) + if newSz == 0 { + t.Fatalf("unexpected new file size: %d", newSz) + } + + // Compare the original size with the new size. + // db size might increase by a few page sizes due to the new small update. + if sz < newSz-5*int64(pagesize) { + t.Fatalf("unexpected file growth: %d => %d", sz, newSz) + } +} + +// Ensure that opening a database beyond the max step size does not increase its size. +// https://github.com/boltdb/bolt/issues/303 +func TestOpen_Size_Large(t *testing.T) { + if testing.Short() { + t.Skip("short mode") + } + + // Open a data file. + db := btesting.MustCreateDB(t) + path := db.Path() + + pagesize := db.Info().PageSize + + // Insert until we get above the minimum 4MB size. + var index uint64 + for i := 0; i < 10000; i++ { + if err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("data")) + for j := 0; j < 1000; j++ { + if err := b.Put(u64tob(index), make([]byte, 50)); err != nil { + t.Fatal(err) + } + index++ + } + return nil + }); err != nil { + t.Fatal(err) + } + } + + // Close database and grab the size. + if err := db.Close(); err != nil { + t.Fatal(err) + } + sz := fileSize(path) + if sz == 0 { + t.Fatalf("unexpected new file size: %d", sz) + } else if sz < (1 << 30) { + t.Fatalf("expected larger initial size: %d", sz) + } + + // Reopen database, update, and check size again. + db0, err := bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } + if err := db0.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}) + }); err != nil { + t.Fatal(err) + } + if err := db0.Close(); err != nil { + t.Fatal(err) + } + + newSz := fileSize(path) + if newSz == 0 { + t.Fatalf("unexpected new file size: %d", newSz) + } + + // Compare the original size with the new size. + // db size might increase by a few page sizes due to the new small update. + if sz < newSz-5*int64(pagesize) { + t.Fatalf("unexpected file growth: %d => %d", sz, newSz) + } +} + +// Ensure that a re-opened database is consistent. +func TestOpen_Check(t *testing.T) { + path := tempfile() + defer os.RemoveAll(path) + + db, err := bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } + if err = db.View(func(tx *bolt.Tx) error { return <-tx.Check() }); err != nil { + t.Fatal(err) + } + if err = db.Close(); err != nil { + t.Fatal(err) + } + + db, err = bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } + if err := db.View(func(tx *bolt.Tx) error { return <-tx.Check() }); err != nil { + t.Fatal(err) + } + if err := db.Close(); err != nil { + t.Fatal(err) + } +} + +// Ensure that write errors to the meta file handler during initialization are returned. +func TestOpen_MetaInitWriteError(t *testing.T) { + t.Skip("pending") +} + +// Ensure that a database that is too small returns an error. +func TestOpen_FileTooSmall(t *testing.T) { + path := tempfile() + defer os.RemoveAll(path) + + db, err := bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } + pageSize := int64(db.Info().PageSize) + if err = db.Close(); err != nil { + t.Fatal(err) + } + + // corrupt the database + if err = os.Truncate(path, pageSize); err != nil { + t.Fatal(err) + } + + _, err = bolt.Open(path, 0600, nil) + if err == nil || !strings.Contains(err.Error(), "file size too small") { + t.Fatalf("unexpected error: %s", err) + } +} + +// TestDB_Open_InitialMmapSize tests if having InitialMmapSize large enough +// to hold data from concurrent write transaction resolves the issue that +// read transaction blocks the write transaction and causes deadlock. +// This is a very hacky test since the mmap size is not exposed. +func TestDB_Open_InitialMmapSize(t *testing.T) { + path := tempfile() + defer os.Remove(path) + + initMmapSize := 1 << 30 // 1GB + testWriteSize := 1 << 27 // 134MB + + db, err := bolt.Open(path, 0600, &bolt.Options{InitialMmapSize: initMmapSize}) + if err != nil { + t.Fatal(err) + } + + // create a long-running read transaction + // that never gets closed while writing + rtx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + + // create a write transaction + wtx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + b, err := wtx.CreateBucket([]byte("test")) + if err != nil { + t.Fatal(err) + } + + // and commit a large write + err = b.Put([]byte("foo"), make([]byte, testWriteSize)) + if err != nil { + t.Fatal(err) + } + + done := make(chan error, 1) + + go func() { + err := wtx.Commit() + done <- err + }() + + select { + case <-time.After(5 * time.Second): + t.Errorf("unexpected that the reader blocks writer") + case err := <-done: + if err != nil { + t.Fatal(err) + } + } + + if err := rtx.Rollback(); err != nil { + t.Fatal(err) + } +} + +// TestDB_Open_ReadOnly checks a database in read only mode can read but not write. +func TestDB_Open_ReadOnly(t *testing.T) { + // Create a writable db, write k-v and close it. + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + if err := db.Close(); err != nil { + t.Fatal(err) + } + + f := db.Path() + o := &bolt.Options{ReadOnly: true} + readOnlyDB, err := bolt.Open(f, 0600, o) + if err != nil { + panic(err) + } + + if !readOnlyDB.IsReadOnly() { + t.Fatal("expect db in read only mode") + } + + // Read from a read-only transaction. + if err := readOnlyDB.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + if !bytes.Equal(value, []byte("bar")) { + t.Fatal("expect value 'bar', got", value) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Can't launch read-write transaction. + if _, err := readOnlyDB.Begin(true); err != berrors.ErrDatabaseReadOnly { + t.Fatalf("unexpected error: %s", err) + } + + if err := readOnlyDB.Close(); err != nil { + t.Fatal(err) + } +} + +func TestDB_Open_ReadOnly_NoCreate(t *testing.T) { + f := filepath.Join(t.TempDir(), "db") + _, err := bolt.Open(f, 0600, &bolt.Options{ReadOnly: true}) + require.ErrorIs(t, err, os.ErrNotExist) +} + +// TestOpen_BigPage checks the database uses bigger pages when +// changing PageSize. +func TestOpen_BigPage(t *testing.T) { + pageSize := os.Getpagesize() + + db1 := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize * 2}) + + db2 := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: pageSize * 4}) + + if db1sz, db2sz := fileSize(db1.Path()), fileSize(db2.Path()); db1sz >= db2sz { + t.Errorf("expected %d < %d", db1sz, db2sz) + } +} + +// TestOpen_RecoverFreeList tests opening the DB with free-list +// write-out after no free list sync will recover the free list +// and write it out. +func TestOpen_RecoverFreeList(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{NoFreelistSync: true}) + + // Write some pages. + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + wbuf := make([]byte, 8192) + for i := 0; i < 100; i++ { + s := fmt.Sprintf("%d", i) + b, err := tx.CreateBucket([]byte(s)) + if err != nil { + t.Fatal(err) + } + if err = b.Put([]byte(s), wbuf); err != nil { + t.Fatal(err) + } + } + if err = tx.Commit(); err != nil { + t.Fatal(err) + } + + // Generate free pages. + if tx, err = db.Begin(true); err != nil { + t.Fatal(err) + } + for i := 0; i < 50; i++ { + s := fmt.Sprintf("%d", i) + b := tx.Bucket([]byte(s)) + if b == nil { + t.Fatal(err) + } + if err := b.Delete([]byte(s)); err != nil { + t.Fatal(err) + } + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + db.MustClose() + + // Record freelist count from opening with NoFreelistSync. + db.MustReopen() + freepages := db.Stats().FreePageN + if freepages == 0 { + t.Fatalf("no free pages on NoFreelistSync reopen") + } + db.MustClose() + + // Check free page count is reconstructed when opened with freelist sync. + db.SetOptions(&bolt.Options{}) + db.MustReopen() + // One less free page for syncing the free list on open. + freepages-- + if fp := db.Stats().FreePageN; fp < freepages { + t.Fatalf("closed with %d free pages, opened with %d", freepages, fp) + } +} + +// Ensure that a database cannot open a transaction when it's not open. +func TestDB_Begin_ErrDatabaseNotOpen(t *testing.T) { + var db bolt.DB + if _, err := db.Begin(false); err != berrors.ErrDatabaseNotOpen { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that a read-write transaction can be retrieved. +func TestDB_BeginRW(t *testing.T) { + db := btesting.MustCreateDB(t) + + tx, err := db.Begin(true) + require.NoError(t, err) + require.NotNil(t, tx, "expected tx") + defer func() { require.NoError(t, tx.Commit()) }() + + require.True(t, tx.Writable(), "expected writable tx") + require.Same(t, db.DB, tx.DB()) +} + +// TestDB_Concurrent_WriteTo checks that issuing WriteTo operations concurrently +// with commits does not produce corrupted db files. It also verifies that all +// readonly transactions, which are created based on the same data view, should +// always read the same data. +func TestDB_Concurrent_WriteTo_and_ConsistentRead(t *testing.T) { + o := &bolt.Options{ + NoFreelistSync: false, + PageSize: 4096, + } + db := btesting.MustCreateDBWithOption(t, o) + + wtxs, rtxs := 50, 5 + bucketName := []byte("data") + + var dataLock sync.Mutex + dataCache := make(map[int][]map[string]string) + + var wg sync.WaitGroup + wg.Add(wtxs * rtxs) + f := func(round int, tx *bolt.Tx) { + defer wg.Done() + time.Sleep(time.Duration(rand.Intn(200)+10) * time.Millisecond) + f := filepath.Join(t.TempDir(), fmt.Sprintf("%d-bolt-", round)) + err := tx.CopyFile(f, 0600) + require.NoError(t, err) + + // read all the data + b := tx.Bucket(bucketName) + data := make(map[string]string) + err = b.ForEach(func(k, v []byte) error { + data[string(k)] = string(v) + return nil + }) + require.NoError(t, err) + + // cache the data + dataLock.Lock() + dataSlice := dataCache[round] + dataSlice = append(dataSlice, data) + dataCache[round] = dataSlice + dataLock.Unlock() + + err = tx.Rollback() + require.NoError(t, err) + + copyOpt := *o + snap := btesting.MustOpenDBWithOption(t, f, ©Opt) + defer snap.MustClose() + snap.MustCheck() + } + + err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket(bucketName) + return err + }) + require.NoError(t, err) + + for i := 0; i < wtxs; i++ { + tx, err := db.Begin(true) + require.NoError(t, err) + + b := tx.Bucket(bucketName) + + for j := 0; j < rtxs; j++ { + rtx, rerr := db.Begin(false) + require.NoError(t, rerr) + go f(i, rtx) + + for k := 0; k < 10; k++ { + key, value := fmt.Sprintf("key_%d", rand.Intn(10)), fmt.Sprintf("value_%d", rand.Intn(100)) + perr := b.Put([]byte(key), []byte(value)) + require.NoError(t, perr) + } + } + err = tx.Commit() + require.NoError(t, err) + } + wg.Wait() + + // compare the data. The data generated in the same round + // should be exactly the same. + for round, dataSlice := range dataCache { + data0 := dataSlice[0] + + for i := 1; i < len(dataSlice); i++ { + datai := dataSlice[i] + same := reflect.DeepEqual(data0, datai) + require.True(t, same, fmt.Sprintf("found inconsistent data in round %d, data[0]: %v, data[%d] : %v", round, data0, i, datai)) + } + } +} + +// Ensure that opening a transaction while the DB is closed returns an error. +func TestDB_BeginRW_Closed(t *testing.T) { + var db bolt.DB + if _, err := db.Begin(true); err != berrors.ErrDatabaseNotOpen { + t.Fatalf("unexpected error: %s", err) + } +} + +func TestDB_Close_PendingTx_RW(t *testing.T) { testDB_Close_PendingTx(t, true) } +func TestDB_Close_PendingTx_RO(t *testing.T) { testDB_Close_PendingTx(t, false) } + +// Ensure that a database cannot close while transactions are open. +func testDB_Close_PendingTx(t *testing.T, writable bool) { + db := btesting.MustCreateDB(t) + + // Start transaction. + tx, err := db.Begin(writable) + if err != nil { + t.Fatal(err) + } + + // Open update in separate goroutine. + startCh := make(chan struct{}, 1) + done := make(chan error, 1) + go func() { + startCh <- struct{}{} + err := db.Close() + done <- err + }() + // wait for the above goroutine to get scheduled. + <-startCh + + // Ensure database hasn't closed. + time.Sleep(100 * time.Millisecond) + select { + case err := <-done: + if err != nil { + t.Errorf("error from inside goroutine: %v", err) + } + t.Fatal("database closed too early") + default: + } + + // Commit/close transaction. + if writable { + err = tx.Commit() + } else { + err = tx.Rollback() + } + if err != nil { + t.Fatal(err) + } + + // Ensure database closed now. + select { + case err := <-done: + if err != nil { + t.Fatalf("error from inside goroutine: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatalf("database did not close") + } +} + +// Ensure a database can provide a transactional block. +func TestDB_Update(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + if err := b.Delete([]byte("foo")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + if v := b.Get([]byte("foo")); v != nil { + t.Fatalf("expected nil value, got: %v", v) + } + if v := b.Get([]byte("baz")); !bytes.Equal(v, []byte("bat")) { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a closed database returns an error while running a transaction block +func TestDB_Update_Closed(t *testing.T) { + var db bolt.DB + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != berrors.ErrDatabaseNotOpen { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure a panic occurs while trying to commit a managed transaction. +func TestDB_Update_ManualCommit(t *testing.T) { + db := btesting.MustCreateDB(t) + + var panicked bool + if err := db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + }() + return nil + }); err != nil { + t.Fatal(err) + } else if !panicked { + t.Fatal("expected panic") + } +} + +// Ensure a panic occurs while trying to rollback a managed transaction. +func TestDB_Update_ManualRollback(t *testing.T) { + db := btesting.MustCreateDB(t) + + var panicked bool + if err := db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + }() + return nil + }); err != nil { + t.Fatal(err) + } else if !panicked { + t.Fatal("expected panic") + } +} + +// Ensure a panic occurs while trying to commit a managed transaction. +func TestDB_View_ManualCommit(t *testing.T) { + db := btesting.MustCreateDB(t) + + var panicked bool + if err := db.View(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + }() + return nil + }); err != nil { + t.Fatal(err) + } else if !panicked { + t.Fatal("expected panic") + } +} + +// Ensure a panic occurs while trying to rollback a managed transaction. +func TestDB_View_ManualRollback(t *testing.T) { + db := btesting.MustCreateDB(t) + + var panicked bool + if err := db.View(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + }() + return nil + }); err != nil { + t.Fatal(err) + } else if !panicked { + t.Fatal("expected panic") + } +} + +// Ensure a write transaction that panics does not hold open locks. +func TestDB_Update_Panic(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Panic during update but recover. + func() { + defer func() { + if r := recover(); r != nil { + t.Log("recover: update", r) + } + }() + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + panic("omg") + }); err != nil { + t.Fatal(err) + } + }() + + // Verify we can update again. + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Verify that our change persisted. + if err := db.Update(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure a database can return an error through a read-only transactional block. +func TestDB_View_Error(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.View(func(tx *bolt.Tx) error { + return errors.New("xxx") + }); err == nil || err.Error() != "xxx" { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure a read transaction that panics does not hold open locks. +func TestDB_View_Panic(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Panic during view transaction but recover. + func() { + defer func() { + if r := recover(); r != nil { + t.Log("recover: view", r) + } + }() + + if err := db.View(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + panic("omg") + }); err != nil { + t.Fatal(err) + } + }() + + // Verify that we can still use read transactions. + if err := db.View(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that DB stats can be returned. +func TestDB_Stats(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + + stats := db.Stats() + if stats.TxStats.GetPageCount() != 2 { + t.Fatalf("unexpected TxStats.PageCount: %d", stats.TxStats.GetPageCount()) + } else if stats.FreePageN != 0 { + t.Fatalf("unexpected FreePageN != 0: %d", stats.FreePageN) + } else if stats.PendingPageN != 2 { + t.Fatalf("unexpected PendingPageN != 2: %d", stats.PendingPageN) + } +} + +// Ensure that database pages are in expected order and type. +func TestDB_Consistency(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + + for i := 0; i < 10; i++ { + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + } + + if err := db.Update(func(tx *bolt.Tx) error { + if p, _ := tx.Page(0); p == nil { + t.Fatal("expected page") + } else if p.Type != "meta" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(1); p == nil { + t.Fatal("expected page") + } else if p.Type != "meta" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(2); p == nil { + t.Fatal("expected page") + } else if p.Type != "free" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(3); p == nil { + t.Fatal("expected page") + } else if p.Type != "free" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(4); p == nil { + t.Fatal("expected page") + } else if p.Type != "leaf" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(5); p == nil { + t.Fatal("expected page") + } else if p.Type != "freelist" { + t.Fatalf("unexpected page type: %s", p.Type) + } + + if p, _ := tx.Page(6); p != nil { + t.Fatal("unexpected page") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that DB stats can be subtracted from one another. +func TestDBStats_Sub(t *testing.T) { + var a, b bolt.Stats + a.TxStats.PageCount = 3 + a.FreePageN = 4 + b.TxStats.PageCount = 10 + b.FreePageN = 14 + diff := b.Sub(&a) + if diff.TxStats.GetPageCount() != 7 { + t.Fatalf("unexpected TxStats.PageCount: %d", diff.TxStats.GetPageCount()) + } + + // free page stats are copied from the receiver and not subtracted + if diff.FreePageN != 14 { + t.Fatalf("unexpected FreePageN: %d", diff.FreePageN) + } +} + +// Ensure two functions can perform updates in a single batch. +func TestDB_Batch(t *testing.T) { + db := btesting.MustCreateDB(t) + + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Iterate over multiple updates in separate goroutines. + n := 2 + ch := make(chan error, n) + for i := 0; i < n; i++ { + go func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + }(i) + } + + // Check all responses to make sure there's no error. + for i := 0; i < n; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 0; i < n; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +func TestDB_Batch_Panic(t *testing.T) { + db := btesting.MustCreateDB(t) + + var sentinel int + var bork = &sentinel + var problem interface{} + var err error + + // Execute a function inside a batch that panics. + func() { + defer func() { + if p := recover(); p != nil { + problem = p + } + }() + err = db.Batch(func(tx *bolt.Tx) error { + panic(bork) + }) + }() + + // Verify there is no error. + if g, e := err, error(nil); g != e { + t.Fatalf("wrong error: %v != %v", g, e) + } + // Verify the panic was captured. + if g, e := problem, bork; g != e { + t.Fatalf("wrong error: %v != %v", g, e) + } +} + +func TestDB_BatchFull(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + + const size = 3 + // buffered so we never leak goroutines + ch := make(chan error, size) + put := func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + } + + db.MaxBatchSize = size + // high enough to never trigger here + db.MaxBatchDelay = 1 * time.Hour + + go put(1) + go put(2) + + // Give the batch a chance to exhibit bugs. + time.Sleep(10 * time.Millisecond) + + // not triggered yet + select { + case <-ch: + t.Fatalf("batch triggered too early") + default: + } + + go put(3) + + // Check all responses to make sure there's no error. + for i := 0; i < size; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i <= size; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +func TestDB_BatchTime(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + t.Fatal(err) + } + + const size = 1 + // buffered so we never leak goroutines + ch := make(chan error, size) + put := func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + } + + db.MaxBatchSize = 1000 + db.MaxBatchDelay = 0 + + go put(1) + + // Batch must trigger by time alone. + + // Check all responses to make sure there's no error. + for i := 0; i < size; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + if err := db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i <= size; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// TestDBUnmap verifes that `dataref`, `data` and `datasz` must be reset +// to zero values respectively after unmapping the db. +func TestDBUnmap(t *testing.T) { + db := btesting.MustCreateDB(t) + + require.NoError(t, db.DB.Close()) + + // Ignore the following error: + // Error: copylocks: call of reflect.ValueOf copies lock value: github.com/tutus-one/tutus-bolt.DB contains sync.Once contains sync.Mutex (govet) + //nolint:govet + v := reflect.ValueOf(*db.DB) + dataref := v.FieldByName("dataref") + data := v.FieldByName("data") + datasz := v.FieldByName("datasz") + assert.True(t, dataref.IsNil()) + assert.True(t, data.IsNil()) + assert.True(t, datasz.IsZero()) + + // Set db.DB to nil to prevent MustCheck from panicking. + db.DB = nil +} + +// Convenience function for inserting a bunch of keys with 1000 byte values +func fillDBWithKeys(db *btesting.DB, numKeys int) error { + return db.Fill([]byte("data"), 1, numKeys, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 1000) }, + ) +} + +// Creates a new database size, forces a specific allocation size jump, and fills it with the number of keys specified +func createFilledDB(t testing.TB, o *bolt.Options, allocSize int, numKeys int) *btesting.DB { + // Open a data file. + db := btesting.MustCreateDBWithOption(t, o) + db.AllocSize = allocSize + + // Insert a reasonable amount of data below the max size. + err := db.Fill([]byte("data"), 1, numKeys, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 1000) }, + ) + if err != nil { + t.Fatal(err) + } + return db +} + +// Ensure that a database cannot exceed its maximum size +// https://github.com/etcd-io/bbolt/issues/928 +func TestDB_MaxSizeNotExceeded(t *testing.T) { + testCases := []struct { + name string + options bolt.Options + }{ + { + name: "Standard case", + options: bolt.Options{ + MaxSize: 5 * 1024 * 1024, // 5 MiB + PageSize: 4096, + }, + }, + { + name: "NoGrowSync", + options: bolt.Options{ + MaxSize: 5 * 1024 * 1024, // 5 MiB + PageSize: 4096, + NoGrowSync: true, + }, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + db := createFilledDB(t, + &testCase.options, + 4*1024*1024, // adjust allocation jumps to 4 MiB + 2000, + ) + + path := db.Path() + + // The data file should be 4 MiB now (expanded once from zero). + // It should have space for roughly 16 more entries before trying to grow + // Keep inserting until grow is required + err := fillDBWithKeys(db, 100) + assert.ErrorIs(t, err, berrors.ErrMaxSizeReached) + + newSz := fileSize(path) + require.Greater(t, newSz, int64(0), "unexpected new file size: %d", newSz) + assert.LessOrEqual(t, newSz, int64(db.MaxSize), "The size of the data file should not exceed db.MaxSize") + + err = db.Close() + require.NoError(t, err, "Closing the re-opened database should succeed") + }) + } +} + +// Ensure that opening a database that is beyond the maximum size succeeds +// The maximum size should only apply to growing the data file +// https://github.com/etcd-io/bbolt/issues/928 +func TestDB_MaxSizeExceededCanOpen(t *testing.T) { + // Open a data file. + db := createFilledDB(t, nil, 4*1024*1024, 2000) // adjust allocation jumps to 4 MiB, fill with 2000, 1KB keys + path := db.Path() + + // Insert a reasonable amount of data below the max size. + err := fillDBWithKeys(db, 2000) + require.NoError(t, err, "fillDbWithKeys should succeed") + + err = db.Close() + require.NoError(t, err, "Close should succeed") + + // The data file should be 4 MiB now (expanded once from zero). + minimumSizeForTest := int64(1024 * 1024) + newSz := fileSize(path) + require.GreaterOrEqual(t, newSz, minimumSizeForTest, "unexpected new file size: %d. Expected at least %d", newSz, minimumSizeForTest) + + // Now try to re-open the database with an extremely small max size + t.Logf("Reopening bbolt DB at: %s", path) + db, err = btesting.OpenDBWithOption(t, path, &bolt.Options{ + MaxSize: 1, + }) + assert.NoError(t, err, "Should be able to open database bigger than MaxSize") + + err = db.Close() + require.NoError(t, err, "Closing the re-opened database should succeed") +} + +// Ensure that opening a database that is beyond the maximum size succeeds, +// even when InitialMmapSize is above the limit (mmaps should not affect file size) +// This test exists for platforms where Truncate should not be called during mmap +// https://github.com/etcd-io/bbolt/issues/928 +func TestDB_MaxSizeExceededCanOpenWithHighMmap(t *testing.T) { + if runtime.GOOS == "windows" { + // In Windows, the file must be expanded to the mmap initial size, + // so this test doesn't run in Windows. + t.SkipNow() + } + + // Open a data file. + db := createFilledDB(t, nil, 4*1024*1024, 2000) // adjust allocation jumps to 4 MiB, fill with 2000 1KB entries + path := db.Path() + + err := db.Close() + require.NoError(t, err, "Close should succeed") + + // The data file should be 4 MiB now (expanded once from zero). + minimumSizeForTest := int64(1024 * 1024) + newSz := fileSize(path) + require.GreaterOrEqual(t, newSz, minimumSizeForTest, "unexpected new file size: %d. Expected at least %d", newSz, minimumSizeForTest) + + // Now try to re-open the database with an extremely small max size + t.Logf("Reopening bbolt DB at: %s", path) + db, err = btesting.OpenDBWithOption(t, path, &bolt.Options{ + MaxSize: 1, + InitialMmapSize: int(minimumSizeForTest) * 2, + }) + assert.NoError(t, err, "Should be able to open database bigger than MaxSize when InitialMmapSize set high") + + err = db.Close() + require.NoError(t, err, "Closing the re-opened database should succeed") +} + +// Ensure that when InitialMmapSize is above the limit, opening a database +// that is beyond the maximum size fails in Windows. +// In Windows, the file must be expanded to the mmap initial size. +// https://github.com/etcd-io/bbolt/issues/928 +func TestDB_MaxSizeExceededDoesNotGrow(t *testing.T) { + if runtime.GOOS != "windows" { + // This test is only relevant on Windows + t.SkipNow() + } + + // Open a data file. + db := createFilledDB(t, nil, 4*1024*1024, 2000) // adjust allocation jumps to 4 MiB, fill with 2000 1KB entries + path := db.Path() + + err := db.Close() + require.NoError(t, err, "Close should succeed") + + // The data file should be 4 MiB now (expanded once from zero). + minimumSizeForTest := int64(1024 * 1024) + newSz := fileSize(path) + assert.GreaterOrEqual(t, newSz, minimumSizeForTest, "unexpected new file size: %d. Expected at least %d", newSz, minimumSizeForTest) + + // Now try to re-open the database with an extremely small max size and + // an initial mmap size to be greater than the actual file size, forcing an illegal grow on open + t.Logf("Reopening bbolt DB at: %s", path) + _, err = btesting.OpenDBWithOption(t, path, &bolt.Options{ + MaxSize: 1, + InitialMmapSize: int(newSz) * 2, + }) + assert.Error(t, err, "Opening the DB with InitialMmapSize > MaxSize should cause an error on Windows") +} + +func ExampleDB_Update() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Execute several commands within a read-write transaction. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + return err + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + return err + } + return nil + }); err != nil { + log.Fatal(err) + } + + // Read the value back from a separate read-only transaction. + if err := db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value of 'foo' is: %s\n", value) + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release the file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // The value of 'foo' is: bar +} + +func ExampleDB_View() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Insert data into a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("people")) + if err != nil { + return err + } + if err := b.Put([]byte("john"), []byte("doe")); err != nil { + return err + } + if err := b.Put([]byte("susy"), []byte("que")); err != nil { + return err + } + return nil + }); err != nil { + log.Fatal(err) + } + + // Access data from within a read-only transactional block. + if err := db.View(func(tx *bolt.Tx) error { + v := tx.Bucket([]byte("people")).Get([]byte("john")) + fmt.Printf("John's last name is %s.\n", v) + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release the file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // John's last name is doe. +} + +func ExampleDB_Begin() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Create a bucket using a read-write transaction. + if err = db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + log.Fatal(err) + } + + // Create several keys in a transaction. + tx, err := db.Begin(true) + if err != nil { + log.Fatal(err) + } + b := tx.Bucket([]byte("widgets")) + if err = b.Put([]byte("john"), []byte("blue")); err != nil { + log.Fatal(err) + } + if err = b.Put([]byte("abby"), []byte("red")); err != nil { + log.Fatal(err) + } + if err = b.Put([]byte("zephyr"), []byte("purple")); err != nil { + log.Fatal(err) + } + if err = tx.Commit(); err != nil { + log.Fatal(err) + } + + // Iterate over the values in sorted key order. + tx, err = db.Begin(false) + if err != nil { + log.Fatal(err) + } + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("%s likes %s\n", k, v) + } + + if err = tx.Rollback(); err != nil { + log.Fatal(err) + } + + if err = db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // abby likes red + // john likes blue + // zephyr likes purple +} + +func BenchmarkDBBatchAutomatic(b *testing.B) { + db := btesting.MustCreateDB(b) + + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("bench")) + return err + }); err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + + for round := 0; round < 1000; round++ { + wg.Add(1) + + go func(id uint32) { + defer wg.Done() + <-start + + h := fnv.New32a() + buf := make([]byte, 4) + binary.LittleEndian.PutUint32(buf, id) + _, _ = h.Write(buf[:]) + k := h.Sum(nil) + insert := func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("bench")) + return b.Put(k, []byte("filler")) + } + if err := db.Batch(insert); err != nil { + b.Error(err) + return + } + }(uint32(round)) + } + close(start) + wg.Wait() + } + + b.StopTimer() + validateBatchBench(b, db) +} + +func BenchmarkDBBatchSingle(b *testing.B) { + db := btesting.MustCreateDB(b) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("bench")) + return err + }); err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + + for round := 0; round < 1000; round++ { + wg.Add(1) + go func(id uint32) { + defer wg.Done() + <-start + + h := fnv.New32a() + buf := make([]byte, 4) + binary.LittleEndian.PutUint32(buf, id) + _, _ = h.Write(buf[:]) + k := h.Sum(nil) + insert := func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("bench")) + return b.Put(k, []byte("filler")) + } + if err := db.Update(insert); err != nil { + b.Error(err) + return + } + }(uint32(round)) + } + close(start) + wg.Wait() + } + + b.StopTimer() + validateBatchBench(b, db) +} + +func BenchmarkDBBatchManual10x100(b *testing.B) { + db := btesting.MustCreateDB(b) + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("bench")) + return err + }); err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + errCh := make(chan error, 10) + + for major := 0; major < 10; major++ { + wg.Add(1) + go func(id uint32) { + defer wg.Done() + <-start + + insert100 := func(tx *bolt.Tx) error { + h := fnv.New32a() + buf := make([]byte, 4) + for minor := uint32(0); minor < 100; minor++ { + binary.LittleEndian.PutUint32(buf, uint32(id*100+minor)) + h.Reset() + _, _ = h.Write(buf[:]) + k := h.Sum(nil) + b := tx.Bucket([]byte("bench")) + if err := b.Put(k, []byte("filler")); err != nil { + return err + } + } + return nil + } + err := db.Update(insert100) + errCh <- err + }(uint32(major)) + } + close(start) + wg.Wait() + close(errCh) + for err := range errCh { + if err != nil { + b.Fatal(err) + } + } + } + + b.StopTimer() + validateBatchBench(b, db) +} + +func validateBatchBench(b *testing.B, db *btesting.DB) { + var rollback = errors.New("sentinel error to cause rollback") + validate := func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte("bench")) + h := fnv.New32a() + buf := make([]byte, 4) + for id := uint32(0); id < 1000; id++ { + binary.LittleEndian.PutUint32(buf, id) + h.Reset() + _, _ = h.Write(buf[:]) + k := h.Sum(nil) + v := bucket.Get(k) + if v == nil { + b.Errorf("not found id=%d key=%x", id, k) + continue + } + if g, e := v, []byte("filler"); !bytes.Equal(g, e) { + b.Errorf("bad value for id=%d key=%x: %s != %q", id, k, g, e) + } + if err := bucket.Delete(k); err != nil { + return err + } + } + // should be empty now + c := bucket.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + b.Errorf("unexpected key: %x = %q", k, v) + } + return rollback + } + if err := db.Update(validate); err != nil && err != rollback { + b.Error(err) + } +} + +// tempfile returns a temporary file path. +func tempfile() string { + f, err := os.CreateTemp("", "bolt-") + if err != nil { + panic(err) + } + if err := f.Close(); err != nil { + panic(err) + } + if err := os.Remove(f.Name()); err != nil { + panic(err) + } + return f.Name() +} + +func trunc(b []byte, length int) []byte { + if length < len(b) { + return b[:length] + } + return b +} + +func fileSize(path string) int64 { + fi, err := os.Stat(path) + if err != nil { + return 0 + } + return fi.Size() +} + +// u64tob converts a uint64 into an 8-byte slice. +func u64tob(v uint64) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, v) + return b +} diff --git a/db_whitebox_test.go b/db_whitebox_test.go new file mode 100644 index 0000000..0f56e6a --- /dev/null +++ b/db_whitebox_test.go @@ -0,0 +1,126 @@ +package bbolt + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt/errors" +) + +func TestOpenWithPreLoadFreelist(t *testing.T) { + testCases := []struct { + name string + readonly bool + preLoadFreePage bool + expectedFreePagesLoaded bool + }{ + { + name: "write mode always load free pages", + readonly: false, + preLoadFreePage: false, + expectedFreePagesLoaded: true, + }, + { + name: "readonly mode load free pages when flag set", + readonly: true, + preLoadFreePage: true, + expectedFreePagesLoaded: true, + }, + { + name: "readonly mode doesn't load free pages when flag not set", + readonly: true, + preLoadFreePage: false, + expectedFreePagesLoaded: false, + }, + } + + fileName, err := prepareData(t) + require.NoError(t, err) + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + db, err := Open(fileName, 0666, &Options{ + ReadOnly: tc.readonly, + PreLoadFreelist: tc.preLoadFreePage, + }) + require.NoError(t, err) + + assert.Equal(t, tc.expectedFreePagesLoaded, db.freelist != nil) + + assert.NoError(t, db.Close()) + }) + } +} + +func TestMethodPage(t *testing.T) { + testCases := []struct { + name string + readonly bool + preLoadFreePage bool + expectedError error + }{ + { + name: "write mode", + readonly: false, + preLoadFreePage: false, + expectedError: nil, + }, + { + name: "readonly mode with preloading free pages", + readonly: true, + preLoadFreePage: true, + expectedError: nil, + }, + { + name: "readonly mode without preloading free pages", + readonly: true, + preLoadFreePage: false, + expectedError: errors.ErrFreePagesNotLoaded, + }, + } + + fileName, err := prepareData(t) + require.NoError(t, err) + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + db, err := Open(fileName, 0666, &Options{ + ReadOnly: tc.readonly, + PreLoadFreelist: tc.preLoadFreePage, + }) + require.NoError(t, err) + defer db.Close() + + tx, err := db.Begin(!tc.readonly) + require.NoError(t, err) + + _, err = tx.Page(0) + require.Equal(t, tc.expectedError, err) + + if tc.readonly { + require.NoError(t, tx.Rollback()) + } else { + require.NoError(t, tx.Commit()) + } + + require.NoError(t, db.Close()) + }) + } +} + +func prepareData(t *testing.T) (string, error) { + fileName := filepath.Join(t.TempDir(), "db") + db, err := Open(fileName, 0666, nil) + if err != nil { + return "", err + } + if err := db.Close(); err != nil { + return "", err + } + + return fileName, nil +} diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..d1007e4 --- /dev/null +++ b/doc.go @@ -0,0 +1,40 @@ +/* +package bbolt implements a low-level key/value store in pure Go. It supports +fully serializable transactions, ACID semantics, and lock-free MVCC with +multiple readers and a single writer. Bolt can be used for projects that +want a simple data store without the need to add large dependencies such as +Postgres or MySQL. + +Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is +optimized for fast read access and does not require recovery in the event of a +system crash. Transactions which have not finished committing will simply be +rolled back in the event of a crash. + +The design of Bolt is based on Howard Chu's LMDB database project. + +Bolt currently works on Windows, Mac OS X, and Linux. + +# Basics + +There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is +a collection of buckets and is represented by a single file on disk. A bucket is +a collection of unique keys that are associated with values. + +Transactions provide either read-only or read-write access to the database. +Read-only transactions can retrieve key/value pairs and can use Cursors to +iterate over the dataset sequentially. Read-write transactions can create and +delete buckets and can insert and remove keys. Only one read-write transaction +is allowed at a time. + +# Caveats + +The database uses a read-only, memory-mapped data file to ensure that +applications cannot corrupt the database, however, this means that keys and +values returned from Bolt cannot be changed. Writing to a read-only byte slice +will cause Go to panic. + +Keys and values retrieved from the database are only valid for the life of +the transaction. When used outside the transaction, these byte slices can +point to different data or can point to invalid memory which will cause a panic. +*/ +package bbolt diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..f06112d --- /dev/null +++ b/errors.go @@ -0,0 +1,108 @@ +package bbolt + +import "github.com/tutus-one/tutus-bolt/errors" + +// These errors can be returned when opening or calling methods on a DB. +var ( + // ErrDatabaseNotOpen is returned when a DB instance is accessed before it + // is opened or after it is closed. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrDatabaseNotOpen = errors.ErrDatabaseNotOpen + + // ErrInvalid is returned when both meta pages on a database are invalid. + // This typically occurs when a file is not a bolt database. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrInvalid = errors.ErrInvalid + + // ErrInvalidMapping is returned when the database file fails to get mapped. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrInvalidMapping = errors.ErrInvalidMapping + + // ErrVersionMismatch is returned when the data file was created with a + // different version of Bolt. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrVersionMismatch = errors.ErrVersionMismatch + + // ErrChecksum is returned when a checksum mismatch occurs on either of the two meta pages. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrChecksum = errors.ErrChecksum + + // ErrTimeout is returned when a database cannot obtain an exclusive lock + // on the data file after the timeout passed to Open(). + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrTimeout = errors.ErrTimeout +) + +// These errors can occur when beginning or committing a Tx. +var ( + // ErrTxNotWritable is returned when performing a write operation on a + // read-only transaction. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrTxNotWritable = errors.ErrTxNotWritable + + // ErrTxClosed is returned when committing or rolling back a transaction + // that has already been committed or rolled back. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrTxClosed = errors.ErrTxClosed + + // ErrDatabaseReadOnly is returned when a mutating transaction is started on a + // read-only database. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrDatabaseReadOnly = errors.ErrDatabaseReadOnly + + // ErrFreePagesNotLoaded is returned when a readonly transaction without + // preloading the free pages is trying to access the free pages. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrFreePagesNotLoaded = errors.ErrFreePagesNotLoaded +) + +// These errors can occur when putting or deleting a value or a bucket. +var ( + // ErrBucketNotFound is returned when trying to access a bucket that has + // not been created yet. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrBucketNotFound = errors.ErrBucketNotFound + + // ErrBucketExists is returned when creating a bucket that already exists. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrBucketExists = errors.ErrBucketExists + + // ErrBucketNameRequired is returned when creating a bucket with a blank name. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrBucketNameRequired = errors.ErrBucketNameRequired + + // ErrKeyRequired is returned when inserting a zero-length key. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrKeyRequired = errors.ErrKeyRequired + + // ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrKeyTooLarge = errors.ErrKeyTooLarge + + // ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrValueTooLarge = errors.ErrValueTooLarge + + // ErrIncompatibleValue is returned when trying create or delete a bucket + // on an existing non-bucket key or when trying to create or delete a + // non-bucket key on an existing bucket key. + // + // Deprecated: Use the error variables defined in the bbolt/errors package. + ErrIncompatibleValue = errors.ErrIncompatibleValue +) diff --git a/errors/errors.go b/errors/errors.go new file mode 100644 index 0000000..dbebd63 --- /dev/null +++ b/errors/errors.go @@ -0,0 +1,87 @@ +// Package errors defines the error variables that may be returned +// during bbolt operations. +package errors + +import "errors" + +// These errors can be returned when opening or calling methods on a DB. +var ( + // ErrDatabaseNotOpen is returned when a DB instance is accessed before it + // is opened or after it is closed. + ErrDatabaseNotOpen = errors.New("database not open") + + // ErrInvalid is returned when both meta pages on a database are invalid. + // This typically occurs when a file is not a bolt database. + ErrInvalid = errors.New("invalid database") + + // ErrInvalidMapping is returned when the database file fails to get mapped. + ErrInvalidMapping = errors.New("database isn't correctly mapped") + + // ErrVersionMismatch is returned when the data file was created with a + // different version of Bolt. + ErrVersionMismatch = errors.New("version mismatch") + + // ErrChecksum is returned when a checksum mismatch occurs on either of the two meta pages. + ErrChecksum = errors.New("checksum error") + + // ErrTimeout is returned when a database cannot obtain an exclusive lock + // on the data file after the timeout passed to Open(). + ErrTimeout = errors.New("timeout") +) + +// These errors can occur when beginning or committing a Tx. +var ( + // ErrTxNotWritable is returned when performing a write operation on a + // read-only transaction. + ErrTxNotWritable = errors.New("tx not writable") + + // ErrTxClosed is returned when committing or rolling back a transaction + // that has already been committed or rolled back. + ErrTxClosed = errors.New("tx closed") + + // ErrDatabaseReadOnly is returned when a mutating transaction is started on a + // read-only database. + ErrDatabaseReadOnly = errors.New("database is in read-only mode") + + // ErrFreePagesNotLoaded is returned when a readonly transaction without + // preloading the free pages is trying to access the free pages. + ErrFreePagesNotLoaded = errors.New("free pages are not pre-loaded") +) + +// These errors can occur when putting or deleting a value or a bucket. +var ( + // ErrBucketNotFound is returned when trying to access a bucket that has + // not been created yet. + ErrBucketNotFound = errors.New("bucket not found") + + // ErrBucketExists is returned when creating a bucket that already exists. + ErrBucketExists = errors.New("bucket already exists") + + // ErrBucketNameRequired is returned when creating a bucket with a blank name. + ErrBucketNameRequired = errors.New("bucket name required") + + // ErrKeyRequired is returned when inserting a zero-length key. + ErrKeyRequired = errors.New("key required") + + // ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize. + ErrKeyTooLarge = errors.New("key too large") + + // ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize. + ErrValueTooLarge = errors.New("value too large") + + // ErrMaxSizeReached is returned when the configured maximum size of the data file is reached. + ErrMaxSizeReached = errors.New("database reached maximum size") + + // ErrIncompatibleValue is returned when trying to create or delete a bucket + // on an existing non-bucket key or when trying to create or delete a + // non-bucket key on an existing bucket key. + ErrIncompatibleValue = errors.New("incompatible value") + + // ErrSameBuckets is returned when trying to move a sub-bucket between + // source and target buckets, while source and target buckets are the same. + ErrSameBuckets = errors.New("the source and target are the same bucket") + + // ErrDifferentDB is returned when trying to move a sub-bucket between + // source and target buckets, while source and target buckets are in different database files. + ErrDifferentDB = errors.New("the source and target buckets are in different database files") +) diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..185e312 --- /dev/null +++ b/go.mod @@ -0,0 +1,21 @@ +module github.com/tutus-one/tutus-bolt + +go 1.24 + +toolchain go1.24.3 + +require ( + github.com/spf13/cobra v1.9.1 + github.com/spf13/pflag v1.0.6 + github.com/stretchr/testify v1.10.0 + go.etcd.io/gofail v0.2.0 + golang.org/x/sync v0.14.0 + golang.org/x/sys v0.33.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..de26c3e --- /dev/null +++ b/go.sum @@ -0,0 +1,24 @@ +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +go.etcd.io/gofail v0.2.0 h1:p19drv16FKK345a09a1iubchlw/vmRuksmRzgBIGjcA= +go.etcd.io/gofail v0.2.0/go.mod h1:nL3ILMGfkXTekKI3clMBNazKnjUZjYLKmBHzsVAnC1o= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= +golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/btesting/btesting.go b/internal/btesting/btesting.go new file mode 100644 index 0000000..1409393 --- /dev/null +++ b/internal/btesting/btesting.go @@ -0,0 +1,230 @@ +package btesting + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" +) + +var statsFlag = flag.Bool("stats", false, "show performance stats") + +const ( + // TestFreelistType is used as an env variable for test to indicate the backend type. + TestFreelistType = "TEST_FREELIST_TYPE" + // TestEnableStrictMode is used to enable strict check by default after opening each DB. + TestEnableStrictMode = "TEST_ENABLE_STRICT_MODE" +) + +// DB is a test wrapper for bolt.DB. +type DB struct { + *bolt.DB + f string + o *bolt.Options + t testing.TB +} + +// MustCreateDB returns a new, open DB at a temporary location. +func MustCreateDB(t testing.TB) *DB { + return MustCreateDBWithOption(t, nil) +} + +// MustCreateDBWithOption returns a new, open DB at a temporary location with given options. +func MustCreateDBWithOption(t testing.TB, o *bolt.Options) *DB { + f := filepath.Join(t.TempDir(), "db") + return MustOpenDBWithOption(t, f, o) +} + +func MustOpenDBWithOption(t testing.TB, f string, o *bolt.Options) *DB { + db, err := OpenDBWithOption(t, f, o) + require.NoError(t, err) + require.NotNil(t, db) + return db +} + +func OpenDBWithOption(t testing.TB, f string, o *bolt.Options) (*DB, error) { + t.Logf("Opening bbolt DB at: %s", f) + if o == nil { + o = bolt.DefaultOptions + } + + freelistType := bolt.FreelistArrayType + if env := os.Getenv(TestFreelistType); env == string(bolt.FreelistMapType) { + freelistType = bolt.FreelistMapType + } + + o.FreelistType = freelistType + + db, err := bolt.Open(f, 0600, o) + if err != nil { + return nil, err + } + resDB := &DB{ + DB: db, + f: f, + o: o, + t: t, + } + resDB.strictModeEnabledDefault() + t.Cleanup(resDB.PostTestCleanup) + return resDB, nil +} + +func (db *DB) PostTestCleanup() { + // Check database consistency after every test. + if db.DB != nil { + db.MustCheck() + db.MustClose() + } +} + +// Close closes the database but does NOT delete the underlying file. +func (db *DB) Close() error { + if db.DB != nil { + // Log statistics. + if *statsFlag { + db.PrintStats() + } + db.t.Logf("Closing bbolt DB at: %s", db.f) + err := db.DB.Close() + if err != nil { + return err + } + db.DB = nil + } + return nil +} + +// MustClose closes the database but does NOT delete the underlying file. +func (db *DB) MustClose() { + err := db.Close() + require.NoError(db.t, err) +} + +func (db *DB) MustDeleteFile() { + err := os.Remove(db.Path()) + require.NoError(db.t, err) +} + +func (db *DB) SetOptions(o *bolt.Options) { + db.o = o +} + +// MustReopen reopen the database. Panic on error. +func (db *DB) MustReopen() { + if db.DB != nil { + panic("Please call Close() before MustReopen()") + } + db.t.Logf("Reopening bbolt DB at: %s", db.f) + indb, err := bolt.Open(db.Path(), 0600, db.o) + require.NoError(db.t, err) + db.DB = indb + db.strictModeEnabledDefault() +} + +// MustCheck runs a consistency check on the database and panics if any errors are found. +func (db *DB) MustCheck() { + err := db.View(func(tx *bolt.Tx) error { + // Collect all the errors. + var errors []error + for err := range tx.Check() { + errors = append(errors, err) + if len(errors) > 10 { + break + } + } + + // If errors occurred, copy the DB and print the errors. + if len(errors) > 0 { + var path = filepath.Join(db.t.TempDir(), "db.backup") + err := tx.CopyFile(path, 0600) + require.NoError(db.t, err) + + // Print errors. + fmt.Print("\n\n") + fmt.Printf("consistency check failed (%d errors)\n", len(errors)) + for _, err := range errors { + fmt.Println(err) + } + fmt.Println("") + fmt.Println("db saved to:") + fmt.Println(path) + fmt.Print("\n\n") + os.Exit(-1) + } + + return nil + }) + require.NoError(db.t, err) +} + +// Fill - fills the DB using numTx transactions and numKeysPerTx. +func (db *DB) Fill(bucket []byte, numTx int, numKeysPerTx int, + keyGen func(tx int, key int) []byte, + valueGen func(tx int, key int) []byte) error { + for tr := 0; tr < numTx; tr++ { + err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists(bucket) + for i := 0; i < numKeysPerTx; i++ { + if err := b.Put(keyGen(tr, i), valueGen(tr, i)); err != nil { + return err + } + } + return nil + }) + if err != nil { + return err + } + } + return nil +} + +func (db *DB) Path() string { + return db.f +} + +// CopyTempFile copies a database to a temporary file. +func (db *DB) CopyTempFile() { + path := filepath.Join(db.t.TempDir(), "db.copy") + err := db.View(func(tx *bolt.Tx) error { + return tx.CopyFile(path, 0600) + }) + require.NoError(db.t, err) + fmt.Println("db copied to: ", path) +} + +// PrintStats prints the database stats +func (db *DB) PrintStats() { + var stats = db.Stats() + fmt.Printf("[db] %-20s %-20s %-20s\n", + fmt.Sprintf("pg(%d/%d)", stats.TxStats.GetPageCount(), stats.TxStats.GetPageAlloc()), + fmt.Sprintf("cur(%d)", stats.TxStats.GetCursorCount()), + fmt.Sprintf("node(%d/%d)", stats.TxStats.GetNodeCount(), stats.TxStats.GetNodeDeref()), + ) + fmt.Printf(" %-20s %-20s %-20s\n", + fmt.Sprintf("rebal(%d/%v)", stats.TxStats.GetRebalance(), truncDuration(stats.TxStats.GetRebalanceTime())), + fmt.Sprintf("spill(%d/%v)", stats.TxStats.GetSpill(), truncDuration(stats.TxStats.GetSpillTime())), + fmt.Sprintf("w(%d/%v)", stats.TxStats.GetWrite(), truncDuration(stats.TxStats.GetWriteTime())), + ) +} + +func truncDuration(d time.Duration) string { + return regexp.MustCompile(`^(\d+)(\.\d+)`).ReplaceAllString(d.String(), "$1") +} + +func (db *DB) strictModeEnabledDefault() { + strictModeEnabled := strings.ToLower(os.Getenv(TestEnableStrictMode)) + db.StrictMode = strictModeEnabled == "true" +} + +func (db *DB) ForceDisableStrictMode() { + db.StrictMode = false +} diff --git a/internal/common/bucket.go b/internal/common/bucket.go new file mode 100644 index 0000000..2b4ab14 --- /dev/null +++ b/internal/common/bucket.go @@ -0,0 +1,54 @@ +package common + +import ( + "fmt" + "unsafe" +) + +const BucketHeaderSize = int(unsafe.Sizeof(InBucket{})) + +// InBucket represents the on-file representation of a bucket. +// This is stored as the "value" of a bucket key. If the bucket is small enough, +// then its root page can be stored inline in the "value", after the bucket +// header. In the case of inline buckets, the "root" will be 0. +type InBucket struct { + root Pgid // page id of the bucket's root-level page + sequence uint64 // monotonically incrementing, used by NextSequence() +} + +func NewInBucket(root Pgid, seq uint64) InBucket { + return InBucket{ + root: root, + sequence: seq, + } +} + +func (b *InBucket) RootPage() Pgid { + return b.root +} + +func (b *InBucket) SetRootPage(id Pgid) { + b.root = id +} + +// InSequence returns the sequence. The reason why not naming it `Sequence` +// is to avoid duplicated name as `(*Bucket) Sequence()` +func (b *InBucket) InSequence() uint64 { + return b.sequence +} + +func (b *InBucket) SetInSequence(v uint64) { + b.sequence = v +} + +func (b *InBucket) IncSequence() { + b.sequence++ +} + +func (b *InBucket) InlinePage(v []byte) *Page { + return (*Page)(unsafe.Pointer(&v[BucketHeaderSize])) +} + +func (b *InBucket) String() string { + return fmt.Sprintf("", b.root, b.sequence) +} diff --git a/internal/common/inode.go b/internal/common/inode.go new file mode 100644 index 0000000..080b9af --- /dev/null +++ b/internal/common/inode.go @@ -0,0 +1,115 @@ +package common + +import "unsafe" + +// Inode represents an internal node inside of a node. +// It can be used to point to elements in a page or point +// to an element which hasn't been added to a page yet. +type Inode struct { + flags uint32 + pgid Pgid + key []byte + value []byte +} + +type Inodes []Inode + +func (in *Inode) Flags() uint32 { + return in.flags +} + +func (in *Inode) SetFlags(flags uint32) { + in.flags = flags +} + +func (in *Inode) Pgid() Pgid { + return in.pgid +} + +func (in *Inode) SetPgid(id Pgid) { + in.pgid = id +} + +func (in *Inode) Key() []byte { + return in.key +} + +func (in *Inode) SetKey(key []byte) { + in.key = key +} + +func (in *Inode) Value() []byte { + return in.value +} + +func (in *Inode) SetValue(value []byte) { + in.value = value +} + +func ReadInodeFromPage(p *Page) Inodes { + inodes := make(Inodes, int(p.Count())) + isLeaf := p.IsLeafPage() + for i := 0; i < int(p.Count()); i++ { + inode := &inodes[i] + if isLeaf { + elem := p.LeafPageElement(uint16(i)) + inode.SetFlags(elem.Flags()) + inode.SetKey(elem.Key()) + inode.SetValue(elem.Value()) + } else { + elem := p.BranchPageElement(uint16(i)) + inode.SetPgid(elem.Pgid()) + inode.SetKey(elem.Key()) + } + Assert(len(inode.Key()) > 0, "read: zero-length inode key") + } + + return inodes +} + +func WriteInodeToPage(inodes Inodes, p *Page) uint32 { + // Loop over each item and write it to the page. + // off tracks the offset into p of the start of the next data. + off := unsafe.Sizeof(*p) + p.PageElementSize()*uintptr(len(inodes)) + isLeaf := p.IsLeafPage() + for i, item := range inodes { + Assert(len(item.Key()) > 0, "write: zero-length inode key") + + // Create a slice to write into of needed size and advance + // byte pointer for next iteration. + sz := len(item.Key()) + len(item.Value()) + b := UnsafeByteSlice(unsafe.Pointer(p), off, 0, sz) + off += uintptr(sz) + + // Write the page element. + if isLeaf { + elem := p.LeafPageElement(uint16(i)) + elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))) + elem.SetFlags(item.Flags()) + elem.SetKsize(uint32(len(item.Key()))) + elem.SetVsize(uint32(len(item.Value()))) + } else { + elem := p.BranchPageElement(uint16(i)) + elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))) + elem.SetKsize(uint32(len(item.Key()))) + elem.SetPgid(item.Pgid()) + Assert(elem.Pgid() != p.Id(), "write: circular dependency occurred") + } + + // Write data for the element to the end of the page. + l := copy(b, item.Key()) + copy(b[l:], item.Value()) + } + + return uint32(off) +} + +func UsedSpaceInPage(inodes Inodes, p *Page) uint32 { + off := unsafe.Sizeof(*p) + p.PageElementSize()*uintptr(len(inodes)) + for _, item := range inodes { + sz := len(item.Key()) + len(item.Value()) + off += uintptr(sz) + } + + return uint32(off) +} diff --git a/internal/common/meta.go b/internal/common/meta.go new file mode 100644 index 0000000..31e5413 --- /dev/null +++ b/internal/common/meta.go @@ -0,0 +1,161 @@ +package common + +import ( + "fmt" + "hash/fnv" + "io" + "unsafe" + + "github.com/tutus-one/tutus-bolt/errors" +) + +type Meta struct { + magic uint32 + version uint32 + pageSize uint32 + flags uint32 + root InBucket + freelist Pgid + pgid Pgid + txid Txid + checksum uint64 +} + +// Validate checks the marker bytes and version of the meta page to ensure it matches this binary. +func (m *Meta) Validate() error { + if m.magic != Magic { + return errors.ErrInvalid + } else if m.version != Version { + return errors.ErrVersionMismatch + } else if m.checksum != m.Sum64() { + return errors.ErrChecksum + } + return nil +} + +// Copy copies one meta object to another. +func (m *Meta) Copy(dest *Meta) { + *dest = *m +} + +// Write writes the meta onto a page. +func (m *Meta) Write(p *Page) { + if m.root.root >= m.pgid { + panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) + } else if m.freelist >= m.pgid && m.freelist != PgidNoFreelist { + // TODO: reject pgidNoFreeList if !NoFreelistSync + panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) + } + + // Page id is either going to be 0 or 1 which we can determine by the transaction ID. + p.id = Pgid(m.txid % 2) + p.SetFlags(MetaPageFlag) + + // Calculate the checksum. + m.checksum = m.Sum64() + + m.Copy(p.Meta()) +} + +// Sum64 generates the checksum for the meta. +func (m *Meta) Sum64() uint64 { + var h = fnv.New64a() + _, _ = h.Write((*[unsafe.Offsetof(Meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) + return h.Sum64() +} + +func (m *Meta) Magic() uint32 { + return m.magic +} + +func (m *Meta) SetMagic(v uint32) { + m.magic = v +} + +func (m *Meta) Version() uint32 { + return m.version +} + +func (m *Meta) SetVersion(v uint32) { + m.version = v +} + +func (m *Meta) PageSize() uint32 { + return m.pageSize +} + +func (m *Meta) SetPageSize(v uint32) { + m.pageSize = v +} + +func (m *Meta) Flags() uint32 { + return m.flags +} + +func (m *Meta) SetFlags(v uint32) { + m.flags = v +} + +func (m *Meta) SetRootBucket(b InBucket) { + m.root = b +} + +func (m *Meta) RootBucket() *InBucket { + return &m.root +} + +func (m *Meta) Freelist() Pgid { + return m.freelist +} + +func (m *Meta) SetFreelist(v Pgid) { + m.freelist = v +} + +func (m *Meta) IsFreelistPersisted() bool { + return m.freelist != PgidNoFreelist +} + +func (m *Meta) Pgid() Pgid { + return m.pgid +} + +func (m *Meta) SetPgid(id Pgid) { + m.pgid = id +} + +func (m *Meta) Txid() Txid { + return m.txid +} + +func (m *Meta) SetTxid(id Txid) { + m.txid = id +} + +func (m *Meta) IncTxid() { + m.txid += 1 +} + +func (m *Meta) DecTxid() { + m.txid -= 1 +} + +func (m *Meta) Checksum() uint64 { + return m.checksum +} + +func (m *Meta) SetChecksum(v uint64) { + m.checksum = v +} + +func (m *Meta) Print(w io.Writer) { + fmt.Fprintf(w, "Version: %d\n", m.version) + fmt.Fprintf(w, "Page Size: %d bytes\n", m.pageSize) + fmt.Fprintf(w, "Flags: %08x\n", m.flags) + fmt.Fprintf(w, "Root: \n", m.root.root) + fmt.Fprintf(w, "Freelist: \n", m.freelist) + fmt.Fprintf(w, "HWM: \n", m.pgid) + fmt.Fprintf(w, "Txn ID: %d\n", m.txid) + fmt.Fprintf(w, "Checksum: %016x\n", m.checksum) + fmt.Fprintf(w, "\n") +} diff --git a/internal/common/page.go b/internal/common/page.go new file mode 100644 index 0000000..4453160 --- /dev/null +++ b/internal/common/page.go @@ -0,0 +1,391 @@ +package common + +import ( + "fmt" + "os" + "sort" + "unsafe" +) + +const PageHeaderSize = unsafe.Sizeof(Page{}) + +const MinKeysPerPage = 2 + +const BranchPageElementSize = unsafe.Sizeof(branchPageElement{}) +const LeafPageElementSize = unsafe.Sizeof(leafPageElement{}) +const pgidSize = unsafe.Sizeof(Pgid(0)) + +const ( + BranchPageFlag = 0x01 + LeafPageFlag = 0x02 + MetaPageFlag = 0x04 + FreelistPageFlag = 0x10 +) + +const ( + BucketLeafFlag = 0x01 +) + +type Pgid uint64 + +type Page struct { + id Pgid + flags uint16 + count uint16 + overflow uint32 +} + +func NewPage(id Pgid, flags, count uint16, overflow uint32) *Page { + return &Page{ + id: id, + flags: flags, + count: count, + overflow: overflow, + } +} + +// Typ returns a human-readable page type string used for debugging. +func (p *Page) Typ() string { + if p.IsBranchPage() { + return "branch" + } else if p.IsLeafPage() { + return "leaf" + } else if p.IsMetaPage() { + return "meta" + } else if p.IsFreelistPage() { + return "freelist" + } + return fmt.Sprintf("unknown<%02x>", p.flags) +} + +func (p *Page) IsBranchPage() bool { + return p.flags == BranchPageFlag +} + +func (p *Page) IsLeafPage() bool { + return p.flags == LeafPageFlag +} + +func (p *Page) IsMetaPage() bool { + return p.flags == MetaPageFlag +} + +func (p *Page) IsFreelistPage() bool { + return p.flags == FreelistPageFlag +} + +// Meta returns a pointer to the metadata section of the page. +func (p *Page) Meta() *Meta { + return (*Meta)(UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) +} + +func (p *Page) FastCheck(id Pgid) { + Assert(p.id == id, "Page expected to be: %v, but self identifies as %v", id, p.id) + // Only one flag of page-type can be set. + Assert(p.IsBranchPage() || + p.IsLeafPage() || + p.IsMetaPage() || + p.IsFreelistPage(), + "page %v: has unexpected type/flags: %x", p.id, p.flags) +} + +// LeafPageElement retrieves the leaf node by index +func (p *Page) LeafPageElement(index uint16) *leafPageElement { + return (*leafPageElement)(UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), + LeafPageElementSize, int(index))) +} + +// LeafPageElements retrieves a list of leaf nodes. +func (p *Page) LeafPageElements() []leafPageElement { + if p.count == 0 { + return nil + } + data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + elems := unsafe.Slice((*leafPageElement)(data), int(p.count)) + return elems +} + +// BranchPageElement retrieves the branch node by index +func (p *Page) BranchPageElement(index uint16) *branchPageElement { + return (*branchPageElement)(UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), + unsafe.Sizeof(branchPageElement{}), int(index))) +} + +// BranchPageElements retrieves a list of branch nodes. +func (p *Page) BranchPageElements() []branchPageElement { + if p.count == 0 { + return nil + } + data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + elems := unsafe.Slice((*branchPageElement)(data), int(p.count)) + return elems +} + +func (p *Page) FreelistPageCount() (int, int) { + Assert(p.IsFreelistPage(), fmt.Sprintf("can't get freelist page count from a non-freelist page: %2x", p.flags)) + + // If the page.count is at the max uint16 value (64k) then it's considered + // an overflow and the size of the freelist is stored as the first element. + var idx, count = 0, int(p.count) + if count == 0xFFFF { + idx = 1 + c := *(*Pgid)(UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) + count = int(c) + if count < 0 { + panic(fmt.Sprintf("leading element count %d overflows int", c)) + } + } + + return idx, count +} + +func (p *Page) FreelistPageIds() []Pgid { + Assert(p.IsFreelistPage(), fmt.Sprintf("can't get freelist page IDs from a non-freelist page: %2x", p.flags)) + + idx, count := p.FreelistPageCount() + + if count == 0 { + return nil + } + + data := UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), pgidSize, idx) + ids := unsafe.Slice((*Pgid)(data), count) + + return ids +} + +// dump writes n bytes of the page to STDERR as hex output. +func (p *Page) hexdump(n int) { + buf := UnsafeByteSlice(unsafe.Pointer(p), 0, 0, n) + fmt.Fprintf(os.Stderr, "%x\n", buf) +} + +func (p *Page) PageElementSize() uintptr { + if p.IsLeafPage() { + return LeafPageElementSize + } + return BranchPageElementSize +} + +func (p *Page) Id() Pgid { + return p.id +} + +func (p *Page) SetId(target Pgid) { + p.id = target +} + +func (p *Page) Flags() uint16 { + return p.flags +} + +func (p *Page) SetFlags(v uint16) { + p.flags = v +} + +func (p *Page) Count() uint16 { + return p.count +} + +func (p *Page) SetCount(target uint16) { + p.count = target +} + +func (p *Page) Overflow() uint32 { + return p.overflow +} + +func (p *Page) SetOverflow(target uint32) { + p.overflow = target +} + +func (p *Page) String() string { + return fmt.Sprintf("ID: %d, Type: %s, count: %d, overflow: %d", p.id, p.Typ(), p.count, p.overflow) +} + +type Pages []*Page + +func (s Pages) Len() int { return len(s) } +func (s Pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s Pages) Less(i, j int) bool { return s[i].id < s[j].id } + +// branchPageElement represents a node on a branch page. +type branchPageElement struct { + pos uint32 + ksize uint32 + pgid Pgid +} + +func (n *branchPageElement) Pos() uint32 { + return n.pos +} + +func (n *branchPageElement) SetPos(v uint32) { + n.pos = v +} + +func (n *branchPageElement) Ksize() uint32 { + return n.ksize +} + +func (n *branchPageElement) SetKsize(v uint32) { + n.ksize = v +} + +func (n *branchPageElement) Pgid() Pgid { + return n.pgid +} + +func (n *branchPageElement) SetPgid(v Pgid) { + n.pgid = v +} + +// Key returns a byte slice of the node key. +func (n *branchPageElement) Key() []byte { + return UnsafeByteSlice(unsafe.Pointer(n), 0, int(n.pos), int(n.pos)+int(n.ksize)) +} + +// leafPageElement represents a node on a leaf page. +type leafPageElement struct { + flags uint32 + pos uint32 + ksize uint32 + vsize uint32 +} + +func NewLeafPageElement(flags, pos, ksize, vsize uint32) *leafPageElement { + return &leafPageElement{ + flags: flags, + pos: pos, + ksize: ksize, + vsize: vsize, + } +} + +func (n *leafPageElement) Flags() uint32 { + return n.flags +} + +func (n *leafPageElement) SetFlags(v uint32) { + n.flags = v +} + +func (n *leafPageElement) Pos() uint32 { + return n.pos +} + +func (n *leafPageElement) SetPos(v uint32) { + n.pos = v +} + +func (n *leafPageElement) Ksize() uint32 { + return n.ksize +} + +func (n *leafPageElement) SetKsize(v uint32) { + n.ksize = v +} + +func (n *leafPageElement) Vsize() uint32 { + return n.vsize +} + +func (n *leafPageElement) SetVsize(v uint32) { + n.vsize = v +} + +// Key returns a byte slice of the node key. +func (n *leafPageElement) Key() []byte { + i := int(n.pos) + j := i + int(n.ksize) + return UnsafeByteSlice(unsafe.Pointer(n), 0, i, j) +} + +// Value returns a byte slice of the node value. +func (n *leafPageElement) Value() []byte { + i := int(n.pos) + int(n.ksize) + j := i + int(n.vsize) + return UnsafeByteSlice(unsafe.Pointer(n), 0, i, j) +} + +func (n *leafPageElement) IsBucketEntry() bool { + return n.flags&uint32(BucketLeafFlag) != 0 +} + +func (n *leafPageElement) Bucket() *InBucket { + if n.IsBucketEntry() { + return LoadBucket(n.Value()) + } else { + return nil + } +} + +// PageInfo represents human readable information about a page. +type PageInfo struct { + ID int + Type string + Count int + OverflowCount int +} + +type Pgids []Pgid + +func (s Pgids) Len() int { return len(s) } +func (s Pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s Pgids) Less(i, j int) bool { return s[i] < s[j] } + +// Merge returns the sorted union of a and b. +func (s Pgids) Merge(b Pgids) Pgids { + // Return the opposite slice if one is nil. + if len(s) == 0 { + return b + } + if len(b) == 0 { + return s + } + merged := make(Pgids, len(s)+len(b)) + Mergepgids(merged, s, b) + return merged +} + +// Mergepgids copies the sorted union of a and b into dst. +// If dst is too small, it panics. +func Mergepgids(dst, a, b Pgids) { + if len(dst) < len(a)+len(b) { + panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b))) + } + // Copy in the opposite slice if one is nil. + if len(a) == 0 { + copy(dst, b) + return + } + if len(b) == 0 { + copy(dst, a) + return + } + + // Merged will hold all elements from both lists. + merged := dst[:0] + + // Assign lead to the slice with a lower starting value, follow to the higher value. + lead, follow := a, b + if b[0] < a[0] { + lead, follow = b, a + } + + // Continue while there are elements in the lead. + for len(lead) > 0 { + // Merge largest prefix of lead that is ahead of follow[0]. + n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] }) + merged = append(merged, lead[:n]...) + if n >= len(lead) { + break + } + + // Swap lead and follow. + lead, follow = follow, lead[n:] + } + + // Append what's left in follow. + _ = append(merged, follow...) +} diff --git a/internal/common/page_test.go b/internal/common/page_test.go new file mode 100644 index 0000000..376ab6a --- /dev/null +++ b/internal/common/page_test.go @@ -0,0 +1,72 @@ +package common + +import ( + "reflect" + "sort" + "testing" + "testing/quick" +) + +// Ensure that the page type can be returned in human readable format. +func TestPage_typ(t *testing.T) { + if typ := (&Page{flags: BranchPageFlag}).Typ(); typ != "branch" { + t.Fatalf("exp=branch; got=%v", typ) + } + if typ := (&Page{flags: LeafPageFlag}).Typ(); typ != "leaf" { + t.Fatalf("exp=leaf; got=%v", typ) + } + if typ := (&Page{flags: MetaPageFlag}).Typ(); typ != "meta" { + t.Fatalf("exp=meta; got=%v", typ) + } + if typ := (&Page{flags: FreelistPageFlag}).Typ(); typ != "freelist" { + t.Fatalf("exp=freelist; got=%v", typ) + } + if typ := (&Page{flags: 20000}).Typ(); typ != "unknown<4e20>" { + t.Fatalf("exp=unknown<4e20>; got=%v", typ) + } +} + +// Ensure that the hexdump debugging function doesn't blow up. +func TestPage_dump(t *testing.T) { + (&Page{id: 256}).hexdump(16) +} + +func TestPgids_merge(t *testing.T) { + a := Pgids{4, 5, 6, 10, 11, 12, 13, 27} + b := Pgids{1, 3, 8, 9, 25, 30} + c := a.Merge(b) + if !reflect.DeepEqual(c, Pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) { + t.Errorf("mismatch: %v", c) + } + + a = Pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36} + b = Pgids{8, 9, 25, 30} + c = a.Merge(b) + if !reflect.DeepEqual(c, Pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) { + t.Errorf("mismatch: %v", c) + } +} + +func TestPgids_merge_quick(t *testing.T) { + if err := quick.Check(func(a, b Pgids) bool { + // Sort incoming lists. + sort.Sort(a) + sort.Sort(b) + + // Merge the two lists together. + got := a.Merge(b) + + // The expected value should be the two lists combined and sorted. + exp := append(a, b...) + sort.Sort(exp) + + if !reflect.DeepEqual(exp, got) { + t.Errorf("\nexp=%+v\ngot=%+v\n", exp, got) + return false + } + + return true + }, nil); err != nil { + t.Fatal(err) + } +} diff --git a/internal/common/types.go b/internal/common/types.go new file mode 100644 index 0000000..8ad8279 --- /dev/null +++ b/internal/common/types.go @@ -0,0 +1,40 @@ +package common + +import ( + "os" + "runtime" + "time" +) + +// MaxMmapStep is the largest step that can be taken when remapping the mmap. +const MaxMmapStep = 1 << 30 // 1GB + +// Version represents the data file format version. +const Version uint32 = 2 + +// Magic represents a marker value to indicate that a file is a Bolt DB. +const Magic uint32 = 0xED0CDAED + +const PgidNoFreelist Pgid = 0xffffffffffffffff + +// DO NOT EDIT. Copied from the "bolt" package. +const pageMaxAllocSize = 0xFFFFFFF + +// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when +// syncing changes to a file. This is required as some operating systems, +// such as OpenBSD, do not have a unified buffer cache (UBC) and writes +// must be synchronized using the msync(2) syscall. +const IgnoreNoSync = runtime.GOOS == "openbsd" + +// Default values if not set in a DB instance. +const ( + DefaultMaxBatchSize int = 1000 + DefaultMaxBatchDelay = 10 * time.Millisecond + DefaultAllocSize = 16 * 1024 * 1024 +) + +// DefaultPageSize is the default page size for db which is set to the OS page size. +var DefaultPageSize = os.Getpagesize() + +// Txid represents the internal transaction identifier. +type Txid uint64 diff --git a/internal/common/unsafe.go b/internal/common/unsafe.go new file mode 100644 index 0000000..9b77dd7 --- /dev/null +++ b/internal/common/unsafe.go @@ -0,0 +1,27 @@ +package common + +import ( + "unsafe" +) + +func UnsafeAdd(base unsafe.Pointer, offset uintptr) unsafe.Pointer { + return unsafe.Pointer(uintptr(base) + offset) +} + +func UnsafeIndex(base unsafe.Pointer, offset uintptr, elemsz uintptr, n int) unsafe.Pointer { + return unsafe.Pointer(uintptr(base) + offset + uintptr(n)*elemsz) +} + +func UnsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte { + // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices + // + // This memory is not allocated from C, but it is unmanaged by Go's + // garbage collector and should behave similarly, and the compiler + // should produce similar code. Note that this conversion allows a + // subslice to begin after the base address, with an optional offset, + // while the URL above does not cover this case and only slices from + // index 0. However, the wiki never says that the address must be to + // the beginning of a C allocation (or even that malloc was used at + // all), so this is believed to be correct. + return (*[pageMaxAllocSize]byte)(UnsafeAdd(base, offset))[i:j:j] +} diff --git a/internal/common/utils.go b/internal/common/utils.go new file mode 100644 index 0000000..bdf82a7 --- /dev/null +++ b/internal/common/utils.go @@ -0,0 +1,64 @@ +package common + +import ( + "fmt" + "io" + "os" + "unsafe" +) + +func LoadBucket(buf []byte) *InBucket { + return (*InBucket)(unsafe.Pointer(&buf[0])) +} + +func LoadPage(buf []byte) *Page { + return (*Page)(unsafe.Pointer(&buf[0])) +} + +func LoadPageMeta(buf []byte) *Meta { + return (*Meta)(unsafe.Pointer(&buf[PageHeaderSize])) +} + +func CopyFile(srcPath, dstPath string) error { + // Ensure source file exists. + _, err := os.Stat(srcPath) + if os.IsNotExist(err) { + return fmt.Errorf("source file %q not found", srcPath) + } else if err != nil { + return err + } + + // Ensure output file not exist. + _, err = os.Stat(dstPath) + if err == nil { + return fmt.Errorf("output file %q already exists", dstPath) + } else if !os.IsNotExist(err) { + return err + } + + srcDB, err := os.Open(srcPath) + if err != nil { + return fmt.Errorf("failed to open source file %q: %w", srcPath, err) + } + defer srcDB.Close() + dstDB, err := os.Create(dstPath) + if err != nil { + return fmt.Errorf("failed to create output file %q: %w", dstPath, err) + } + defer dstDB.Close() + written, err := io.Copy(dstDB, srcDB) + if err != nil { + return fmt.Errorf("failed to copy database file from %q to %q: %w", srcPath, dstPath, err) + } + + srcFi, err := srcDB.Stat() + if err != nil { + return fmt.Errorf("failed to get source file info %q: %w", srcPath, err) + } + initialSize := srcFi.Size() + if initialSize != written { + return fmt.Errorf("the byte copied (%q: %d) isn't equal to the initial db size (%q: %d)", dstPath, written, srcPath, initialSize) + } + + return nil +} diff --git a/internal/common/verify.go b/internal/common/verify.go new file mode 100644 index 0000000..eac95e2 --- /dev/null +++ b/internal/common/verify.go @@ -0,0 +1,67 @@ +// Copied from https://github.com/etcd-io/etcd/blob/main/client/pkg/verify/verify.go +package common + +import ( + "fmt" + "os" + "strings" +) + +const ENV_VERIFY = "BBOLT_VERIFY" + +type VerificationType string + +const ( + ENV_VERIFY_VALUE_ALL VerificationType = "all" + ENV_VERIFY_VALUE_ASSERT VerificationType = "assert" +) + +func getEnvVerify() string { + return strings.ToLower(os.Getenv(ENV_VERIFY)) +} + +func IsVerificationEnabled(verification VerificationType) bool { + env := getEnvVerify() + return env == string(ENV_VERIFY_VALUE_ALL) || env == strings.ToLower(string(verification)) +} + +// EnableVerifications sets `ENV_VERIFY` and returns a function that +// can be used to bring the original settings. +func EnableVerifications(verification VerificationType) func() { + previousEnv := getEnvVerify() + os.Setenv(ENV_VERIFY, string(verification)) + return func() { + os.Setenv(ENV_VERIFY, previousEnv) + } +} + +// EnableAllVerifications enables verification and returns a function +// that can be used to bring the original settings. +func EnableAllVerifications() func() { + return EnableVerifications(ENV_VERIFY_VALUE_ALL) +} + +// DisableVerifications unsets `ENV_VERIFY` and returns a function that +// can be used to bring the original settings. +func DisableVerifications() func() { + previousEnv := getEnvVerify() + os.Unsetenv(ENV_VERIFY) + return func() { + os.Setenv(ENV_VERIFY, previousEnv) + } +} + +// Verify performs verification if the assertions are enabled. +// In the default setup running in tests and skipped in the production code. +func Verify(f func()) { + if IsVerificationEnabled(ENV_VERIFY_VALUE_ASSERT) { + f() + } +} + +// Assert will panic with a given formatted message if the given condition is false. +func Assert(condition bool, msg string, v ...any) { + if !condition { + panic(fmt.Sprintf("assertion failed: "+msg, v...)) + } +} diff --git a/internal/freelist/array.go b/internal/freelist/array.go new file mode 100644 index 0000000..bf688dc --- /dev/null +++ b/internal/freelist/array.go @@ -0,0 +1,108 @@ +package freelist + +import ( + "fmt" + "sort" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +type array struct { + *shared + + ids []common.Pgid // all free and available free page ids. +} + +func (f *array) Init(ids common.Pgids) { + f.ids = ids + f.reindex() +} + +func (f *array) Allocate(txid common.Txid, n int) common.Pgid { + if len(f.ids) == 0 { + return 0 + } + + var initial, previd common.Pgid + for i, id := range f.ids { + if id <= 1 { + panic(fmt.Sprintf("invalid page allocation: %d", id)) + } + + // Reset initial page if this is not contiguous. + if previd == 0 || id-previd != 1 { + initial = id + } + + // If we found a contiguous block then remove it and return it. + if (id-initial)+1 == common.Pgid(n) { + // If we're allocating off the beginning then take the fast path + // and just adjust the existing slice. This will use extra memory + // temporarily but the append() in free() will realloc the slice + // as is necessary. + if (i + 1) == n { + f.ids = f.ids[i+1:] + } else { + copy(f.ids[i-n+1:], f.ids[i+1:]) + f.ids = f.ids[:len(f.ids)-n] + } + + // Remove from the free cache. + for i := common.Pgid(0); i < common.Pgid(n); i++ { + delete(f.cache, initial+i) + } + f.allocs[initial] = txid + return initial + } + + previd = id + } + return 0 +} + +func (f *array) FreeCount() int { + return len(f.ids) +} + +func (f *array) freePageIds() common.Pgids { + return f.ids +} + +func (f *array) mergeSpans(ids common.Pgids) { + sort.Sort(ids) + common.Verify(func() { + idsIdx := make(map[common.Pgid]struct{}) + for _, id := range f.ids { + // The existing f.ids shouldn't have duplicated free ID. + if _, ok := idsIdx[id]; ok { + panic(fmt.Sprintf("detected duplicated free page ID: %d in existing f.ids: %v", id, f.ids)) + } + idsIdx[id] = struct{}{} + } + + prev := common.Pgid(0) + for _, id := range ids { + // The ids shouldn't have duplicated free ID. Note page 0 and 1 + // are reserved for meta pages, so they can never be free page IDs. + if prev == id { + panic(fmt.Sprintf("detected duplicated free ID: %d in ids: %v", id, ids)) + } + prev = id + + // The ids shouldn't have any overlap with the existing f.ids. + if _, ok := idsIdx[id]; ok { + panic(fmt.Sprintf("detected overlapped free page ID: %d between ids: %v and existing f.ids: %v", id, ids, f.ids)) + } + } + }) + f.ids = common.Pgids(f.ids).Merge(ids) +} + +func NewArrayFreelist() Interface { + a := &array{ + shared: newShared(), + ids: []common.Pgid{}, + } + a.Interface = a + return a +} diff --git a/internal/freelist/array_test.go b/internal/freelist/array_test.go new file mode 100644 index 0000000..cb4d840 --- /dev/null +++ b/internal/freelist/array_test.go @@ -0,0 +1,91 @@ +package freelist + +import ( + "reflect" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// Ensure that a freelist can find contiguous blocks of pages. +func TestFreelistArray_allocate(t *testing.T) { + f := NewArrayFreelist() + ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} + f.Init(ids) + if id := int(f.Allocate(1, 3)); id != 3 { + t.Fatalf("exp=3; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 6 { + t.Fatalf("exp=6; got=%v", id) + } + if id := int(f.Allocate(1, 3)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.Allocate(1, 2)); id != 12 { + t.Fatalf("exp=12; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 7 { + t.Fatalf("exp=7; got=%v", id) + } + if id := int(f.Allocate(1, 0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.Allocate(1, 0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := common.Pgids([]common.Pgid{9, 18}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } + + if id := int(f.Allocate(1, 1)); id != 9 { + t.Fatalf("exp=9; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 18 { + t.Fatalf("exp=18; got=%v", id) + } + if id := int(f.Allocate(1, 1)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := common.Pgids([]common.Pgid{}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } +} + +func TestInvalidArrayAllocation(t *testing.T) { + f := NewArrayFreelist() + // page 0 and 1 are reserved for meta pages, so they should never be free pages. + ids := []common.Pgid{1} + f.Init(ids) + require.Panics(t, func() { + f.Allocate(common.Txid(1), 1) + }) +} + +func Test_Freelist_Array_Rollback(t *testing.T) { + f := newTestArrayFreelist() + + f.Init([]common.Pgid{3, 5, 6, 7, 12, 13}) + + f.Free(100, common.NewPage(20, 0, 0, 1)) + f.Allocate(100, 3) + f.Free(100, common.NewPage(25, 0, 0, 0)) + f.Allocate(100, 2) + + require.Equal(t, map[common.Pgid]common.Txid{5: 100, 12: 100}, f.allocs) + require.Equal(t, map[common.Txid]*txPending{100: { + ids: []common.Pgid{20, 21, 25}, + alloctx: []common.Txid{0, 0, 0}, + }}, f.pending) + + f.Rollback(100) + + require.Equal(t, map[common.Pgid]common.Txid{}, f.allocs) + require.Equal(t, map[common.Txid]*txPending{}, f.pending) +} + +func newTestArrayFreelist() *array { + f := NewArrayFreelist() + return f.(*array) +} diff --git a/internal/freelist/freelist.go b/internal/freelist/freelist.go new file mode 100644 index 0000000..77c4c77 --- /dev/null +++ b/internal/freelist/freelist.go @@ -0,0 +1,82 @@ +package freelist + +import ( + "github.com/tutus-one/tutus-bolt/internal/common" +) + +type ReadWriter interface { + // Read calls Init with the page ids stored in the given page. + Read(page *common.Page) + + // Write writes the freelist into the given page. + Write(page *common.Page) + + // EstimatedWritePageSize returns the size in bytes of the freelist after serialization in Write. + // This should never underestimate the size. + EstimatedWritePageSize() int +} + +type Interface interface { + ReadWriter + + // Init initializes this freelist with the given list of pages. + Init(ids common.Pgids) + + // Allocate tries to allocate the given number of contiguous pages + // from the free list pages. It returns the starting page ID if + // available; otherwise, it returns 0. + Allocate(txid common.Txid, numPages int) common.Pgid + + // Count returns the number of free and pending pages. + Count() int + + // FreeCount returns the number of free pages. + FreeCount() int + + // PendingCount returns the number of pending pages. + PendingCount() int + + // AddReadonlyTXID adds a given read-only transaction id for pending page tracking. + AddReadonlyTXID(txid common.Txid) + + // RemoveReadonlyTXID removes a given read-only transaction id for pending page tracking. + RemoveReadonlyTXID(txid common.Txid) + + // ReleasePendingPages releases any pages associated with closed read-only transactions. + ReleasePendingPages() + + // Free releases a page and its overflow for a given transaction id. + // If the page is already free or is one of the meta pages, then a panic will occur. + Free(txId common.Txid, p *common.Page) + + // Freed returns whether a given page is in the free list. + Freed(pgId common.Pgid) bool + + // Rollback removes the pages from a given pending tx. + Rollback(txId common.Txid) + + // Copyall copies a list of all free ids and all pending ids in one sorted list. + // f.count returns the minimum length required for dst. + Copyall(dst []common.Pgid) + + // Reload reads the freelist from a page and filters out pending items. + Reload(p *common.Page) + + // NoSyncReload reads the freelist from Pgids and filters out pending items. + NoSyncReload(pgIds common.Pgids) + + // freePageIds returns the IDs of all free pages. Returns an empty slice if no free pages are available. + freePageIds() common.Pgids + + // pendingPageIds returns all pending pages by transaction id. + pendingPageIds() map[common.Txid]*txPending + + // release moves all page ids for a transaction id (or older) to the freelist. + release(txId common.Txid) + + // releaseRange moves pending pages allocated within an extent [begin,end] to the free list. + releaseRange(begin, end common.Txid) + + // mergeSpans is merging the given pages into the freelist + mergeSpans(ids common.Pgids) +} diff --git a/internal/freelist/freelist_test.go b/internal/freelist/freelist_test.go new file mode 100644 index 0000000..834b42b --- /dev/null +++ b/internal/freelist/freelist_test.go @@ -0,0 +1,622 @@ +package freelist + +import ( + "fmt" + "math" + "math/rand" + "os" + "reflect" + "slices" + "sort" + "testing" + "testing/quick" + "unsafe" + + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// TestFreelistType is used as a env variable for test to indicate the backend type +const TestFreelistType = "TEST_FREELIST_TYPE" + +// Ensure that a page is added to a transaction's freelist. +func TestFreelist_free(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 0)) + if !reflect.DeepEqual([]common.Pgid{12}, f.pendingPageIds()[100].ids) { + t.Fatalf("exp=%v; got=%v", []common.Pgid{12}, f.pendingPageIds()[100].ids) + } +} + +// Ensure that a page and its overflow is added to a transaction's freelist. +func TestFreelist_free_overflow(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 3)) + if exp := []common.Pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pendingPageIds()[100].ids) { + t.Fatalf("exp=%v; got=%v", exp, f.pendingPageIds()[100].ids) + } +} + +// Ensure that double freeing a page is causing a panic +func TestFreelist_free_double_free_panics(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 3)) + require.Panics(t, func() { + f.Free(100, common.NewPage(12, 0, 0, 3)) + }) +} + +// Ensure that attempting to free the meta page panics +func TestFreelist_free_meta_panics(t *testing.T) { + f := newTestFreelist() + require.Panics(t, func() { + f.Free(100, common.NewPage(0, 0, 0, 0)) + }) + require.Panics(t, func() { + f.Free(100, common.NewPage(1, 0, 0, 0)) + }) +} + +func TestFreelist_free_freelist(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, common.FreelistPageFlag, 0, 0)) + pp := f.pendingPageIds()[100] + require.Equal(t, []common.Pgid{12}, pp.ids) + require.Equal(t, []common.Txid{0}, pp.alloctx) +} + +func TestFreelist_free_freelist_alloctx(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, common.FreelistPageFlag, 0, 0)) + f.Rollback(100) + require.Empty(t, f.freePageIds()) + require.Empty(t, f.pendingPageIds()) + require.False(t, f.Freed(12)) + + f.Free(101, common.NewPage(12, common.FreelistPageFlag, 0, 0)) + require.True(t, f.Freed(12)) + if exp := []common.Pgid{12}; !reflect.DeepEqual(exp, f.pendingPageIds()[101].ids) { + t.Fatalf("exp=%v; got=%v", exp, f.pendingPageIds()[101].ids) + } + f.ReleasePendingPages() + require.True(t, f.Freed(12)) + require.Empty(t, f.pendingPageIds()) + if exp := common.Pgids([]common.Pgid{12}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } +} + +// Ensure that a transaction's free pages can be released. +func TestFreelist_release(t *testing.T) { + f := newTestFreelist() + f.Free(100, common.NewPage(12, 0, 0, 1)) + f.Free(100, common.NewPage(9, 0, 0, 0)) + f.Free(102, common.NewPage(39, 0, 0, 0)) + f.release(100) + f.release(101) + if exp := common.Pgids([]common.Pgid{9, 12, 13}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } + + f.release(102) + if exp := common.Pgids([]common.Pgid{9, 12, 13, 39}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } +} + +// Ensure that releaseRange handles boundary conditions correctly +func TestFreelist_releaseRange(t *testing.T) { + type testRange struct { + begin, end common.Txid + } + + type testPage struct { + id common.Pgid + n int + allocTxn common.Txid + freeTxn common.Txid + } + + var releaseRangeTests = []struct { + title string + pagesIn []testPage + releaseRanges []testRange + wantFree []common.Pgid + }{ + { + title: "Single pending in range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 300}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending with minimum end range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 200}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending outsize minimum end range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{1, 199}}, + wantFree: []common.Pgid{}, + }, + { + title: "Single pending with minimum begin range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{100, 300}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending outside minimum begin range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, + releaseRanges: []testRange{{101, 300}}, + wantFree: []common.Pgid{}, + }, + { + title: "Single pending in minimum range", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, + releaseRanges: []testRange{{199, 200}}, + wantFree: []common.Pgid{3}, + }, + { + title: "Single pending and read transaction at 199", + pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, + releaseRanges: []testRange{{100, 198}, {200, 300}}, + wantFree: []common.Pgid{}, + }, + { + title: "Adjacent pending and read transactions at 199, 200", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, + }, + releaseRanges: []testRange{ + {100, 198}, + {200, 199}, // Simulate the ranges db.freePages might produce. + {201, 300}, + }, + wantFree: []common.Pgid{}, + }, + { + title: "Out of order ranges", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 199, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 200, freeTxn: 201}, + }, + releaseRanges: []testRange{ + {201, 199}, + {201, 200}, + {200, 200}, + }, + wantFree: []common.Pgid{}, + }, + { + title: "Multiple pending, read transaction at 150", + pagesIn: []testPage{ + {id: 3, n: 1, allocTxn: 100, freeTxn: 200}, + {id: 4, n: 1, allocTxn: 100, freeTxn: 125}, + {id: 5, n: 1, allocTxn: 125, freeTxn: 150}, + {id: 6, n: 1, allocTxn: 125, freeTxn: 175}, + {id: 7, n: 2, allocTxn: 150, freeTxn: 175}, + {id: 9, n: 2, allocTxn: 175, freeTxn: 200}, + }, + releaseRanges: []testRange{{50, 149}, {151, 300}}, + wantFree: []common.Pgid{4, 9, 10}, + }, + } + + for _, c := range releaseRangeTests { + t.Run(c.title, func(t *testing.T) { + f := newTestFreelist() + var ids []common.Pgid + for _, p := range c.pagesIn { + for i := uint64(0); i < uint64(p.n); i++ { + ids = append(ids, common.Pgid(uint64(p.id)+i)) + } + } + f.Init(ids) + for _, p := range c.pagesIn { + f.Allocate(p.allocTxn, p.n) + } + + for _, p := range c.pagesIn { + f.Free(p.freeTxn, common.NewPage(p.id, 0, 0, uint32(p.n-1))) + } + + for _, r := range c.releaseRanges { + f.releaseRange(r.begin, r.end) + } + + require.Equal(t, common.Pgids(c.wantFree), f.freePageIds()) + }) + } +} + +func TestFreeList_init(t *testing.T) { + buf := make([]byte, 4096) + f := newTestFreelist() + f.Init(common.Pgids{5, 6, 8}) + + p := common.LoadPage(buf) + f.Write(p) + + f2 := newTestFreelist() + f2.Read(p) + require.Equal(t, common.Pgids{5, 6, 8}, f2.freePageIds()) + + // When initializing the freelist with an empty list of page ID, + // it should reset the freelist page IDs. + f2.Init([]common.Pgid{}) + require.Equal(t, common.Pgids{}, f2.freePageIds()) +} + +func TestFreeList_reload(t *testing.T) { + buf := make([]byte, 4096) + f := newTestFreelist() + f.Init(common.Pgids{5, 6, 8}) + + p := common.LoadPage(buf) + f.Write(p) + + f2 := newTestFreelist() + f2.Read(p) + require.Equal(t, common.Pgids{5, 6, 8}, f2.freePageIds()) + + f2.Free(common.Txid(5), common.NewPage(10, common.LeafPageFlag, 0, 2)) + + // reload shouldn't affect the pending list + f2.Reload(p) + + require.Equal(t, common.Pgids{5, 6, 8}, f2.freePageIds()) + require.Equal(t, []common.Pgid{10, 11, 12}, f2.pendingPageIds()[5].ids) +} + +// Ensure that the txIDx swap, less and len are properly implemented +func TestTxidSorting(t *testing.T) { + require.NoError(t, quick.Check(func(a []uint64) bool { + var txids []common.Txid + for _, txid := range a { + txids = append(txids, common.Txid(txid)) + } + + sort.Sort(txIDx(txids)) + + var r []uint64 + for _, txid := range txids { + r = append(r, uint64(txid)) + } + + if !slices.IsSorted(r) { + t.Errorf("txids were not sorted correctly=%v", txids) + return false + } + + return true + }, nil)) +} + +// Ensure that a freelist can deserialize from a freelist page. +func TestFreelist_read(t *testing.T) { + // Create a page. + var buf [4096]byte + page := (*common.Page)(unsafe.Pointer(&buf[0])) + page.SetFlags(common.FreelistPageFlag) + page.SetCount(2) + + // Insert 2 page ids. + ids := (*[3]common.Pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) + ids[0] = 23 + ids[1] = 50 + + // Deserialize page into a freelist. + f := newTestFreelist() + f.Read(page) + + // Ensure that there are two page ids in the freelist. + if exp := common.Pgids([]common.Pgid{23, 50}); !reflect.DeepEqual(exp, f.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f.freePageIds()) + } +} + +// Ensure that we never read a non-freelist page +func TestFreelist_read_panics(t *testing.T) { + buf := make([]byte, 4096) + page := common.LoadPage(buf) + page.SetFlags(common.BranchPageFlag) + page.SetCount(2) + f := newTestFreelist() + require.Panics(t, func() { + f.Read(page) + }) +} + +// Ensure that a freelist can serialize into a freelist page. +func TestFreelist_write(t *testing.T) { + // Create a freelist and write it to a page. + var buf [4096]byte + f := newTestFreelist() + + f.Init([]common.Pgid{12, 39}) + f.pendingPageIds()[100] = &txPending{ids: []common.Pgid{28, 11}} + f.pendingPageIds()[101] = &txPending{ids: []common.Pgid{3}} + p := (*common.Page)(unsafe.Pointer(&buf[0])) + f.Write(p) + + // Read the page back out. + f2 := newTestFreelist() + f2.Read(p) + + // Ensure that the freelist is correct. + // All pages should be present and in reverse order. + if exp := common.Pgids([]common.Pgid{3, 11, 12, 28, 39}); !reflect.DeepEqual(exp, f2.freePageIds()) { + t.Fatalf("exp=%v; got=%v", exp, f2.freePageIds()) + } +} + +func TestFreelist_E2E_HappyPath(t *testing.T) { + f := newTestFreelist() + f.Init([]common.Pgid{}) + requirePages(t, f, common.Pgids{}, common.Pgids{}) + + allocated := f.Allocate(common.Txid(1), 5) + require.Equal(t, common.Pgid(0), allocated) + // tx.go may now allocate more space, and eventually we need to delete a page again + f.Free(common.Txid(2), common.NewPage(5, common.LeafPageFlag, 0, 0)) + f.Free(common.Txid(2), common.NewPage(3, common.LeafPageFlag, 0, 0)) + f.Free(common.Txid(2), common.NewPage(8, common.LeafPageFlag, 0, 0)) + // the above will only mark the pages as pending, so free pages should not return anything + requirePages(t, f, common.Pgids{}, common.Pgids{3, 5, 8}) + + // someone wants to do a read on top of the next tx id + f.AddReadonlyTXID(common.Txid(3)) + // this should free the above pages for tx 2 entirely + f.ReleasePendingPages() + requirePages(t, f, common.Pgids{3, 5, 8}, common.Pgids{}) + + // no span of two pages available should yield a zero-page result + require.Equal(t, common.Pgid(0), f.Allocate(common.Txid(4), 2)) + // we should be able to allocate those pages independently however, + // map and array differ in the order they return the pages + expectedPgids := map[common.Pgid]struct{}{3: {}, 5: {}, 8: {}} + for i := 0; i < 3; i++ { + allocated = f.Allocate(common.Txid(4), 1) + require.Contains(t, expectedPgids, allocated, "expected to find pgid %d", allocated) + require.False(t, f.Freed(allocated)) + delete(expectedPgids, allocated) + } + require.Emptyf(t, expectedPgids, "unexpectedly more than one page was still found") + // no more free pages to allocate + require.Equal(t, common.Pgid(0), f.Allocate(common.Txid(4), 1)) +} + +func TestFreelist_E2E_MultiSpanOverflows(t *testing.T) { + f := newTestFreelist() + f.Init([]common.Pgid{}) + f.Free(common.Txid(10), common.NewPage(20, common.LeafPageFlag, 0, 1)) + f.Free(common.Txid(10), common.NewPage(25, common.LeafPageFlag, 0, 2)) + f.Free(common.Txid(10), common.NewPage(35, common.LeafPageFlag, 0, 3)) + f.Free(common.Txid(10), common.NewPage(39, common.LeafPageFlag, 0, 2)) + f.Free(common.Txid(10), common.NewPage(45, common.LeafPageFlag, 0, 4)) + requirePages(t, f, common.Pgids{}, common.Pgids{20, 21, 25, 26, 27, 35, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49}) + f.ReleasePendingPages() + requirePages(t, f, common.Pgids{20, 21, 25, 26, 27, 35, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49}, common.Pgids{}) + + // that sequence, regardless of implementation, should always yield the same blocks of pages + allocSequence := []int{7, 5, 3, 2} + expectedSpanStarts := []common.Pgid{35, 45, 25, 20} + for i, pageNums := range allocSequence { + allocated := f.Allocate(common.Txid(11), pageNums) + require.Equal(t, expectedSpanStarts[i], allocated) + // ensure all pages in that span are not considered free anymore + for i := 0; i < pageNums; i++ { + require.False(t, f.Freed(allocated+common.Pgid(i))) + } + } +} + +func TestFreelist_E2E_Rollbacks(t *testing.T) { + freelist := newTestFreelist() + freelist.Init([]common.Pgid{}) + freelist.Free(common.Txid(2), common.NewPage(5, common.LeafPageFlag, 0, 1)) + freelist.Free(common.Txid(2), common.NewPage(8, common.LeafPageFlag, 0, 0)) + requirePages(t, freelist, common.Pgids{}, common.Pgids{5, 6, 8}) + freelist.Rollback(common.Txid(2)) + requirePages(t, freelist, common.Pgids{}, common.Pgids{}) + + // unknown transaction should not trigger anything + freelist.Free(common.Txid(4), common.NewPage(13, common.LeafPageFlag, 0, 3)) + requirePages(t, freelist, common.Pgids{}, common.Pgids{13, 14, 15, 16}) + freelist.ReleasePendingPages() + requirePages(t, freelist, common.Pgids{13, 14, 15, 16}, common.Pgids{}) + freelist.Rollback(common.Txid(1337)) + requirePages(t, freelist, common.Pgids{13, 14, 15, 16}, common.Pgids{}) +} + +func TestFreelist_E2E_RollbackPanics(t *testing.T) { + freelist := newTestFreelist() + freelist.Init([]common.Pgid{5}) + requirePages(t, freelist, common.Pgids{5}, common.Pgids{}) + + _ = freelist.Allocate(common.Txid(5), 1) + require.Panics(t, func() { + // depending on the verification level, either should panic + freelist.Free(common.Txid(5), common.NewPage(5, common.LeafPageFlag, 0, 0)) + freelist.Rollback(5) + }) +} + +// tests the reloading from another physical page +func TestFreelist_E2E_Reload(t *testing.T) { + freelist := newTestFreelist() + freelist.Init([]common.Pgid{}) + freelist.Free(common.Txid(2), common.NewPage(5, common.LeafPageFlag, 0, 1)) + freelist.Free(common.Txid(2), common.NewPage(8, common.LeafPageFlag, 0, 0)) + freelist.ReleasePendingPages() + requirePages(t, freelist, common.Pgids{5, 6, 8}, common.Pgids{}) + buf := make([]byte, 4096) + p := common.LoadPage(buf) + freelist.Write(p) + + freelist.Free(common.Txid(3), common.NewPage(3, common.LeafPageFlag, 0, 1)) + freelist.Free(common.Txid(3), common.NewPage(10, common.LeafPageFlag, 0, 2)) + requirePages(t, freelist, common.Pgids{5, 6, 8}, common.Pgids{3, 4, 10, 11, 12}) + + otherBuf := make([]byte, 4096) + px := common.LoadPage(otherBuf) + freelist.Write(px) + + loadFreeList := newTestFreelist() + loadFreeList.Init([]common.Pgid{}) + loadFreeList.Read(px) + requirePages(t, loadFreeList, common.Pgids{3, 4, 5, 6, 8, 10, 11, 12}, common.Pgids{}) + // restore the original freelist again + loadFreeList.Reload(p) + requirePages(t, loadFreeList, common.Pgids{5, 6, 8}, common.Pgids{}) + + // reload another page with different free pages to test we are deduplicating the free pages with the pending ones correctly + freelist = newTestFreelist() + freelist.Init([]common.Pgid{}) + freelist.Free(common.Txid(5), common.NewPage(5, common.LeafPageFlag, 0, 4)) + freelist.Reload(p) + requirePages(t, freelist, common.Pgids{}, common.Pgids{5, 6, 7, 8, 9}) +} + +// tests the loading and reloading from physical pages +func TestFreelist_E2E_SerDe_HappyPath(t *testing.T) { + freelist := newTestFreelist() + freelist.Init([]common.Pgid{}) + freelist.Free(common.Txid(2), common.NewPage(5, common.LeafPageFlag, 0, 1)) + freelist.Free(common.Txid(2), common.NewPage(8, common.LeafPageFlag, 0, 0)) + freelist.ReleasePendingPages() + requirePages(t, freelist, common.Pgids{5, 6, 8}, common.Pgids{}) + + freelist.Free(common.Txid(3), common.NewPage(3, common.LeafPageFlag, 0, 1)) + freelist.Free(common.Txid(3), common.NewPage(10, common.LeafPageFlag, 0, 2)) + requirePages(t, freelist, common.Pgids{5, 6, 8}, common.Pgids{3, 4, 10, 11, 12}) + + buf := make([]byte, 4096) + p := common.LoadPage(buf) + require.Equal(t, 80, freelist.EstimatedWritePageSize()) + freelist.Write(p) + + loadFreeList := newTestFreelist() + loadFreeList.Init([]common.Pgid{}) + loadFreeList.Read(p) + requirePages(t, loadFreeList, common.Pgids{3, 4, 5, 6, 8, 10, 11, 12}, common.Pgids{}) +} + +// tests the loading of a freelist against other implementations with various sizes +func TestFreelist_E2E_SerDe_AcrossImplementations(t *testing.T) { + testSizes := []int{0, 1, 10, 100, 1000, math.MaxUint16, math.MaxUint16 + 1, math.MaxUint16 * 2} + for _, size := range testSizes { + t.Run(fmt.Sprintf("n=%d", size), func(t *testing.T) { + freelist := newTestFreelist() + expectedFreePgids := common.Pgids{} + for i := 0; i < size; i++ { + pgid := common.Pgid(i + 2) + freelist.Free(common.Txid(1), common.NewPage(pgid, common.LeafPageFlag, 0, 0)) + expectedFreePgids = append(expectedFreePgids, pgid) + } + freelist.ReleasePendingPages() + requirePages(t, freelist, expectedFreePgids, common.Pgids{}) + buf := make([]byte, freelist.EstimatedWritePageSize()) + p := common.LoadPage(buf) + freelist.Write(p) + + for n, loadFreeList := range map[string]Interface{ + "hashmap": NewHashMapFreelist(), + "array": NewArrayFreelist(), + } { + t.Run(n, func(t *testing.T) { + loadFreeList.Read(p) + requirePages(t, loadFreeList, expectedFreePgids, common.Pgids{}) + }) + } + }) + } +} + +func requirePages(t *testing.T, f Interface, freePageIds common.Pgids, pendingPageIds common.Pgids) { + require.Equal(t, f.FreeCount()+f.PendingCount(), f.Count()) + require.Equalf(t, freePageIds, f.freePageIds(), "unexpected free pages") + require.Equal(t, len(freePageIds), f.FreeCount()) + + pp := allPendingPages(f.pendingPageIds()) + require.Equalf(t, pendingPageIds, pp, "unexpected pending pages") + require.Equal(t, len(pp), f.PendingCount()) + + for _, pgid := range f.freePageIds() { + require.Truef(t, f.Freed(pgid), "expected free page to return true on Freed") + } + + for _, pgid := range pp { + require.Truef(t, f.Freed(pgid), "expected pending page to return true on Freed") + } +} + +func allPendingPages(p map[common.Txid]*txPending) common.Pgids { + pgids := common.Pgids{} + for _, pending := range p { + pgids = append(pgids, pending.ids...) + } + sort.Sort(pgids) + return pgids +} + +func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) } +func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) } +func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) } +func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) } + +func benchmark_FreelistRelease(b *testing.B, size int) { + ids := randomPgids(size) + pending := randomPgids(len(ids) / 400) + b.ResetTimer() + for i := 0; i < b.N; i++ { + txp := &txPending{ids: pending} + f := newTestFreelist() + f.pendingPageIds()[1] = txp + f.Init(ids) + f.release(1) + } +} + +func randomPgids(n int) []common.Pgid { + pgids := make(common.Pgids, n) + for i := range pgids { + pgids[i] = common.Pgid(rand.Int63()) + } + sort.Sort(pgids) + return pgids +} + +func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) { + f := newTestFreelist() + exp := common.Pgids([]common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}) + + f.Init(exp) + + if got := f.freePageIds(); !reflect.DeepEqual(exp, got) { + t.Fatalf("exp=%v; got=%v", exp, got) + } + + f2 := newTestFreelist() + exp2 := []common.Pgid{} + f2.Init(exp2) + + if got2 := f2.freePageIds(); !reflect.DeepEqual(got2, common.Pgids(exp2)) { + t.Fatalf("exp2=%#v; got2=%#v", exp2, got2) + } + +} + +// newTestFreelist get the freelist type from env and initial the freelist +func newTestFreelist() Interface { + if env := os.Getenv(TestFreelistType); env == "hashmap" { + return NewHashMapFreelist() + } + + return NewArrayFreelist() +} diff --git a/internal/freelist/hashmap.go b/internal/freelist/hashmap.go new file mode 100644 index 0000000..3138004 --- /dev/null +++ b/internal/freelist/hashmap.go @@ -0,0 +1,292 @@ +package freelist + +import ( + "fmt" + "reflect" + "sort" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// pidSet holds the set of starting pgids which have the same span size +type pidSet map[common.Pgid]struct{} + +type hashMap struct { + *shared + + freePagesCount uint64 // count of free pages(hashmap version) + freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size + forwardMap map[common.Pgid]uint64 // key is start pgid, value is its span size + backwardMap map[common.Pgid]uint64 // key is end pgid, value is its span size +} + +func (f *hashMap) Init(pgids common.Pgids) { + // reset the counter when freelist init + f.freePagesCount = 0 + f.freemaps = make(map[uint64]pidSet) + f.forwardMap = make(map[common.Pgid]uint64) + f.backwardMap = make(map[common.Pgid]uint64) + + if len(pgids) == 0 { + return + } + + if !sort.SliceIsSorted([]common.Pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) { + panic("pgids not sorted") + } + + size := uint64(1) + start := pgids[0] + + for i := 1; i < len(pgids); i++ { + // continuous page + if pgids[i] == pgids[i-1]+1 { + size++ + } else { + f.addSpan(start, size) + + size = 1 + start = pgids[i] + } + } + + // init the tail + if size != 0 && start != 0 { + f.addSpan(start, size) + } + + f.reindex() +} + +func (f *hashMap) Allocate(txid common.Txid, n int) common.Pgid { + if n == 0 { + return 0 + } + + // if we have a exact size match just return short path + if bm, ok := f.freemaps[uint64(n)]; ok { + for pid := range bm { + // remove the span + f.delSpan(pid, uint64(n)) + + f.allocs[pid] = txid + + for i := common.Pgid(0); i < common.Pgid(n); i++ { + delete(f.cache, pid+i) + } + return pid + } + } + + // lookup the map to find larger span + for size, bm := range f.freemaps { + if size < uint64(n) { + continue + } + + for pid := range bm { + // remove the initial + f.delSpan(pid, size) + + f.allocs[pid] = txid + + remain := size - uint64(n) + + // add remain span + f.addSpan(pid+common.Pgid(n), remain) + + for i := common.Pgid(0); i < common.Pgid(n); i++ { + delete(f.cache, pid+i) + } + return pid + } + } + + return 0 +} + +func (f *hashMap) FreeCount() int { + common.Verify(func() { + expectedFreePageCount := f.hashmapFreeCountSlow() + common.Assert(int(f.freePagesCount) == expectedFreePageCount, + "freePagesCount (%d) is out of sync with free pages map (%d)", f.freePagesCount, expectedFreePageCount) + }) + return int(f.freePagesCount) +} + +func (f *hashMap) freePageIds() common.Pgids { + count := f.FreeCount() + if count == 0 { + return common.Pgids{} + } + + m := make([]common.Pgid, 0, count) + + startPageIds := make([]common.Pgid, 0, len(f.forwardMap)) + for k := range f.forwardMap { + startPageIds = append(startPageIds, k) + } + sort.Sort(common.Pgids(startPageIds)) + + for _, start := range startPageIds { + if size, ok := f.forwardMap[start]; ok { + for i := 0; i < int(size); i++ { + m = append(m, start+common.Pgid(i)) + } + } + } + + return m +} + +func (f *hashMap) hashmapFreeCountSlow() int { + count := 0 + for _, size := range f.forwardMap { + count += int(size) + } + return count +} + +func (f *hashMap) addSpan(start common.Pgid, size uint64) { + f.backwardMap[start-1+common.Pgid(size)] = size + f.forwardMap[start] = size + if _, ok := f.freemaps[size]; !ok { + f.freemaps[size] = make(map[common.Pgid]struct{}) + } + + f.freemaps[size][start] = struct{}{} + f.freePagesCount += size +} + +func (f *hashMap) delSpan(start common.Pgid, size uint64) { + delete(f.forwardMap, start) + delete(f.backwardMap, start+common.Pgid(size-1)) + delete(f.freemaps[size], start) + if len(f.freemaps[size]) == 0 { + delete(f.freemaps, size) + } + f.freePagesCount -= size +} + +func (f *hashMap) mergeSpans(ids common.Pgids) { + common.Verify(func() { + ids1Freemap := f.idsFromFreemaps() + ids2Forward := f.idsFromForwardMap() + ids3Backward := f.idsFromBackwardMap() + + if !reflect.DeepEqual(ids1Freemap, ids2Forward) { + panic(fmt.Sprintf("Detected mismatch, f.freemaps: %v, f.forwardMap: %v", f.freemaps, f.forwardMap)) + } + if !reflect.DeepEqual(ids1Freemap, ids3Backward) { + panic(fmt.Sprintf("Detected mismatch, f.freemaps: %v, f.backwardMap: %v", f.freemaps, f.backwardMap)) + } + + sort.Sort(ids) + prev := common.Pgid(0) + for _, id := range ids { + // The ids shouldn't have duplicated free ID. + if prev == id { + panic(fmt.Sprintf("detected duplicated free ID: %d in ids: %v", id, ids)) + } + prev = id + + // The ids shouldn't have any overlap with the existing f.freemaps. + if _, ok := ids1Freemap[id]; ok { + panic(fmt.Sprintf("detected overlapped free page ID: %d between ids: %v and existing f.freemaps: %v", id, ids, f.freemaps)) + } + } + }) + for _, id := range ids { + // try to see if we can merge and update + f.mergeWithExistingSpan(id) + } +} + +// mergeWithExistingSpan merges pid to the existing free spans, try to merge it backward and forward +func (f *hashMap) mergeWithExistingSpan(pid common.Pgid) { + prev := pid - 1 + next := pid + 1 + + preSize, mergeWithPrev := f.backwardMap[prev] + nextSize, mergeWithNext := f.forwardMap[next] + newStart := pid + newSize := uint64(1) + + if mergeWithPrev { + //merge with previous span + start := prev + 1 - common.Pgid(preSize) + f.delSpan(start, preSize) + + newStart -= common.Pgid(preSize) + newSize += preSize + } + + if mergeWithNext { + // merge with next span + f.delSpan(next, nextSize) + newSize += nextSize + } + + f.addSpan(newStart, newSize) +} + +// idsFromFreemaps get all free page IDs from f.freemaps. +// used by test only. +func (f *hashMap) idsFromFreemaps() map[common.Pgid]struct{} { + ids := make(map[common.Pgid]struct{}) + for size, idSet := range f.freemaps { + for start := range idSet { + for i := 0; i < int(size); i++ { + id := start + common.Pgid(i) + if _, ok := ids[id]; ok { + panic(fmt.Sprintf("detected duplicated free page ID: %d in f.freemaps: %v", id, f.freemaps)) + } + ids[id] = struct{}{} + } + } + } + return ids +} + +// idsFromForwardMap get all free page IDs from f.forwardMap. +// used by test only. +func (f *hashMap) idsFromForwardMap() map[common.Pgid]struct{} { + ids := make(map[common.Pgid]struct{}) + for start, size := range f.forwardMap { + for i := 0; i < int(size); i++ { + id := start + common.Pgid(i) + if _, ok := ids[id]; ok { + panic(fmt.Sprintf("detected duplicated free page ID: %d in f.forwardMap: %v", id, f.forwardMap)) + } + ids[id] = struct{}{} + } + } + return ids +} + +// idsFromBackwardMap get all free page IDs from f.backwardMap. +// used by test only. +func (f *hashMap) idsFromBackwardMap() map[common.Pgid]struct{} { + ids := make(map[common.Pgid]struct{}) + for end, size := range f.backwardMap { + for i := 0; i < int(size); i++ { + id := end - common.Pgid(i) + if _, ok := ids[id]; ok { + panic(fmt.Sprintf("detected duplicated free page ID: %d in f.backwardMap: %v", id, f.backwardMap)) + } + ids[id] = struct{}{} + } + } + return ids +} + +func NewHashMapFreelist() Interface { + hm := &hashMap{ + shared: newShared(), + freemaps: make(map[uint64]pidSet), + forwardMap: make(map[common.Pgid]uint64), + backwardMap: make(map[common.Pgid]uint64), + } + hm.Interface = hm + return hm +} diff --git a/internal/freelist/hashmap_test.go b/internal/freelist/hashmap_test.go new file mode 100644 index 0000000..1992466 --- /dev/null +++ b/internal/freelist/hashmap_test.go @@ -0,0 +1,187 @@ +package freelist + +import ( + "math/rand" + "reflect" + "sort" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +func TestFreelistHashmap_init_panics(t *testing.T) { + f := NewHashMapFreelist() + require.Panics(t, func() { + // init expects sorted input + f.Init([]common.Pgid{25, 5}) + }) +} + +func TestFreelistHashmap_allocate(t *testing.T) { + f := NewHashMapFreelist() + + ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} + f.Init(ids) + + f.Allocate(1, 3) + if x := f.FreeCount(); x != 6 { + t.Fatalf("exp=6; got=%v", x) + } + + f.Allocate(1, 2) + if x := f.FreeCount(); x != 4 { + t.Fatalf("exp=4; got=%v", x) + } + f.Allocate(1, 1) + if x := f.FreeCount(); x != 3 { + t.Fatalf("exp=3; got=%v", x) + } + + f.Allocate(1, 0) + if x := f.FreeCount(); x != 3 { + t.Fatalf("exp=3; got=%v", x) + } +} + +func TestFreelistHashmap_mergeWithExist(t *testing.T) { + bm1 := pidSet{1: struct{}{}} + + bm2 := pidSet{5: struct{}{}} + tests := []struct { + name string + ids common.Pgids + pgid common.Pgid + want common.Pgids + wantForwardmap map[common.Pgid]uint64 + wantBackwardmap map[common.Pgid]uint64 + wantfreemap map[uint64]pidSet + }{ + { + name: "test1", + ids: []common.Pgid{1, 2, 4, 5, 6}, + pgid: 3, + want: []common.Pgid{1, 2, 3, 4, 5, 6}, + wantForwardmap: map[common.Pgid]uint64{1: 6}, + wantBackwardmap: map[common.Pgid]uint64{6: 6}, + wantfreemap: map[uint64]pidSet{6: bm1}, + }, + { + name: "test2", + ids: []common.Pgid{1, 2, 5, 6}, + pgid: 3, + want: []common.Pgid{1, 2, 3, 5, 6}, + wantForwardmap: map[common.Pgid]uint64{1: 3, 5: 2}, + wantBackwardmap: map[common.Pgid]uint64{6: 2, 3: 3}, + wantfreemap: map[uint64]pidSet{3: bm1, 2: bm2}, + }, + { + name: "test3", + ids: []common.Pgid{1, 2}, + pgid: 3, + want: []common.Pgid{1, 2, 3}, + wantForwardmap: map[common.Pgid]uint64{1: 3}, + wantBackwardmap: map[common.Pgid]uint64{3: 3}, + wantfreemap: map[uint64]pidSet{3: bm1}, + }, + { + name: "test4", + ids: []common.Pgid{2, 3}, + pgid: 1, + want: []common.Pgid{1, 2, 3}, + wantForwardmap: map[common.Pgid]uint64{1: 3}, + wantBackwardmap: map[common.Pgid]uint64{3: 3}, + wantfreemap: map[uint64]pidSet{3: bm1}, + }, + } + for _, tt := range tests { + f := newTestHashMapFreelist() + f.Init(tt.ids) + + f.mergeWithExistingSpan(tt.pgid) + + if got := f.freePageIds(); !reflect.DeepEqual(tt.want, got) { + t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.want, got) + } + if got := f.forwardMap; !reflect.DeepEqual(tt.wantForwardmap, got) { + t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantForwardmap, got) + } + if got := f.backwardMap; !reflect.DeepEqual(tt.wantBackwardmap, got) { + t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantBackwardmap, got) + } + if got := f.freemaps; !reflect.DeepEqual(tt.wantfreemap, got) { + t.Fatalf("name %s; exp=%v; got=%v", tt.name, tt.wantfreemap, got) + } + } +} + +func TestFreelistHashmap_GetFreePageIDs(t *testing.T) { + f := newTestHashMapFreelist() + + N := int32(100000) + fm := make(map[common.Pgid]uint64) + i := int32(0) + val := int32(0) + for i = 0; i < N; { + val = rand.Int31n(1000) + fm[common.Pgid(i)] = uint64(val) + i += val + f.freePagesCount += uint64(val) + } + + f.forwardMap = fm + res := f.freePageIds() + + if !sort.SliceIsSorted(res, func(i, j int) bool { return res[i] < res[j] }) { + t.Fatalf("pgids not sorted") + } +} + +func Test_Freelist_Hashmap_Rollback(t *testing.T) { + f := newTestHashMapFreelist() + + f.Init([]common.Pgid{3, 5, 6, 7, 12, 13}) + + f.Free(100, common.NewPage(20, 0, 0, 1)) + f.Allocate(100, 3) + f.Free(100, common.NewPage(25, 0, 0, 0)) + f.Allocate(100, 2) + + require.Equal(t, map[common.Pgid]common.Txid{5: 100, 12: 100}, f.allocs) + require.Equal(t, map[common.Txid]*txPending{100: { + ids: []common.Pgid{20, 21, 25}, + alloctx: []common.Txid{0, 0, 0}, + }}, f.pending) + + f.Rollback(100) + + require.Equal(t, map[common.Pgid]common.Txid{}, f.allocs) + require.Equal(t, map[common.Txid]*txPending{}, f.pending) +} + +func Benchmark_freelist_hashmapGetFreePageIDs(b *testing.B) { + f := newTestHashMapFreelist() + N := int32(100000) + fm := make(map[common.Pgid]uint64) + i := int32(0) + val := int32(0) + for i = 0; i < N; { + val = rand.Int31n(1000) + fm[common.Pgid(i)] = uint64(val) + i += val + } + + f.forwardMap = fm + + b.ReportAllocs() + b.ResetTimer() + for n := 0; n < b.N; n++ { + f.freePageIds() + } +} + +func newTestHashMapFreelist() *hashMap { + f := NewHashMapFreelist() + return f.(*hashMap) +} diff --git a/internal/freelist/shared.go b/internal/freelist/shared.go new file mode 100644 index 0000000..0fa79e6 --- /dev/null +++ b/internal/freelist/shared.go @@ -0,0 +1,310 @@ +package freelist + +import ( + "fmt" + "math" + "sort" + "unsafe" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +type txPending struct { + ids []common.Pgid + alloctx []common.Txid // txids allocating the ids + lastReleaseBegin common.Txid // beginning txid of last matching releaseRange +} + +type shared struct { + Interface + + readonlyTXIDs []common.Txid // all readonly transaction IDs. + allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. + cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. + pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. +} + +func newShared() *shared { + return &shared{ + pending: make(map[common.Txid]*txPending), + allocs: make(map[common.Pgid]common.Txid), + cache: make(map[common.Pgid]struct{}), + } +} + +func (t *shared) pendingPageIds() map[common.Txid]*txPending { + return t.pending +} + +func (t *shared) PendingCount() int { + var count int + for _, txp := range t.pending { + count += len(txp.ids) + } + return count +} + +func (t *shared) Count() int { + return t.FreeCount() + t.PendingCount() +} + +func (t *shared) Freed(pgId common.Pgid) bool { + _, ok := t.cache[pgId] + return ok +} + +func (t *shared) Free(txid common.Txid, p *common.Page) { + if p.Id() <= 1 { + panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.Id())) + } + + // Free page and all its overflow pages. + txp := t.pending[txid] + if txp == nil { + txp = &txPending{} + t.pending[txid] = txp + } + allocTxid, ok := t.allocs[p.Id()] + common.Verify(func() { + if allocTxid == txid { + panic(fmt.Sprintf("free: freed page (%d) was allocated by the same transaction (%d)", p.Id(), txid)) + } + }) + if ok { + delete(t.allocs, p.Id()) + } + + for id := p.Id(); id <= p.Id()+common.Pgid(p.Overflow()); id++ { + // Verify that page is not already free. + if _, ok := t.cache[id]; ok { + panic(fmt.Sprintf("page %d already freed", id)) + } + // Add to the freelist and cache. + txp.ids = append(txp.ids, id) + txp.alloctx = append(txp.alloctx, allocTxid) + t.cache[id] = struct{}{} + } +} + +func (t *shared) Rollback(txid common.Txid) { + // Remove page ids from cache. + txp := t.pending[txid] + if txp == nil { + return + } + for i, pgid := range txp.ids { + delete(t.cache, pgid) + tx := txp.alloctx[i] + if tx == 0 { + continue + } + if tx != txid { + // Pending free aborted; restore page back to alloc list. + t.allocs[pgid] = tx + } else { + // A writing TXN should never free a page which was allocated by itself. + panic(fmt.Sprintf("rollback: freed page (%d) was allocated by the same transaction (%d)", pgid, txid)) + } + } + // Remove pages from pending list and mark as free if allocated by txid. + delete(t.pending, txid) + + // Remove pgids which are allocated by this txid + for pgid, tid := range t.allocs { + if tid == txid { + delete(t.allocs, pgid) + } + } +} + +func (t *shared) AddReadonlyTXID(tid common.Txid) { + t.readonlyTXIDs = append(t.readonlyTXIDs, tid) +} + +func (t *shared) RemoveReadonlyTXID(tid common.Txid) { + for i := range t.readonlyTXIDs { + if t.readonlyTXIDs[i] == tid { + last := len(t.readonlyTXIDs) - 1 + t.readonlyTXIDs[i] = t.readonlyTXIDs[last] + t.readonlyTXIDs = t.readonlyTXIDs[:last] + break + } + } +} + +type txIDx []common.Txid + +func (t txIDx) Len() int { return len(t) } +func (t txIDx) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t txIDx) Less(i, j int) bool { return t[i] < t[j] } + +func (t *shared) ReleasePendingPages() { + // Free all pending pages prior to the earliest open transaction. + sort.Sort(txIDx(t.readonlyTXIDs)) + minid := common.Txid(math.MaxUint64) + if len(t.readonlyTXIDs) > 0 { + minid = t.readonlyTXIDs[0] + } + if minid > 0 { + t.release(minid - 1) + } + // Release unused txid extents. + for _, tid := range t.readonlyTXIDs { + t.releaseRange(minid, tid-1) + minid = tid + 1 + } + t.releaseRange(minid, common.Txid(math.MaxUint64)) + // Any page both allocated and freed in an extent is safe to release. +} + +func (t *shared) release(txid common.Txid) { + m := make(common.Pgids, 0) + for tid, txp := range t.pending { + if tid <= txid { + // Move transaction's pending pages to the available freelist. + // Don't remove from the cache since the page is still free. + m = append(m, txp.ids...) + delete(t.pending, tid) + } + } + t.mergeSpans(m) +} + +func (t *shared) releaseRange(begin, end common.Txid) { + if begin > end { + return + } + m := common.Pgids{} + for tid, txp := range t.pending { + if tid < begin || tid > end { + continue + } + // Don't recompute freed pages if ranges haven't updated. + if txp.lastReleaseBegin == begin { + continue + } + for i := 0; i < len(txp.ids); i++ { + if atx := txp.alloctx[i]; atx < begin || atx > end { + continue + } + m = append(m, txp.ids[i]) + txp.ids[i] = txp.ids[len(txp.ids)-1] + txp.ids = txp.ids[:len(txp.ids)-1] + txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1] + txp.alloctx = txp.alloctx[:len(txp.alloctx)-1] + i-- + } + txp.lastReleaseBegin = begin + if len(txp.ids) == 0 { + delete(t.pending, tid) + } + } + t.mergeSpans(m) +} + +// Copyall copies a list of all free ids and all pending ids in one sorted list. +// f.count returns the minimum length required for dst. +func (t *shared) Copyall(dst []common.Pgid) { + m := make(common.Pgids, 0, t.PendingCount()) + for _, txp := range t.pendingPageIds() { + m = append(m, txp.ids...) + } + sort.Sort(m) + common.Mergepgids(dst, t.freePageIds(), m) +} + +func (t *shared) Reload(p *common.Page) { + t.Read(p) + t.NoSyncReload(t.freePageIds()) +} + +func (t *shared) NoSyncReload(pgIds common.Pgids) { + // Build a cache of only pending pages. + pcache := make(map[common.Pgid]bool) + for _, txp := range t.pending { + for _, pendingID := range txp.ids { + pcache[pendingID] = true + } + } + + // Check each page in the freelist and build a new available freelist + // with any pages not in the pending lists. + a := []common.Pgid{} + for _, id := range pgIds { + if !pcache[id] { + a = append(a, id) + } + } + + t.Init(a) +} + +// reindex rebuilds the free cache based on available and pending free lists. +func (t *shared) reindex() { + free := t.freePageIds() + pending := t.pendingPageIds() + t.cache = make(map[common.Pgid]struct{}, len(free)) + for _, id := range free { + t.cache[id] = struct{}{} + } + for _, txp := range pending { + for _, pendingID := range txp.ids { + t.cache[pendingID] = struct{}{} + } + } +} + +func (t *shared) Read(p *common.Page) { + if !p.IsFreelistPage() { + panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.Id(), p.Typ())) + } + + ids := p.FreelistPageIds() + + // Copy the list of page ids from the freelist. + if len(ids) == 0 { + t.Init([]common.Pgid{}) + } else { + // copy the ids, so we don't modify on the freelist page directly + idsCopy := make([]common.Pgid, len(ids)) + copy(idsCopy, ids) + // Make sure they're sorted. + sort.Sort(common.Pgids(idsCopy)) + + t.Init(idsCopy) + } +} + +func (t *shared) EstimatedWritePageSize() int { + n := t.Count() + if n >= 0xFFFF { + // The first element will be used to store the count. See freelist.write. + n++ + } + return int(common.PageHeaderSize) + (int(unsafe.Sizeof(common.Pgid(0))) * n) +} + +func (t *shared) Write(p *common.Page) { + // Combine the old free pgids and pgids waiting on an open transaction. + + // Update the header flag. + p.SetFlags(common.FreelistPageFlag) + + // The page.count can only hold up to 64k elements so if we overflow that + // number then we handle it by putting the size in the first element. + l := t.Count() + if l == 0 { + p.SetCount(uint16(l)) + } else if l < 0xFFFF { + p.SetCount(uint16(l)) + data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + ids := unsafe.Slice((*common.Pgid)(data), l) + t.Copyall(ids) + } else { + p.SetCount(0xFFFF) + data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) + ids := unsafe.Slice((*common.Pgid)(data), l+1) + ids[0] = common.Pgid(l) + t.Copyall(ids[1:]) + } +} diff --git a/internal/guts_cli/guts_cli.go b/internal/guts_cli/guts_cli.go new file mode 100644 index 0000000..2611e3e --- /dev/null +++ b/internal/guts_cli/guts_cli.go @@ -0,0 +1,141 @@ +package guts_cli + +// Low level access to pages / data-structures of the bbolt file. + +import ( + "errors" + "fmt" + "io" + "os" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +var ( + // ErrCorrupt is returned when a checking a data file finds errors. + ErrCorrupt = errors.New("invalid value") +) + +// ReadPage reads Page info & full Page data from a path. +// This is not transactionally safe. +func ReadPage(path string, pageID uint64) (*common.Page, []byte, error) { + // Find Page size. + pageSize, hwm, err := ReadPageAndHWMSize(path) + if err != nil { + return nil, nil, fmt.Errorf("read Page size: %s", err) + } + + // Open database file. + f, err := os.Open(path) + if err != nil { + return nil, nil, err + } + defer f.Close() + + // Read one block into buffer. + buf := make([]byte, pageSize) + if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil { + return nil, nil, err + } else if n != len(buf) { + return nil, nil, io.ErrUnexpectedEOF + } + + // Determine total number of blocks. + p := common.LoadPage(buf) + if p.Id() != common.Pgid(pageID) { + return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.Id(), pageID) + } + overflowN := p.Overflow() + if overflowN >= uint32(hwm)-3 { // we exclude 2 Meta pages and the current Page. + return nil, nil, fmt.Errorf("error: %w, Page claims to have %d overflow pages (>=hwm=%d). Interrupting to avoid risky OOM", ErrCorrupt, overflowN, hwm) + } + + if overflowN == 0 { + return p, buf, nil + } + + // Re-read entire Page (with overflow) into buffer. + buf = make([]byte, (uint64(overflowN)+1)*pageSize) + if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil { + return nil, nil, err + } else if n != len(buf) { + return nil, nil, io.ErrUnexpectedEOF + } + p = common.LoadPage(buf) + if p.Id() != common.Pgid(pageID) { + return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.Id(), pageID) + } + + return p, buf, nil +} + +func WritePage(path string, pageBuf []byte) error { + page := common.LoadPage(pageBuf) + pageSize, _, err := ReadPageAndHWMSize(path) + if err != nil { + return err + } + expectedLen := pageSize * (uint64(page.Overflow()) + 1) + if expectedLen != uint64(len(pageBuf)) { + return fmt.Errorf("WritePage: len(buf):%d != pageSize*(overflow+1):%d", len(pageBuf), expectedLen) + } + f, err := os.OpenFile(path, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + _, err = f.WriteAt(pageBuf, int64(page.Id())*int64(pageSize)) + return err +} + +// ReadPageAndHWMSize reads Page size and HWM (id of the last+1 Page). +// This is not transactionally safe. +func ReadPageAndHWMSize(path string) (uint64, common.Pgid, error) { + // Open database file. + f, err := os.Open(path) + if err != nil { + return 0, 0, err + } + defer f.Close() + + // Read 4KB chunk. + buf := make([]byte, 4096) + if _, err := io.ReadFull(f, buf); err != nil { + return 0, 0, err + } + + // Read Page size from metadata. + m := common.LoadPageMeta(buf) + if m.Magic() != common.Magic { + return 0, 0, fmt.Errorf("the Meta Page has wrong (unexpected) magic") + } + return uint64(m.PageSize()), common.Pgid(m.Pgid()), nil +} + +// GetRootPage returns the root-page (according to the most recent transaction). +func GetRootPage(path string) (root common.Pgid, activeMeta common.Pgid, err error) { + m, id, err := GetActiveMetaPage(path) + if err != nil { + return 0, id, err + } + return m.RootBucket().RootPage(), id, nil +} + +// GetActiveMetaPage returns the active meta page and its page ID (0 or 1). +func GetActiveMetaPage(path string) (*common.Meta, common.Pgid, error) { + _, buf0, err0 := ReadPage(path, 0) + if err0 != nil { + return nil, 0, err0 + } + m0 := common.LoadPageMeta(buf0) + _, buf1, err1 := ReadPage(path, 1) + if err1 != nil { + return nil, 1, err1 + } + m1 := common.LoadPageMeta(buf1) + if m0.Txid() < m1.Txid() { + return m1, 1, nil + } else { + return m0, 0, nil + } +} diff --git a/internal/surgeon/surgeon.go b/internal/surgeon/surgeon.go new file mode 100644 index 0000000..c1c0182 --- /dev/null +++ b/internal/surgeon/surgeon.go @@ -0,0 +1,156 @@ +package surgeon + +import ( + "fmt" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +func CopyPage(path string, srcPage common.Pgid, target common.Pgid) error { + p1, d1, err1 := guts_cli.ReadPage(path, uint64(srcPage)) + if err1 != nil { + return err1 + } + p1.SetId(target) + return guts_cli.WritePage(path, d1) +} + +func ClearPage(path string, pgId common.Pgid) (bool, error) { + return ClearPageElements(path, pgId, 0, -1, false) +} + +// ClearPageElements supports clearing elements in both branch and leaf +// pages. Note if the ${abandonFreelist} is true, the freelist may be cleaned +// in the meta pages in the following two cases, and bbolt needs to scan the +// db to reconstruct free list. It may cause some delay on next startup, +// depending on the db size. +// 1. Any branch elements are cleared; +// 2. An object saved in overflow pages is cleared; +// +// Usually ${abandonFreelist} defaults to false, it means it will not clear the +// freelist in meta pages automatically. Users will receive a warning message +// to remind them to explicitly execute `bbolt surgery abandom-freelist` +// afterwards; the first return parameter will be true in such case. But if +// the freelist isn't synced at all, no warning message will be displayed. +func ClearPageElements(path string, pgId common.Pgid, start, end int, abandonFreelist bool) (bool, error) { + // Read the page + p, buf, err := guts_cli.ReadPage(path, uint64(pgId)) + if err != nil { + return false, fmt.Errorf("ReadPage failed: %w", err) + } + + if !p.IsLeafPage() && !p.IsBranchPage() { + return false, fmt.Errorf("can't clear elements in %q page", p.Typ()) + } + + elementCnt := int(p.Count()) + + if elementCnt == 0 { + return false, nil + } + + if start < 0 || start >= elementCnt { + return false, fmt.Errorf("the start index (%d) is out of range [0, %d)", start, elementCnt) + } + + if (end < 0 || end > elementCnt) && end != -1 { + return false, fmt.Errorf("the end index (%d) is out of range [0, %d]", end, elementCnt) + } + + if start > end && end != -1 { + return false, fmt.Errorf("the start index (%d) is bigger than the end index (%d)", start, end) + } + + if start == end { + return false, fmt.Errorf("invalid: the start index (%d) is equal to the end index (%d)", start, end) + } + + preOverflow := p.Overflow() + + var ( + dataWritten uint32 + ) + if end == int(p.Count()) || end == -1 { + inodes := common.ReadInodeFromPage(p) + inodes = inodes[:start] + + p.SetCount(uint16(start)) + // no need to write inode & data again, we just need to get + // the data size which will be kept. + dataWritten = common.UsedSpaceInPage(inodes, p) + } else { + inodes := common.ReadInodeFromPage(p) + inodes = append(inodes[:start], inodes[end:]...) + + p.SetCount(uint16(len(inodes))) + dataWritten = common.WriteInodeToPage(inodes, p) + } + + pageSize, _, err := guts_cli.ReadPageAndHWMSize(path) + if err != nil { + return false, fmt.Errorf("ReadPageAndHWMSize failed: %w", err) + } + if dataWritten%uint32(pageSize) == 0 { + p.SetOverflow(dataWritten/uint32(pageSize) - 1) + } else { + p.SetOverflow(dataWritten / uint32(pageSize)) + } + + datasz := pageSize * (uint64(p.Overflow()) + 1) + if err := guts_cli.WritePage(path, buf[0:datasz]); err != nil { + return false, fmt.Errorf("WritePage failed: %w", err) + } + + if preOverflow != p.Overflow() || p.IsBranchPage() { + if abandonFreelist { + return false, ClearFreelist(path) + } + return true, nil + } + + return false, nil +} + +func ClearFreelist(path string) error { + if err := clearFreelistInMetaPage(path, 0); err != nil { + return fmt.Errorf("clearFreelist on meta page 0 failed: %w", err) + } + if err := clearFreelistInMetaPage(path, 1); err != nil { + return fmt.Errorf("clearFreelist on meta page 1 failed: %w", err) + } + return nil +} + +func clearFreelistInMetaPage(path string, pageId uint64) error { + _, buf, err := guts_cli.ReadPage(path, pageId) + if err != nil { + return fmt.Errorf("ReadPage %d failed: %w", pageId, err) + } + + meta := common.LoadPageMeta(buf) + meta.SetFreelist(common.PgidNoFreelist) + meta.SetChecksum(meta.Sum64()) + + if err := guts_cli.WritePage(path, buf); err != nil { + return fmt.Errorf("WritePage %d failed: %w", pageId, err) + } + + return nil +} + +// RevertMetaPage replaces the newer metadata page with the older. +// It usually means that one transaction is being lost. But frequently +// data corruption happens on the last transaction pages and the +// previous state is consistent. +func RevertMetaPage(path string) error { + _, activeMetaPage, err := guts_cli.GetRootPage(path) + if err != nil { + return err + } + if activeMetaPage == 0 { + return CopyPage(path, 1, 0) + } else { + return CopyPage(path, 0, 1) + } +} diff --git a/internal/surgeon/surgeon_test.go b/internal/surgeon/surgeon_test.go new file mode 100644 index 0000000..67bdefa --- /dev/null +++ b/internal/surgeon/surgeon_test.go @@ -0,0 +1,57 @@ +package surgeon_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/surgeon" +) + +func TestRevertMetaPage(t *testing.T) { + db := btesting.MustCreateDB(t) + assert.NoError(t, + db.Fill([]byte("data"), 1, 500, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + )) + assert.NoError(t, + db.Update( + func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("data")) + assert.NoError(t, b.Put([]byte("0123"), []byte("new Value for 123"))) + assert.NoError(t, b.Put([]byte("1234b"), []byte("additional object"))) + assert.NoError(t, b.Delete([]byte("0246"))) + return nil + })) + + assert.NoError(t, + db.View( + func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("data")) + assert.Equal(t, []byte("new Value for 123"), b.Get([]byte("0123"))) + assert.Equal(t, []byte("additional object"), b.Get([]byte("1234b"))) + assert.Nil(t, b.Get([]byte("0246"))) + return nil + })) + + db.Close() + + // This causes the whole tree to be linked to the previous state + assert.NoError(t, surgeon.RevertMetaPage(db.Path())) + + db.MustReopen() + db.MustCheck() + assert.NoError(t, + db.View( + func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("data")) + assert.Equal(t, make([]byte, 100), b.Get([]byte("0123"))) + assert.Nil(t, b.Get([]byte("1234b"))) + assert.Equal(t, make([]byte, 100), b.Get([]byte("0246"))) + return nil + })) +} diff --git a/internal/surgeon/xray.go b/internal/surgeon/xray.go new file mode 100644 index 0000000..7167b59 --- /dev/null +++ b/internal/surgeon/xray.go @@ -0,0 +1,102 @@ +package surgeon + +// Library contains raw access to bbolt files for sake of testing or fixing of corrupted files. +// +// The library must not be used bbolt btree - just by CLI or tests. +// It's not optimized for performance. + +import ( + "bytes" + "fmt" + + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +type XRay struct { + path string +} + +func NewXRay(path string) XRay { + return XRay{path} +} + +func (n XRay) traverse(stack []common.Pgid, callback func(page *common.Page, stack []common.Pgid) error) error { + p, data, err := guts_cli.ReadPage(n.path, uint64(stack[len(stack)-1])) + if err != nil { + return fmt.Errorf("failed reading page (stack %v): %w", stack, err) + } + err = callback(p, stack) + if err != nil { + return fmt.Errorf("failed callback for page (stack %v): %w", stack, err) + } + switch p.Typ() { + case "meta": + { + m := common.LoadPageMeta(data) + r := m.RootBucket().RootPage() + return n.traverse(append(stack, r), callback) + } + case "branch": + { + for i := uint16(0); i < p.Count(); i++ { + bpe := p.BranchPageElement(i) + if err := n.traverse(append(stack, bpe.Pgid()), callback); err != nil { + return err + } + } + } + case "leaf": + for i := uint16(0); i < p.Count(); i++ { + lpe := p.LeafPageElement(i) + if lpe.IsBucketEntry() { + pgid := lpe.Bucket().RootPage() + if pgid > 0 { + if err := n.traverse(append(stack, pgid), callback); err != nil { + return err + } + } else { + inlinePage := lpe.Bucket().InlinePage(lpe.Value()) + if err := callback(inlinePage, stack); err != nil { + return fmt.Errorf("failed callback for inline page (stack %v): %w", stack, err) + } + } + } + } + case "freelist": + return nil + // Free does not have children. + } + return nil +} + +// FindPathsToKey finds all paths from root to the page that contains the given key. +// As it traverses multiple buckets, so in theory there might be multiple keys with the given name. +// Note: For simplicity it's currently implemented as traversing of the whole reachable tree. +// If key is a bucket name, a page-path referencing the key will be returned as well. +func (n XRay) FindPathsToKey(key []byte) ([][]common.Pgid, error) { + var found [][]common.Pgid + + rootPage, _, err := guts_cli.GetRootPage(n.path) + if err != nil { + return nil, err + } + err = n.traverse([]common.Pgid{rootPage}, + func(page *common.Page, stack []common.Pgid) error { + if page.Typ() == "leaf" { + for i := uint16(0); i < page.Count(); i++ { + if bytes.Equal(page.LeafPageElement(i).Key(), key) { + var copyPath []common.Pgid + copyPath = append(copyPath, stack...) + found = append(found, copyPath) + } + } + } + return nil + }) + if err != nil { + return nil, err + } else { + return found, nil + } +} diff --git a/internal/surgeon/xray_test.go b/internal/surgeon/xray_test.go new file mode 100644 index 0000000..09ea1d9 --- /dev/null +++ b/internal/surgeon/xray_test.go @@ -0,0 +1,66 @@ +package surgeon_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" + "github.com/tutus-one/tutus-bolt/internal/surgeon" +) + +func TestFindPathsToKey(t *testing.T) { + db := btesting.MustCreateDB(t) + assert.NoError(t, + db.Fill([]byte("data"), 1, 500, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + )) + assert.NoError(t, db.Close()) + + navigator := surgeon.NewXRay(db.Path()) + path1, err := navigator.FindPathsToKey([]byte("0451")) + assert.NoError(t, err) + assert.NotEmpty(t, path1) + + page := path1[0][len(path1[0])-1] + p, _, err := guts_cli.ReadPage(db.Path(), uint64(page)) + assert.NoError(t, err) + assert.GreaterOrEqual(t, []byte("0451"), p.LeafPageElement(0).Key()) + assert.LessOrEqual(t, []byte("0451"), p.LeafPageElement(p.Count()-1).Key()) +} + +func TestFindPathsToKey_Bucket(t *testing.T) { + rootBucket := []byte("data") + subBucket := []byte("0451A") + + db := btesting.MustCreateDB(t) + assert.NoError(t, + db.Fill(rootBucket, 1, 500, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + )) + require.NoError(t, db.Update(func(tx *bbolt.Tx) error { + sb, err := tx.Bucket(rootBucket).CreateBucket(subBucket) + require.NoError(t, err) + require.NoError(t, sb.Put([]byte("foo"), []byte("bar"))) + return nil + })) + + assert.NoError(t, db.Close()) + + navigator := surgeon.NewXRay(db.Path()) + path1, err := navigator.FindPathsToKey(subBucket) + assert.NoError(t, err) + assert.NotEmpty(t, path1) + + page := path1[0][len(path1[0])-1] + p, _, err := guts_cli.ReadPage(db.Path(), uint64(page)) + assert.NoError(t, err) + assert.GreaterOrEqual(t, subBucket, p.LeafPageElement(0).Key()) + assert.LessOrEqual(t, subBucket, p.LeafPageElement(p.Count()-1).Key()) +} diff --git a/internal/tests/tx_check_test.go b/internal/tests/tx_check_test.go new file mode 100644 index 0000000..5a1d0a8 --- /dev/null +++ b/internal/tests/tx_check_test.go @@ -0,0 +1,91 @@ +package tests_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" + "github.com/tutus-one/tutus-bolt/internal/surgeon" +) + +func TestTx_RecursivelyCheckPages_MisplacedPage(t *testing.T) { + db := btesting.MustCreateDB(t) + db.ForceDisableStrictMode() + require.NoError(t, + db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + )) + require.NoError(t, db.Close()) + + xRay := surgeon.NewXRay(db.Path()) + + path1, err := xRay.FindPathsToKey([]byte("0451")) + require.NoError(t, err, "cannot find page that contains key:'0451'") + require.Len(t, path1, 1, "Expected only one page that contains key:'0451'") + + path2, err := xRay.FindPathsToKey([]byte("7563")) + require.NoError(t, err, "cannot find page that contains key:'7563'") + require.Len(t, path2, 1, "Expected only one page that contains key:'7563'") + + srcPage := path1[0][len(path1[0])-1] + targetPage := path2[0][len(path2[0])-1] + require.NoError(t, surgeon.CopyPage(db.Path(), srcPage, targetPage)) + + db.MustReopen() + db.ForceDisableStrictMode() + require.NoError(t, db.Update(func(tx *bolt.Tx) error { + // Collect all the errors. + var errors []error + for err := range tx.Check() { + errors = append(errors, err) + } + require.Len(t, errors, 1) + require.ErrorContains(t, errors[0], fmt.Sprintf("leaf page(%v) needs to be >= the key in the ancestor", targetPage)) + return nil + })) + require.NoError(t, db.Close()) +} + +func TestTx_RecursivelyCheckPages_CorruptedLeaf(t *testing.T) { + db := btesting.MustCreateDB(t) + db.ForceDisableStrictMode() + require.NoError(t, + db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + )) + require.NoError(t, db.Close()) + + xray := surgeon.NewXRay(db.Path()) + + path1, err := xray.FindPathsToKey([]byte("0451")) + require.NoError(t, err, "cannot find page that contains key:'0451'") + require.Len(t, path1, 1, "Expected only one page that contains key:'0451'") + + srcPage := path1[0][len(path1[0])-1] + p, pbuf, err := guts_cli.ReadPage(db.Path(), uint64(srcPage)) + require.NoError(t, err) + require.Positive(t, p.Count(), "page must be not empty") + p.LeafPageElement(p.Count() / 2).Key()[0] = 'z' + require.NoError(t, guts_cli.WritePage(db.Path(), pbuf)) + + db.MustReopen() + db.ForceDisableStrictMode() + require.NoError(t, db.Update(func(tx *bolt.Tx) error { + // Collect all the errors. + var errors []error + for err := range tx.Check() { + errors = append(errors, err) + } + require.Len(t, errors, 2) + require.ErrorContains(t, errors[0], fmt.Sprintf("leaf page(%v) needs to be < than key of the next element in ancestor", srcPage)) + require.ErrorContains(t, errors[1], fmt.Sprintf("leaf page(%v) needs to be > (found <) than previous element", srcPage)) + return nil + })) + require.NoError(t, db.Close()) +} diff --git a/logger.go b/logger.go new file mode 100644 index 0000000..fb25089 --- /dev/null +++ b/logger.go @@ -0,0 +1,113 @@ +package bbolt + +// See https://github.com/etcd-io/raft/blob/main/logger.go +import ( + "fmt" + "io" + "log" + "os" +) + +type Logger interface { + Debug(v ...interface{}) + Debugf(format string, v ...interface{}) + + Error(v ...interface{}) + Errorf(format string, v ...interface{}) + + Info(v ...interface{}) + Infof(format string, v ...interface{}) + + Warning(v ...interface{}) + Warningf(format string, v ...interface{}) + + Fatal(v ...interface{}) + Fatalf(format string, v ...interface{}) + + Panic(v ...interface{}) + Panicf(format string, v ...interface{}) +} + +func getDiscardLogger() Logger { + return discardLogger +} + +var ( + discardLogger = &DefaultLogger{Logger: log.New(io.Discard, "", 0)} +) + +const ( + calldepth = 2 +) + +// DefaultLogger is a default implementation of the Logger interface. +type DefaultLogger struct { + *log.Logger + debug bool +} + +func (l *DefaultLogger) EnableTimestamps() { + l.SetFlags(l.Flags() | log.Ldate | log.Ltime) +} + +func (l *DefaultLogger) EnableDebug() { + l.debug = true +} + +func (l *DefaultLogger) Debug(v ...interface{}) { + if l.debug { + _ = l.Output(calldepth, header("DEBUG", fmt.Sprint(v...))) + } +} + +func (l *DefaultLogger) Debugf(format string, v ...interface{}) { + if l.debug { + _ = l.Output(calldepth, header("DEBUG", fmt.Sprintf(format, v...))) + } +} + +func (l *DefaultLogger) Info(v ...interface{}) { + _ = l.Output(calldepth, header("INFO", fmt.Sprint(v...))) +} + +func (l *DefaultLogger) Infof(format string, v ...interface{}) { + _ = l.Output(calldepth, header("INFO", fmt.Sprintf(format, v...))) +} + +func (l *DefaultLogger) Error(v ...interface{}) { + _ = l.Output(calldepth, header("ERROR", fmt.Sprint(v...))) +} + +func (l *DefaultLogger) Errorf(format string, v ...interface{}) { + _ = l.Output(calldepth, header("ERROR", fmt.Sprintf(format, v...))) +} + +func (l *DefaultLogger) Warning(v ...interface{}) { + _ = l.Output(calldepth, header("WARN", fmt.Sprint(v...))) +} + +func (l *DefaultLogger) Warningf(format string, v ...interface{}) { + _ = l.Output(calldepth, header("WARN", fmt.Sprintf(format, v...))) +} + +func (l *DefaultLogger) Fatal(v ...interface{}) { + _ = l.Output(calldepth, header("FATAL", fmt.Sprint(v...))) + os.Exit(1) +} + +func (l *DefaultLogger) Fatalf(format string, v ...interface{}) { + _ = l.Output(calldepth, header("FATAL", fmt.Sprintf(format, v...))) + os.Exit(1) +} + +func (l *DefaultLogger) Panic(v ...interface{}) { + l.Logger.Panic(v...) +} + +func (l *DefaultLogger) Panicf(format string, v ...interface{}) { + l.Logger.Panicf(format, v...) +} + +func header(lvl, msg string) string { + return fmt.Sprintf("%s: %s", lvl, msg) +} diff --git a/manydbs_test.go b/manydbs_test.go new file mode 100644 index 0000000..595c81b --- /dev/null +++ b/manydbs_test.go @@ -0,0 +1,73 @@ +package bbolt + +import ( + "crypto/rand" + "fmt" + "os" + "path/filepath" + "testing" +) + +func createDb(t *testing.T) (*DB, func()) { + // First, create a temporary directory to be used for the duration of + // this test. + tempDirName, err := os.MkdirTemp("", "bboltmemtest") + if err != nil { + t.Fatalf("error creating temp dir: %v", err) + } + path := filepath.Join(tempDirName, "testdb.db") + + bdb, err := Open(path, 0600, nil) + if err != nil { + t.Fatalf("error creating bbolt db: %v", err) + } + + cleanup := func() { + bdb.Close() + os.RemoveAll(tempDirName) + } + + return bdb, cleanup +} + +func createAndPutKeys(t *testing.T) { + t.Parallel() + + db, cleanup := createDb(t) + defer cleanup() + + bucketName := []byte("bucket") + + for i := 0; i < 100; i++ { + err := db.Update(func(tx *Tx) error { + nodes, err := tx.CreateBucketIfNotExists(bucketName) + if err != nil { + return err + } + + var key [16]byte + _, rerr := rand.Read(key[:]) + if rerr != nil { + return rerr + } + if err := nodes.Put(key[:], nil); err != nil { + return err + } + + return nil + }) + if err != nil { + t.Fatal(err) + } + } +} + +func TestManyDBs(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode") + } + + for i := 0; i < 100; i++ { + t.Run(fmt.Sprintf("%d", i), createAndPutKeys) + } +} diff --git a/mlock_unix.go b/mlock_unix.go new file mode 100644 index 0000000..9a0fd33 --- /dev/null +++ b/mlock_unix.go @@ -0,0 +1,36 @@ +//go:build !windows + +package bbolt + +import "golang.org/x/sys/unix" + +// mlock locks memory of db file +func mlock(db *DB, fileSize int) error { + sizeToLock := fileSize + if sizeToLock > db.datasz { + // Can't lock more than mmaped slice + sizeToLock = db.datasz + } + if err := unix.Mlock(db.dataref[:sizeToLock]); err != nil { + return err + } + return nil +} + +// munlock unlocks memory of db file +func munlock(db *DB, fileSize int) error { + if db.dataref == nil { + return nil + } + + sizeToUnlock := fileSize + if sizeToUnlock > db.datasz { + // Can't unlock more than mmaped slice + sizeToUnlock = db.datasz + } + + if err := unix.Munlock(db.dataref[:sizeToUnlock]); err != nil { + return err + } + return nil +} diff --git a/mlock_windows.go b/mlock_windows.go new file mode 100644 index 0000000..00b0fb4 --- /dev/null +++ b/mlock_windows.go @@ -0,0 +1,11 @@ +package bbolt + +// mlock locks memory of db file +func mlock(_ *DB, _ int) error { + panic("mlock is supported only on UNIX systems") +} + +// munlock unlocks memory of db file +func munlock(_ *DB, _ int) error { + panic("munlock is supported only on UNIX systems") +} diff --git a/movebucket_test.go b/movebucket_test.go new file mode 100644 index 0000000..a083cc6 --- /dev/null +++ b/movebucket_test.go @@ -0,0 +1,398 @@ +package bbolt_test + +import ( + crand "crypto/rand" + "math/rand" + "os" + "path/filepath" + "testing" + + "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" + + "github.com/stretchr/testify/require" +) + +func TestTx_MoveBucket(t *testing.T) { + testCases := []struct { + name string + srcBucketPath []string + dstBucketPath []string + bucketToMove string + bucketExistInSrc bool + bucketExistInDst bool + hasIncompatibleKeyInSrc bool + hasIncompatibleKeyInDst bool + expectedErr error + }{ + // normal cases + { + name: "normal case", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: nil, + }, + { + name: "the source and target bucket share the same grandparent", + srcBucketPath: []string{"grandparent", "sb2"}, + dstBucketPath: []string{"grandparent", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: nil, + }, + { + name: "bucketToMove is a top level bucket", + srcBucketPath: []string{}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: nil, + }, + { + name: "convert bucketToMove to a top level bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: nil, + }, + // negative cases + { + name: "bucketToMove not exist in source bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: false, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: errors.ErrBucketNotFound, + }, + { + name: "bucketToMove exist in target bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: true, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: errors.ErrBucketExists, + }, + { + name: "incompatible key exist in source bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: false, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: true, + hasIncompatibleKeyInDst: false, + expectedErr: errors.ErrIncompatibleValue, + }, + { + name: "incompatible key exist in target bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: true, + expectedErr: errors.ErrIncompatibleValue, + }, + { + name: "the source and target are the same bucket", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"sb1", "sb2"}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: errors.ErrSameBuckets, + }, + { + name: "both the source and target are the root bucket", + srcBucketPath: []string{}, + dstBucketPath: []string{}, + bucketToMove: "bucketToMove", + bucketExistInSrc: true, + bucketExistInDst: false, + hasIncompatibleKeyInSrc: false, + hasIncompatibleKeyInDst: false, + expectedErr: errors.ErrSameBuckets, + }, + } + + for _, tc := range testCases { + + t.Run(tc.name, func(*testing.T) { + db := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + + dumpBucketBeforeMoving := filepath.Join(t.TempDir(), "dbBeforeMove") + dumpBucketAfterMoving := filepath.Join(t.TempDir(), "dbAfterMove") + + t.Log("Creating sample db and populate some data") + err := db.Update(func(tx *bbolt.Tx) error { + srcBucket := prepareBuckets(t, tx, tc.srcBucketPath...) + dstBucket := prepareBuckets(t, tx, tc.dstBucketPath...) + + if tc.bucketExistInSrc { + _ = createBucketAndPopulateData(t, tx, srcBucket, tc.bucketToMove) + } + + if tc.bucketExistInDst { + _ = createBucketAndPopulateData(t, tx, dstBucket, tc.bucketToMove) + } + + if tc.hasIncompatibleKeyInSrc { + putErr := srcBucket.Put([]byte(tc.bucketToMove), []byte("bar")) + require.NoError(t, putErr) + } + + if tc.hasIncompatibleKeyInDst { + putErr := dstBucket.Put([]byte(tc.bucketToMove), []byte("bar")) + require.NoError(t, putErr) + } + + return nil + }) + require.NoError(t, err) + + t.Log("Moving bucket") + err = db.Update(func(tx *bbolt.Tx) error { + srcBucket := prepareBuckets(t, tx, tc.srcBucketPath...) + dstBucket := prepareBuckets(t, tx, tc.dstBucketPath...) + + if tc.expectedErr == nil { + t.Logf("Dump the bucket to %s before moving it", dumpBucketBeforeMoving) + bk := openBucket(tx, srcBucket, tc.bucketToMove) + dumpErr := dumpBucket([]byte(tc.bucketToMove), bk, dumpBucketBeforeMoving) + require.NoError(t, dumpErr) + } + + mErr := tx.MoveBucket([]byte(tc.bucketToMove), srcBucket, dstBucket) + require.Equal(t, tc.expectedErr, mErr) + + if tc.expectedErr == nil { + t.Logf("Dump the bucket to %s after moving it", dumpBucketAfterMoving) + bk := openBucket(tx, dstBucket, tc.bucketToMove) + dumpErr := dumpBucket([]byte(tc.bucketToMove), bk, dumpBucketAfterMoving) + require.NoError(t, dumpErr) + } + + return nil + }) + require.NoError(t, err) + + // skip assertion if failure expected + if tc.expectedErr != nil { + return + } + + t.Log("Verifying the bucket should be identical before and after being moved") + dataBeforeMove, err := os.ReadFile(dumpBucketBeforeMoving) + require.NoError(t, err) + dataAfterMove, err := os.ReadFile(dumpBucketAfterMoving) + require.NoError(t, err) + require.Equal(t, dataBeforeMove, dataAfterMove) + }) + } +} + +func TestBucket_MoveBucket_DiffDB(t *testing.T) { + srcBucketPath := []string{"sb1", "sb2"} + dstBucketPath := []string{"db1", "db2"} + bucketToMove := "bucketToMove" + + var srcBucket *bbolt.Bucket + + t.Log("Creating source bucket and populate some data") + srcDB := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + err := srcDB.Update(func(tx *bbolt.Tx) error { + srcBucket = prepareBuckets(t, tx, srcBucketPath...) + return nil + }) + require.NoError(t, err) + defer func() { + require.NoError(t, srcDB.Close()) + }() + + t.Log("Creating target bucket and populate some data") + dstDB := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + err = dstDB.Update(func(tx *bbolt.Tx) error { + prepareBuckets(t, tx, dstBucketPath...) + return nil + }) + require.NoError(t, err) + defer func() { + require.NoError(t, dstDB.Close()) + }() + + t.Log("Reading source bucket in a separate RWTx") + sTx, sErr := srcDB.Begin(true) + require.NoError(t, sErr) + defer func() { + require.NoError(t, sTx.Rollback()) + }() + srcBucket = prepareBuckets(t, sTx, srcBucketPath...) + + t.Log("Moving the sub-bucket in a separate RWTx") + err = dstDB.Update(func(tx *bbolt.Tx) error { + dstBucket := prepareBuckets(t, tx, dstBucketPath...) + mErr := srcBucket.MoveBucket([]byte(bucketToMove), dstBucket) + require.Equal(t, errors.ErrDifferentDB, mErr) + + return nil + }) + require.NoError(t, err) +} + +func TestBucket_MoveBucket_DiffTx(t *testing.T) { + testCases := []struct { + name string + srcBucketPath []string + dstBucketPath []string + isSrcReadonlyTx bool + isDstReadonlyTx bool + bucketToMove string + expectedErr error + }{ + { + name: "src is RWTx and target is RTx", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + isSrcReadonlyTx: true, + isDstReadonlyTx: false, + bucketToMove: "bucketToMove", + expectedErr: errors.ErrTxNotWritable, + }, + { + name: "src is RTx and target is RWTx", + srcBucketPath: []string{"sb1", "sb2"}, + dstBucketPath: []string{"db1", "db2"}, + isSrcReadonlyTx: false, + isDstReadonlyTx: true, + bucketToMove: "bucketToMove", + expectedErr: errors.ErrTxNotWritable, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var srcBucket *bbolt.Bucket + var dstBucket *bbolt.Bucket + + t.Log("Creating source and target buckets and populate some data") + db := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + err := db.Update(func(tx *bbolt.Tx) error { + srcBucket = prepareBuckets(t, tx, tc.srcBucketPath...) + dstBucket = prepareBuckets(t, tx, tc.dstBucketPath...) + return nil + }) + require.NoError(t, err) + defer func() { + require.NoError(t, db.Close()) + }() + + t.Log("Opening source bucket in a separate Tx") + sTx, sErr := db.Begin(tc.isSrcReadonlyTx) + require.NoError(t, sErr) + defer func() { + require.NoError(t, sTx.Rollback()) + }() + srcBucket = prepareBuckets(t, sTx, tc.srcBucketPath...) + + t.Log("Opening target bucket in a separate Tx") + dTx, dErr := db.Begin(tc.isDstReadonlyTx) + require.NoError(t, dErr) + defer func() { + require.NoError(t, dTx.Rollback()) + }() + dstBucket = prepareBuckets(t, dTx, tc.dstBucketPath...) + + t.Log("Moving the sub-bucket") + err = db.View(func(tx *bbolt.Tx) error { + mErr := srcBucket.MoveBucket([]byte(tc.bucketToMove), dstBucket) + require.Equal(t, tc.expectedErr, mErr) + + return nil + }) + require.NoError(t, err) + }) + } +} + +// prepareBuckets opens the bucket chain. For each bucket in the chain, +// open it if existed, otherwise create it and populate sample data. +func prepareBuckets(t testing.TB, tx *bbolt.Tx, buckets ...string) *bbolt.Bucket { + var bk *bbolt.Bucket + + for _, key := range buckets { + if childBucket := openBucket(tx, bk, key); childBucket == nil { + bk = createBucketAndPopulateData(t, tx, bk, key) + } else { + bk = childBucket + } + } + return bk +} + +func openBucket(tx *bbolt.Tx, bk *bbolt.Bucket, bucketToOpen string) *bbolt.Bucket { + if bk == nil { + return tx.Bucket([]byte(bucketToOpen)) + } + return bk.Bucket([]byte(bucketToOpen)) +} + +func createBucketAndPopulateData(t testing.TB, tx *bbolt.Tx, bk *bbolt.Bucket, bucketName string) *bbolt.Bucket { + if bk == nil { + newBucket, err := tx.CreateBucket([]byte(bucketName)) + require.NoError(t, err, "failed to create bucket %s", bucketName) + populateSampleDataInBucket(t, newBucket, rand.Intn(4096)) + return newBucket + } + + newBucket, err := bk.CreateBucket([]byte(bucketName)) + require.NoError(t, err, "failed to create bucket %s", bucketName) + populateSampleDataInBucket(t, newBucket, rand.Intn(4096)) + return newBucket +} + +func populateSampleDataInBucket(t testing.TB, bk *bbolt.Bucket, n int) { + var min, max = 1, 1024 + + for i := 0; i < n; i++ { + // generate rand key/value length + keyLength := rand.Intn(max-min) + min + valLength := rand.Intn(max-min) + min + + keyData := make([]byte, keyLength) + valData := make([]byte, valLength) + + _, err := crand.Read(keyData) + require.NoError(t, err) + + _, err = crand.Read(valData) + require.NoError(t, err) + + err = bk.Put(keyData, valData) + require.NoError(t, err) + } +} diff --git a/node.go b/node.go new file mode 100644 index 0000000..8f3ea79 --- /dev/null +++ b/node.go @@ -0,0 +1,538 @@ +package bbolt + +import ( + "bytes" + "fmt" + "sort" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// node represents an in-memory, deserialized page. +type node struct { + bucket *Bucket + isLeaf bool + unbalanced bool + spilled bool + key []byte + pgid common.Pgid + parent *node + children nodes + inodes common.Inodes +} + +// root returns the top-level node this node is attached to. +func (n *node) root() *node { + if n.parent == nil { + return n + } + return n.parent.root() +} + +// minKeys returns the minimum number of inodes this node should have. +func (n *node) minKeys() int { + if n.isLeaf { + return 1 + } + return 2 +} + +// size returns the size of the node after serialization. +func (n *node) size() int { + sz, elsz := common.PageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value())) + } + return int(sz) +} + +// sizeLessThan returns true if the node is less than a given size. +// This is an optimization to avoid calculating a large node when we only need +// to know if it fits inside a certain page size. +func (n *node) sizeLessThan(v uintptr) bool { + sz, elsz := common.PageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value())) + if sz >= v { + return false + } + } + return true +} + +// pageElementSize returns the size of each page element based on the type of node. +func (n *node) pageElementSize() uintptr { + if n.isLeaf { + return common.LeafPageElementSize + } + return common.BranchPageElementSize +} + +// childAt returns the child node at a given index. +func (n *node) childAt(index int) *node { + if n.isLeaf { + panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) + } + return n.bucket.node(n.inodes[index].Pgid(), n) +} + +// childIndex returns the index of a given child node. +func (n *node) childIndex(child *node) int { + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 }) + return index +} + +// numChildren returns the number of children. +func (n *node) numChildren() int { + return len(n.inodes) +} + +// nextSibling returns the next node with the same parent. +func (n *node) nextSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index >= n.parent.numChildren()-1 { + return nil + } + return n.parent.childAt(index + 1) +} + +// prevSibling returns the previous node with the same parent. +func (n *node) prevSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index == 0 { + return nil + } + return n.parent.childAt(index - 1) +} + +// put inserts a key/value. +func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) { + if pgId >= n.bucket.tx.meta.Pgid() { + panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid())) + } else if len(oldKey) <= 0 { + panic("put: zero-length old key") + } else if len(newKey) <= 0 { + panic("put: zero-length new key") + } + + // Find insertion index. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 }) + + // Add capacity and shift nodes if we don't have an exact match and need to insert. + exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey) + if !exact { + n.inodes = append(n.inodes, common.Inode{}) + copy(n.inodes[index+1:], n.inodes[index:]) + } + + inode := &n.inodes[index] + inode.SetFlags(flags) + inode.SetKey(newKey) + inode.SetValue(value) + inode.SetPgid(pgId) + common.Assert(len(inode.Key()) > 0, "put: zero-length inode key") +} + +// del removes a key from the node. +func (n *node) del(key []byte) { + // Find index of key. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 }) + + // Exit if the key isn't found. + if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].Key(), key) { + return + } + + // Delete inode from the node. + n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) + + // Mark the node as needing rebalancing. + n.unbalanced = true +} + +// read initializes the node from a page. +func (n *node) read(p *common.Page) { + n.pgid = p.Id() + n.isLeaf = p.IsLeafPage() + n.inodes = common.ReadInodeFromPage(p) + + // Save first key, so we can find the node in the parent when we spill. + if len(n.inodes) > 0 { + n.key = n.inodes[0].Key() + common.Assert(len(n.key) > 0, "read: zero-length node key") + } else { + n.key = nil + } +} + +// write writes the items onto one or more pages. +// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set +// and the rest should be zeroed. +func (n *node) write(p *common.Page) { + common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page") + + // Initialize page. + if n.isLeaf { + p.SetFlags(common.LeafPageFlag) + } else { + p.SetFlags(common.BranchPageFlag) + } + + if len(n.inodes) >= 0xFFFF { + panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id())) + } + p.SetCount(uint16(len(n.inodes))) + + // Stop here if there are no items to write. + if p.Count() == 0 { + return + } + + common.WriteInodeToPage(n.inodes, p) + + // DEBUG ONLY: n.dump() +} + +// split breaks up a node into multiple smaller nodes, if appropriate. +// This should only be called from the spill() function. +func (n *node) split(pageSize uintptr) []*node { + var nodes []*node + + node := n + for { + // Split node into two. + a, b := node.splitTwo(pageSize) + nodes = append(nodes, a) + + // If we can't split then exit the loop. + if b == nil { + break + } + + // Set node to b so it gets split on the next iteration. + node = b + } + + return nodes +} + +// splitTwo breaks up a node into two smaller nodes, if appropriate. +// This should only be called from the split() function. +func (n *node) splitTwo(pageSize uintptr) (*node, *node) { + // Ignore the split if the page doesn't have at least enough nodes for + // two pages or if the nodes can fit in a single page. + if len(n.inodes) <= (common.MinKeysPerPage*2) || n.sizeLessThan(pageSize) { + return n, nil + } + + // Determine the threshold before starting a new node. + var fillPercent = n.bucket.FillPercent + if fillPercent < minFillPercent { + fillPercent = minFillPercent + } else if fillPercent > maxFillPercent { + fillPercent = maxFillPercent + } + threshold := int(float64(pageSize) * fillPercent) + + // Determine split position and sizes of the two pages. + splitIndex, _ := n.splitIndex(threshold) + + // Split node into two separate nodes. + // If there's no parent then we'll need to create one. + if n.parent == nil { + n.parent = &node{bucket: n.bucket, children: []*node{n}} + } + + // Create a new node and add it to the parent. + next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} + n.parent.children = append(n.parent.children, next) + + // Split inodes across two nodes. + next.inodes = n.inodes[splitIndex:] + n.inodes = n.inodes[:splitIndex] + + // Update the statistics. + n.bucket.tx.stats.IncSplit(1) + + return n, next +} + +// splitIndex finds the position where a page will fill a given threshold. +// It returns the index as well as the size of the first page. +// This is only be called from split(). +func (n *node) splitIndex(threshold int) (index, sz uintptr) { + sz = common.PageHeaderSize + + // Loop until we only have the minimum number of keys required for the second page. + for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ { + index = uintptr(i) + inode := n.inodes[i] + elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value())) + + // If we have at least the minimum number of keys and adding another + // node would put us over the threshold then exit and return. + if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) { + break + } + + // Add the element size to the total size. + sz += elsize + } + + return +} + +// spill writes the nodes to dirty pages and splits nodes as it goes. +// Returns an error if dirty pages cannot be allocated. +func (n *node) spill() error { + var tx = n.bucket.tx + if n.spilled { + return nil + } + + // Spill child nodes first. Child nodes can materialize sibling nodes in + // the case of split-merge so we cannot use a range loop. We have to check + // the children size on every loop iteration. + sort.Sort(n.children) + for i := 0; i < len(n.children); i++ { + if err := n.children[i].spill(); err != nil { + return err + } + } + + // We no longer need the child list because it's only used for spill tracking. + n.children = nil + + // Split nodes into appropriate sizes. The first node will always be n. + var nodes = n.split(uintptr(tx.db.pageSize)) + for _, node := range nodes { + // Add node's page to the freelist if it's not new. + if node.pgid > 0 { + tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid)) + node.pgid = 0 + } + + // Allocate contiguous space for the node. + p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) + if err != nil { + return err + } + + // Write the node. + if p.Id() >= tx.meta.Pgid() { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid())) + } + node.pgid = p.Id() + node.write(p) + node.spilled = true + + // Insert into parent inodes. + if node.parent != nil { + var key = node.key + if key == nil { + key = node.inodes[0].Key() + } + + node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0) + node.key = node.inodes[0].Key() + common.Assert(len(node.key) > 0, "spill: zero-length node key") + } + + // Update the statistics. + tx.stats.IncSpill(1) + } + + // If the root node split and created a new root then we need to spill that + // as well. We'll clear out the children to make sure it doesn't try to respill. + if n.parent != nil && n.parent.pgid == 0 { + n.children = nil + return n.parent.spill() + } + + return nil +} + +// rebalance attempts to combine the node with sibling nodes if the node fill +// size is below a threshold or if there are not enough keys. +func (n *node) rebalance() { + if !n.unbalanced { + return + } + n.unbalanced = false + + // Update statistics. + n.bucket.tx.stats.IncRebalance(1) + + // Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys. + var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2 + if n.size() > threshold && len(n.inodes) > n.minKeys() { + return + } + + // Root node has special handling. + if n.parent == nil { + // If root node is a branch and only has one node then collapse it. + if !n.isLeaf && len(n.inodes) == 1 { + // Move root's child up. + child := n.bucket.node(n.inodes[0].Pgid(), n) + n.isLeaf = child.isLeaf + n.inodes = child.inodes[:] + n.children = child.children + + // Reparent all child nodes being moved. + for _, inode := range n.inodes { + if child, ok := n.bucket.nodes[inode.Pgid()]; ok { + child.parent = n + } + } + + // Remove old child. + child.parent = nil + delete(n.bucket.nodes, child.pgid) + child.free() + } + + return + } + + // If node has no keys then just remove it. + if n.numChildren() == 0 { + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + n.parent.rebalance() + return + } + + common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children") + + // Merge with right sibling if idx == 0, otherwise left sibling. + var leftNode, rightNode *node + var useNextSibling = n.parent.childIndex(n) == 0 + if useNextSibling { + leftNode = n + rightNode = n.nextSibling() + } else { + leftNode = n.prevSibling() + rightNode = n + } + + // If both nodes are too small then merge them. + // Reparent all child nodes being moved. + for _, inode := range rightNode.inodes { + if child, ok := n.bucket.nodes[inode.Pgid()]; ok { + child.parent.removeChild(child) + child.parent = leftNode + child.parent.children = append(child.parent.children, child) + } + } + + // Copy over inodes from right node to left node and remove right node. + leftNode.inodes = append(leftNode.inodes, rightNode.inodes...) + n.parent.del(rightNode.key) + n.parent.removeChild(rightNode) + delete(n.bucket.nodes, rightNode.pgid) + rightNode.free() + + // Either this node or the sibling node was deleted from the parent so rebalance it. + n.parent.rebalance() +} + +// removes a node from the list of in-memory children. +// This does not affect the inodes. +func (n *node) removeChild(target *node) { + for i, child := range n.children { + if child == target { + n.children = append(n.children[:i], n.children[i+1:]...) + return + } + } +} + +// dereference causes the node to copy all its inode key/value references to heap memory. +// This is required when the mmap is reallocated so inodes are not pointing to stale data. +func (n *node) dereference() { + if n.key != nil { + key := make([]byte, len(n.key)) + copy(key, n.key) + n.key = key + common.Assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") + } + + for i := range n.inodes { + inode := &n.inodes[i] + + key := make([]byte, len(inode.Key())) + copy(key, inode.Key()) + inode.SetKey(key) + common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key") + + value := make([]byte, len(inode.Value())) + copy(value, inode.Value()) + inode.SetValue(value) + } + + // Recursively dereference children. + for _, child := range n.children { + child.dereference() + } + + // Update statistics. + n.bucket.tx.stats.IncNodeDeref(1) +} + +// free adds the node's underlying page to the freelist. +func (n *node) free() { + if n.pgid != 0 { + n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid)) + n.pgid = 0 + } +} + +// dump writes the contents of the node to STDERR for debugging purposes. +/* +func (n *node) dump() { + // Write node header. + var typ = "branch" + if n.isLeaf { + typ = "leaf" + } + warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) + + // Write out abbreviated version of each item. + for _, item := range n.inodes { + if n.isLeaf { + if item.flags&bucketLeafFlag != 0 { + bucket := (*bucket)(unsafe.Pointer(&item.value[0])) + warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) + } else { + warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) + } + } else { + warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) + } + } + warn("") +} +*/ + +func compareKeys(left, right []byte) int { + return bytes.Compare(left, right) +} + +type nodes []*node + +func (s nodes) Len() int { return len(s) } +func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s nodes) Less(i, j int) bool { + return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1 +} diff --git a/node_test.go b/node_test.go new file mode 100644 index 0000000..87406a7 --- /dev/null +++ b/node_test.go @@ -0,0 +1,169 @@ +package bbolt + +import ( + "testing" + "unsafe" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// Ensure that a node can insert a key/value. +func TestNode_put(t *testing.T) { + m := &common.Meta{} + m.SetPgid(1) + n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{meta: m}}} + n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0) + n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0) + n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0) + n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, common.LeafPageFlag) + + if len(n.inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(n.inodes)) + } + if k, v := n.inodes[0].Key(), n.inodes[0].Value(); string(k) != "bar" || string(v) != "1" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[1].Key(), n.inodes[1].Value(); string(k) != "baz" || string(v) != "2" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[2].Key(), n.inodes[2].Value(); string(k) != "foo" || string(v) != "3" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if n.inodes[2].Flags() != uint32(common.LeafPageFlag) { + t.Fatalf("not a leaf: %d", n.inodes[2].Flags()) + } +} + +// Ensure that a node can deserialize from a leaf page. +func TestNode_read_LeafPage(t *testing.T) { + // Create a page. + var buf [4096]byte + page := (*common.Page)(unsafe.Pointer(&buf[0])) + page.SetFlags(common.LeafPageFlag) + page.SetCount(2) + + // Insert 2 elements at the beginning. sizeof(leafPageElement) == 16 + nodes := page.LeafPageElements() + //nodes := (*[3]leafPageElement)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) + nodes[0] = *common.NewLeafPageElement(0, 32, 3, 4) // pos = sizeof(leafPageElement) * 2 + nodes[1] = *common.NewLeafPageElement(0, 23, 10, 3) // pos = sizeof(leafPageElement) + 3 + 4 + + // Write data for the nodes at the end. + const s = "barfoozhelloworldbye" + data := common.UnsafeByteSlice(unsafe.Pointer(uintptr(unsafe.Pointer(page))+unsafe.Sizeof(*page)+common.LeafPageElementSize*2), 0, 0, len(s)) + copy(data, s) + + // Deserialize page into a leaf. + n := &node{} + n.read(page) + + // Check that there are two inodes with correct data. + if !n.isLeaf { + t.Fatal("expected leaf") + } + if len(n.inodes) != 2 { + t.Fatalf("exp=2; got=%d", len(n.inodes)) + } + if k, v := n.inodes[0].Key(), n.inodes[0].Value(); string(k) != "bar" || string(v) != "fooz" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[1].Key(), n.inodes[1].Value(); string(k) != "helloworld" || string(v) != "bye" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } +} + +// Ensure that a node can serialize into a leaf page. +func TestNode_write_LeafPage(t *testing.T) { + // Create a node. + m := &common.Meta{} + m.SetPgid(1) + n := &node{isLeaf: true, inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}} + n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0) + n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0) + n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0) + + // Write it to a page. + var buf [4096]byte + p := (*common.Page)(unsafe.Pointer(&buf[0])) + n.write(p) + + // Read the page back in. + n2 := &node{} + n2.read(p) + + // Check that the two pages are the same. + if len(n2.inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(n2.inodes)) + } + if k, v := n2.inodes[0].Key(), n2.inodes[0].Value(); string(k) != "john" || string(v) != "johnson" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n2.inodes[1].Key(), n2.inodes[1].Value(); string(k) != "ricki" || string(v) != "lake" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n2.inodes[2].Key(), n2.inodes[2].Value(); string(k) != "susy" || string(v) != "que" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } +} + +// Ensure that a node can split into appropriate subgroups. +func TestNode_split(t *testing.T) { + // Create a node. + m := &common.Meta{} + m.SetPgid(1) + n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) + + // Split between 2 & 3. + n.split(100) + + var parent = n.parent + if len(parent.children) != 2 { + t.Fatalf("exp=2; got=%d", len(parent.children)) + } + if len(parent.children[0].inodes) != 2 { + t.Fatalf("exp=2; got=%d", len(parent.children[0].inodes)) + } + if len(parent.children[1].inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(parent.children[1].inodes)) + } +} + +// Ensure that a page with the minimum number of inodes just returns a single node. +func TestNode_split_MinKeys(t *testing.T) { + // Create a node. + m := &common.Meta{} + m.SetPgid(1) + n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + + // Split. + n.split(20) + if n.parent != nil { + t.Fatalf("expected nil parent") + } +} + +// Ensure that a node that has keys that all fit on a page just returns one leaf. +func TestNode_split_SinglePage(t *testing.T) { + // Create a node. + m := &common.Meta{} + m.SetPgid(1) + n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) + + // Split. + n.split(4096) + if n.parent != nil { + t.Fatalf("expected nil parent") + } +} diff --git a/quick_test.go b/quick_test.go new file mode 100644 index 0000000..13cd1fe --- /dev/null +++ b/quick_test.go @@ -0,0 +1,90 @@ +package bbolt_test + +import ( + "bytes" + "flag" + "fmt" + "math/rand" + "os" + "reflect" + "testing" + "testing/quick" + "time" +) + +// testing/quick defaults to 5 iterations and a random seed. +// You can override these settings from the command line: +// +// -quick.count The number of iterations to perform. +// -quick.seed The seed to use for randomizing. +// -quick.maxitems The maximum number of items to insert into a DB. +// -quick.maxksize The maximum size of a key. +// -quick.maxvsize The maximum size of a value. +// + +var qcount, qseed, qmaxitems, qmaxksize, qmaxvsize int + +func TestMain(m *testing.M) { + flag.IntVar(&qcount, "quick.count", 5, "") + flag.IntVar(&qseed, "quick.seed", int(time.Now().UnixNano())%100000, "") + flag.IntVar(&qmaxitems, "quick.maxitems", 1000, "") + flag.IntVar(&qmaxksize, "quick.maxksize", 1024, "") + flag.IntVar(&qmaxvsize, "quick.maxvsize", 1024, "") + flag.Parse() + fmt.Fprintln(os.Stderr, "seed:", qseed) + fmt.Fprintf(os.Stderr, "quick settings: count=%v, items=%v, ksize=%v, vsize=%v\n", qcount, qmaxitems, qmaxksize, qmaxvsize) + + os.Exit(m.Run()) +} + +func qconfig() *quick.Config { + return &quick.Config{ + MaxCount: qcount, + Rand: rand.New(rand.NewSource(int64(qseed))), + } +} + +type testdata []testdataitem + +func (t testdata) Len() int { return len(t) } +func (t testdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t testdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == -1 } + +func (t testdata) Generate(rand *rand.Rand, size int) reflect.Value { + n := rand.Intn(qmaxitems-1) + 1 + items := make(testdata, n) + used := make(map[string]bool) + for i := 0; i < n; i++ { + item := &items[i] + // Ensure that keys are unique by looping until we find one that we have not already used. + for { + item.Key = randByteSlice(rand, 1, qmaxksize) + if !used[string(item.Key)] { + used[string(item.Key)] = true + break + } + } + item.Value = randByteSlice(rand, 0, qmaxvsize) + } + return reflect.ValueOf(items) +} + +type revtestdata []testdataitem + +func (t revtestdata) Len() int { return len(t) } +func (t revtestdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t revtestdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == 1 } + +type testdataitem struct { + Key []byte + Value []byte +} + +func randByteSlice(rand *rand.Rand, minSize, maxSize int) []byte { + n := rand.Intn(maxSize-minSize) + minSize + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} diff --git a/scripts/compare_benchmarks.sh b/scripts/compare_benchmarks.sh new file mode 100644 index 0000000..2b77669 --- /dev/null +++ b/scripts/compare_benchmarks.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# https://github.com/kubernetes/kube-state-metrics/blob/main/tests/compare_benchmarks.sh (originally written by mxinden) + +# exit immediately when a command fails +set -e +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail +# error on unset variables +set -u + +[[ "$#" -eq 1 ]] || echo "One argument required, $# provided." + +REF_CURRENT="$(git rev-parse --abbrev-ref HEAD)" +BASE_TO_COMPARE=$1 + +RESULT_CURRENT="$(mktemp)-${REF_CURRENT}" +RESULT_TO_COMPARE="$(mktemp)-${BASE_TO_COMPARE}" + +BENCH_COUNT=${BENCH_COUNT:-10} +BENCHSTAT_CONFIDENCE_LEVEL=${BENCHSTAT_CONFIDENCE_LEVEL:-0.9} +BENCHSTAT_FORMAT=${BENCHSTAT_FORMAT:-"text"} +BENCH_PARAMETERS=${BENCH_PARAMETERS:-"-count 2000000 -batch-size 10000"} + +if [[ "${BENCHSTAT_FORMAT}" == "csv" ]] && [[ -z "${BENCHSTAT_OUTPUT_FILE}" ]]; then + echo "BENCHSTAT_FORMAT is set to csv, but BENCHSTAT_OUTPUT_FILE is not set." + exit 1 +fi + +function bench() { + local output_file + output_file="$1" + make build + + for _ in $(seq "$BENCH_COUNT"); do + echo ./bin/bbolt bench -gobench-output -profile-mode n ${BENCH_PARAMETERS} + # shellcheck disable=SC2086 + ./bin/bbolt bench -gobench-output -profile-mode n ${BENCH_PARAMETERS} >> "${output_file}" + done +} + +function main() { + echo "### Benchmarking PR ${REF_CURRENT}" + bench "${RESULT_CURRENT}" + echo "" + echo "### Done benchmarking ${REF_CURRENT}" + + echo "### Benchmarking base ${BASE_TO_COMPARE}" + git checkout "${BASE_TO_COMPARE}" + bench "${RESULT_TO_COMPARE}" + echo "" + echo "### Done benchmarking ${BASE_TO_COMPARE}" + + git checkout - + + echo "" + echo "### Result" + echo "BASE=${BASE_TO_COMPARE} HEAD=${REF_CURRENT}" + + if [[ "${BENCHSTAT_FORMAT}" == "csv" ]]; then + benchstat -format=csv -confidence="${BENCHSTAT_CONFIDENCE_LEVEL}" BASE="${RESULT_TO_COMPARE}" HEAD="${RESULT_CURRENT}" 2>/dev/null 1>"${BENCHSTAT_OUTPUT_FILE}" + else + if [[ -z "${BENCHSTAT_OUTPUT_FILE}" ]]; then + benchstat -confidence="${BENCHSTAT_CONFIDENCE_LEVEL}" BASE="${RESULT_TO_COMPARE}" HEAD="${RESULT_CURRENT}" + else + benchstat -confidence="${BENCHSTAT_CONFIDENCE_LEVEL}" BASE="${RESULT_TO_COMPARE}" HEAD="${RESULT_CURRENT}" 1>"${BENCHSTAT_OUTPUT_FILE}" + fi + fi +} + +main diff --git a/scripts/fix.sh b/scripts/fix.sh new file mode 100644 index 0000000..6b933c9 --- /dev/null +++ b/scripts/fix.sh @@ -0,0 +1,13 @@ +GO_CMD="go" + +# TODO(ptabor): Expand to cover different architectures (GOOS GOARCH), or just list go files. + +GOFILES=$(${GO_CMD} list --f "{{with \$d:=.}}{{range .GoFiles}}{{\$d.Dir}}/{{.}}{{\"\n\"}}{{end}}{{end}}" ./...) +TESTGOFILES=$(${GO_CMD} list --f "{{with \$d:=.}}{{range .TestGoFiles}}{{\$d.Dir}}/{{.}}{{\"\n\"}}{{end}}{{end}}" ./...) +XTESTGOFILES=$(${GO_CMD} list --f "{{with \$d:=.}}{{range .XTestGoFiles}}{{\$d.Dir}}/{{.}}{{\"\n\"}}{{end}}{{end}}" ./...) + + +echo "${GOFILES}" "${TESTGOFILES}" "${XTESTGOFILES}"| xargs -n 100 go run golang.org/x/tools/cmd/goimports@latest -w -local go.etcd.io + +go fmt ./... +go mod tidy diff --git a/simulation_no_freelist_sync_test.go b/simulation_no_freelist_sync_test.go new file mode 100644 index 0000000..5770c68 --- /dev/null +++ b/simulation_no_freelist_sync_test.go @@ -0,0 +1,47 @@ +package bbolt_test + +import ( + "testing" + + bolt "github.com/tutus-one/tutus-bolt" +) + +func TestSimulateNoFreeListSync_1op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1, 1) +} +func TestSimulateNoFreeListSync_10op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10, 1) +} +func TestSimulateNoFreeListSync_100op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 1) +} +func TestSimulateNoFreeListSync_1000op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 1) +} +func TestSimulateNoFreeListSync_10000op_1p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 1) +} +func TestSimulateNoFreeListSync_10op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10, 10) +} +func TestSimulateNoFreeListSync_100op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 10) +} +func TestSimulateNoFreeListSync_1000op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 10) +} +func TestSimulateNoFreeListSync_10000op_10p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 10) +} +func TestSimulateNoFreeListSync_100op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 100, 100) +} +func TestSimulateNoFreeListSync_1000op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 1000, 100) +} +func TestSimulateNoFreeListSync_10000op_100p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 100) +} +func TestSimulateNoFreeListSync_10000op_1000p(t *testing.T) { + testSimulate(t, &bolt.Options{NoFreelistSync: true}, 8, 10000, 1000) +} diff --git a/simulation_test.go b/simulation_test.go new file mode 100644 index 0000000..6a890f1 --- /dev/null +++ b/simulation_test.go @@ -0,0 +1,362 @@ +package bbolt_test + +import ( + "bytes" + "fmt" + "math/rand" + "sync" + "sync/atomic" + "testing" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +func TestSimulate_1op_1p(t *testing.T) { testSimulate(t, nil, 1, 1, 1) } +func TestSimulate_10op_1p(t *testing.T) { testSimulate(t, nil, 1, 10, 1) } +func TestSimulate_100op_1p(t *testing.T) { testSimulate(t, nil, 1, 100, 1) } +func TestSimulate_1000op_1p(t *testing.T) { testSimulate(t, nil, 1, 1000, 1) } +func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, nil, 1, 10000, 1) } + +func TestSimulate_10op_10p(t *testing.T) { testSimulate(t, nil, 1, 10, 10) } +func TestSimulate_100op_10p(t *testing.T) { testSimulate(t, nil, 1, 100, 10) } +func TestSimulate_1000op_10p(t *testing.T) { testSimulate(t, nil, 1, 1000, 10) } +func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, nil, 1, 10000, 10) } + +func TestSimulate_100op_100p(t *testing.T) { testSimulate(t, nil, 1, 100, 100) } +func TestSimulate_1000op_100p(t *testing.T) { testSimulate(t, nil, 1, 1000, 100) } +func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, nil, 1, 10000, 100) } + +func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, nil, 1, 10000, 1000) } + +// Randomly generate operations on a given database with multiple clients to ensure consistency and thread safety. +func testSimulate(t *testing.T, openOption *bolt.Options, round, threadCount, parallelism int) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + // A list of operations that readers and writers can perform. + var readerHandlers = []simulateHandler{simulateGetHandler} + var writerHandlers = []simulateHandler{simulateGetHandler, simulatePutHandler} + + var versions = make(map[int]*QuickDB) + versions[1] = NewQuickDB() + + db := btesting.MustCreateDBWithOption(t, openOption) + + var mutex sync.Mutex + + for n := 0; n < round; n++ { + // Run n threads in parallel, each with their own operation. + var threads = make(chan bool, parallelism) + var wg sync.WaitGroup + + // counter for how many goroutines were fired + var opCount int64 + + // counter for ignored operations + var igCount int64 + + var errCh = make(chan error, threadCount) + + var i int + for { + // this buffered channel will keep accepting booleans + // until it hits the limit defined by the parallelism + // argument to testSimulate() + threads <- true + + // this wait group can only be marked "done" from inside + // the subsequent goroutine + wg.Add(1) + writable := ((rand.Int() % 100) < 20) // 20% writers + + // Choose an operation to execute. + var handler simulateHandler + if writable { + handler = writerHandlers[rand.Intn(len(writerHandlers))] + } else { + handler = readerHandlers[rand.Intn(len(readerHandlers))] + } + + // Execute a thread for the given operation. + go func(writable bool, handler simulateHandler) { + defer wg.Done() + atomic.AddInt64(&opCount, 1) + // Start transaction. + tx, err := db.Begin(writable) + if err != nil { + errCh <- fmt.Errorf("error tx begin: %v", err) + return + } + + // Obtain current state of the dataset. + mutex.Lock() + var qdb = versions[tx.ID()] + if writable { + qdb = versions[tx.ID()-1].Copy() + } + mutex.Unlock() + + // Make sure we commit/rollback the tx at the end and update the state. + if writable { + defer func() { + mutex.Lock() + versions[tx.ID()] = qdb + mutex.Unlock() + + if err := tx.Commit(); err != nil { + errCh <- err + return + } + }() + } else { + defer func() { _ = tx.Rollback() }() + } + + // Ignore operation if we don't have data yet. + if qdb == nil { + atomic.AddInt64(&igCount, 1) + return + } + + // Execute handler. + handler(tx, qdb) + + // Release a thread back to the scheduling loop. + <-threads + }(writable, handler) + + i++ + if i >= threadCount { + break + } + } + + // Wait until all threads are done. + wg.Wait() + t.Logf("transactions:%d ignored:%d", opCount, igCount) + close(errCh) + for err := range errCh { + if err != nil { + t.Fatalf("error from inside goroutine: %v", err) + } + } + + db.MustClose() + // I have doubts the DB drop is indented here (as 'versions' is not being reset). + // But I'm preserving for now the original behavior. + db.MustDeleteFile() + db.MustReopen() + } + +} + +type simulateHandler func(tx *bolt.Tx, qdb *QuickDB) + +// Retrieves a key from the database and verifies that it is what is expected. +func simulateGetHandler(tx *bolt.Tx, qdb *QuickDB) { + // Randomly retrieve an existing exist. + keys := qdb.Rand() + if len(keys) == 0 { + return + } + + // Retrieve root bucket. + b := tx.Bucket(keys[0]) + if b == nil { + panic(fmt.Sprintf("bucket[0] expected: %08x\n", trunc(keys[0], 4))) + } + + // Drill into nested buckets. + for _, key := range keys[1 : len(keys)-1] { + b = b.Bucket(key) + if b == nil { + panic(fmt.Sprintf("bucket[n] expected: %v -> %v\n", keys, key)) + } + } + + // Verify key/value on the final bucket. + expected := qdb.Get(keys) + actual := b.Get(keys[len(keys)-1]) + if !bytes.Equal(actual, expected) { + fmt.Println("=== EXPECTED ===") + fmt.Println(expected) + fmt.Println("=== ACTUAL ===") + fmt.Println(actual) + fmt.Println("=== END ===") + panic("value mismatch") + } +} + +// Inserts a key into the database. +func simulatePutHandler(tx *bolt.Tx, qdb *QuickDB) { + var err error + keys, value := randKeys(), randValue() + + // Retrieve root bucket. + b := tx.Bucket(keys[0]) + if b == nil { + b, err = tx.CreateBucket(keys[0]) + if err != nil { + panic("create bucket: " + err.Error()) + } + } + + // Create nested buckets, if necessary. + for _, key := range keys[1 : len(keys)-1] { + child := b.Bucket(key) + if child != nil { + b = child + } else { + b, err = b.CreateBucket(key) + if err != nil { + panic("create bucket: " + err.Error()) + } + } + } + + // Insert into database. + if err := b.Put(keys[len(keys)-1], value); err != nil { + panic("put: " + err.Error()) + } + + // Insert into in-memory database. + qdb.Put(keys, value) +} + +// QuickDB is an in-memory database that replicates the functionality of the +// Bolt DB type except that it is entirely in-memory. It is meant for testing +// that the Bolt database is consistent. +type QuickDB struct { + sync.RWMutex + m map[string]interface{} +} + +// NewQuickDB returns an instance of QuickDB. +func NewQuickDB() *QuickDB { + return &QuickDB{m: make(map[string]interface{})} +} + +// Get retrieves the value at a key path. +func (db *QuickDB) Get(keys [][]byte) []byte { + db.RLock() + defer db.RUnlock() + + m := db.m + for _, key := range keys[:len(keys)-1] { + value := m[string(key)] + if value == nil { + return nil + } + switch value := value.(type) { + case map[string]interface{}: + m = value + case []byte: + return nil + } + } + + // Only return if it's a simple value. + if value, ok := m[string(keys[len(keys)-1])].([]byte); ok { + return value + } + return nil +} + +// Put inserts a value into a key path. +func (db *QuickDB) Put(keys [][]byte, value []byte) { + db.Lock() + defer db.Unlock() + + // Build buckets all the way down the key path. + m := db.m + for _, key := range keys[:len(keys)-1] { + if _, ok := m[string(key)].([]byte); ok { + return // Keypath intersects with a simple value. Do nothing. + } + + if m[string(key)] == nil { + m[string(key)] = make(map[string]interface{}) + } + m = m[string(key)].(map[string]interface{}) + } + + // Insert value into the last key. + m[string(keys[len(keys)-1])] = value +} + +// Rand returns a random key path that points to a simple value. +func (db *QuickDB) Rand() [][]byte { + db.RLock() + defer db.RUnlock() + if len(db.m) == 0 { + return nil + } + var keys [][]byte + db.rand(db.m, &keys) + return keys +} + +func (db *QuickDB) rand(m map[string]interface{}, keys *[][]byte) { + i, index := 0, rand.Intn(len(m)) + for k, v := range m { + if i == index { + *keys = append(*keys, []byte(k)) + if v, ok := v.(map[string]interface{}); ok { + db.rand(v, keys) + } + return + } + i++ + } + panic("quickdb rand: out-of-range") +} + +// Copy copies the entire database. +func (db *QuickDB) Copy() *QuickDB { + db.RLock() + defer db.RUnlock() + return &QuickDB{m: db.copy(db.m)} +} + +func (db *QuickDB) copy(m map[string]interface{}) map[string]interface{} { + clone := make(map[string]interface{}, len(m)) + for k, v := range m { + switch v := v.(type) { + case map[string]interface{}: + clone[k] = db.copy(v) + default: + clone[k] = v + } + } + return clone +} + +func randKey() []byte { + var min, max = 1, 1024 + n := rand.Intn(max-min) + min + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} + +func randKeys() [][]byte { + var keys [][]byte + var count = rand.Intn(2) + 2 + for i := 0; i < count; i++ { + keys = append(keys, randKey()) + } + return keys +} + +func randValue() []byte { + n := rand.Intn(8192) + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} diff --git a/tests/dmflakey/dmflakey.go b/tests/dmflakey/dmflakey.go new file mode 100644 index 0000000..593b6ff --- /dev/null +++ b/tests/dmflakey/dmflakey.go @@ -0,0 +1,350 @@ +//go:build linux + +package dmflakey + +import ( + "errors" + "fmt" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "time" + + "golang.org/x/sys/unix" +) + +type featCfg struct { + // SyncFS attempts to synchronize filesystem before inject failure. + syncFS bool + // interval is used to determine the up time for feature. + // + // For AllowWrites, it means that the device is available for `interval` seconds. + // For Other features, the device exhibits unreliable behaviour for + // `interval` seconds. + interval time.Duration +} + +// Default values. +const ( + // defaultImgSize is the default size for filesystem image. + defaultImgSize int64 = 1024 * 1024 * 1024 * 10 // 10 GiB + // defaultInterval is the default interval for the up time of feature. + defaultInterval = 2 * time.Minute +) + +// defaultFeatCfg is the default setting for flakey feature. +var defaultFeatCfg = featCfg{interval: defaultInterval} + +// FeatOpt is used to configure failure feature. +type FeatOpt func(*featCfg) + +// WithIntervalFeatOpt updates the up time for the feature. +func WithIntervalFeatOpt(interval time.Duration) FeatOpt { + return func(cfg *featCfg) { + cfg.interval = interval + } +} + +// WithSyncFSFeatOpt is to determine if the caller wants to synchronize +// filesystem before inject failure. +func WithSyncFSFeatOpt(syncFS bool) FeatOpt { + return func(cfg *featCfg) { + cfg.syncFS = syncFS + } +} + +// Flakey is to inject failure into device. +type Flakey interface { + // DevicePath returns the flakey device path. + DevicePath() string + + // Filesystem returns filesystem's type. + Filesystem() FSType + + // AllowWrites allows write I/O. + AllowWrites(opts ...FeatOpt) error + + // DropWrites drops all write I/O silently. + DropWrites(opts ...FeatOpt) error + + // ErrorWrites drops all write I/O and returns error. + ErrorWrites(opts ...FeatOpt) error + + // Teardown releases the flakey device. + Teardown() error +} + +// FSType represents the filesystem name. +type FSType string + +// Supported filesystems. +const ( + FSTypeEXT4 FSType = "ext4" + FSTypeXFS FSType = "xfs" +) + +// InitFlakey creates an filesystem on a loopback device and returns Flakey on it. +// +// The device-mapper device will be /dev/mapper/$flakeyDevice. And the filesystem +// image will be created at $dataStorePath/$flakeyDevice.img. By default, the +// device is available for 2 minutes and size is 10 GiB. +func InitFlakey(flakeyDevice, dataStorePath string, fsType FSType, mkfsOpt string) (_ Flakey, retErr error) { + imgPath := filepath.Join(dataStorePath, fmt.Sprintf("%s.img", flakeyDevice)) + if err := createEmptyFSImage(imgPath, fsType, mkfsOpt); err != nil { + return nil, err + } + defer func() { + if retErr != nil { + os.RemoveAll(imgPath) + } + }() + + loopDevice, err := attachToLoopDevice(imgPath) + if err != nil { + return nil, err + } + defer func() { + if retErr != nil { + _ = detachLoopDevice(loopDevice) + } + }() + + imgSize, err := getBlkSize(loopDevice) + if err != nil { + return nil, err + } + + if err := newFlakeyDevice(flakeyDevice, loopDevice, defaultInterval); err != nil { + return nil, err + } + + return &flakey{ + fsType: fsType, + imgPath: imgPath, + imgSize: imgSize, + + loopDevice: loopDevice, + flakeyDevice: flakeyDevice, + }, nil +} + +type flakey struct { + fsType FSType + imgPath string + imgSize int64 + + loopDevice string + flakeyDevice string +} + +// DevicePath returns the flakey device path. +func (f *flakey) DevicePath() string { + return fmt.Sprintf("/dev/mapper/%s", f.flakeyDevice) +} + +// Filesystem returns filesystem's type. +func (f *flakey) Filesystem() FSType { + return f.fsType +} + +// AllowWrites allows write I/O. +func (f *flakey) AllowWrites(opts ...FeatOpt) error { + var o = defaultFeatCfg + for _, opt := range opts { + opt(&o) + } + + // NOTE: Table parameters + // + // 0 imgSize flakey [ []] + // + // Mandatory parameters: + // + // : Full pathname to the underlying block-device, or a "major:minor" device-number. + // : Starting sector within the device. + // : Number of seconds device is available. + // : Number of seconds device returns errors. + // + // Optional: + // + // If no feature parameters are present, during the periods of unreliability, all I/O returns errors. + // + // For AllowWrites, the device will handle data correctly in `interval` seconds. + // + // REF: https://docs.kernel.org/admin-guide/device-mapper/dm-flakey.html. + table := fmt.Sprintf("0 %d flakey %s 0 %d 0", + f.imgSize, f.loopDevice, int(o.interval.Seconds())) + + return reloadFlakeyDevice(f.flakeyDevice, o.syncFS, table) +} + +// DropWrites drops all write I/O silently. +func (f *flakey) DropWrites(opts ...FeatOpt) error { + var o = defaultFeatCfg + for _, opt := range opts { + opt(&o) + } + + // NOTE: Table parameters + // + // 0 imgSize flakey [ []] + // + // Mandatory parameters: + // + // : Full pathname to the underlying block-device, or a "major:minor" device-number. + // : Starting sector within the device. + // : Number of seconds device is available. + // : Number of seconds device returns errors. + // + // Optional: + // + // : How many arguments (length of ) + // + // For DropWrites, + // + // num_features: 1 (there is only one argument) + // feature_arguments: drop_writes + // + // The Device will drop all the writes into disk in `interval` seconds. + // Read I/O is handled correctly. + // + // For example, the application calls fsync, all the dirty pages will + // be flushed into disk ideally. But during DropWrites, device will + // ignore all the data and return successfully. It can be used to + // simulate data-loss after power failure. + // + // REF: https://docs.kernel.org/admin-guide/device-mapper/dm-flakey.html. + table := fmt.Sprintf("0 %d flakey %s 0 0 %d 1 drop_writes", + f.imgSize, f.loopDevice, int(o.interval.Seconds())) + + return reloadFlakeyDevice(f.flakeyDevice, o.syncFS, table) +} + +// ErrorWrites drops all write I/O and returns error. +func (f *flakey) ErrorWrites(opts ...FeatOpt) error { + var o = defaultFeatCfg + for _, opt := range opts { + opt(&o) + } + + // NOTE: Table parameters + // + // 0 imgSize flakey [ []] + // + // Mandatory parameters: + // + // : Full pathname to the underlying block-device, or a "major:minor" device-number. + // : Starting sector within the device. + // : Number of seconds device is available. + // : Number of seconds device returns errors. + // + // Optional: + // + // : How many arguments (length of ) + // + // For ErrorWrites, + // + // num_features: 1 (there is only one argument) + // feature_arguments: error_writes + // + // The Device will drop all the writes into disk in `interval` seconds + // and return failure to caller. Read I/O is handled correctly. + // + // REF: https://docs.kernel.org/admin-guide/device-mapper/dm-flakey.html. + table := fmt.Sprintf("0 %d flakey %s 0 0 %d 1 error_writes", + f.imgSize, f.loopDevice, int(o.interval.Seconds())) + + return reloadFlakeyDevice(f.flakeyDevice, o.syncFS, table) +} + +// Teardown releases the flakey device. +func (f *flakey) Teardown() error { + // FIXME(XXX): Even though we umount device successfully, it's still + // possible to run into `Device or resource busy` issue. It's easy to + // reproduce it in slow storage or 2-4 cores ARM64 host with xfs. We + // should retry it to fix transisent issue. + var derr error + for i := 0; i < 10; i++ { + derr = deleteFlakeyDevice(f.flakeyDevice) + if derr != nil { + if strings.Contains(derr.Error(), "Device or resource busy") { + time.Sleep(1 * time.Second) + continue + } + if strings.Contains(derr.Error(), "No such device or address") { + derr = nil + } + } + break + } + if derr != nil { + return derr + } + + if err := detachLoopDevice(f.loopDevice); err != nil { + if !errors.Is(err, unix.ENXIO) { + return err + } + } + return os.RemoveAll(f.imgPath) +} + +// createEmptyFSImage creates empty filesystem on dataStorePath folder with +// default size - 10 GiB. +func createEmptyFSImage(imgPath string, fsType FSType, mkfsOpt string) error { + if err := validateFSType(fsType); err != nil { + return err + } + + mkfs, err := exec.LookPath(fmt.Sprintf("mkfs.%s", fsType)) + if err != nil { + return fmt.Errorf("failed to ensure mkfs.%s: %w", fsType, err) + } + + if _, err := os.Stat(imgPath); err == nil { + return fmt.Errorf("failed to create image because %s already exists", imgPath) + } + + if err := os.MkdirAll(path.Dir(imgPath), 0600); err != nil { + return fmt.Errorf("failed to ensure parent directory %s: %w", path.Dir(imgPath), err) + } + + f, err := os.Create(imgPath) + if err != nil { + return fmt.Errorf("failed to create image %s: %w", imgPath, err) + } + + if err = func() error { + defer f.Close() + + return f.Truncate(defaultImgSize) + }(); err != nil { + return fmt.Errorf("failed to truncate image %s with %v bytes: %w", + imgPath, defaultImgSize, err) + } + + args := []string{imgPath} + if mkfsOpt != "" { + splitArgs := strings.Split(mkfsOpt, " ") + args = append(splitArgs, imgPath) + } + + output, err := exec.Command(mkfs, args...).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to mkfs on %s (%s %v) (out: %s): %w", + imgPath, mkfs, args, string(output), err) + } + return nil +} + +// validateFSType validates the fs type input. +func validateFSType(fsType FSType) error { + switch fsType { + case FSTypeEXT4, FSTypeXFS: + return nil + default: + return fmt.Errorf("unsupported filesystem %s", fsType) + } +} diff --git a/tests/dmflakey/dmflakey_test.go b/tests/dmflakey/dmflakey_test.go new file mode 100644 index 0000000..5a7dda7 --- /dev/null +++ b/tests/dmflakey/dmflakey_test.go @@ -0,0 +1,188 @@ +//go:build linux + +package dmflakey + +import ( + "errors" + "flag" + "fmt" + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + testutils "github.com/tutus-one/tutus-bolt/tests/utils" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" +) + +func TestMain(m *testing.M) { + flag.Parse() + testutils.RequiresRoot() + os.Exit(m.Run()) +} + +func TestBasic(t *testing.T) { + for _, fsType := range []FSType{FSTypeEXT4, FSTypeXFS} { + t.Run(string(fsType), func(t *testing.T) { + tmpDir := t.TempDir() + + flakey, err := InitFlakey("go-dmflakey", tmpDir, fsType, "") + require.NoError(t, err, "init flakey") + defer func() { + assert.NoError(t, flakey.Teardown()) + }() + + target := filepath.Join(tmpDir, "root") + require.NoError(t, os.MkdirAll(target, 0600)) + + require.NoError(t, mount(target, flakey.DevicePath(), "")) + defer func() { + assert.NoError(t, unmount(target)) + }() + + file := filepath.Join(target, "test") + assert.NoError(t, writeFile(file, []byte("hello, world"), 0600, true)) + + assert.NoError(t, unmount(target)) + + assert.NoError(t, flakey.Teardown()) + }) + } +} + +func TestDropWritesExt4(t *testing.T) { + flakey, root := initFlakey(t, FSTypeEXT4) + + // commit=1000 is to delay commit triggered by writeback thread + require.NoError(t, mount(root, flakey.DevicePath(), "commit=1000")) + + // ensure testdir/f1 is synced. + target := filepath.Join(root, "testdir") + require.NoError(t, os.MkdirAll(target, 0600)) + + f1 := filepath.Join(target, "f1") + assert.NoError(t, writeFile(f1, []byte("hello, world from f1"), 0600, false)) + require.NoError(t, syncfs(f1)) + + // testdir/f2 is created but without fsync + f2 := filepath.Join(target, "f2") + assert.NoError(t, writeFile(f2, []byte("hello, world from f2"), 0600, false)) + + // simulate power failure + assert.NoError(t, flakey.DropWrites()) + assert.NoError(t, unmount(root)) + assert.NoError(t, flakey.AllowWrites()) + require.NoError(t, mount(root, flakey.DevicePath(), "")) + + data, err := os.ReadFile(f1) + assert.NoError(t, err) + assert.Equal(t, "hello, world from f1", string(data)) + + _, err = os.ReadFile(f2) + assert.True(t, errors.Is(err, os.ErrNotExist)) +} + +func TestErrorWritesExt4(t *testing.T) { + flakey, root := initFlakey(t, FSTypeEXT4) + + // commit=1000 is to delay commit triggered by writeback thread + require.NoError(t, mount(root, flakey.DevicePath(), "commit=1000")) + + // inject IO failure on write + assert.NoError(t, flakey.ErrorWrites()) + + f1 := filepath.Join(root, "f1") + err := writeFile(f1, []byte("hello, world during failpoint"), 0600, true) + assert.ErrorContains(t, err, "input/output error") + + // resume + assert.NoError(t, flakey.AllowWrites()) + err = writeFile(f1, []byte("hello, world"), 0600, true) + assert.NoError(t, err) + + assert.NoError(t, unmount(root)) + require.NoError(t, mount(root, flakey.DevicePath(), "")) + + data, err := os.ReadFile(f1) + assert.NoError(t, err) + assert.Equal(t, "hello, world", string(data)) +} + +func initFlakey(t *testing.T, fsType FSType) (_ Flakey, root string) { + tmpDir := t.TempDir() + + target := filepath.Join(tmpDir, "root") + require.NoError(t, os.MkdirAll(target, 0600)) + + flakey, err := InitFlakey("go-dmflakey", tmpDir, fsType, "") + require.NoError(t, err, "init flakey") + + t.Cleanup(func() { + assert.NoError(t, unmount(target)) + assert.NoError(t, flakey.Teardown()) + }) + return flakey, target +} + +func writeFile(name string, data []byte, perm os.FileMode, sync bool) error { + f, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm) + if err != nil { + return err + } + defer f.Close() + + if _, err = f.Write(data); err != nil { + return err + } + + if sync { + return f.Sync() + } + return nil +} + +func syncfs(file string) error { + f, err := os.Open(file) + if err != nil { + return fmt.Errorf("failed to open %s: %w", file, err) + } + defer f.Close() + + _, _, errno := unix.Syscall(unix.SYS_SYNCFS, uintptr(f.Fd()), 0, 0) + if errno != 0 { + return errno + } + return nil +} + +func mount(target string, devPath string, opt string) error { + args := []string{"-o", opt, devPath, target} + + output, err := exec.Command("mount", args...).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to mount (args: %v) (out: %s): %w", + args, string(output), err) + } + return nil +} + +func unmount(target string) error { + for i := 0; i < 50; i++ { + if err := unix.Unmount(target, 0); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(500 * time.Millisecond) + continue + case unix.EINVAL: + default: + return fmt.Errorf("failed to umount %s: %w", target, err) + } + } + return nil + } + return unix.EBUSY +} diff --git a/tests/dmflakey/dmsetup.go b/tests/dmflakey/dmsetup.go new file mode 100644 index 0000000..d1fe698 --- /dev/null +++ b/tests/dmflakey/dmsetup.go @@ -0,0 +1,105 @@ +//go:build linux + +package dmflakey + +import ( + "fmt" + "os" + "os/exec" + "time" + "unsafe" + + "golang.org/x/sys/unix" +) + +// newFlakeyDevice creates flakey device. +// +// REF: https://docs.kernel.org/admin-guide/device-mapper/dm-flakey.html +func newFlakeyDevice(flakeyDevice, loopDevice string, interval time.Duration) error { + loopSize, err := getBlkSize(loopDevice) + if err != nil { + return fmt.Errorf("failed to get the size of the loop device %s: %w", loopDevice, err) + } + + // The flakey device will be available in interval.Seconds(). + table := fmt.Sprintf("0 %d flakey %s 0 %d 0", + loopSize, loopDevice, int(interval.Seconds())) + + args := []string{"create", flakeyDevice, "--table", table} + + output, err := exec.Command("dmsetup", args...).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to create flakey device %s with table %s (out: %s): %w", + flakeyDevice, table, string(output), err) + } + return nil +} + +// reloadFlakeyDevice reloads the flakey device with feature table. +func reloadFlakeyDevice(flakeyDevice string, syncFS bool, table string) (retErr error) { + args := []string{"suspend", "--nolockfs", flakeyDevice} + if syncFS { + args[1] = flakeyDevice + args = args[:len(args)-1] + } + + output, err := exec.Command("dmsetup", args...).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to suspend flakey device %s (out: %s): %w", + flakeyDevice, string(output), err) + } + + defer func() { + output, derr := exec.Command("dmsetup", "resume", flakeyDevice).CombinedOutput() + if derr != nil { + derr = fmt.Errorf("failed to resume flakey device %s (out: %s): %w", + flakeyDevice, string(output), derr) + } + + if retErr == nil { + retErr = derr + } + }() + + output, err = exec.Command("dmsetup", "load", flakeyDevice, "--table", table).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to reload flakey device %s with table (%s) (out: %s): %w", + flakeyDevice, table, string(output), err) + } + return nil +} + +// removeFlakeyDevice removes flakey device. +func deleteFlakeyDevice(flakeyDevice string) error { + output, err := exec.Command("dmsetup", "remove", flakeyDevice).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to remove flakey device %s (out: %s): %w", + flakeyDevice, string(output), err) + } + return nil +} + +// getBlkSize64 gets device size in bytes (BLKGETSIZE64). +// +// REF: https://man7.org/linux/man-pages/man8/blockdev.8.html +func getBlkSize64(device string) (int64, error) { + deviceFd, err := os.Open(device) + if err != nil { + return 0, fmt.Errorf("failed to open device %s: %w", device, err) + } + defer deviceFd.Close() + + var size int64 + if _, _, err := unix.Syscall(unix.SYS_IOCTL, deviceFd.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&size))); err != 0 { + return 0, fmt.Errorf("failed to get block size: %w", err) + } + return size, nil +} + +// getBlkSize gets size in 512-byte sectors (BLKGETSIZE64 / 512). +// +// REF: https://man7.org/linux/man-pages/man8/blockdev.8.html +func getBlkSize(device string) (int64, error) { + size, err := getBlkSize64(device) + return size / 512, err +} diff --git a/tests/dmflakey/loopback.go b/tests/dmflakey/loopback.go new file mode 100644 index 0000000..7013363 --- /dev/null +++ b/tests/dmflakey/loopback.go @@ -0,0 +1,91 @@ +//go:build linux + +package dmflakey + +import ( + "errors" + "fmt" + "os" + "time" + + "golang.org/x/sys/unix" +) + +const ( + loopControlDevice = "/dev/loop-control" + loopDevicePattern = "/dev/loop%d" + + maxRetryToAttach = 50 +) + +// attachToLoopDevice associates free loop device with backing file. +// +// There might have race condition. It needs to retry when it runs into EBUSY. +// +// REF: https://man7.org/linux/man-pages/man4/loop.4.html +func attachToLoopDevice(backingFile string) (string, error) { + backingFd, err := os.OpenFile(backingFile, os.O_RDWR, 0) + if err != nil { + return "", fmt.Errorf("failed to open loop device's backing file %s: %w", + backingFile, err) + } + defer backingFd.Close() + + for i := 0; i < maxRetryToAttach; i++ { + loop, err := getFreeLoopDevice() + if err != nil { + return "", fmt.Errorf("failed to get free loop device: %w", err) + } + + err = func() error { + loopFd, err := os.OpenFile(loop, os.O_RDWR, 0) + if err != nil { + return err + } + defer loopFd.Close() + + return unix.IoctlSetInt(int(loopFd.Fd()), + unix.LOOP_SET_FD, int(backingFd.Fd())) + }() + if err != nil { + if errors.Is(err, unix.EBUSY) { + time.Sleep(500 * time.Millisecond) + continue + } + return "", err + } + return loop, nil + } + return "", fmt.Errorf("failed to associate free loop device with backing file %s after retry %v", + backingFile, maxRetryToAttach) +} + +// detachLoopDevice disassociates the loop device from any backing file. +// +// REF: https://man7.org/linux/man-pages/man4/loop.4.html +func detachLoopDevice(loopDevice string) error { + loopFd, err := os.Open(loopDevice) + if err != nil { + return fmt.Errorf("failed to open loop %s: %w", loopDevice, err) + } + defer loopFd.Close() + + return unix.IoctlSetInt(int(loopFd.Fd()), unix.LOOP_CLR_FD, 0) +} + +// getFreeLoopDevice allocates or finds a free loop device for use. +// +// REF: https://man7.org/linux/man-pages/man4/loop.4.html +func getFreeLoopDevice() (string, error) { + control, err := os.OpenFile(loopControlDevice, os.O_RDWR, 0) + if err != nil { + return "", fmt.Errorf("failed to open %s: %w", loopControlDevice, err) + } + + idx, err := unix.IoctlRetInt(int(control.Fd()), unix.LOOP_CTL_GET_FREE) + control.Close() + if err != nil { + return "", fmt.Errorf("failed to get free loop device number: %w", err) + } + return fmt.Sprintf(loopDevicePattern, idx), nil +} diff --git a/tests/failpoint/db_failpoint_test.go b/tests/failpoint/db_failpoint_test.go new file mode 100644 index 0000000..7890b95 --- /dev/null +++ b/tests/failpoint/db_failpoint_test.go @@ -0,0 +1,368 @@ +package failpoint + +import ( + crand "crypto/rand" + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" + gofail "go.etcd.io/gofail/runtime" +) + +func TestFailpoint_MapFail(t *testing.T) { + err := gofail.Enable("mapError", `return("map somehow failed")`) + require.NoError(t, err) + defer func() { + err = gofail.Disable("mapError") + require.NoError(t, err) + }() + + f := filepath.Join(t.TempDir(), "db") + _, err = bolt.Open(f, 0600, nil) + require.Error(t, err) + require.ErrorContains(t, err, "map somehow failed") +} + +// ensures when munmap fails, the flock is unlocked +func TestFailpoint_UnmapFail_DbClose(t *testing.T) { + //unmap error on db close + //we need to open the db first, and then enable the error. + //otherwise the db cannot be opened. + f := filepath.Join(t.TempDir(), "db") + + err := gofail.Enable("unmapError", `return("unmap somehow failed")`) + require.NoError(t, err) + _, err = bolt.Open(f, 0600, nil) + require.Error(t, err) + require.ErrorContains(t, err, "unmap somehow failed") + //disable the error, and try to reopen the db + err = gofail.Disable("unmapError") + require.NoError(t, err) + + db, err := bolt.Open(f, 0600, &bolt.Options{Timeout: 30 * time.Second}) + require.NoError(t, err) + err = db.Close() + require.NoError(t, err) +} + +func TestFailpoint_mLockFail(t *testing.T) { + err := gofail.Enable("mlockError", `return("mlock somehow failed")`) + require.NoError(t, err) + + f := filepath.Join(t.TempDir(), "db") + _, err = bolt.Open(f, 0600, &bolt.Options{Mlock: true}) + require.Error(t, err) + require.ErrorContains(t, err, "mlock somehow failed") + + // It should work after disabling the failpoint. + err = gofail.Disable("mlockError") + require.NoError(t, err) + + _, err = bolt.Open(f, 0600, &bolt.Options{Mlock: true}) + require.NoError(t, err) +} + +func TestFailpoint_mLockFail_When_remap(t *testing.T) { + db := btesting.MustCreateDB(t) + db.Mlock = true + + err := gofail.Enable("mlockError", `return("mlock somehow failed in allocate")`) + require.NoError(t, err) + + err = db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + ) + + require.Error(t, err) + require.ErrorContains(t, err, "mlock somehow failed in allocate") + + // It should work after disabling the failpoint. + err = gofail.Disable("mlockError") + require.NoError(t, err) + db.MustClose() + db.MustReopen() + + err = db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + ) + + require.NoError(t, err) +} + +func TestFailpoint_ResizeFileFail(t *testing.T) { + db := btesting.MustCreateDB(t) + + err := gofail.Enable("resizeFileError", `return("resizeFile somehow failed")`) + require.NoError(t, err) + + err = db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + ) + + require.Error(t, err) + require.ErrorContains(t, err, "resizeFile somehow failed") + + // It should work after disabling the failpoint. + err = gofail.Disable("resizeFileError") + require.NoError(t, err) + db.MustClose() + db.MustReopen() + + err = db.Fill([]byte("data"), 1, 10000, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + ) + + require.NoError(t, err) +} + +func TestFailpoint_LackOfDiskSpace(t *testing.T) { + db := btesting.MustCreateDB(t) + + err := gofail.Enable("lackOfDiskSpace", `return("grow somehow failed")`) + require.NoError(t, err) + + tx, err := db.Begin(true) + require.NoError(t, err) + + err = tx.Commit() + require.Error(t, err) + require.ErrorContains(t, err, "grow somehow failed") + + err = tx.Rollback() + require.Error(t, err) + require.ErrorIs(t, err, errors.ErrTxClosed) + + // It should work after disabling the failpoint. + err = gofail.Disable("lackOfDiskSpace") + require.NoError(t, err) + + tx, err = db.Begin(true) + require.NoError(t, err) + + err = tx.Commit() + require.NoError(t, err) + + err = tx.Rollback() + require.Error(t, err) + require.ErrorIs(t, err, errors.ErrTxClosed) +} + +// TestIssue72 reproduces issue 72. +// +// When bbolt is processing a `Put` invocation, the key might be concurrently +// updated by the application which calls the `Put` API (although it shouldn't). +// It might lead to a situation that bbolt use an old key to find a proper +// position to insert the key/value pair, but actually inserts a new key. +// Eventually it might break the rule that all keys should be sorted. In a +// worse case, it might cause page elements to point to already freed pages. +// +// REF: https://github.com/etcd-io/bbolt/issues/72 +func TestIssue72(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + + bucketName := []byte(t.Name()) + err := db.Update(func(tx *bolt.Tx) error { + _, txerr := tx.CreateBucket(bucketName) + return txerr + }) + require.NoError(t, err) + + // The layout is like: + // + // +--+--+--+ + // +------+1 |3 |10+---+ + // | +-++--+--+ | + // | | | + // | | | + // +v-+--+ +v-+--+ +-v+--+--+ + // |1 |2 | |3 |4 | |10|11|12| + // +--+--+ +--+--+ +--+--+--+ + // + err = db.Update(func(tx *bolt.Tx) error { + bk := tx.Bucket(bucketName) + + for _, id := range []int{1, 2, 3, 4, 10, 11, 12} { + if txerr := bk.Put(idToBytes(id), make([]byte, 1000)); txerr != nil { + return txerr + } + } + return nil + }) + require.NoError(t, err) + + require.NoError(t, gofail.Enable("beforeBucketPut", `sleep(5000)`)) + + // +--+--+--+ + // +------+1 |3 |1 +---+ + // | +-++--+--+ | + // | | | + // | | | + // +v-+--+ +v-+--+ +-v+--+--+--+ + // |1 |2 | |3 |4 | |1 |10|11|12| + // +--+--+ +--+--+ +--+--+--+--+ + // + key := idToBytes(13) + updatedKey := idToBytes(1) + err = db.Update(func(tx *bolt.Tx) error { + bk := tx.Bucket(bucketName) + + go func() { + time.Sleep(3 * time.Second) + copy(key, updatedKey) + }() + return bk.Put(key, make([]byte, 100)) + }) + require.NoError(t, err) + + require.NoError(t, gofail.Disable("beforeBucketPut")) + + // bbolt inserts 100 into last branch page. Since there are two `1` + // keys in branch, spill operation will update first `1` pointer and + // then last one won't be updated and continues to point to freed page. + // + // + // +--+--+--+ + // +---------------+1 |3 |1 +---------+ + // | +--++-+--+ | + // | | | + // | | | + // | +--+--+ +v-+--+ +-----v-----+ + // | |1 |2 | |3 |4 | |freed page | + // | +--+--+ +--+--+ +-----------+ + // | + // +v-+--+--+--+---+ + // |1 |10|11|12|100| + // +--+--+--+--+---+ + err = db.Update(func(tx *bolt.Tx) error { + return tx.Bucket(bucketName).Put(idToBytes(100), make([]byte, 100)) + }) + require.NoError(t, err) + + defer func() { + if r := recover(); r != nil { + t.Logf("panic info:\n %v", r) + } + }() + + // Add more keys to ensure branch node to spill. + err = db.Update(func(tx *bolt.Tx) error { + bk := tx.Bucket(bucketName) + + for _, id := range []int{101, 102, 103, 104, 105} { + if txerr := bk.Put(idToBytes(id), make([]byte, 1000)); txerr != nil { + return txerr + } + } + return nil + }) + require.NoError(t, err) +} + +func TestTx_Rollback_Freelist(t *testing.T) { + db := btesting.MustCreateDBWithOption(t, &bolt.Options{PageSize: 4096}) + + bucketName := []byte("data") + + t.Log("Populate some data to have at least 5 leaf pages.") + var keys []string + err := db.Update(func(tx *bolt.Tx) error { + b, terr := tx.CreateBucket(bucketName) + if terr != nil { + return terr + } + for i := 0; i <= 10; i++ { + k := fmt.Sprintf("t1_k%02d", i) + keys = append(keys, k) + + v := make([]byte, 1500) + if _, terr := crand.Read(v); terr != nil { + return terr + } + + if terr := b.Put([]byte(k), v); terr != nil { + return terr + } + } + return nil + }) + require.NoError(t, err) + + t.Log("Remove some keys to have at least 3 more free pages.") + err = db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + for i := 0; i < 6; i++ { + if terr := b.Delete([]byte(keys[i])); terr != nil { + return terr + } + } + return nil + }) + require.NoError(t, err) + + t.Log("Close and then reopen the db to release all pending free pages.") + db.MustClose() + db.MustReopen() + + t.Log("Enable the `beforeWriteMetaError` failpoint.") + require.NoError(t, gofail.Enable("beforeWriteMetaError", `return("writeMeta somehow failed")`)) + defer func() { + t.Log("Disable the `beforeWriteMetaError` failpoint.") + require.NoError(t, gofail.Disable("beforeWriteMetaError")) + }() + + beforeFreelistPgids, err := readFreelistPageIds(db.Path()) + require.NoError(t, err) + require.Greater(t, len(beforeFreelistPgids), 0) + + t.Log("Simulate TXN rollback") + err = db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketName) + for i := 6; i < len(keys); i++ { + v := make([]byte, 1500) + if _, terr := crand.Read(v); terr != nil { + return terr + } + // update the keys + if terr := b.Put([]byte(keys[i]), v); terr != nil { + return terr + } + } + return nil + }) + require.Error(t, err) + + afterFreelistPgids, err := readFreelistPageIds(db.Path()) + require.NoError(t, err) + + require.Equal(t, beforeFreelistPgids, afterFreelistPgids) +} + +func idToBytes(id int) []byte { + return []byte(fmt.Sprintf("%010d", id)) +} + +func readFreelistPageIds(path string) ([]common.Pgid, error) { + m, _, err := guts_cli.GetActiveMetaPage(path) + if err != nil { + return nil, err + } + + p, _, err := guts_cli.ReadPage(path, uint64(m.Freelist())) + if err != nil { + return nil, err + } + + return p.FreelistPageIds(), nil +} diff --git a/tests/robustness/main_test.go b/tests/robustness/main_test.go new file mode 100644 index 0000000..53122e6 --- /dev/null +++ b/tests/robustness/main_test.go @@ -0,0 +1,17 @@ +//go:build linux + +package robustness + +import ( + "flag" + "os" + "testing" + + testutils "github.com/tutus-one/tutus-bolt/tests/utils" +) + +func TestMain(m *testing.M) { + flag.Parse() + testutils.RequiresRoot() + os.Exit(m.Run()) +} diff --git a/tests/robustness/powerfailure_test.go b/tests/robustness/powerfailure_test.go new file mode 100644 index 0000000..c7794ab --- /dev/null +++ b/tests/robustness/powerfailure_test.go @@ -0,0 +1,326 @@ +//go:build linux + +package robustness + +import ( + "bytes" + "crypto/rand" + "fmt" + "io" + "math" + "math/big" + "net/http" + "net/url" + "os" + "os/exec" + "path" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/tutus-one/tutus-bolt/tests/dmflakey" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" +) + +var panicFailpoints = []string{ + "beforeSyncDataPages", + "beforeSyncMetaPage", + "lackOfDiskSpace", + "mapError", + "resizeFileError", + "unmapError", +} + +// TestRestartFromPowerFailureExt4 is to test data after unexpected power failure on ext4. +func TestRestartFromPowerFailureExt4(t *testing.T) { + for _, tc := range []struct { + name string + du time.Duration + fsMountOpt string + useFailpoint bool + }{ + { + name: "fp_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + useFailpoint: true, + }, + { + name: "fp_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + useFailpoint: true, + }, + { + name: "kill_ext4_commit5s", + du: 5 * time.Second, + fsMountOpt: "commit=5", + }, + { + name: "kill_ext4_commit1s", + du: 10 * time.Second, + fsMountOpt: "commit=1", + }, + { + name: "kill_ext4_commit1000s", + du: 10 * time.Second, + fsMountOpt: "commit=1000", + }, + } { + t.Run(tc.name, func(t *testing.T) { + doPowerFailure(t, tc.du, dmflakey.FSTypeEXT4, "", tc.fsMountOpt, tc.useFailpoint) + }) + } +} + +func TestRestartFromPowerFailureXFS(t *testing.T) { + for _, tc := range []struct { + name string + mkfsOpt string + fsMountOpt string + useFailpoint bool + }{ + { + name: "xfs_no_opts", + mkfsOpt: "", + fsMountOpt: "", + useFailpoint: true, + }, + { + name: "lazy-log", + mkfsOpt: "-l lazy-count=1", + fsMountOpt: "", + useFailpoint: true, + }, + { + name: "odd-allocsize", + mkfsOpt: "", + fsMountOpt: "allocsize=" + fmt.Sprintf("%d", 4096*5), + useFailpoint: true, + }, + { + name: "nolargeio", + mkfsOpt: "", + fsMountOpt: "nolargeio", + useFailpoint: true, + }, + { + name: "odd-alignment", + mkfsOpt: "-d sunit=1024,swidth=1024", + fsMountOpt: "noalign", + useFailpoint: true, + }, + { + name: "openshift-sno-options", + mkfsOpt: "-m bigtime=1,finobt=1,rmapbt=0,reflink=1 -i sparse=1 -l lazy-count=1", + // openshift also supplies seclabel,relatime,prjquota on RHEL, but that's not supported on our CI + // prjquota is only unsupported on our ARM runners. + // You can find more information in either the man page with `man xfs` or `man mkfs.xfs`. + // Also refer to https://man7.org/linux/man-pages/man8/mkfs.xfs.8.html. + fsMountOpt: "rw,attr2,inode64,logbufs=8,logbsize=32k", + useFailpoint: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + t.Logf("mkfs opts: %s", tc.mkfsOpt) + t.Logf("mount opts: %s", tc.fsMountOpt) + doPowerFailure(t, 5*time.Second, dmflakey.FSTypeXFS, tc.mkfsOpt, tc.fsMountOpt, tc.useFailpoint) + }) + } +} + +func doPowerFailure(t *testing.T, du time.Duration, fsType dmflakey.FSType, mkfsOpt string, fsMountOpt string, useFailpoint bool) { + flakey := initFlakeyDevice(t, strings.ReplaceAll(t.Name(), "/", "_"), fsType, mkfsOpt, fsMountOpt) + root := flakey.RootFS() + + dbPath := filepath.Join(root, "boltdb") + + args := []string{"bbolt", "bench", + "-work", // keep the database + "-path", dbPath, + "-count=1000000000", + "-batch-size=5", // separate total count into multiple truncation + "-value-size=512", + } + + logPath := filepath.Join(t.TempDir(), fmt.Sprintf("%s.log", t.Name())) + require.NoError(t, os.MkdirAll(path.Dir(logPath), 0600)) + + logFd, err := os.Create(logPath) + require.NoError(t, err) + defer logFd.Close() + + fpURL := "127.0.0.1:12345" + + cmd := exec.Command(args[0], args[1:]...) + cmd.Stdout = logFd + cmd.Stderr = logFd + cmd.Env = append(cmd.Env, "GOFAIL_HTTP="+fpURL) + t.Logf("start %s", strings.Join(args, " ")) + require.NoError(t, cmd.Start(), "args: %v", args) + + errCh := make(chan error, 1) + go func() { + errCh <- cmd.Wait() + }() + + defer func() { + if t.Failed() { + logData, err := os.ReadFile(logPath) + assert.NoError(t, err) + t.Logf("dump log:\n: %s", string(logData)) + } + }() + + time.Sleep(du) + t.Logf("simulate power failure") + + if useFailpoint { + fpURL = "http://" + fpURL + targetFp := panicFailpoints[randomInt(t, math.MaxInt32)%len(panicFailpoints)] + t.Logf("random pick failpoint: %s", targetFp) + activeFailpoint(t, fpURL, targetFp, "panic") + } else { + t.Log("kill bbolt") + assert.NoError(t, cmd.Process.Kill()) + } + + select { + case <-time.After(10 * time.Second): + t.Log("bbolt is supposed to be already stopped, but actually not yet; forcibly kill it") + assert.NoError(t, cmd.Process.Kill()) + case err := <-errCh: + require.Error(t, err) + } + require.NoError(t, flakey.PowerFailure("")) + + st, err := os.Stat(dbPath) + require.NoError(t, err) + t.Logf("db size: %d", st.Size()) + + t.Logf("verify data") + output, err := exec.Command("bbolt", "check", dbPath).CombinedOutput() + require.NoError(t, err, "bbolt check output: %s", string(output)) +} + +// activeFailpoint actives the failpoint by http. +func activeFailpoint(t *testing.T, targetUrl string, fpName, fpVal string) { + u, err := url.JoinPath(targetUrl, fpName) + require.NoError(t, err, "parse url %s", targetUrl) + + req, err := http.NewRequest("PUT", u, bytes.NewBuffer([]byte(fpVal))) + require.NoError(t, err) + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + data, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Equal(t, 204, resp.StatusCode, "response body: %s", string(data)) +} + +// FlakeyDevice extends dmflakey.Flakey interface. +type FlakeyDevice interface { + // RootFS returns root filesystem. + RootFS() string + + // PowerFailure simulates power failure with drop all the writes. + PowerFailure(mntOpt string) error + + dmflakey.Flakey +} + +// initFlakeyDevice returns FlakeyDevice instance with a given filesystem. +func initFlakeyDevice(t *testing.T, name string, fsType dmflakey.FSType, mkfsOpt string, mntOpt string) FlakeyDevice { + imgDir := t.TempDir() + + flakey, err := dmflakey.InitFlakey(name, imgDir, fsType, mkfsOpt) + require.NoError(t, err, "init flakey %s", name) + t.Cleanup(func() { + assert.NoError(t, flakey.Teardown()) + }) + + rootDir := t.TempDir() + err = unix.Mount(flakey.DevicePath(), rootDir, string(fsType), 0, mntOpt) + require.NoError(t, err, "init rootfs on %s", rootDir) + + t.Cleanup(func() { assert.NoError(t, unmountAll(rootDir)) }) + + return &flakeyT{ + Flakey: flakey, + + rootDir: rootDir, + mntOpt: mntOpt, + } +} + +type flakeyT struct { + dmflakey.Flakey + + rootDir string + mntOpt string +} + +// RootFS returns root filesystem. +func (f *flakeyT) RootFS() string { + return f.rootDir +} + +// PowerFailure simulates power failure with drop all the writes. +func (f *flakeyT) PowerFailure(mntOpt string) error { + if err := f.DropWrites(); err != nil { + return fmt.Errorf("failed to drop_writes: %w", err) + } + + if err := unmountAll(f.rootDir); err != nil { + return fmt.Errorf("failed to unmount rootfs %s: %w", f.rootDir, err) + } + + if mntOpt == "" { + mntOpt = f.mntOpt + } + + if err := f.AllowWrites(); err != nil { + return fmt.Errorf("failed to allow_writes: %w", err) + } + + if err := unix.Mount(f.DevicePath(), f.rootDir, string(f.Filesystem()), 0, mntOpt); err != nil { + return fmt.Errorf("failed to mount rootfs %s (%s): %w", f.rootDir, mntOpt, err) + } + return nil +} + +func unmountAll(target string) error { + for i := 0; i < 50; i++ { + if err := unix.Unmount(target, 0); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(500 * time.Millisecond) + continue + case unix.EINVAL: + return nil + default: + return fmt.Errorf("failed to umount %s: %w", target, err) + } + } + continue + } + return fmt.Errorf("failed to umount %s: %w", target, unix.EBUSY) +} + +func randomInt(t *testing.T, max int) int { + n, err := rand.Int(rand.Reader, big.NewInt(int64(max))) + assert.NoError(t, err) + return int(n.Int64()) +} diff --git a/tests/utils/helpers.go b/tests/utils/helpers.go new file mode 100644 index 0000000..f9c87f6 --- /dev/null +++ b/tests/utils/helpers.go @@ -0,0 +1,26 @@ +package utils + +import ( + "flag" + "fmt" + "os" +) + +var enableRoot bool + +func init() { + flag.BoolVar(&enableRoot, "test.root", false, "enable tests that require root") +} + +// RequiresRoot requires root and the test.root flag has been set. +func RequiresRoot() { + if !enableRoot { + fmt.Fprintln(os.Stderr, "Skip tests that require root") + os.Exit(0) + } + + if os.Getuid() != 0 { + fmt.Fprintln(os.Stderr, "This test must be run as root.") + os.Exit(1) + } +} diff --git a/tx.go b/tx.go new file mode 100644 index 0000000..e1f4fb1 --- /dev/null +++ b/tx.go @@ -0,0 +1,858 @@ +package bbolt + +import ( + "errors" + "fmt" + "io" + "os" + "runtime" + "sort" + "strings" + "sync/atomic" + "time" + "unsafe" + + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// Tx represents a read-only or read/write transaction on the database. +// Read-only transactions can be used for retrieving values for keys and creating cursors. +// Read/write transactions can create and remove buckets and create and remove keys. +// +// IMPORTANT: You must commit or rollback transactions when you are done with +// them. Pages can not be reclaimed by the writer until no more transactions +// are using them. A long running read transaction can cause the database to +// quickly grow. +type Tx struct { + writable bool + managed bool + db *DB + meta *common.Meta + root Bucket + pages map[common.Pgid]*common.Page + stats TxStats + commitHandlers []func() + + // WriteFlag specifies the flag for write-related methods like WriteTo(). + // Tx opens the database file with the specified flag to copy the data. + // + // By default, the flag is unset, which works well for mostly in-memory + // workloads. For databases that are much larger than available RAM, + // set the flag to syscall.O_DIRECT to avoid trashing the page cache. + WriteFlag int +} + +// init initializes the transaction. +func (tx *Tx) init(db *DB) { + tx.db = db + tx.pages = nil + + // Copy the meta page since it can be changed by the writer. + tx.meta = &common.Meta{} + db.meta().Copy(tx.meta) + + // Copy over the root bucket. + tx.root = newBucket(tx) + tx.root.InBucket = &common.InBucket{} + *tx.root.InBucket = *(tx.meta.RootBucket()) + + // Increment the transaction id and add a page cache for writable transactions. + if tx.writable { + tx.pages = make(map[common.Pgid]*common.Page) + tx.meta.IncTxid() + } +} + +// ID returns the transaction id. +func (tx *Tx) ID() int { + if tx == nil || tx.meta == nil { + return -1 + } + return int(tx.meta.Txid()) +} + +// DB returns a reference to the database that created the transaction. +func (tx *Tx) DB() *DB { + return tx.db +} + +// Size returns current database size in bytes as seen by this transaction. +func (tx *Tx) Size() int64 { + return int64(tx.meta.Pgid()) * int64(tx.db.pageSize) +} + +// Writable returns whether the transaction can perform write operations. +func (tx *Tx) Writable() bool { + return tx.writable +} + +// Cursor creates a cursor associated with the root bucket. +// All items in the cursor will return a nil value because all root bucket keys point to buckets. +// The cursor is only valid as long as the transaction is open. +// Do not use a cursor after the transaction is closed. +func (tx *Tx) Cursor() *Cursor { + return tx.root.Cursor() +} + +// Stats retrieves a copy of the current transaction statistics. +func (tx *Tx) Stats() TxStats { + return tx.stats +} + +// Inspect returns the structure of the database. +func (tx *Tx) Inspect() BucketStructure { + return tx.root.Inspect() +} + +// Bucket retrieves a bucket by name. +// Returns nil if the bucket does not exist. +// The bucket instance is only valid for the lifetime of the transaction. +func (tx *Tx) Bucket(name []byte) *Bucket { + return tx.root.Bucket(name) +} + +// CreateBucket creates a new bucket. +// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. +func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) { + return tx.root.CreateBucket(name) +} + +// CreateBucketIfNotExists creates a new bucket if it doesn't already exist. +// Returns an error if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. +func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) { + return tx.root.CreateBucketIfNotExists(name) +} + +// DeleteBucket deletes a bucket. +// Returns an error if the bucket cannot be found or if the key represents a non-bucket value. +func (tx *Tx) DeleteBucket(name []byte) error { + return tx.root.DeleteBucket(name) +} + +// MoveBucket moves a sub-bucket from the source bucket to the destination bucket. +// Returns an error if +// 1. the sub-bucket cannot be found in the source bucket; +// 2. or the key already exists in the destination bucket; +// 3. the key represents a non-bucket value. +// +// If src is nil, it means moving a top level bucket into the target bucket. +// If dst is nil, it means converting the child bucket into a top level bucket. +func (tx *Tx) MoveBucket(child []byte, src *Bucket, dst *Bucket) error { + if src == nil { + src = &tx.root + } + if dst == nil { + dst = &tx.root + } + return src.MoveBucket(child, dst) +} + +// ForEach executes a function for each bucket in the root. +// If the provided function returns an error then the iteration is stopped and +// the error is returned to the caller. +func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error { + return tx.root.ForEach(func(k, v []byte) error { + return fn(k, tx.root.Bucket(k)) + }) +} + +// OnCommit adds a handler function to be executed after the transaction successfully commits. +func (tx *Tx) OnCommit(fn func()) { + tx.commitHandlers = append(tx.commitHandlers, fn) +} + +// Commit writes all changes to disk, updates the meta page and closes the transaction. +// Returns an error if a disk write error occurs, or if Commit is +// called on a read-only transaction. +func (tx *Tx) Commit() (err error) { + txId := tx.ID() + lg := tx.db.Logger() + if lg != discardLogger { + lg.Debugf("Committing transaction %d", txId) + defer func() { + if err != nil { + lg.Errorf("Committing transaction failed: %v", err) + } else { + lg.Debugf("Committing transaction %d successfully", txId) + } + }() + } + + common.Assert(!tx.managed, "managed tx commit not allowed") + if tx.db == nil { + return berrors.ErrTxClosed + } else if !tx.writable { + return berrors.ErrTxNotWritable + } + + // TODO(benbjohnson): Use vectorized I/O to write out dirty pages. + + // Rebalance nodes which have had deletions. + var startTime = time.Now() + tx.root.rebalance() + if tx.stats.GetRebalance() > 0 { + tx.stats.IncRebalanceTime(time.Since(startTime)) + } + + opgid := tx.meta.Pgid() + + // spill data onto dirty pages. + startTime = time.Now() + if err = tx.root.spill(); err != nil { + lg.Errorf("spilling data onto dirty pages failed: %v", err) + tx.rollback() + return err + } + tx.stats.IncSpillTime(time.Since(startTime)) + + // Free the old root bucket. + tx.meta.RootBucket().SetRootPage(tx.root.RootPage()) + + // Free the old freelist because commit writes out a fresh freelist. + if tx.meta.Freelist() != common.PgidNoFreelist { + tx.db.freelist.Free(tx.meta.Txid(), tx.db.page(tx.meta.Freelist())) + } + + if !tx.db.NoFreelistSync { + err = tx.commitFreelist() + if err != nil { + lg.Errorf("committing freelist failed: %v", err) + return err + } + } else { + tx.meta.SetFreelist(common.PgidNoFreelist) + } + + // If the high water mark has moved up then attempt to grow the database. + if tx.meta.Pgid() > opgid { + _ = errors.New("") + // gofail: var lackOfDiskSpace string + // tx.rollback() + // return errors.New(lackOfDiskSpace) + if err = tx.db.grow(int(tx.meta.Pgid()+1) * tx.db.pageSize); err != nil { + lg.Errorf("growing db size failed, pgid: %d, pagesize: %d, error: %v", tx.meta.Pgid(), tx.db.pageSize, err) + tx.rollback() + return err + } + } + + // Write dirty pages to disk. + startTime = time.Now() + if err = tx.write(); err != nil { + lg.Errorf("writing data failed: %v", err) + tx.rollback() + return err + } + + // If strict mode is enabled then perform a consistency check. + if tx.db.StrictMode { + ch := tx.Check() + var errs []string + for { + chkErr, ok := <-ch + if !ok { + break + } + errs = append(errs, chkErr.Error()) + } + if len(errs) > 0 { + panic("check fail: " + strings.Join(errs, "\n")) + } + } + + // Write meta to disk. + if err = tx.writeMeta(); err != nil { + lg.Errorf("writeMeta failed: %v", err) + tx.rollback() + return err + } + tx.stats.IncWriteTime(time.Since(startTime)) + + // Finalize the transaction. + tx.close() + + // Execute commit handlers now that the locks have been removed. + for _, fn := range tx.commitHandlers { + fn() + } + + return nil +} + +func (tx *Tx) commitFreelist() error { + // Allocate new pages for the new free list. This will overestimate + // the size of the freelist but not underestimate the size (which would be bad). + p, err := tx.allocate((tx.db.freelist.EstimatedWritePageSize() / tx.db.pageSize) + 1) + if err != nil { + tx.rollback() + return err + } + + tx.db.freelist.Write(p) + tx.meta.SetFreelist(p.Id()) + + return nil +} + +// Rollback closes the transaction and ignores all previous updates. Read-only +// transactions must be rolled back and not committed. +func (tx *Tx) Rollback() error { + common.Assert(!tx.managed, "managed tx rollback not allowed") + if tx.db == nil { + return berrors.ErrTxClosed + } + tx.nonPhysicalRollback() + return nil +} + +// nonPhysicalRollback is called when user calls Rollback directly, in this case we do not need to reload the free pages from disk. +func (tx *Tx) nonPhysicalRollback() { + if tx.db == nil { + return + } + if tx.writable { + tx.db.freelist.Rollback(tx.meta.Txid()) + } + tx.close() +} + +// rollback needs to reload the free pages from disk in case some system error happens like fsync error. +func (tx *Tx) rollback() { + if tx.db == nil { + return + } + if tx.writable { + tx.db.freelist.Rollback(tx.meta.Txid()) + // When mmap fails, the `data`, `dataref` and `datasz` may be reset to + // zero values, and there is no way to reload free page IDs in this case. + if tx.db.data != nil { + if !tx.db.hasSyncedFreelist() { + // Reconstruct free page list by scanning the DB to get the whole free page list. + // Note: scanning the whole db is heavy if your db size is large in NoSyncFreeList mode. + tx.db.freelist.NoSyncReload(tx.db.freepages()) + } else { + // Read free page list from freelist page. + tx.db.freelist.Reload(tx.db.page(tx.db.meta().Freelist())) + } + } + } + tx.close() +} + +func (tx *Tx) close() { + if tx.db == nil { + return + } + if tx.writable { + // Grab freelist stats. + var freelistFreeN = tx.db.freelist.FreeCount() + var freelistPendingN = tx.db.freelist.PendingCount() + var freelistAlloc = tx.db.freelist.EstimatedWritePageSize() + + // Remove transaction ref & writer lock. + tx.db.rwtx = nil + tx.db.rwlock.Unlock() + + // Merge statistics. + tx.db.statlock.Lock() + tx.db.stats.FreePageN = freelistFreeN + tx.db.stats.PendingPageN = freelistPendingN + tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize + tx.db.stats.FreelistInuse = freelistAlloc + tx.db.stats.TxStats.add(&tx.stats) + tx.db.statlock.Unlock() + } else { + tx.db.removeTx(tx) + } + + // Clear all references. + tx.db = nil + tx.meta = nil + tx.root = Bucket{tx: tx} + tx.pages = nil +} + +// Copy writes the entire database to a writer. +// This function exists for backwards compatibility. +// +// Deprecated: Use WriteTo() instead. +func (tx *Tx) Copy(w io.Writer) error { + _, err := tx.WriteTo(w) + return err +} + +// WriteTo writes the entire database to a writer. +// If err == nil then exactly tx.Size() bytes will be written into the writer. +func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { + // Attempt to open reader with WriteFlag + f, err := tx.db.openFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0) + if err != nil { + return 0, err + } + defer func() { + if cerr := f.Close(); err == nil { + err = cerr + } + }() + + // Generate a meta page. We use the same page data for both meta pages. + buf := make([]byte, tx.db.pageSize) + page := (*common.Page)(unsafe.Pointer(&buf[0])) + page.SetFlags(common.MetaPageFlag) + *page.Meta() = *tx.meta + + // Write meta 0. + page.SetId(0) + page.Meta().SetChecksum(page.Meta().Sum64()) + nn, err := w.Write(buf) + n += int64(nn) + if err != nil { + return n, fmt.Errorf("meta 0 copy: %s", err) + } + + // Write meta 1 with a lower transaction id. + page.SetId(1) + page.Meta().DecTxid() + page.Meta().SetChecksum(page.Meta().Sum64()) + nn, err = w.Write(buf) + n += int64(nn) + if err != nil { + return n, fmt.Errorf("meta 1 copy: %s", err) + } + + // Move past the meta pages in the file. + if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil { + return n, fmt.Errorf("seek: %s", err) + } + + // Copy data pages. + wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)) + n += wn + if err != nil { + return n, err + } + + return n, nil +} + +// CopyFile copies the entire database to file at the given path. +// A reader transaction is maintained during the copy so it is safe to continue +// using the database while a copy is in progress. +func (tx *Tx) CopyFile(path string, mode os.FileMode) error { + f, err := tx.db.openFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode) + if err != nil { + return err + } + + _, err = tx.WriteTo(f) + if err != nil { + _ = f.Close() + return err + } + return f.Close() +} + +// allocate returns a contiguous block of memory starting at a given page. +func (tx *Tx) allocate(count int) (*common.Page, error) { + lg := tx.db.Logger() + p, err := tx.db.allocate(tx.meta.Txid(), count) + if err != nil { + lg.Errorf("allocating failed, txid: %d, count: %d, error: %v", tx.meta.Txid(), count, err) + return nil, err + } + + // Save to our page cache. + tx.pages[p.Id()] = p + + // Update statistics. + tx.stats.IncPageCount(int64(count)) + tx.stats.IncPageAlloc(int64(count * tx.db.pageSize)) + + return p, nil +} + +// write writes any dirty pages to disk. +func (tx *Tx) write() error { + // Sort pages by id. + lg := tx.db.Logger() + pages := make(common.Pages, 0, len(tx.pages)) + for _, p := range tx.pages { + pages = append(pages, p) + } + // Clear out page cache early. + tx.pages = make(map[common.Pgid]*common.Page) + sort.Sort(pages) + + // Write pages to disk in order. + for _, p := range pages { + rem := (uint64(p.Overflow()) + 1) * uint64(tx.db.pageSize) + offset := int64(p.Id()) * int64(tx.db.pageSize) + var written uintptr + + // Write out page in "max allocation" sized chunks. + for { + sz := rem + if sz > maxAllocSize-1 { + sz = maxAllocSize - 1 + } + buf := common.UnsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz)) + + if _, err := tx.db.ops.writeAt(buf, offset); err != nil { + lg.Errorf("writeAt failed, offset: %d: %w", offset, err) + return err + } + + // Update statistics. + tx.stats.IncWrite(1) + + // Exit inner for loop if we've written all the chunks. + rem -= sz + if rem == 0 { + break + } + + // Otherwise move offset forward and move pointer to next chunk. + offset += int64(sz) + written += uintptr(sz) + } + } + + // Ignore file sync if flag is set on DB. + if !tx.db.NoSync || common.IgnoreNoSync { + // gofail: var beforeSyncDataPages struct{} + if err := fdatasync(tx.db); err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] fdatasync failed: %w", runtime.GOOS, runtime.GOARCH, err) + return err + } + } + + // Put small pages back to page pool. + for _, p := range pages { + // Ignore page sizes over 1 page. + // These are allocated using make() instead of the page pool. + if int(p.Overflow()) != 0 { + continue + } + + buf := common.UnsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize) + + // See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1 + for i := range buf { + buf[i] = 0 + } + tx.db.pagePool.Put(buf) //nolint:staticcheck + } + + return nil +} + +// writeMeta writes the meta to the disk. +func (tx *Tx) writeMeta() error { + // gofail: var beforeWriteMetaError string + // return errors.New(beforeWriteMetaError) + + // Create a temporary buffer for the meta page. + lg := tx.db.Logger() + buf := make([]byte, tx.db.pageSize) + p := tx.db.pageInBuffer(buf, 0) + tx.meta.Write(p) + + // Write the meta page to file. + if _, err := tx.db.ops.writeAt(buf, int64(p.Id())*int64(tx.db.pageSize)); err != nil { + lg.Errorf("writeAt failed, pgid: %d, pageSize: %d, error: %v", p.Id(), tx.db.pageSize, err) + return err + } + if !tx.db.NoSync || common.IgnoreNoSync { + // gofail: var beforeSyncMetaPage struct{} + if err := fdatasync(tx.db); err != nil { + lg.Errorf("[GOOS: %s, GOARCH: %s] fdatasync failed: %w", runtime.GOOS, runtime.GOARCH, err) + return err + } + } + + // Update statistics. + tx.stats.IncWrite(1) + + return nil +} + +// page returns a reference to the page with a given id. +// If page has been written to then a temporary buffered page is returned. +func (tx *Tx) page(id common.Pgid) *common.Page { + // Check the dirty pages first. + if tx.pages != nil { + if p, ok := tx.pages[id]; ok { + p.FastCheck(id) + return p + } + } + + // Otherwise return directly from the mmap. + p := tx.db.page(id) + p.FastCheck(id) + return p +} + +// forEachPage iterates over every page within a given page and executes a function. +func (tx *Tx) forEachPage(pgidnum common.Pgid, fn func(*common.Page, int, []common.Pgid)) { + stack := make([]common.Pgid, 10) + stack[0] = pgidnum + tx.forEachPageInternal(stack[:1], fn) +} + +func (tx *Tx) forEachPageInternal(pgidstack []common.Pgid, fn func(*common.Page, int, []common.Pgid)) { + p := tx.page(pgidstack[len(pgidstack)-1]) + + // Execute function. + fn(p, len(pgidstack)-1, pgidstack) + + // Recursively loop over children. + if p.IsBranchPage() { + for i := 0; i < int(p.Count()); i++ { + elem := p.BranchPageElement(uint16(i)) + tx.forEachPageInternal(append(pgidstack, elem.Pgid()), fn) + } + } +} + +// Page returns page information for a given page number. +// This is only safe for concurrent use when used by a writable transaction. +func (tx *Tx) Page(id int) (*common.PageInfo, error) { + if tx.db == nil { + return nil, berrors.ErrTxClosed + } else if common.Pgid(id) >= tx.meta.Pgid() { + return nil, nil + } + + if tx.db.freelist == nil { + return nil, berrors.ErrFreePagesNotLoaded + } + + // Build the page info. + p := tx.db.page(common.Pgid(id)) + info := &common.PageInfo{ + ID: id, + Count: int(p.Count()), + OverflowCount: int(p.Overflow()), + } + + // Determine the type (or if it's free). + if tx.db.freelist.Freed(common.Pgid(id)) { + info.Type = "free" + } else { + info.Type = p.Typ() + } + + return info, nil +} + +// TxStats represents statistics about the actions performed by the transaction. +type TxStats struct { + // Page statistics. + // + // DEPRECATED: Use GetPageCount() or IncPageCount() + PageCount int64 // number of page allocations + // DEPRECATED: Use GetPageAlloc() or IncPageAlloc() + PageAlloc int64 // total bytes allocated + + // Cursor statistics. + // + // DEPRECATED: Use GetCursorCount() or IncCursorCount() + CursorCount int64 // number of cursors created + + // Node statistics + // + // DEPRECATED: Use GetNodeCount() or IncNodeCount() + NodeCount int64 // number of node allocations + // DEPRECATED: Use GetNodeDeref() or IncNodeDeref() + NodeDeref int64 // number of node dereferences + + // Rebalance statistics. + // + // DEPRECATED: Use GetRebalance() or IncRebalance() + Rebalance int64 // number of node rebalances + // DEPRECATED: Use GetRebalanceTime() or IncRebalanceTime() + RebalanceTime time.Duration // total time spent rebalancing + + // Split/Spill statistics. + // + // DEPRECATED: Use GetSplit() or IncSplit() + Split int64 // number of nodes split + // DEPRECATED: Use GetSpill() or IncSpill() + Spill int64 // number of nodes spilled + // DEPRECATED: Use GetSpillTime() or IncSpillTime() + SpillTime time.Duration // total time spent spilling + + // Write statistics. + // + // DEPRECATED: Use GetWrite() or IncWrite() + Write int64 // number of writes performed + // DEPRECATED: Use GetWriteTime() or IncWriteTime() + WriteTime time.Duration // total time spent writing to disk +} + +func (s *TxStats) add(other *TxStats) { + s.IncPageCount(other.GetPageCount()) + s.IncPageAlloc(other.GetPageAlloc()) + s.IncCursorCount(other.GetCursorCount()) + s.IncNodeCount(other.GetNodeCount()) + s.IncNodeDeref(other.GetNodeDeref()) + s.IncRebalance(other.GetRebalance()) + s.IncRebalanceTime(other.GetRebalanceTime()) + s.IncSplit(other.GetSplit()) + s.IncSpill(other.GetSpill()) + s.IncSpillTime(other.GetSpillTime()) + s.IncWrite(other.GetWrite()) + s.IncWriteTime(other.GetWriteTime()) +} + +// Sub calculates and returns the difference between two sets of transaction stats. +// This is useful when obtaining stats at two different points and time and +// you need the performance counters that occurred within that time span. +func (s *TxStats) Sub(other *TxStats) TxStats { + var diff TxStats + diff.PageCount = s.GetPageCount() - other.GetPageCount() + diff.PageAlloc = s.GetPageAlloc() - other.GetPageAlloc() + diff.CursorCount = s.GetCursorCount() - other.GetCursorCount() + diff.NodeCount = s.GetNodeCount() - other.GetNodeCount() + diff.NodeDeref = s.GetNodeDeref() - other.GetNodeDeref() + diff.Rebalance = s.GetRebalance() - other.GetRebalance() + diff.RebalanceTime = s.GetRebalanceTime() - other.GetRebalanceTime() + diff.Split = s.GetSplit() - other.GetSplit() + diff.Spill = s.GetSpill() - other.GetSpill() + diff.SpillTime = s.GetSpillTime() - other.GetSpillTime() + diff.Write = s.GetWrite() - other.GetWrite() + diff.WriteTime = s.GetWriteTime() - other.GetWriteTime() + return diff +} + +// GetPageCount returns PageCount atomically. +func (s *TxStats) GetPageCount() int64 { + return atomic.LoadInt64(&s.PageCount) +} + +// IncPageCount increases PageCount atomically and returns the new value. +func (s *TxStats) IncPageCount(delta int64) int64 { + return atomic.AddInt64(&s.PageCount, delta) +} + +// GetPageAlloc returns PageAlloc atomically. +func (s *TxStats) GetPageAlloc() int64 { + return atomic.LoadInt64(&s.PageAlloc) +} + +// IncPageAlloc increases PageAlloc atomically and returns the new value. +func (s *TxStats) IncPageAlloc(delta int64) int64 { + return atomic.AddInt64(&s.PageAlloc, delta) +} + +// GetCursorCount returns CursorCount atomically. +func (s *TxStats) GetCursorCount() int64 { + return atomic.LoadInt64(&s.CursorCount) +} + +// IncCursorCount increases CursorCount atomically and return the new value. +func (s *TxStats) IncCursorCount(delta int64) int64 { + return atomic.AddInt64(&s.CursorCount, delta) +} + +// GetNodeCount returns NodeCount atomically. +func (s *TxStats) GetNodeCount() int64 { + return atomic.LoadInt64(&s.NodeCount) +} + +// IncNodeCount increases NodeCount atomically and returns the new value. +func (s *TxStats) IncNodeCount(delta int64) int64 { + return atomic.AddInt64(&s.NodeCount, delta) +} + +// GetNodeDeref returns NodeDeref atomically. +func (s *TxStats) GetNodeDeref() int64 { + return atomic.LoadInt64(&s.NodeDeref) +} + +// IncNodeDeref increases NodeDeref atomically and returns the new value. +func (s *TxStats) IncNodeDeref(delta int64) int64 { + return atomic.AddInt64(&s.NodeDeref, delta) +} + +// GetRebalance returns Rebalance atomically. +func (s *TxStats) GetRebalance() int64 { + return atomic.LoadInt64(&s.Rebalance) +} + +// IncRebalance increases Rebalance atomically and returns the new value. +func (s *TxStats) IncRebalance(delta int64) int64 { + return atomic.AddInt64(&s.Rebalance, delta) +} + +// GetRebalanceTime returns RebalanceTime atomically. +func (s *TxStats) GetRebalanceTime() time.Duration { + return atomicLoadDuration(&s.RebalanceTime) +} + +// IncRebalanceTime increases RebalanceTime atomically and returns the new value. +func (s *TxStats) IncRebalanceTime(delta time.Duration) time.Duration { + return atomicAddDuration(&s.RebalanceTime, delta) +} + +// GetSplit returns Split atomically. +func (s *TxStats) GetSplit() int64 { + return atomic.LoadInt64(&s.Split) +} + +// IncSplit increases Split atomically and returns the new value. +func (s *TxStats) IncSplit(delta int64) int64 { + return atomic.AddInt64(&s.Split, delta) +} + +// GetSpill returns Spill atomically. +func (s *TxStats) GetSpill() int64 { + return atomic.LoadInt64(&s.Spill) +} + +// IncSpill increases Spill atomically and returns the new value. +func (s *TxStats) IncSpill(delta int64) int64 { + return atomic.AddInt64(&s.Spill, delta) +} + +// GetSpillTime returns SpillTime atomically. +func (s *TxStats) GetSpillTime() time.Duration { + return atomicLoadDuration(&s.SpillTime) +} + +// IncSpillTime increases SpillTime atomically and returns the new value. +func (s *TxStats) IncSpillTime(delta time.Duration) time.Duration { + return atomicAddDuration(&s.SpillTime, delta) +} + +// GetWrite returns Write atomically. +func (s *TxStats) GetWrite() int64 { + return atomic.LoadInt64(&s.Write) +} + +// IncWrite increases Write atomically and returns the new value. +func (s *TxStats) IncWrite(delta int64) int64 { + return atomic.AddInt64(&s.Write, delta) +} + +// GetWriteTime returns WriteTime atomically. +func (s *TxStats) GetWriteTime() time.Duration { + return atomicLoadDuration(&s.WriteTime) +} + +// IncWriteTime increases WriteTime atomically and returns the new value. +func (s *TxStats) IncWriteTime(delta time.Duration) time.Duration { + return atomicAddDuration(&s.WriteTime, delta) +} + +func atomicAddDuration(ptr *time.Duration, du time.Duration) time.Duration { + return time.Duration(atomic.AddInt64((*int64)(unsafe.Pointer(ptr)), int64(du))) +} + +func atomicLoadDuration(ptr *time.Duration) time.Duration { + return time.Duration(atomic.LoadInt64((*int64)(unsafe.Pointer(ptr)))) +} diff --git a/tx_check.go b/tx_check.go new file mode 100644 index 0000000..998c20e --- /dev/null +++ b/tx_check.go @@ -0,0 +1,290 @@ +package bbolt + +import ( + "encoding/hex" + "fmt" + + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// Check performs several consistency checks on the database for this transaction. +// An error is returned if any inconsistency is found. +// +// It can be safely run concurrently on a writable transaction. However, this +// incurs a high cost for large databases and databases with a lot of subbuckets +// because of caching. This overhead can be removed if running on a read-only +// transaction, however, it is not safe to execute other writer transactions at +// the same time. +// +// It also allows users to provide a customized `KVStringer` implementation, +// so that bolt can generate human-readable diagnostic messages. +func (tx *Tx) Check(options ...CheckOption) <-chan error { + chkConfig := checkConfig{ + kvStringer: HexKVStringer(), + } + for _, op := range options { + op(&chkConfig) + } + + ch := make(chan error) + go func() { + // Close the channel to signal completion. + defer close(ch) + tx.check(chkConfig, ch) + }() + return ch +} + +func (tx *Tx) check(cfg checkConfig, ch chan error) { + // Force loading free list if opened in ReadOnly mode. + tx.db.loadFreelist() + + // Check if any pages are double freed. + freed := make(map[common.Pgid]bool) + all := make([]common.Pgid, tx.db.freelist.Count()) + tx.db.freelist.Copyall(all) + for _, id := range all { + if freed[id] { + ch <- fmt.Errorf("page %d: already freed", id) + } + freed[id] = true + } + + // Track every reachable page. + reachable := make(map[common.Pgid]*common.Page) + reachable[0] = tx.page(0) // meta0 + reachable[1] = tx.page(1) // meta1 + if tx.meta.Freelist() != common.PgidNoFreelist { + for i := uint32(0); i <= tx.page(tx.meta.Freelist()).Overflow(); i++ { + reachable[tx.meta.Freelist()+common.Pgid(i)] = tx.page(tx.meta.Freelist()) + } + } + + if cfg.pageId == 0 { + // Check the whole db file, starting from the root bucket and + // recursively check all child buckets. + tx.recursivelyCheckBucket(&tx.root, reachable, freed, cfg.kvStringer, ch) + + // Ensure all pages below high water mark are either reachable or freed. + for i := common.Pgid(0); i < tx.meta.Pgid(); i++ { + _, isReachable := reachable[i] + if !isReachable && !freed[i] { + ch <- fmt.Errorf("page %d: unreachable unfreed", int(i)) + } + } + } else { + // Check the db file starting from a specified pageId. + if cfg.pageId < 2 || cfg.pageId >= uint64(tx.meta.Pgid()) { + ch <- fmt.Errorf("page ID (%d) out of range [%d, %d)", cfg.pageId, 2, tx.meta.Pgid()) + return + } + + tx.recursivelyCheckPage(common.Pgid(cfg.pageId), reachable, freed, cfg.kvStringer, ch) + } +} + +func (tx *Tx) recursivelyCheckPage(pageId common.Pgid, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool, + kvStringer KVStringer, ch chan error) { + tx.checkInvariantProperties(pageId, reachable, freed, kvStringer, ch) + tx.recursivelyCheckBucketInPage(pageId, reachable, freed, kvStringer, ch) +} + +func (tx *Tx) recursivelyCheckBucketInPage(pageId common.Pgid, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool, + kvStringer KVStringer, ch chan error) { + p := tx.page(pageId) + + switch { + case p.IsBranchPage(): + for i := range p.BranchPageElements() { + elem := p.BranchPageElement(uint16(i)) + tx.recursivelyCheckBucketInPage(elem.Pgid(), reachable, freed, kvStringer, ch) + } + case p.IsLeafPage(): + for i := range p.LeafPageElements() { + elem := p.LeafPageElement(uint16(i)) + if elem.IsBucketEntry() { + inBkt := common.NewInBucket(pageId, 0) + tmpBucket := Bucket{ + InBucket: &inBkt, + rootNode: &node{isLeaf: p.IsLeafPage()}, + FillPercent: DefaultFillPercent, + tx: tx, + } + if child := tmpBucket.Bucket(elem.Key()); child != nil { + tx.recursivelyCheckBucket(child, reachable, freed, kvStringer, ch) + } + } + } + default: + ch <- fmt.Errorf("unexpected page type (flags: %x) for pgId:%d", p.Flags(), pageId) + } +} + +func (tx *Tx) recursivelyCheckBucket(b *Bucket, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool, + kvStringer KVStringer, ch chan error) { + // Ignore inline buckets. + if b.RootPage() == 0 { + return + } + + tx.checkInvariantProperties(b.RootPage(), reachable, freed, kvStringer, ch) + + // Check each bucket within this bucket. + _ = b.ForEachBucket(func(k []byte) error { + if child := b.Bucket(k); child != nil { + tx.recursivelyCheckBucket(child, reachable, freed, kvStringer, ch) + } + return nil + }) +} + +func (tx *Tx) checkInvariantProperties(pageId common.Pgid, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool, + kvStringer KVStringer, ch chan error) { + tx.forEachPage(pageId, func(p *common.Page, _ int, stack []common.Pgid) { + verifyPageReachable(p, tx.meta.Pgid(), stack, reachable, freed, ch) + }) + + tx.recursivelyCheckPageKeyOrder(pageId, kvStringer.KeyToString, ch) +} + +func verifyPageReachable(p *common.Page, hwm common.Pgid, stack []common.Pgid, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool, ch chan error) { + if p.Id() > hwm { + ch <- fmt.Errorf("page %d: out of bounds: %d (stack: %v)", int(p.Id()), int(hwm), stack) + } + + // Ensure each page is only referenced once. + for i := common.Pgid(0); i <= common.Pgid(p.Overflow()); i++ { + var id = p.Id() + i + if _, ok := reachable[id]; ok { + ch <- fmt.Errorf("page %d: multiple references (stack: %v)", int(id), stack) + } + reachable[id] = p + } + + // We should only encounter un-freed leaf and branch pages. + if freed[p.Id()] { + ch <- fmt.Errorf("page %d: reachable freed", int(p.Id())) + } else if !p.IsBranchPage() && !p.IsLeafPage() { + ch <- fmt.Errorf("page %d: invalid type: %s (stack: %v)", int(p.Id()), p.Typ(), stack) + } +} + +// recursivelyCheckPageKeyOrder verifies database consistency with respect to b-tree +// key order constraints: +// - keys on pages must be sorted +// - keys on children pages are between 2 consecutive keys on the parent's branch page). +func (tx *Tx) recursivelyCheckPageKeyOrder(pgId common.Pgid, keyToString func([]byte) string, ch chan error) { + tx.recursivelyCheckPageKeyOrderInternal(pgId, nil, nil, nil, keyToString, ch) +} + +// recursivelyCheckPageKeyOrderInternal verifies that all keys in the subtree rooted at `pgid` are: +// - >=`minKeyClosed` (can be nil) +// - <`maxKeyOpen` (can be nil) +// - Are in right ordering relationship to their parents. +// `pagesStack` is expected to contain IDs of pages from the tree root to `pgid` for the clean debugging message. +func (tx *Tx) recursivelyCheckPageKeyOrderInternal( + pgId common.Pgid, minKeyClosed, maxKeyOpen []byte, pagesStack []common.Pgid, + keyToString func([]byte) string, ch chan error) (maxKeyInSubtree []byte) { + + p := tx.page(pgId) + pagesStack = append(pagesStack, pgId) + switch { + case p.IsBranchPage(): + // For branch page we navigate ranges of all subpages. + runningMin := minKeyClosed + for i := range p.BranchPageElements() { + elem := p.BranchPageElement(uint16(i)) + verifyKeyOrder(elem.Pgid(), "branch", i, elem.Key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack) + + maxKey := maxKeyOpen + if i < len(p.BranchPageElements())-1 { + maxKey = p.BranchPageElement(uint16(i + 1)).Key() + } + maxKeyInSubtree = tx.recursivelyCheckPageKeyOrderInternal(elem.Pgid(), elem.Key(), maxKey, pagesStack, keyToString, ch) + runningMin = maxKeyInSubtree + } + return maxKeyInSubtree + case p.IsLeafPage(): + runningMin := minKeyClosed + for i := range p.LeafPageElements() { + elem := p.LeafPageElement(uint16(i)) + verifyKeyOrder(pgId, "leaf", i, elem.Key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack) + runningMin = elem.Key() + } + if p.Count() > 0 { + return p.LeafPageElement(p.Count() - 1).Key() + } + default: + ch <- fmt.Errorf("unexpected page type (flags: %x) for pgId:%d", p.Flags(), pgId) + } + return maxKeyInSubtree +} + +/*** + * verifyKeyOrder checks whether an entry with given #index on pgId (pageType: "branch|leaf") that has given "key", + * is within range determined by (previousKey..maxKeyOpen) and reports found violations to the channel (ch). + */ +func verifyKeyOrder(pgId common.Pgid, pageType string, index int, key []byte, previousKey []byte, maxKeyOpen []byte, ch chan error, keyToString func([]byte) string, pagesStack []common.Pgid) { + if index == 0 && previousKey != nil && compareKeys(previousKey, key) > 0 { + ch <- fmt.Errorf("the first key[%d]=(hex)%s on %s page(%d) needs to be >= the key in the ancestor (%s). Stack: %v", + index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack) + } + if index > 0 { + cmpRet := compareKeys(previousKey, key) + if cmpRet > 0 { + ch <- fmt.Errorf("key[%d]=(hex)%s on %s page(%d) needs to be > (found <) than previous element (hex)%s. Stack: %v", + index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack) + } + if cmpRet == 0 { + ch <- fmt.Errorf("key[%d]=(hex)%s on %s page(%d) needs to be > (found =) than previous element (hex)%s. Stack: %v", + index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack) + } + } + if maxKeyOpen != nil && compareKeys(key, maxKeyOpen) >= 0 { + ch <- fmt.Errorf("key[%d]=(hex)%s on %s page(%d) needs to be < than key of the next element in ancestor (hex)%s. Pages stack: %v", + index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack) + } +} + +// =========================================================================================== + +type checkConfig struct { + kvStringer KVStringer + pageId uint64 +} + +type CheckOption func(options *checkConfig) + +func WithKVStringer(kvStringer KVStringer) CheckOption { + return func(c *checkConfig) { + c.kvStringer = kvStringer + } +} + +// WithPageId sets a page ID from which the check command starts to check +func WithPageId(pageId uint64) CheckOption { + return func(c *checkConfig) { + c.pageId = pageId + } +} + +// KVStringer allows to prepare human-readable diagnostic messages. +type KVStringer interface { + KeyToString([]byte) string + ValueToString([]byte) string +} + +// HexKVStringer serializes both key & value to hex representation. +func HexKVStringer() KVStringer { + return hexKvStringer{} +} + +type hexKvStringer struct{} + +func (hexKvStringer) KeyToString(key []byte) string { + return hex.EncodeToString(key) +} + +func (hexKvStringer) ValueToString(value []byte) string { + return hex.EncodeToString(value) +} diff --git a/tx_check_test.go b/tx_check_test.go new file mode 100644 index 0000000..adffbd4 --- /dev/null +++ b/tx_check_test.go @@ -0,0 +1,166 @@ +package bbolt_test + +import ( + "fmt" + "math/rand" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" + "github.com/tutus-one/tutus-bolt/internal/common" + "github.com/tutus-one/tutus-bolt/internal/guts_cli" +) + +func TestTx_Check_CorruptPage(t *testing.T) { + bucketName := []byte("data") + + t.Log("Creating db file.") + db := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + + // Each page can hold roughly 20 key/values pair, so 100 such + // key/value pairs will consume about 5 leaf pages. + err := db.Fill(bucketName, 1, 100, + func(tx int, k int) []byte { return []byte(fmt.Sprintf("%04d", k)) }, + func(tx int, k int) []byte { return make([]byte, 100) }, + ) + require.NoError(t, err) + + t.Log("Corrupting a random leaf page.") + victimPageId, validPageIds := corruptRandomLeafPageInBucket(t, db.DB, bucketName) + + t.Log("Running consistency check.") + vErr := db.View(func(tx *bbolt.Tx) error { + var cErrs []error + + t.Log("Check corrupted page.") + errChan := tx.Check(bbolt.WithPageId(uint64(victimPageId))) + for cErr := range errChan { + cErrs = append(cErrs, cErr) + } + require.Greater(t, len(cErrs), 0) + + t.Log("Check valid pages.") + cErrs = cErrs[:0] + for _, pgId := range validPageIds { + errChan = tx.Check(bbolt.WithPageId(uint64(pgId))) + for cErr := range errChan { + cErrs = append(cErrs, cErr) + } + require.Equal(t, 0, len(cErrs)) + } + return nil + }) + require.NoError(t, vErr) + t.Log("All check passed") + + // Manually close the db, otherwise the PostTestCleanup will + // check the db again and accordingly fail the test. + db.MustClose() +} + +func TestTx_Check_WithNestBucket(t *testing.T) { + parentBucketName := []byte("parentBucket") + + t.Log("Creating db file.") + db := btesting.MustCreateDBWithOption(t, &bbolt.Options{PageSize: 4096}) + + err := db.Update(func(tx *bbolt.Tx) error { + pb, bErr := tx.CreateBucket(parentBucketName) + if bErr != nil { + return bErr + } + + t.Log("put some key/values under the parent bucket directly") + for i := 0; i < 10; i++ { + k, v := fmt.Sprintf("%04d", i), fmt.Sprintf("value_%4d", i) + if pErr := pb.Put([]byte(k), []byte(v)); pErr != nil { + return pErr + } + } + + t.Log("create a nested bucket and put some key/values under the nested bucket") + cb, bErr := pb.CreateBucket([]byte("nestedBucket")) + if bErr != nil { + return bErr + } + + for i := 0; i < 2000; i++ { + k, v := fmt.Sprintf("%04d", i), fmt.Sprintf("value_%4d", i) + if pErr := cb.Put([]byte(k), []byte(v)); pErr != nil { + return pErr + } + } + + return nil + }) + require.NoError(t, err) + + // Get the bucket's root page. + bucketRootPageId := mustGetBucketRootPage(t, db.DB, parentBucketName) + + t.Logf("Running consistency check starting from pageId: %d", bucketRootPageId) + vErr := db.View(func(tx *bbolt.Tx) error { + var cErrs []error + + errChan := tx.Check(bbolt.WithPageId(uint64(bucketRootPageId))) + for cErr := range errChan { + cErrs = append(cErrs, cErr) + } + require.Equal(t, 0, len(cErrs)) + + return nil + }) + require.NoError(t, vErr) + t.Log("All check passed") + + // Manually close the db, otherwise the PostTestCleanup will + // check the db again and accordingly fail the test. + db.MustClose() +} + +// corruptRandomLeafPage corrupts one random leaf page. +func corruptRandomLeafPageInBucket(t testing.TB, db *bbolt.DB, bucketName []byte) (victimPageId common.Pgid, validPageIds []common.Pgid) { + bucketRootPageId := mustGetBucketRootPage(t, db, bucketName) + bucketRootPage, _, err := guts_cli.ReadPage(db.Path(), uint64(bucketRootPageId)) + require.NoError(t, err) + require.True(t, bucketRootPage.IsBranchPage()) + + // Retrieve all the leaf pages included in the branch page, and pick up random one from them. + var bucketPageIds []common.Pgid + for _, bpe := range bucketRootPage.BranchPageElements() { + bucketPageIds = append(bucketPageIds, bpe.Pgid()) + } + randomIdx := rand.Intn(len(bucketPageIds)) + victimPageId = bucketPageIds[randomIdx] + validPageIds = append(bucketPageIds[:randomIdx], bucketPageIds[randomIdx+1:]...) + + victimPage, victimBuf, err := guts_cli.ReadPage(db.Path(), uint64(victimPageId)) + require.NoError(t, err) + require.True(t, victimPage.IsLeafPage()) + require.True(t, victimPage.Count() > 1) + + // intentionally make the second key < the first key. + element := victimPage.LeafPageElement(1) + key := element.Key() + key[0] = 0 + + // Write the corrupt page to db file. + err = guts_cli.WritePage(db.Path(), victimBuf) + require.NoError(t, err) + return victimPageId, validPageIds +} + +// mustGetBucketRootPage returns the root page for the provided bucket. +func mustGetBucketRootPage(t testing.TB, db *bbolt.DB, bucketName []byte) common.Pgid { + var rootPageId common.Pgid + _ = db.View(func(tx *bbolt.Tx) error { + b := tx.Bucket(bucketName) + require.NotNil(t, b) + rootPageId = b.RootPage() + return nil + }) + + return rootPageId +} diff --git a/tx_stats_test.go b/tx_stats_test.go new file mode 100644 index 0000000..e0cbbd4 --- /dev/null +++ b/tx_stats_test.go @@ -0,0 +1,54 @@ +package bbolt + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestTxStats_add(t *testing.T) { + statsA := TxStats{ + PageCount: 1, + PageAlloc: 2, + CursorCount: 3, + NodeCount: 100, + NodeDeref: 101, + Rebalance: 1000, + RebalanceTime: 1001 * time.Second, + Split: 10000, + Spill: 10001, + SpillTime: 10001 * time.Second, + Write: 100000, + WriteTime: 100001 * time.Second, + } + + statsB := TxStats{ + PageCount: 2, + PageAlloc: 3, + CursorCount: 4, + NodeCount: 101, + NodeDeref: 102, + Rebalance: 1001, + RebalanceTime: 1002 * time.Second, + Split: 11001, + Spill: 11002, + SpillTime: 11002 * time.Second, + Write: 110001, + WriteTime: 110010 * time.Second, + } + + statsB.add(&statsA) + assert.Equal(t, int64(3), statsB.GetPageCount()) + assert.Equal(t, int64(5), statsB.GetPageAlloc()) + assert.Equal(t, int64(7), statsB.GetCursorCount()) + assert.Equal(t, int64(201), statsB.GetNodeCount()) + assert.Equal(t, int64(203), statsB.GetNodeDeref()) + assert.Equal(t, int64(2001), statsB.GetRebalance()) + assert.Equal(t, 2003*time.Second, statsB.GetRebalanceTime()) + assert.Equal(t, int64(21001), statsB.GetSplit()) + assert.Equal(t, int64(21003), statsB.GetSpill()) + assert.Equal(t, 21003*time.Second, statsB.GetSpillTime()) + assert.Equal(t, int64(210001), statsB.GetWrite()) + assert.Equal(t, 210011*time.Second, statsB.GetWriteTime()) +} diff --git a/tx_test.go b/tx_test.go new file mode 100644 index 0000000..4de7340 --- /dev/null +++ b/tx_test.go @@ -0,0 +1,1056 @@ +package bbolt_test + +import ( + "bytes" + "errors" + "fmt" + "log" + "os" + "runtime" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + bolt "github.com/tutus-one/tutus-bolt" + berrors "github.com/tutus-one/tutus-bolt/errors" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +// TestTx_Check_ReadOnly tests consistency checking on a ReadOnly database. +func TestTx_Check_ReadOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + if err := db.Close(); err != nil { + t.Fatal(err) + } + + readOnlyDB, err := bolt.Open(db.Path(), 0600, &bolt.Options{ReadOnly: true}) + if err != nil { + t.Fatal(err) + } + defer readOnlyDB.Close() + + tx, err := readOnlyDB.Begin(false) + if err != nil { + t.Fatal(err) + } + // ReadOnly DB will load freelist on Check call. + numChecks := 2 + errc := make(chan error, numChecks) + check := func() { + errc <- <-tx.Check() + } + // Ensure the freelist is not reloaded and does not race. + for i := 0; i < numChecks; i++ { + go check() + } + for i := 0; i < numChecks; i++ { + if err := <-errc; err != nil { + t.Fatal(err) + } + } + // Close the view transaction + err = tx.Rollback() + if err != nil { + t.Fatal(err) + } +} + +// Ensure that committing a closed transaction returns an error. +func TestTx_Commit_ErrTxClosed(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + if _, err := tx.CreateBucket([]byte("foo")); err != nil { + t.Fatal(err) + } + + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + + if err := tx.Commit(); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that rolling back a closed transaction returns an error. +func TestTx_Rollback_ErrTxClosed(t *testing.T) { + db := btesting.MustCreateDB(t) + + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + + if err := tx.Rollback(); err != nil { + t.Fatal(err) + } + if err := tx.Rollback(); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that committing a read-only transaction returns an error. +func TestTx_Commit_ErrTxNotWritable(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + if err := tx.Commit(); err != berrors.ErrTxNotWritable { + t.Fatal(err) + } + // Close the view transaction + err = tx.Rollback() + if err != nil { + t.Fatal(err) + } +} + +// Ensure that a transaction can retrieve a cursor on the root bucket. +func TestTx_Cursor(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + + if _, err := tx.CreateBucket([]byte("woojits")); err != nil { + t.Fatal(err) + } + + c := tx.Cursor() + if k, v := c.First(); !bytes.Equal(k, []byte("widgets")) { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("unexpected value: %v", v) + } + + if k, v := c.Next(); !bytes.Equal(k, []byte("woojits")) { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("unexpected value: %v", v) + } + + if k, v := c.Next(); k != nil { + t.Fatalf("unexpected key: %v", k) + } else if v != nil { + t.Fatalf("unexpected value: %v", k) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that creating a bucket with a read-only transaction returns an error. +func TestTx_CreateBucket_ErrTxNotWritable(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.View(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("foo")) + if err != berrors.ErrTxNotWritable { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that creating a bucket on a closed transaction returns an error. +func TestTx_CreateBucket_ErrTxClosed(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + + if _, err := tx.CreateBucket([]byte("foo")); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that a Tx can retrieve a bucket. +func TestTx_Bucket(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a Tx retrieving a non-existent key returns nil. +func TestTx_Get_NotFound(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if b.Get([]byte("no_such_key")) != nil { + t.Fatal("expected nil value") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can be created and retrieved. +func TestTx_CreateBucket(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Create a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } else if b == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Read the bucket through a separate transaction. + if err := db.View(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can be created if it doesn't already exist. +func TestTx_CreateBucketIfNotExists(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + // Create bucket. + if b, err := tx.CreateBucketIfNotExists([]byte("widgets")); err != nil { + t.Fatal(err) + } else if b == nil { + t.Fatal("expected bucket") + } + + // Create bucket again. + if b, err := tx.CreateBucketIfNotExists([]byte("widgets")); err != nil { + t.Fatal(err) + } else if b == nil { + t.Fatal("expected bucket") + } + + return nil + }); err != nil { + t.Fatal(err) + } + + // Read the bucket through a separate transaction. + if err := db.View(func(tx *bolt.Tx) error { + if tx.Bucket([]byte("widgets")) == nil { + t.Fatal("expected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure transaction returns an error if creating an unnamed bucket. +func TestTx_CreateBucketIfNotExists_ErrBucketNameRequired(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucketIfNotExists([]byte{}); err != berrors.ErrBucketNameRequired { + t.Fatalf("unexpected error: %s", err) + } + + if _, err := tx.CreateBucketIfNotExists(nil); err != berrors.ErrBucketNameRequired { + t.Fatalf("unexpected error: %s", err) + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket cannot be created twice. +func TestTx_CreateBucket_ErrBucketExists(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Create a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Create the same bucket again. + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket([]byte("widgets")); err != berrors.ErrBucketExists { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket is created with a non-blank name. +func TestTx_CreateBucket_ErrBucketNameRequired(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + if _, err := tx.CreateBucket(nil); err != berrors.ErrBucketNameRequired { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that a bucket can be deleted. +func TestTx_DeleteBucket(t *testing.T) { + db := btesting.MustCreateDB(t) + + // Create a bucket and add a value. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + // Delete the bucket and make sure we can't get the value. + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.DeleteBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + if tx.Bucket([]byte("widgets")) != nil { + t.Fatal("unexpected bucket") + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.Update(func(tx *bolt.Tx) error { + // Create the bucket again and make sure there's not a phantom value. + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if v := b.Get([]byte("foo")); v != nil { + t.Fatalf("unexpected phantom value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that deleting a bucket on a closed transaction returns an error. +func TestTx_DeleteBucket_ErrTxClosed(t *testing.T) { + db := btesting.MustCreateDB(t) + tx, err := db.Begin(true) + if err != nil { + t.Fatal(err) + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + if err := tx.DeleteBucket([]byte("foo")); err != berrors.ErrTxClosed { + t.Fatalf("unexpected error: %s", err) + } +} + +// Ensure that deleting a bucket with a read-only transaction returns an error. +func TestTx_DeleteBucket_ReadOnly(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.View(func(tx *bolt.Tx) error { + if err := tx.DeleteBucket([]byte("foo")); err != berrors.ErrTxNotWritable { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that nothing happens when deleting a bucket that doesn't exist. +func TestTx_DeleteBucket_NotFound(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + if err := tx.DeleteBucket([]byte("widgets")); err != berrors.ErrBucketNotFound { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that no error is returned when a tx.ForEach function does not return +// an error. +func TestTx_ForEach_NoError(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + + if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { + return nil + }); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that an error is returned when a tx.ForEach function returns an error. +func TestTx_ForEach_WithError(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + + marker := errors.New("marker") + if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { + return marker + }); err != marker { + t.Fatalf("unexpected error: %s", err) + } + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Ensure that Tx commit handlers are called after a transaction successfully commits. +func TestTx_OnCommit(t *testing.T) { + db := btesting.MustCreateDB(t) + + var x int + if err := db.Update(func(tx *bolt.Tx) error { + tx.OnCommit(func() { x += 1 }) + tx.OnCommit(func() { x += 2 }) + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } else if x != 3 { + t.Fatalf("unexpected x: %d", x) + } +} + +// Ensure that Tx commit handlers are NOT called after a transaction rolls back. +func TestTx_OnCommit_Rollback(t *testing.T) { + db := btesting.MustCreateDB(t) + + var x int + if err := db.Update(func(tx *bolt.Tx) error { + tx.OnCommit(func() { x += 1 }) + tx.OnCommit(func() { x += 2 }) + if _, err := tx.CreateBucket([]byte("widgets")); err != nil { + t.Fatal(err) + } + return errors.New("rollback this commit") + }); err == nil || err.Error() != "rollback this commit" { + t.Fatalf("unexpected error: %s", err) + } else if x != 0 { + t.Fatalf("unexpected x: %d", x) + } +} + +// Ensure that the database can be copied to a file path. +func TestTx_CopyFile(t *testing.T) { + db := btesting.MustCreateDB(t) + + path := tempfile() + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + return tx.CopyFile(path, 0600) + }); err != nil { + t.Fatal(err) + } + + db2, err := bolt.Open(path, 0600, nil) + if err != nil { + t.Fatal(err) + } + + if err := db2.View(func(tx *bolt.Tx) error { + if v := tx.Bucket([]byte("widgets")).Get([]byte("foo")); !bytes.Equal(v, []byte("bar")) { + t.Fatalf("unexpected value: %v", v) + } + if v := tx.Bucket([]byte("widgets")).Get([]byte("baz")); !bytes.Equal(v, []byte("bat")) { + t.Fatalf("unexpected value: %v", v) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db2.Close(); err != nil { + t.Fatal(err) + } +} + +type failWriterError struct{} + +func (failWriterError) Error() string { + return "error injected for tests" +} + +type failWriter struct { + // fail after this many bytes + After int +} + +func (f *failWriter) Write(p []byte) (n int, err error) { + n = len(p) + if n > f.After { + n = f.After + err = failWriterError{} + } + f.After -= n + return n, err +} + +// Ensure that Copy handles write errors right. +func TestTx_CopyFile_Error_Meta(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + return tx.Copy(&failWriter{}) + }); err == nil || err.Error() != "meta 0 copy: error injected for tests" { + t.Fatalf("unexpected error: %v", err) + } +} + +// Ensure that Copy handles write errors right. +func TestTx_CopyFile_Error_Normal(t *testing.T) { + db := btesting.MustCreateDB(t) + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + t.Fatal(err) + } + if err := b.Put([]byte("baz"), []byte("bat")); err != nil { + t.Fatal(err) + } + return nil + }); err != nil { + t.Fatal(err) + } + + if err := db.View(func(tx *bolt.Tx) error { + return tx.Copy(&failWriter{3 * db.Info().PageSize}) + }); err == nil || err.Error() != "error injected for tests" { + t.Fatalf("unexpected error: %v", err) + } +} + +// TestTx_Rollback ensures there is no error when tx rollback whether we sync freelist or not. +func TestTx_Rollback(t *testing.T) { + for _, isSyncFreelist := range []bool{false, true} { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + db.NoFreelistSync = isSyncFreelist + + tx, err := db.Begin(true) + if err != nil { + t.Fatalf("Error starting tx: %v", err) + } + bucket := []byte("mybucket") + if _, err := tx.CreateBucket(bucket); err != nil { + t.Fatalf("Error creating bucket: %v", err) + } + if err := tx.Commit(); err != nil { + t.Fatalf("Error on commit: %v", err) + } + + tx, err = db.Begin(true) + if err != nil { + t.Fatalf("Error starting tx: %v", err) + } + b := tx.Bucket(bucket) + if err := b.Put([]byte("k"), []byte("v")); err != nil { + t.Fatalf("Error on put: %v", err) + } + // Imagine there is an error and tx needs to be rolled-back + if err := tx.Rollback(); err != nil { + t.Fatalf("Error on rollback: %v", err) + } + + tx, err = db.Begin(false) + if err != nil { + t.Fatalf("Error starting tx: %v", err) + } + b = tx.Bucket(bucket) + if v := b.Get([]byte("k")); v != nil { + t.Fatalf("Value for k should not have been stored") + } + if err := tx.Rollback(); err != nil { + t.Fatalf("Error on rollback: %v", err) + } + + } +} + +// TestTx_releaseRange ensures db.freePages handles page releases +// correctly when there are transaction that are no longer reachable +// via any read/write transactions and are "between" ongoing read +// transactions, which requires they must be freed by +// freelist.releaseRange. +func TestTx_releaseRange(t *testing.T) { + // Set initial mmap size well beyond the limit we will hit in this + // test, since we are testing with long running read transactions + // and will deadlock if db.grow is triggered. + db := btesting.MustCreateDBWithOption(t, &bolt.Options{InitialMmapSize: os.Getpagesize() * 100}) + + bucket := "bucket" + + put := func(key, value string) { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte(bucket)) + if err != nil { + t.Fatal(err) + } + return b.Put([]byte(key), []byte(value)) + }); err != nil { + t.Fatal(err) + } + } + + del := func(key string) { + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte(bucket)) + if err != nil { + t.Fatal(err) + } + return b.Delete([]byte(key)) + }); err != nil { + t.Fatal(err) + } + } + + getWithTxn := func(txn *bolt.Tx, key string) []byte { + return txn.Bucket([]byte(bucket)).Get([]byte(key)) + } + + openReadTxn := func() *bolt.Tx { + readTx, err := db.Begin(false) + if err != nil { + t.Fatal(err) + } + return readTx + } + + checkWithReadTxn := func(txn *bolt.Tx, key string, wantValue []byte) { + value := getWithTxn(txn, key) + if !bytes.Equal(value, wantValue) { + t.Errorf("Wanted value to be %s for key %s, but got %s", wantValue, key, string(value)) + } + } + + rollback := func(txn *bolt.Tx) { + if err := txn.Rollback(); err != nil { + t.Fatal(err) + } + } + + put("k1", "v1") + rtx1 := openReadTxn() + put("k2", "v2") + hold1 := openReadTxn() + put("k3", "v3") + hold2 := openReadTxn() + del("k3") + rtx2 := openReadTxn() + del("k1") + hold3 := openReadTxn() + del("k2") + hold4 := openReadTxn() + put("k4", "v4") + hold5 := openReadTxn() + + // Close the read transactions we established to hold a portion of the pages in pending state. + rollback(hold1) + rollback(hold2) + rollback(hold3) + rollback(hold4) + rollback(hold5) + + // Execute a write transaction to trigger a releaseRange operation in the db + // that will free multiple ranges between the remaining open read transactions, now that the + // holds have been rolled back. + put("k4", "v4") + + // Check that all long running reads still read correct values. + checkWithReadTxn(rtx1, "k1", []byte("v1")) + checkWithReadTxn(rtx2, "k2", []byte("v2")) + rollback(rtx1) + rollback(rtx2) + + // Check that the final state is correct. + rtx7 := openReadTxn() + checkWithReadTxn(rtx7, "k1", nil) + checkWithReadTxn(rtx7, "k2", nil) + checkWithReadTxn(rtx7, "k3", nil) + checkWithReadTxn(rtx7, "k4", []byte("v4")) + rollback(rtx7) +} + +func ExampleTx_Rollback() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Create a bucket. + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }); err != nil { + log.Fatal(err) + } + + // Set a value for a key. + if err := db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + }); err != nil { + log.Fatal(err) + } + + // Update the key but rollback the transaction so it never saves. + tx, err := db.Begin(true) + if err != nil { + log.Fatal(err) + } + b := tx.Bucket([]byte("widgets")) + if err := b.Put([]byte("foo"), []byte("baz")); err != nil { + log.Fatal(err) + } + if err := tx.Rollback(); err != nil { + log.Fatal(err) + } + + // Ensure that our original value is still set. + if err := db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value for 'foo' is still: %s\n", value) + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // The value for 'foo' is still: bar +} + +func ExampleTx_CopyFile() { + // Open the database. + db, err := bolt.Open(tempfile(), 0600, nil) + if err != nil { + log.Fatal(err) + } + defer os.Remove(db.Path()) + + // Create a bucket and a key. + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + return err + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + return err + } + return nil + }); err != nil { + log.Fatal(err) + } + + // Copy the database to another file. + toFile := tempfile() + if err := db.View(func(tx *bolt.Tx) error { + return tx.CopyFile(toFile, 0666) + }); err != nil { + log.Fatal(err) + } + defer os.Remove(toFile) + + // Open the cloned database. + db2, err := bolt.Open(toFile, 0600, nil) + if err != nil { + log.Fatal(err) + } + + // Ensure that the key exists in the copy. + if err := db2.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value for 'foo' in the clone is: %s\n", value) + return nil + }); err != nil { + log.Fatal(err) + } + + // Close database to release file lock. + if err := db.Close(); err != nil { + log.Fatal(err) + } + + if err := db2.Close(); err != nil { + log.Fatal(err) + } + + // Output: + // The value for 'foo' in the clone is: bar +} + +func TestTxStats_GetAndIncAtomically(t *testing.T) { + var stats bolt.TxStats + + stats.IncPageCount(1) + assert.Equal(t, int64(1), stats.GetPageCount()) + + stats.IncPageAlloc(2) + assert.Equal(t, int64(2), stats.GetPageAlloc()) + + stats.IncCursorCount(3) + assert.Equal(t, int64(3), stats.GetCursorCount()) + + stats.IncNodeCount(100) + assert.Equal(t, int64(100), stats.GetNodeCount()) + + stats.IncNodeDeref(101) + assert.Equal(t, int64(101), stats.GetNodeDeref()) + + stats.IncRebalance(1000) + assert.Equal(t, int64(1000), stats.GetRebalance()) + + stats.IncRebalanceTime(1001 * time.Second) + assert.Equal(t, 1001*time.Second, stats.GetRebalanceTime()) + + stats.IncSplit(10000) + assert.Equal(t, int64(10000), stats.GetSplit()) + + stats.IncSpill(10001) + assert.Equal(t, int64(10001), stats.GetSpill()) + + stats.IncSpillTime(10001 * time.Second) + assert.Equal(t, 10001*time.Second, stats.GetSpillTime()) + + stats.IncWrite(100000) + assert.Equal(t, int64(100000), stats.GetWrite()) + + stats.IncWriteTime(100001 * time.Second) + assert.Equal(t, 100001*time.Second, stats.GetWriteTime()) + + assert.Equal(t, + bolt.TxStats{ + PageCount: 1, + PageAlloc: 2, + CursorCount: 3, + NodeCount: 100, + NodeDeref: 101, + Rebalance: 1000, + RebalanceTime: 1001 * time.Second, + Split: 10000, + Spill: 10001, + SpillTime: 10001 * time.Second, + Write: 100000, + WriteTime: 100001 * time.Second, + }, + stats, + ) +} + +func TestTxStats_Sub(t *testing.T) { + statsA := bolt.TxStats{ + PageCount: 1, + PageAlloc: 2, + CursorCount: 3, + NodeCount: 100, + NodeDeref: 101, + Rebalance: 1000, + RebalanceTime: 1001 * time.Second, + Split: 10000, + Spill: 10001, + SpillTime: 10001 * time.Second, + Write: 100000, + WriteTime: 100001 * time.Second, + } + + statsB := bolt.TxStats{ + PageCount: 2, + PageAlloc: 3, + CursorCount: 4, + NodeCount: 101, + NodeDeref: 102, + Rebalance: 1001, + RebalanceTime: 1002 * time.Second, + Split: 11001, + Spill: 11002, + SpillTime: 11002 * time.Second, + Write: 110001, + WriteTime: 110010 * time.Second, + } + + diff := statsB.Sub(&statsA) + assert.Equal(t, int64(1), diff.GetPageCount()) + assert.Equal(t, int64(1), diff.GetPageAlloc()) + assert.Equal(t, int64(1), diff.GetCursorCount()) + assert.Equal(t, int64(1), diff.GetNodeCount()) + assert.Equal(t, int64(1), diff.GetNodeDeref()) + assert.Equal(t, int64(1), diff.GetRebalance()) + assert.Equal(t, time.Second, diff.GetRebalanceTime()) + assert.Equal(t, int64(1001), diff.GetSplit()) + assert.Equal(t, int64(1001), diff.GetSpill()) + assert.Equal(t, 1001*time.Second, diff.GetSpillTime()) + assert.Equal(t, int64(10001), diff.GetWrite()) + assert.Equal(t, 10009*time.Second, diff.GetWriteTime()) +} + +// TestTx_TruncateBeforeWrite ensures the file is truncated ahead whether we sync freelist or not. +func TestTx_TruncateBeforeWrite(t *testing.T) { + if runtime.GOOS == "windows" { + return + } + for _, isSyncFreelist := range []bool{false, true} { + t.Run(fmt.Sprintf("isSyncFreelist:%v", isSyncFreelist), func(t *testing.T) { + // Open the database. + db := btesting.MustCreateDBWithOption(t, &bolt.Options{ + NoFreelistSync: isSyncFreelist, + }) + + bigvalue := make([]byte, db.AllocSize/100) + count := 0 + for { + count++ + tx, err := db.Begin(true) + require.NoError(t, err) + b, err := tx.CreateBucketIfNotExists([]byte("bucket")) + require.NoError(t, err) + err = b.Put([]byte{byte(count)}, bigvalue) + require.NoError(t, err) + err = tx.Commit() + require.NoError(t, err) + + size := fileSize(db.Path()) + + if size > int64(db.AllocSize) && size < int64(db.AllocSize)*2 { + // db.grow expands the file aggresively, that double the size while smaller than db.AllocSize, + // or increase with a step of db.AllocSize if larger, by which we can test if db.grow has run. + t.Fatalf("db.grow doesn't run when file size changes. file size: %d", size) + } + if size > int64(db.AllocSize) { + break + } + } + db.MustClose() + db.MustDeleteFile() + }) + } +} diff --git a/unix_test.go b/unix_test.go new file mode 100644 index 0000000..96b5045 --- /dev/null +++ b/unix_test.go @@ -0,0 +1,115 @@ +//go:build !windows + +package bbolt_test + +import ( + "fmt" + "testing" + + "golang.org/x/sys/unix" + + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/btesting" +) + +func TestMlock_DbOpen(t *testing.T) { + // 32KB + skipOnMemlockLimitBelow(t, 32*1024) + + btesting.MustCreateDBWithOption(t, &bolt.Options{Mlock: true}) +} + +// Test change between "empty" (16KB) and "non-empty" db +func TestMlock_DbCanGrow_Small(t *testing.T) { + // 32KB + skipOnMemlockLimitBelow(t, 32*1024) + + db := btesting.MustCreateDBWithOption(t, &bolt.Options{Mlock: true}) + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("bucket")) + if err != nil { + t.Fatal(err) + } + + key := []byte("key") + value := []byte("value") + if err := b.Put(key, value); err != nil { + t.Fatal(err) + } + + return nil + }); err != nil { + t.Fatal(err) + } + +} + +// Test crossing of 16MB (AllocSize) of db size +func TestMlock_DbCanGrow_Big(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode") + } + + // 32MB + skipOnMemlockLimitBelow(t, 32*1024*1024) + + chunksBefore := 64 + chunksAfter := 64 + + db := btesting.MustCreateDBWithOption(t, &bolt.Options{Mlock: true}) + + for chunk := 0; chunk < chunksBefore; chunk++ { + insertChunk(t, db, chunk) + } + dbSize := fileSize(db.Path()) + + for chunk := 0; chunk < chunksAfter; chunk++ { + insertChunk(t, db, chunksBefore+chunk) + } + newDbSize := fileSize(db.Path()) + + if newDbSize <= dbSize { + t.Errorf("db didn't grow: %v <= %v", newDbSize, dbSize) + } +} + +func insertChunk(t *testing.T, db *btesting.DB, chunkId int) { + chunkSize := 1024 + + if err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("bucket")) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < chunkSize; i++ { + key := []byte(fmt.Sprintf("key-%d-%d", chunkId, i)) + value := []byte("value") + if err := b.Put(key, value); err != nil { + t.Fatal(err) + } + } + + return nil + }); err != nil { + t.Fatal(err) + } +} + +// Main reason for this check is travis limiting mlockable memory to 64KB +// https://github.com/travis-ci/travis-ci/issues/2462 +func skipOnMemlockLimitBelow(t *testing.T, memlockLimitRequest uint64) { + var info unix.Rlimit + if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &info); err != nil { + t.Fatal(err) + } + + if info.Cur < memlockLimitRequest { + t.Skipf( + "skipping as RLIMIT_MEMLOCK is insufficient: %v < %v", + info.Cur, + memlockLimitRequest, + ) + } +} diff --git a/utils_test.go b/utils_test.go new file mode 100644 index 0000000..7159217 --- /dev/null +++ b/utils_test.go @@ -0,0 +1,47 @@ +package bbolt_test + +import ( + bolt "github.com/tutus-one/tutus-bolt" + "github.com/tutus-one/tutus-bolt/internal/common" +) + +// `dumpBucket` dumps all the data, including both key/value data +// and child buckets, from the source bucket into the target db file. +func dumpBucket(srcBucketName []byte, srcBucket *bolt.Bucket, dstFilename string) error { + common.Assert(len(srcBucketName) != 0, "source bucket name can't be empty") + common.Assert(srcBucket != nil, "the source bucket can't be nil") + common.Assert(len(dstFilename) != 0, "the target file path can't be empty") + + dstDB, err := bolt.Open(dstFilename, 0600, nil) + if err != nil { + return err + } + defer dstDB.Close() + + return dstDB.Update(func(tx *bolt.Tx) error { + dstBucket, err := tx.CreateBucket(srcBucketName) + if err != nil { + return err + } + return cloneBucket(srcBucket, dstBucket) + }) +} + +func cloneBucket(src *bolt.Bucket, dst *bolt.Bucket) error { + return src.ForEach(func(k, v []byte) error { + if v == nil { + srcChild := src.Bucket(k) + dstChild, err := dst.CreateBucket(k) + if err != nil { + return err + } + if err = dstChild.SetSequence(srcChild.Sequence()); err != nil { + return err + } + + return cloneBucket(srcChild, dstChild) + } + + return dst.Put(k, v) + }) +} diff --git a/version/version.go b/version/version.go new file mode 100644 index 0000000..af945d7 --- /dev/null +++ b/version/version.go @@ -0,0 +1,6 @@ +package version + +var ( + // Version shows the last bbolt binary version released. + Version = "1.4.0-alpha.0" +)