ci: use bencher.dev for continuous benchmarking (#5915)

Currently, we have a homegrown benchmark suite that reports results of the iperf runs within CI by comparing a run on `main` with the current branch. These comments are noisy because they happen on every PR, regardless of the performance results. As a result, they tend to be skimmed over by devs and not actually considered. To properly track performance, we need to record benchmark results over time and use statistics to detect regressions. https://bencher.dev does exactly that. it supports various benchmark harnesses to automatically collect benchmarks. For our case, we simply use the generic JSON adapter to extract the relevant metrics from the iperf results and report them to the bencher backend. With these metrics in place, bencher can plot the results over time, and alert us in the case of regressions using thresholds based on statistical tests. Resolves: #5818. --------- Signed-off-by: Thomas Eizinger <thomas@eizinger.io> Co-authored-by: Reactor Scram <ReactorScram@users.noreply.github.com>
2026-01-27 10:18:54 +00:00 · 2024-07-24 11:22:17 +10:00
parent 50d6b865a1
commit a8aafc9e14
2 changed files with 28 additions and 220 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -82,27 +82,15 @@ jobs:
      profile: ${{ inputs.profile || 'debug' }}
      stage: ${{ inputs.stage || 'debug' }}

-  build-base-perf-artifacts:
+  build-perf-artifacts:
    needs: update-release-draft
-    if: ${{ github.event_name == 'pull_request' }}
-    uses: ./.github/workflows/_build_artifacts.yml
-    secrets: inherit
-    with:
-      sha: ${{ github.event.pull_request.base.sha }}
-      image_prefix: 'perf'
-      profile: 'release'
-      stage: 'debug'
-
-  build-head-perf-artifacts:
-    needs: update-release-draft
-    if: ${{ github.event_name == 'pull_request' }}
    uses: ./.github/workflows/_build_artifacts.yml
    secrets: inherit
    with:
      sha: ${{ github.sha }}
      image_prefix: 'perf'
      profile: 'release'
-      stage: 'debug'
+      stage: 'debug' # Only the debug images have perf tooling

  integration-tests:
    uses: ./.github/workflows/_integration_tests.yml
@@ -172,12 +160,8 @@ jobs:
      client_tag: ${{ matrix.client.tag }}

  perf-tests:
-    # Only the debug images have perf tooling
-    if: ${{ github.event_name == 'pull_request' }}
-    name: perf-tests-${{ matrix.version.prefix }}-${{ matrix.test_name }}
-    needs:
-      - build-base-perf-artifacts
-      - build-head-perf-artifacts
+    name: perf-tests
+    needs: build-perf-artifacts
    runs-on: ubuntu-22.04
    permissions:
      contents: read
@@ -185,25 +169,20 @@ jobs:
      pull-requests: write
    env:
      API_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/api'
-      API_TAG: ${{ matrix.version.sha }}
+      API_TAG: ${{ github.sha }}
      WEB_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/web'
-      WEB_TAG: ${{ matrix.version.sha }}
+      WEB_TAG: ${{ github.sha }}
      ELIXIR_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/elixir'
-      ELIXIR_TAG: ${{ matrix.version.sha }}
+      ELIXIR_TAG: ${{ github.sha }}
      GATEWAY_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/gateway'
-      GATEWAY_TAG: ${{ matrix.version.sha }}
+      GATEWAY_TAG: ${{ github.sha }}
      CLIENT_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/client'
-      CLIENT_TAG: ${{ matrix.version.sha }}
+      CLIENT_TAG: ${{ github.sha }}
      RELAY_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/relay'
-      RELAY_TAG: ${{ matrix.version.sha }}
+      RELAY_TAG: ${{ github.sha }}
    strategy:
      fail-fast: false
      matrix:
-        version:
-          - sha: ${{ github.sha }}
-            prefix: head
-          - sha: ${{ github.event.pull_request.base.sha }}
-            prefix: base
        test_name:
          - direct-tcp-client2server
          - direct-tcp-server2client
@@ -215,8 +194,6 @@ jobs:
          - relayed-udp-server2client
    steps:
      - uses: actions/checkout@v4
-        with:
-          ref: ${{ matrix.version.sha }}
      - uses: ./.github/actions/gcp-docker-login
        id: login
        with:
@@ -241,13 +218,15 @@ jobs:
        timeout-minutes: 5
        env:
          TEST_NAME: ${{ matrix.test_name }}
-        run: ./scripts/tests/perf/${{ matrix.test_name }}.sh
+        run: |
+          ./scripts/tests/perf/${{ matrix.test_name }}.sh
+          jq '{ "${{ matrix.test_name }}": { "throughput": { "value": .end.sum_received.bits_per_second } } }' ./${{ matrix.test_name }}.json > ./${{ matrix.test_name }}.bmf.json
      - name: 'Save performance test results: ${{ matrix.test_name }}'
        uses: actions/upload-artifact@v4
        with:
          overwrite: true
-          name: ${{ matrix.test_name }}-${{ matrix.version.sha }}-iperf3results
-          path: ./${{ matrix.test_name }}.json
+          name: ${{ matrix.test_name }}-${{ github.sha }}-iperf3results
+          path: ./${{ matrix.test_name }}.bmf.json
      - name: Show Client logs
        if: "!cancelled()"
        run: docker compose logs client
@@ -273,8 +252,7 @@ jobs:
        if: "!cancelled()"
        run: docker compose logs iperf3

-  compare-results:
-    if: github.event_name == 'pull_request'
+  upload-bencher:
    needs: perf-tests
    runs-on: ubuntu-22.04
    permissions:
@@ -283,31 +261,20 @@ jobs:
      pull-requests: write
    steps:
      - uses: actions/checkout@v4
-      - name: Download base ref performance test results
-        uses: actions/download-artifact@v4
-        with:
-          pattern: '*-${{ github.event.pull_request.base.sha }}-iperf3results'
-          merge-multiple: true
-          path: ./${{ github.event.pull_request.base.sha }}
-      - name: Download head ref performance test results
+      - uses: bencherdev/bencher@main
+      - name: Download performance test results
        uses: actions/download-artifact@v4
        with:
          pattern: '*-${{ github.sha }}-iperf3results'
          merge-multiple: true
          path: ./${{ github.sha }}
-      - name: Update PR with results
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const { script } = require('./scripts/tests/perf/results.js');
-            script(github, context, '${{ github.event.pull_request.base.sha }}', '${{ github.sha }}', [
-              'direct-tcp-client2server',
-              'direct-tcp-server2client',
-              'direct-udp-client2server',
-              'direct-udp-server2client',
-              'relayed-tcp-client2server',
-              'relayed-tcp-server2client',
-              'relayed-udp-client2server',
-              'relayed-udp-server2client'
-            ]);
+      - name: Merge benchmarks results into one report
+        run: jq -s 'reduce .[] as $item ({}; . * $item)' ./${{ github.sha }}/*.bmf.json > bmf.json
+      - name: Report results to bencher
+        run: bencher run --file bmf.json --adapter json --github-actions ${{ secrets.GITHUB_TOKEN }}
+        env:
+          BENCHER_PROJECT: firezone-1l75jv1z
+          BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }}
+          BENCHER_BRANCH: ${{ github.head_ref || github.ref_name }}
+          BENCHER_BRANCH_START: ${{ github.base_ref }}
+          BENCHER_TESTBED: github-actions
--- a/scripts/tests/perf/results.js
+++ b/scripts/tests/perf/results.js
@@ -1,159 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-
-function getDiffPercents(base, head) {
-  let diff = -1 * (100 - head / (base / 100));
-
-  if (diff > 0) {
-    return "+" + diff.toFixed(0) + "%";
-  } else if (diff == 0) {
-    return "0%";
-  } else {
-    return diff.toFixed(0) + "%";
-  }
-}
-
-function humanFileSize(bytes, dp = 1) {
-  const thresh = 1000;
-
-  if (Math.abs(bytes) < thresh) {
-    return bytes + " B";
-  }
-
-  const units = ["KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"];
-  let u = -1;
-  const r = 10 ** dp;
-
-  do {
-    bytes /= thresh;
-    ++u;
-  } while (
-    Math.round(Math.abs(bytes) * r) / r >= thresh &&
-    u < units.length - 1
-  );
-
-  return bytes.toFixed(dp) + " " + units[u];
-}
-
-exports.script = async function (
-  github,
-  context,
-  base_sha,
-  head_sha,
-  test_names
-) {
-  let output = `### Performance Test Results
-
-  `;
-  let tcp_output = `#### TCP
-
-| Test Name | Received/s | Sent/s | Retransmits |
-|---|---|---|---|
-`;
-
-  let udp_output = `#### UDP
-
-| Test Name | Total/s | Jitter | Lost |
-|---|---|---|---|
-`;
-
-  for (const test_name of test_names) {
-    // 1. Read the head ref results
-    const results = JSON.parse(
-      fs.readFileSync(path.join(head_sha, test_name + ".json"))
-    ).end;
-
-    // 2. Read the base ref results
-    const results_base = JSON.parse(
-      fs.readFileSync(path.join(base_sha, test_name + ".json"))
-    ).end;
-
-    if (test_name.includes("tcp")) {
-      const tcp_sum_received_bits_per_second =
-        humanFileSize(results.sum_received.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum_received.bits_per_second,
-          results.sum_received.bits_per_second
-        ) +
-        ")";
-      const tcp_sum_sent_bits_per_second =
-        humanFileSize(results.sum_sent.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum_sent.bits_per_second,
-          results.sum_sent.bits_per_second
-        ) +
-        ")";
-      const tcp_sum_sent_retransmits =
-        results.sum_sent.retransmits +
-        " (" +
-        getDiffPercents(
-          results_base.sum_sent.retransmits,
-          results.sum_sent.retransmits
-        ) +
-        ")";
-
-      tcp_output += `| ${test_name} | ${tcp_sum_received_bits_per_second} | ${tcp_sum_sent_bits_per_second} | ${tcp_sum_sent_retransmits} |\n`;
-    } else if (test_name.includes("udp")) {
-      const udp_sum_bits_per_second =
-        humanFileSize(results.sum.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum.bits_per_second,
-          results.sum.bits_per_second
-        ) +
-        ")";
-      const udp_sum_jitter_ms =
-        results.sum.jitter_ms.toFixed(2) +
-        "ms (" +
-        getDiffPercents(results_base.sum.jitter_ms, results.sum.jitter_ms) +
-        ")";
-      const udp_sum_lost_percent =
-        results.sum.lost_percent.toFixed(2) +
-        "% (" +
-        getDiffPercents(
-          results_base.sum.lost_percent,
-          results.sum.lost_percent
-        ) +
-        ")";
-
-      udp_output += `| ${test_name} | ${udp_sum_bits_per_second} | ${udp_sum_jitter_ms} | ${udp_sum_lost_percent} |\n`;
-    } else {
-      throw new Error("Unknown test type");
-    }
-  }
-
-  output += tcp_output + "\n" + udp_output;
-
-  // Retrieve existing bot comments for the PR
-  const { data: comments } = await github.rest.issues.listComments({
-    owner: context.repo.owner,
-    repo: context.repo.repo,
-    issue_number: context.issue.number,
-  });
-
-  const botComment = comments.find((comment) => {
-    return (
-      comment.user.type === "Bot" &&
-      comment.body.includes("Performance Test Results")
-    );
-  });
-
-  // 3. Update previous comment or create new one
-  if (botComment) {
-    github.rest.issues.updateComment({
-      owner: context.repo.owner,
-      repo: context.repo.repo,
-      comment_id: botComment.id,
-      body: output,
-    });
-  } else {
-    github.rest.issues.createComment({
-      issue_number: context.issue.number,
-      owner: context.repo.owner,
-      repo: context.repo.repo,
-      body: output,
-    });
-  }
-};