From a8aafc9e14d96579d8e2b3bcc925c0ec94cc12a4 Mon Sep 17 00:00:00 2001
From: Thomas Eizinger <thomas@eizinger.io>
Date: Wed, 24 Jul 2024 11:22:17 +1000
Subject: [PATCH] ci: use bencher.dev for continuous benchmarking (#5915)

Currently, we have a homegrown benchmark suite that reports results of
the iperf runs within CI by comparing a run on `main` with the current
branch.

These comments are noisy because they happen on every PR, regardless of
the performance results. As a result, they tend to be skimmed over by
devs and not actually considered. To properly track performance, we need
to record benchmark results over time and use statistics to detect
regressions.

https://bencher.dev does exactly that. it supports various benchmark
harnesses to automatically collect benchmarks. For our case, we simply
use the generic JSON adapter to extract the relevant metrics from the
iperf results and report them to the bencher backend.

With these metrics in place, bencher can plot the results over time, and
alert us in the case of regressions using thresholds based on
statistical tests.

Resolves: #5818.

---------

Signed-off-by: Thomas Eizinger <thomas@eizinger.io>
Co-authored-by: Reactor Scram <ReactorScram@users.noreply.github.com>
---
 .github/workflows/ci.yml      |  89 ++++++-------------
 scripts/tests/perf/results.js | 159 ----------------------------------
 2 files changed, 28 insertions(+), 220 deletions(-)
 delete mode 100644 scripts/tests/perf/results.js

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eb45f9106..1910818c9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -82,27 +82,15 @@ jobs:
       profile: ${{ inputs.profile || 'debug' }}
       stage: ${{ inputs.stage || 'debug' }}
 
-  build-base-perf-artifacts:
+  build-perf-artifacts:
     needs: update-release-draft
-    if: ${{ github.event_name == 'pull_request' }}
-    uses: ./.github/workflows/_build_artifacts.yml
-    secrets: inherit
-    with:
-      sha: ${{ github.event.pull_request.base.sha }}
-      image_prefix: 'perf'
-      profile: 'release'
-      stage: 'debug'
-
-  build-head-perf-artifacts:
-    needs: update-release-draft
-    if: ${{ github.event_name == 'pull_request' }}
     uses: ./.github/workflows/_build_artifacts.yml
     secrets: inherit
     with:
       sha: ${{ github.sha }}
       image_prefix: 'perf'
       profile: 'release'
-      stage: 'debug'
+      stage: 'debug' # Only the debug images have perf tooling
 
   integration-tests:
     uses: ./.github/workflows/_integration_tests.yml
@@ -172,12 +160,8 @@ jobs:
       client_tag: ${{ matrix.client.tag }}
 
   perf-tests:
-    # Only the debug images have perf tooling
-    if: ${{ github.event_name == 'pull_request' }}
-    name: perf-tests-${{ matrix.version.prefix }}-${{ matrix.test_name }}
-    needs:
-      - build-base-perf-artifacts
-      - build-head-perf-artifacts
+    name: perf-tests
+    needs: build-perf-artifacts
     runs-on: ubuntu-22.04
     permissions:
       contents: read
@@ -185,25 +169,20 @@ jobs:
       pull-requests: write
     env:
       API_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/api'
-      API_TAG: ${{ matrix.version.sha }}
+      API_TAG: ${{ github.sha }}
       WEB_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/web'
-      WEB_TAG: ${{ matrix.version.sha }}
+      WEB_TAG: ${{ github.sha }}
       ELIXIR_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/elixir'
-      ELIXIR_TAG: ${{ matrix.version.sha }}
+      ELIXIR_TAG: ${{ github.sha }}
       GATEWAY_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/gateway'
-      GATEWAY_TAG: ${{ matrix.version.sha }}
+      GATEWAY_TAG: ${{ github.sha }}
       CLIENT_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/client'
-      CLIENT_TAG: ${{ matrix.version.sha }}
+      CLIENT_TAG: ${{ github.sha }}
       RELAY_IMAGE: 'us-east1-docker.pkg.dev/firezone-staging/firezone/perf/relay'
-      RELAY_TAG: ${{ matrix.version.sha }}
+      RELAY_TAG: ${{ github.sha }}
     strategy:
       fail-fast: false
       matrix:
-        version:
-          - sha: ${{ github.sha }}
-            prefix: head
-          - sha: ${{ github.event.pull_request.base.sha }}
-            prefix: base
         test_name:
           - direct-tcp-client2server
           - direct-tcp-server2client
@@ -215,8 +194,6 @@ jobs:
           - relayed-udp-server2client
     steps:
       - uses: actions/checkout@v4
-        with:
-          ref: ${{ matrix.version.sha }}
       - uses: ./.github/actions/gcp-docker-login
         id: login
         with:
@@ -241,13 +218,15 @@ jobs:
         timeout-minutes: 5
         env:
           TEST_NAME: ${{ matrix.test_name }}
-        run: ./scripts/tests/perf/${{ matrix.test_name }}.sh
+        run: |
+          ./scripts/tests/perf/${{ matrix.test_name }}.sh
+          jq '{ "${{ matrix.test_name }}": { "throughput": { "value": .end.sum_received.bits_per_second } } }' ./${{ matrix.test_name }}.json > ./${{ matrix.test_name }}.bmf.json
       - name: 'Save performance test results: ${{ matrix.test_name }}'
         uses: actions/upload-artifact@v4
         with:
           overwrite: true
-          name: ${{ matrix.test_name }}-${{ matrix.version.sha }}-iperf3results
-          path: ./${{ matrix.test_name }}.json
+          name: ${{ matrix.test_name }}-${{ github.sha }}-iperf3results
+          path: ./${{ matrix.test_name }}.bmf.json
       - name: Show Client logs
         if: "!cancelled()"
         run: docker compose logs client
@@ -273,8 +252,7 @@ jobs:
         if: "!cancelled()"
         run: docker compose logs iperf3
 
-  compare-results:
-    if: github.event_name == 'pull_request'
+  upload-bencher:
     needs: perf-tests
     runs-on: ubuntu-22.04
     permissions:
@@ -283,31 +261,20 @@ jobs:
       pull-requests: write
     steps:
       - uses: actions/checkout@v4
-      - name: Download base ref performance test results
-        uses: actions/download-artifact@v4
-        with:
-          pattern: '*-${{ github.event.pull_request.base.sha }}-iperf3results'
-          merge-multiple: true
-          path: ./${{ github.event.pull_request.base.sha }}
-      - name: Download head ref performance test results
+      - uses: bencherdev/bencher@main
+      - name: Download performance test results
         uses: actions/download-artifact@v4
         with:
           pattern: '*-${{ github.sha }}-iperf3results'
           merge-multiple: true
           path: ./${{ github.sha }}
-      - name: Update PR with results
-        uses: actions/github-script@v7
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const { script } = require('./scripts/tests/perf/results.js');
-            script(github, context, '${{ github.event.pull_request.base.sha }}', '${{ github.sha }}', [
-              'direct-tcp-client2server',
-              'direct-tcp-server2client',
-              'direct-udp-client2server',
-              'direct-udp-server2client',
-              'relayed-tcp-client2server',
-              'relayed-tcp-server2client',
-              'relayed-udp-client2server',
-              'relayed-udp-server2client'
-            ]);
+      - name: Merge benchmarks results into one report
+        run: jq -s 'reduce .[] as $item ({}; . * $item)' ./${{ github.sha }}/*.bmf.json > bmf.json
+      - name: Report results to bencher
+        run: bencher run --file bmf.json --adapter json --github-actions ${{ secrets.GITHUB_TOKEN }}
+        env:
+          BENCHER_PROJECT: firezone-1l75jv1z
+          BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }}
+          BENCHER_BRANCH: ${{ github.head_ref || github.ref_name }}
+          BENCHER_BRANCH_START: ${{ github.base_ref }}
+          BENCHER_TESTBED: github-actions
diff --git a/scripts/tests/perf/results.js b/scripts/tests/perf/results.js
deleted file mode 100644
index e93b06787..000000000
--- a/scripts/tests/perf/results.js
+++ /dev/null
@@ -1,159 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-
-function getDiffPercents(base, head) {
-  let diff = -1 * (100 - head / (base / 100));
-
-  if (diff > 0) {
-    return "+" + diff.toFixed(0) + "%";
-  } else if (diff == 0) {
-    return "0%";
-  } else {
-    return diff.toFixed(0) + "%";
-  }
-}
-
-function humanFileSize(bytes, dp = 1) {
-  const thresh = 1000;
-
-  if (Math.abs(bytes) < thresh) {
-    return bytes + " B";
-  }
-
-  const units = ["KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"];
-  let u = -1;
-  const r = 10 ** dp;
-
-  do {
-    bytes /= thresh;
-    ++u;
-  } while (
-    Math.round(Math.abs(bytes) * r) / r >= thresh &&
-    u < units.length - 1
-  );
-
-  return bytes.toFixed(dp) + " " + units[u];
-}
-
-exports.script = async function (
-  github,
-  context,
-  base_sha,
-  head_sha,
-  test_names
-) {
-  let output = `### Performance Test Results
-
-  `;
-  let tcp_output = `#### TCP
-
-| Test Name | Received/s | Sent/s | Retransmits |
-|---|---|---|---|
-`;
-
-  let udp_output = `#### UDP
-
-| Test Name | Total/s | Jitter | Lost |
-|---|---|---|---|
-`;
-
-  for (const test_name of test_names) {
-    // 1. Read the head ref results
-    const results = JSON.parse(
-      fs.readFileSync(path.join(head_sha, test_name + ".json"))
-    ).end;
-
-    // 2. Read the base ref results
-    const results_base = JSON.parse(
-      fs.readFileSync(path.join(base_sha, test_name + ".json"))
-    ).end;
-
-    if (test_name.includes("tcp")) {
-      const tcp_sum_received_bits_per_second =
-        humanFileSize(results.sum_received.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum_received.bits_per_second,
-          results.sum_received.bits_per_second
-        ) +
-        ")";
-      const tcp_sum_sent_bits_per_second =
-        humanFileSize(results.sum_sent.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum_sent.bits_per_second,
-          results.sum_sent.bits_per_second
-        ) +
-        ")";
-      const tcp_sum_sent_retransmits =
-        results.sum_sent.retransmits +
-        " (" +
-        getDiffPercents(
-          results_base.sum_sent.retransmits,
-          results.sum_sent.retransmits
-        ) +
-        ")";
-
-      tcp_output += `| ${test_name} | ${tcp_sum_received_bits_per_second} | ${tcp_sum_sent_bits_per_second} | ${tcp_sum_sent_retransmits} |\n`;
-    } else if (test_name.includes("udp")) {
-      const udp_sum_bits_per_second =
-        humanFileSize(results.sum.bits_per_second) +
-        " (" +
-        getDiffPercents(
-          results_base.sum.bits_per_second,
-          results.sum.bits_per_second
-        ) +
-        ")";
-      const udp_sum_jitter_ms =
-        results.sum.jitter_ms.toFixed(2) +
-        "ms (" +
-        getDiffPercents(results_base.sum.jitter_ms, results.sum.jitter_ms) +
-        ")";
-      const udp_sum_lost_percent =
-        results.sum.lost_percent.toFixed(2) +
-        "% (" +
-        getDiffPercents(
-          results_base.sum.lost_percent,
-          results.sum.lost_percent
-        ) +
-        ")";
-
-      udp_output += `| ${test_name} | ${udp_sum_bits_per_second} | ${udp_sum_jitter_ms} | ${udp_sum_lost_percent} |\n`;
-    } else {
-      throw new Error("Unknown test type");
-    }
-  }
-
-  output += tcp_output + "\n" + udp_output;
-
-  // Retrieve existing bot comments for the PR
-  const { data: comments } = await github.rest.issues.listComments({
-    owner: context.repo.owner,
-    repo: context.repo.repo,
-    issue_number: context.issue.number,
-  });
-
-  const botComment = comments.find((comment) => {
-    return (
-      comment.user.type === "Bot" &&
-      comment.body.includes("Performance Test Results")
-    );
-  });
-
-  // 3. Update previous comment or create new one
-  if (botComment) {
-    github.rest.issues.updateComment({
-      owner: context.repo.owner,
-      repo: context.repo.repo,
-      comment_id: botComment.id,
-      body: output,
-    });
-  } else {
-    github.rest.issues.createComment({
-      issue_number: context.issue.number,
-      owner: context.repo.owner,
-      repo: context.repo.repo,
-      body: output,
-    });
-  }
-};