From 746641e523fecca2eaf74d357353ae79f5d4b065 Mon Sep 17 00:00:00 2001 From: Timofei Larkin Date: Tue, 17 Jun 2025 11:19:32 +0300 Subject: [PATCH] Split testing job into several This patch separates the Test job of the PR workflow into several smaller jobs: 1) create a testing sandbox and deploy Talos, 2) install Cozystack and configure it, 3) install managed applications and run e2e tests. This lets developers shorten the feedback loop if tests are merely acting flaky and aren't really broken. It's not the right way, but it's 80/20. Signed-off-by: Timofei Larkin --- .github/workflows/pull-requests.yaml | 83 +++++++++- Makefile | 4 + hack/e2e-apps.bats | 32 +++- hack/e2e-install-cozystack.bats | 157 ++++++++++++++++++ hack/e2e-prepare-cluster.bats | 235 +++++++++++++++++++++++++++ packages/core/testing/Makefile | 8 + 6 files changed, 506 insertions(+), 13 deletions(-) create mode 100644 hack/e2e-install-cozystack.bats create mode 100644 hack/e2e-prepare-cluster.bats diff --git a/.github/workflows/pull-requests.yaml b/.github/workflows/pull-requests.yaml index 5210c1fd..e2996516 100644 --- a/.github/workflows/pull-requests.yaml +++ b/.github/workflows/pull-requests.yaml @@ -56,8 +56,8 @@ jobs: name: talos-image path: _out/assets/nocloud-amd64.raw.xz - test: - name: Test + prepare_env: + name: Prepare environment runs-on: [self-hosted] needs: build @@ -66,6 +66,12 @@ jobs: !contains(github.event.pull_request.labels.*.name, 'release') steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + - name: Download installer uses: actions/download-artifact@v4 with: @@ -78,5 +84,74 @@ jobs: name: talos-image path: _out/assets/ - - name: Test - run: make test + - name: Set sandbox ID + run: echo "SANDBOX_NAME=cozy-e2e-sandbox-$(echo "${GITHUB_REPOSITORY}:${GITHUB_WORKFLOW}:${GITHUB_REF}" | sha256sum | cut -c1-10)" >> $GITHUB_ENV + + - name: Prepare environment + run: make SANDBOX_NAME=$SANDBOX_NAME prepare-env + + install_cozystack: + name: Install Cozystack + runs-on: [self-hosted] + needs: prepare_env + + # Never run when the PR carries the "release" label. + if: | + !contains(github.event.pull_request.labels.*.name, 'release') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + + - name: Set sandbox ID + run: echo "SANDBOX_NAME=cozy-e2e-sandbox-$(echo "${GITHUB_REPOSITORY}:${GITHUB_WORKFLOW}:${GITHUB_REF}" | sha256sum | cut -c1-10)" >> $GITHUB_ENV + + - name: Install Cozystack + run: make -C packages/core/testing SANDBOX_NAME=$SANDBOX_NAME install-cozystack + + test_apps: + name: Test applications + runs-on: [self-hosted] + needs: install_cozystack + + # Never run when the PR carries the "release" label. + if: | + !contains(github.event.pull_request.labels.*.name, 'release') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + + - name: Set sandbox ID + run: echo "SANDBOX_NAME=cozy-e2e-sandbox-$(echo "${GITHUB_REPOSITORY}:${GITHUB_WORKFLOW}:${GITHUB_REF}" | sha256sum | cut -c1-10)" >> $GITHUB_ENV + + - name: E2E Apps + run: make -C packages/core/testing SANDBOX_NAME=$SANDBOX_NAME test-apps + + cleanup: + name: Tear down environment + runs-on: [self-hosted] + needs: test_apps + + # Never run when the PR carries the "release" label. + if: | + !contains(github.event.pull_request.labels.*.name, 'release') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + + - name: Set sandbox ID + run: echo "SANDBOX_NAME=cozy-e2e-sandbox-$(echo "${GITHUB_REPOSITORY}:${GITHUB_WORKFLOW}:${GITHUB_REF}" | sha256sum | cut -c1-10)" >> $GITHUB_ENV + + - name: E2E Apps + run: make -C packages/core/testing SANDBOX_NAME=$SANDBOX_NAME delete diff --git a/Makefile b/Makefile index e9263669..fa9a04ef 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,10 @@ test: make -C packages/core/testing apply make -C packages/core/testing test +prepare-env: + make -C packages/core/testing apply + make -C packages/core/testing prepare-cluster + generate: hack/update-codegen.sh diff --git a/hack/e2e-apps.bats b/hack/e2e-apps.bats index 646f22e7..75777609 100755 --- a/hack/e2e-apps.bats +++ b/hack/e2e-apps.bats @@ -5,6 +5,7 @@ # ----------------------------------------------------------------------------- @test "Create tenant with isolated mode enabled" { + kubectl -n tenant-root get tenants.apps.cozystack.io test || kubectl create -f - <1 {print "kubectl wait --timeout=15m --for=condition=ready -n "$1" hr/"$2" &"} END {print "wait"}' | sh -ex + + # Fail the test if any HelmRelease is not Ready + if kubectl get hr -A | grep -v " True " | grep -v NAME; then + kubectl get hr -A + fail "Some HelmReleases failed to reconcile" + fi +} + +@test "Wait for Cluster‑API provider deployments" { + # Wait for Cluster‑API provider deployments + timeout 60 sh -ec 'until kubectl get deploy -n cozy-cluster-api capi-controller-manager capi-kamaji-controller-manager capi-kubeadm-bootstrap-controller-manager capi-operator-cluster-api-operator capk-controller-manager >/dev/null 2>&1; do sleep 1; done' + kubectl wait deployment/capi-controller-manager deployment/capi-kamaji-controller-manager deployment/capi-kubeadm-bootstrap-controller-manager deployment/capi-operator-cluster-api-operator deployment/capk-controller-manager -n cozy-cluster-api --timeout=1m --for=condition=available +} + +@test "Wait for LINSTOR and configure storage" { + # Linstor controller and nodes + kubectl wait deployment/linstor-controller -n cozy-linstor --timeout=5m --for=condition=available + timeout 60 sh -ec 'until [ $(kubectl exec -n cozy-linstor deploy/linstor-controller -- linstor node list | grep -c Online) -eq 3 ]; do sleep 1; done' + + for node in srv1 srv2 srv3; do + kubectl exec -n cozy-linstor deploy/linstor-controller -- linstor ps cdp zfs ${node} /dev/vdc --pool-name data --storage-pool data + done + + # Storage classes + kubectl apply -f - <<'EOF' +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: local + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: linstor.csi.linbit.com +parameters: + linstor.csi.linbit.com/storagePool: "data" + linstor.csi.linbit.com/layerList: "storage" + linstor.csi.linbit.com/allowRemoteVolumeAccess: "false" +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: replicated +provisioner: linstor.csi.linbit.com +parameters: + linstor.csi.linbit.com/storagePool: "data" + linstor.csi.linbit.com/autoPlace: "3" + linstor.csi.linbit.com/layerList: "drbd storage" + linstor.csi.linbit.com/allowRemoteVolumeAccess: "true" + property.linstor.csi.linbit.com/DrbdOptions/auto-quorum: suspend-io + property.linstor.csi.linbit.com/DrbdOptions/Resource/on-no-data-accessible: suspend-io + property.linstor.csi.linbit.com/DrbdOptions/Resource/on-suspended-primary-outdated: force-secondary + property.linstor.csi.linbit.com/DrbdOptions/Net/rr-conflict: retry-connect +volumeBindingMode: Immediate +allowVolumeExpansion: true +EOF +} + +@test "Wait for MetalLB and configure address pool" { + # MetalLB address pool + kubectl apply -f - <<'EOF' +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + name: cozystack + namespace: cozy-metallb +spec: + ipAddressPools: [cozystack] +--- +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + name: cozystack + namespace: cozy-metallb +spec: + addresses: [192.168.123.200-192.168.123.250] + autoAssign: true + avoidBuggyIPs: false +EOF +} + +@test "Check Cozystack API service" { + kubectl wait --for=condition=Available apiservices/v1alpha1.apps.cozystack.io --timeout=2m +} + +@test "Configure Tenant and wait for applications" { + # Patch root tenant and wait for its releases + kubectl patch tenants/root -n tenant-root --type merge -p '{"spec":{"host":"example.org","ingress":true,"monitoring":true,"etcd":true,"isolated":true}}' + + timeout 60 sh -ec 'until kubectl get hr -n tenant-root etcd ingress monitoring tenant-root >/dev/null 2>&1; do sleep 1; done' + kubectl wait hr/etcd hr/ingress hr/tenant-root -n tenant-root --timeout=2m --for=condition=ready + + if ! kubectl wait hr/monitoring -n tenant-root --timeout=2m --for=condition=ready; then + flux reconcile hr monitoring -n tenant-root --force + kubectl wait hr/monitoring -n tenant-root --timeout=2m --for=condition=ready + fi + + # Expose Cozystack services through ingress + kubectl patch configmap/cozystack -n cozy-system --type merge -p '{"data":{"expose-services":"api,dashboard,cdi-uploadproxy,vm-exportproxy,keycloak"}}' + + # NGINX ingress controller + timeout 60 sh -ec 'until kubectl get deploy root-ingress-controller -n tenant-root >/dev/null 2>&1; do sleep 1; done' + kubectl wait deploy/root-ingress-controller -n tenant-root --timeout=5m --for=condition=available + + # etcd statefulset + kubectl wait sts/etcd -n tenant-root --for=jsonpath='{.status.readyReplicas}'=3 --timeout=5m + + # VictoriaMetrics components + kubectl wait vmalert/vmalert-shortterm vmalertmanager/alertmanager -n tenant-root --for=jsonpath='{.status.updateStatus}'=operational --timeout=5m + kubectl wait vlogs/generic -n tenant-root --for=jsonpath='{.status.updateStatus}'=operational --timeout=5m + kubectl wait vmcluster/shortterm vmcluster/longterm -n tenant-root --for=jsonpath='{.status.clusterStatus}'=operational --timeout=5m + + # Grafana + kubectl wait clusters.postgresql.cnpg.io/grafana-db -n tenant-root --for=condition=ready --timeout=5m + kubectl wait deploy/grafana-deployment -n tenant-root --for=condition=available --timeout=5m + + # Verify Grafana via ingress + ingress_ip=$(kubectl get svc root-ingress-controller -n tenant-root -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + if ! curl -sS -k "https://${ingress_ip}" -H 'Host: grafana.example.org' --max-time 30 | grep -q Found; then + echo "Failed to access Grafana via ingress at ${ingress_ip}" >&2 + exit 1 + fi +} + +@test "Keycloak OIDC stack is healthy" { + kubectl patch configmap/cozystack -n cozy-system --type merge -p '{"data":{"oidc-enabled":"true"}}' + + timeout 120 sh -ec 'until kubectl get hr -n cozy-keycloak keycloak keycloak-configure keycloak-operator >/dev/null 2>&1; do sleep 1; done' + kubectl wait hr/keycloak hr/keycloak-configure hr/keycloak-operator -n cozy-keycloak --timeout=10m --for=condition=ready +} diff --git a/hack/e2e-prepare-cluster.bats b/hack/e2e-prepare-cluster.bats new file mode 100644 index 00000000..1c668c91 --- /dev/null +++ b/hack/e2e-prepare-cluster.bats @@ -0,0 +1,235 @@ +#!/usr/bin/env bats +# ----------------------------------------------------------------------------- +# Cozystack end‑to‑end provisioning test (Bats) +# ----------------------------------------------------------------------------- + +@test "Required installer assets exist" { + if [ ! -f _out/assets/cozystack-installer.yaml ]; then + echo "Missing: _out/assets/cozystack-installer.yaml" >&2 + exit 1 + fi + + if [ ! -f _out/assets/nocloud-amd64.raw.xz ]; then + echo "Missing: _out/assets/nocloud-amd64.raw.xz" >&2 + exit 1 + fi +} + +@test "IPv4 forwarding is enabled" { + if [ "$(cat /proc/sys/net/ipv4/ip_forward)" != 1 ]; then + echo "IPv4 forwarding is disabled!" >&2 + echo >&2 + echo "Enable it with:" >&2 + echo " echo 1 > /proc/sys/net/ipv4/ip_forward" >&2 + exit 1 + fi +} + +@test "Clean previous VMs" { + kill $(cat srv1/qemu.pid srv2/qemu.pid srv3/qemu.pid 2>/dev/null) 2>/dev/null || true + rm -rf srv1 srv2 srv3 +} + +@test "Prepare networking and masquerading" { + ip link del cozy-br0 2>/dev/null || true + ip link add cozy-br0 type bridge + ip link set cozy-br0 up + ip address add 192.168.123.1/24 dev cozy-br0 + + # Masquerading rule – idempotent (delete first, then add) + iptables -t nat -D POSTROUTING -s 192.168.123.0/24 ! -d 192.168.123.0/24 -j MASQUERADE 2>/dev/null || true + iptables -t nat -A POSTROUTING -s 192.168.123.0/24 ! -d 192.168.123.0/24 -j MASQUERADE +} + +@test "Prepare cloud‑init drive for VMs" { + mkdir -p srv1 srv2 srv3 + + # Generate cloud‑init ISOs + for i in 1 2 3; do + echo "hostname: srv${i}" > "srv${i}/meta-data" + + cat > "srv${i}/user-data" <<'EOF' +#cloud-config +EOF + + cat > "srv${i}/network-config" <&1 + exit 1 + fi + + rm -f nocloud-amd64.raw + cp _out/assets/nocloud-amd64.raw.xz . + xz --decompress nocloud-amd64.raw.xz +} + +@test "Prepare VM disks" { + for i in 1 2 3; do + cp nocloud-amd64.raw srv${i}/system.img + qemu-img resize srv${i}/system.img 50G + qemu-img create srv${i}/data.img 100G + done +} + +@test "Create tap devices" { + for i in 1 2 3; do + ip link del cozy-srv${i} 2>/dev/null || true + ip tuntap add dev cozy-srv${i} mode tap + ip link set cozy-srv${i} up + ip link set cozy-srv${i} master cozy-br0 + done +} + +@test "Boot QEMU VMs" { + for i in 1 2 3; do + qemu-system-x86_64 -machine type=pc,accel=kvm -cpu host -smp 8 -m 24576 \ + -device virtio-net,netdev=net0,mac=52:54:00:12:34:5${i} \ + -netdev tap,id=net0,ifname=cozy-srv${i},script=no,downscript=no \ + -drive file=srv${i}/system.img,if=virtio,format=raw \ + -drive file=srv${i}/seed.img,if=virtio,format=raw \ + -drive file=srv${i}/data.img,if=virtio,format=raw \ + -display none -daemonize -pidfile srv${i}/qemu.pid + done + + # Give qemu a few seconds to start up networking + sleep 5 +} + +@test "Wait until Talos API port 50000 is reachable on all machines" { + timeout 60 sh -ec 'until nc -nz 192.168.123.11 50000 && nc -nz 192.168.123.12 50000 && nc -nz 192.168.123.13 50000; do sleep 1; done' +} + +@test "Generate Talos cluster configuration" { + # Cluster‑wide patches + cat > patch.yaml <<'EOF' +machine: + kubelet: + nodeIP: + validSubnets: + - 192.168.123.0/24 + extraConfig: + maxPods: 512 + kernel: + modules: + - name: openvswitch + - name: drbd + parameters: + - usermode_helper=disabled + - name: zfs + - name: spl + registries: + mirrors: + docker.io: + endpoints: + - https://mirror.gcr.io + files: + - content: | + [plugins] + [plugins."io.containerd.cri.v1.runtime"] + device_ownership_from_security_context = true + path: /etc/cri/conf.d/20-customization.part + op: create + +cluster: + apiServer: + extraArgs: + oidc-issuer-url: "https://keycloak.example.org/realms/cozy" + oidc-client-id: "kubernetes" + oidc-username-claim: "preferred_username" + oidc-groups-claim: "groups" + network: + cni: + name: none + dnsDomain: cozy.local + podSubnets: + - 10.244.0.0/16 + serviceSubnets: + - 10.96.0.0/16 +EOF + + # Control‑plane‑only patches + cat > patch-controlplane.yaml <<'EOF' +machine: + nodeLabels: + node.kubernetes.io/exclude-from-external-load-balancers: + $patch: delete + network: + interfaces: + - interface: eth0 + vip: + ip: 192.168.123.10 +cluster: + allowSchedulingOnControlPlanes: true + controllerManager: + extraArgs: + bind-address: 0.0.0.0 + scheduler: + extraArgs: + bind-address: 0.0.0.0 + apiServer: + certSANs: + - 127.0.0.1 + proxy: + disabled: true + discovery: + enabled: false + etcd: + advertisedSubnets: + - 192.168.123.0/24 +EOF + + # Generate secrets once + if [ ! -f secrets.yaml ]; then + talosctl gen secrets + fi + + rm -f controlplane.yaml worker.yaml talosconfig kubeconfig + talosctl gen config --with-secrets secrets.yaml cozystack https://192.168.123.10:6443 \ + --config-patch=@patch.yaml --config-patch-control-plane @patch-controlplane.yaml +} + +@test "Apply Talos configuration to the node" { + # Apply the configuration to all three nodes + for node in 11 12 13; do + talosctl apply -f controlplane.yaml -n 192.168.123.${node} -e 192.168.123.${node} -i + done + + # Wait for Talos services to come up again + timeout 60 sh -ec 'until nc -nz 192.168.123.11 50000 && nc -nz 192.168.123.12 50000 && nc -nz 192.168.123.13 50000; do sleep 1; done' +} + +@test "Bootstrap Talos cluster" { + # Bootstrap etcd on the first node + timeout 10 sh -ec 'until talosctl bootstrap -n 192.168.123.11 -e 192.168.123.11; do sleep 1; done' + + # Wait until etcd is healthy + timeout 180 sh -ec 'until talosctl etcd members -n 192.168.123.11,192.168.123.12,192.168.123.13 -e 192.168.123.10 >/dev/null 2>&1; do sleep 1; done' + timeout 60 sh -ec 'while talosctl etcd members -n 192.168.123.11,192.168.123.12,192.168.123.13 -e 192.168.123.10 2>&1 | grep -q "rpc error"; do sleep 1; done' + + # Retrieve kubeconfig + rm -f kubeconfig + talosctl kubeconfig kubeconfig -e 192.168.123.10 -n 192.168.123.10 + + # Wait until all three nodes register in Kubernetes + timeout 60 sh -ec 'until [ $(kubectl get node --no-headers | wc -l) -eq 3 ]; do sleep 1; done' +} diff --git a/packages/core/testing/Makefile b/packages/core/testing/Makefile index e81d8d53..6fb23191 100755 --- a/packages/core/testing/Makefile +++ b/packages/core/testing/Makefile @@ -32,6 +32,14 @@ image-e2e-sandbox: test: test-cluster test-apps ## Run the end-to-end tests in existing sandbox +prepare-cluster: + docker cp ../../../_out/assets/cozystack-installer.yaml "${SANDBOX_NAME}":/workspace/_out/assets/cozystack-installer.yaml + docker cp ../../../_out/assets/nocloud-amd64.raw.xz "${SANDBOX_NAME}":/workspace/_out/assets/nocloud-amd64.raw.xz + docker exec "${SANDBOX_NAME}" sh -c 'cd /workspace && hack/cozytest.sh hack/e2e-prepare-cluster.bats' + +install-cozystack: + docker exec "${SANDBOX_NAME}" sh -c 'cd /workspace && hack/cozytest.sh hack/e2e-install-cozystack.bats' + test-cluster: ## Run the end-to-end for creating a cluster docker cp ../../../_out/assets/cozystack-installer.yaml "${SANDBOX_NAME}":/workspace/_out/assets/cozystack-installer.yaml docker cp ../../../_out/assets/nocloud-amd64.raw.xz "${SANDBOX_NAME}":/workspace/_out/assets/nocloud-amd64.raw.xz