Files
firezone/docker-compose.yml
Thomas Eizinger 9865e03343 ci: fix double symmetric NAT test failure (#10410)
As it turns out, the flaky test was caused by a bug in the eBPF kernel where we read the old channel data header from the wrong offset. This made us essentially read garbage data for the channel number, causing us to:

a. Compute a bad checksum
b. Send the packet on a completely wrong channel

The reason this caused a flaky test is that it requires on side to pick IPv4 to talk to the relay and the other side IPv6. The happy-eyeballs approach of the `allocation` module made that non-deterministic, only exposing this bug occasionally.

To ensure these kind of things are detected earlier in the future, I am adding an additional CI step that checks all packets emitted by the eBPF kernel for checksum errors.

Fixes: #10404

Co-authored-by: Jamil Bou Kheir <jamilbk@users.noreply.github.com>
2025-09-25 17:53:17 +10:00

520 lines
14 KiB
YAML

# Run with DOCKER_BUILD_TARGET=dev to build Rust inside Docker
include:
- scripts/compose/resources.yml
- scripts/compose/edgeshark.yml
- scripts/compose/portal.yml
x-erlang-cluster: &erlang-cluster
ERLANG_CLUSTER_ADAPTER: "Elixir.Cluster.Strategy.Epmd"
ERLANG_CLUSTER_ADAPTER_CONFIG: '{"hosts":["api@api.cluster.local","web@web.cluster.local","domain@domain.cluster.local"]}'
x-portal-urls: &portal-urls
WEB_EXTERNAL_URL: http://localhost:8080/
API_EXTERNAL_URL: http://localhost:8081/
x-phoenix-config: &phoenix-config
PHOENIX_HTTP_WEB_PORT: "8080"
PHOENIX_HTTP_API_PORT: "8081"
PHOENIX_SECURE_COOKIES: "false"
x-health-check: &health-check
interval: 1s
retries: 15
timeout: 1s
services:
web:
extends:
file: scripts/compose/portal.yml
service: common
build:
context: elixir
args:
APPLICATION_NAME: web
image: ${WEB_IMAGE:-ghcr.io/firezone/web}:${WEB_TAG:-main}
hostname: web.cluster.local
ports:
- 8080:8080/tcp
environment:
<<: [*portal-urls, *erlang-cluster, *phoenix-config]
RELEASE_HOSTNAME: "web.cluster.local"
RELEASE_NAME: "web"
LOG_LEVEL: "debug"
healthcheck:
test: ["CMD-SHELL", "curl -f localhost:8080/healthz"]
<<: *health-check
depends_on:
vault:
condition: "service_healthy"
postgres:
condition: "service_healthy"
api:
extends:
file: scripts/compose/portal.yml
service: common
build:
context: elixir
args:
APPLICATION_NAME: api
image: ${API_IMAGE:-ghcr.io/firezone/api}:${API_TAG:-main}
hostname: api.cluster.local
ports:
- 8081:8081/tcp
environment:
<<: [*portal-urls, *erlang-cluster, *phoenix-config]
RELEASE_HOSTNAME: "api.cluster.local"
RELEASE_NAME: "api"
LOG_LEVEL: "debug"
user: root # Needed to run `ip route` commands
cap_add:
- NET_ADMIN # Needed to run `tc` commands to add simulated delay
command:
- sh
- -c
- |
set -e
# Add static route to internet subnet via router
ip -4 route add 203.0.113.0/24 via 172.28.0.254
ip -6 route add 203:0:113::/64 via 172:28:0::254
exec su default -c "bin/server"
depends_on:
vault:
condition: "service_healthy"
postgres:
condition: "service_healthy"
api-router:
condition: "service_healthy"
healthcheck:
test: ["CMD-SHELL", "curl -f localhost:8081/healthz"]
<<: *health-check
networks:
app-internal:
ipv4_address: 172.28.0.100
ipv6_address: 172:28:0::100
api-router:
extends:
file: scripts/compose/router.yml
service: router
environment:
PORT_FORWARDS: |
8081 172.28.0.100 tcp
8081 172:28:0::100 tcp
MASQUERADE_TYPE: ""
NETWORK_LATENCY_MS: 50
networks:
app-internal:
ipv4_address: 172.28.0.254
ipv6_address: 172:28:0::254
interface_name: internal
internet:
ipv4_address: 203.0.113.10
ipv6_address: 203:0:113::10
interface_name: internet
domain:
extends:
file: scripts/compose/portal.yml
service: common
build:
context: elixir
args:
APPLICATION_NAME: domain
image: ${DOMAIN_IMAGE:-ghcr.io/firezone/domain}:${DOMAIN_TAG:-main}
hostname: domain.cluster.local
environment:
<<: *erlang-cluster
RELEASE_HOSTNAME: "domain.cluster.local"
RELEASE_NAME: "domain"
LOG_LEVEL: "debug"
healthcheck:
test: ["CMD-SHELL", "curl -f localhost:4000/healthz"]
<<: *health-check
depends_on:
vault:
condition: "service_healthy"
postgres:
condition: "service_healthy"
# This is a service container which allows to run mix tasks for local development
# without having to install Elixir and Erlang on the host machine.
elixir:
extends:
file: scripts/compose/portal.yml
service: common
build:
context: elixir
target: compiler
args:
APPLICATION_NAME: api
image: ${ELIXIR_IMAGE:-ghcr.io/firezone/elixir}:${ELIXIR_TAG:-main}
hostname: elixir
environment:
<<: *portal-urls
RELEASE_HOSTNAME: "mix.cluster.local"
RELEASE_NAME: "mix"
# Higher log level not to make seeds output too verbose
LOG_LEVEL: "info"
# Mix env should be set to prod to use secrets declared above,
# otherwise seeds will generate invalid tokens
MIX_ENV: "prod"
depends_on:
postgres:
condition: "service_healthy"
# Run with DOCKER_BUILD_TARGET=dev to build Rust inside Docker
client:
healthcheck:
test: ["CMD-SHELL", "ip link | grep tun-firezone"]
<<: *health-check
environment:
FIREZONE_DNS_CONTROL: "${FIREZONE_DNS_CONTROL:-etc-resolv-conf}"
FIREZONE_TOKEN: "n.SFMyNTY.g2gDaANtAAAAJGM4OWJjYzhjLTkzOTItNGRhZS1hNDBkLTg4OGFlZjZkMjhlMG0AAAAkN2RhN2QxY2QtMTExYy00NGE3LWI1YWMtNDAyN2I5ZDIzMGU1bQAAACtBaUl5XzZwQmstV0xlUkFQenprQ0ZYTnFJWktXQnMyRGR3XzJ2Z0lRdkZnbgYAR_ywiZQBYgABUYA.PLNlzyqMSgZlbQb1QX5EzZgYNuY9oeOddP0qDkTwtGg"
RUST_LOG: ${RUST_LOG:-wire=trace,debug}
FIREZONE_API_URL: ws://api:8081
FIREZONE_ID: EFC7A9E3-3576-4633-B633-7D47BA9E14AC
command:
- sh
- -c
- |
set -e
# Add static route to internet subnet via router
ip -4 route add 203.0.113.0/24 via 172.30.0.254
ip -6 route add 203:0:113::/64 via 172:30:0::254
# Disable checksum offloading so that checksums are correct when they reach the relay
apk add --no-cache ethtool
ethtool -K eth0 tx off
exec firezone-headless-client
init: true
build:
target: ${DOCKER_BUILD_TARGET:-debug}
context: rust
dockerfile: Dockerfile
args:
PACKAGE: firezone-headless-client
image: ${CLIENT_IMAGE:-ghcr.io/firezone/debug/client}:${CLIENT_TAG:-main}
privileged: true # Needed to tune `sysctl` inside container.
cap_add:
- NET_ADMIN
sysctls:
- net.ipv6.conf.all.disable_ipv6=0
- net.ipv6.conf.default.disable_ipv6=0
devices:
- "/dev/net/tun:/dev/net/tun"
depends_on:
client-router:
condition: "service_healthy"
api:
condition: "service_healthy"
networks:
client-internal:
ipv4_address: 172.30.0.100
ipv6_address: 172:30:0::100
extra_hosts:
- "api:203.0.113.10"
- "api:203:0:113::10"
client-router:
extends:
file: scripts/compose/router.yml
service: router
environment:
MASQUERADE_TYPE: ${CLIENT_MASQUERADE:-}
NETWORK_LATENCY_MS: 10
networks:
client-internal:
ipv4_address: 172.30.0.254
ipv6_address: 172:30:0::254
interface_name: internal
internet:
interface_name: internet
gateway:
healthcheck:
test: ["CMD-SHELL", "ip link | grep tun-firezone"]
<<: *health-check
environment:
FIREZONE_TOKEN: ".SFMyNTY.g2gDaANtAAAAJGM4OWJjYzhjLTkzOTItNGRhZS1hNDBkLTg4OGFlZjZkMjhlMG0AAAAkMjI3NDU2MGItZTk3Yi00NWU0LThiMzQtNjc5Yzc2MTdlOThkbQAAADhPMDJMN1VTMkozVklOT01QUjlKNklMODhRSVFQNlVPOEFRVk82VTVJUEwwVkpDMjJKR0gwPT09PW4GAAH8sImUAWIAAVGA.tAm2O9FcyF67VAF3rZdwQpeADrYOIs3S2l2K51G26OM"
RUST_LOG: ${RUST_LOG:-wire=trace,debug}
FIREZONE_API_URL: ws://api:8081
FIREZONE_ID: 4694E56C-7643-4A15-9DF3-638E5B05F570
command:
- sh
- -c
- |
set -e
# Add static route to internet subnet via router
ip -4 route add 203.0.113.0/24 via 172.31.0.254
ip -6 route add 203:0:113::/64 via 172:31:0::254
# Disable checksum offloading so that checksums are correct when they reach the relay
apk add --no-cache ethtool
ethtool -K eth0 tx off
ethtool -K eth1 tx off
ethtool -K eth2 tx off
exec firezone-gateway
init: true
build:
target: ${DOCKER_BUILD_TARGET:-debug}
context: rust
dockerfile: Dockerfile
args:
PACKAGE: firezone-gateway
image: ${GATEWAY_IMAGE:-ghcr.io/firezone/debug/gateway}:${GATEWAY_TAG:-main}
cap_add:
- NET_ADMIN
sysctls:
- net.ipv4.ip_forward=1
- net.ipv6.conf.all.disable_ipv6=0
- net.ipv6.conf.default.disable_ipv6=0
- net.ipv6.conf.all.forwarding=1
- net.ipv6.conf.default.forwarding=1
devices:
- "/dev/net/tun:/dev/net/tun"
depends_on:
gateway-router:
condition: "service_healthy"
api:
condition: "service_healthy"
networks:
gateway-internal:
ipv4_address: 172.31.0.100
ipv6_address: 172:31:0::100
dns_resources:
resources:
extra_hosts:
- "api:203.0.113.10"
- "api:203:0:113::10"
gateway-router:
extends:
file: scripts/compose/router.yml
service: router
environment:
MASQUERADE_TYPE: ${GATEWAY_MASQUERADE:-}
NETWORK_LATENCY_MS: 10
networks:
gateway-internal:
ipv4_address: 172.31.0.254
ipv6_address: 172:31:0::254
interface_name: internal
internet:
interface_name: internet
relay-1:
extends:
file: scripts/compose/relay.yml
service: relay
environment:
PUBLIC_IP4_ADDR: 203.0.113.101
PUBLIC_IP6_ADDR: 203:0:113::101
command:
- sh
- -c
- |
set -e
# Add static route to internet subnet via router
ip -4 route add 203.0.113.0/24 via 172.29.1.254
ip -6 route add 203:0:113::/64 via 172:29:1::254
apk add --no-cache ethtool
ethtool -K eth0 tx off
firezone-relay
depends_on:
relay-1-router:
condition: "service_healthy"
networks:
relay-1-internal:
ipv4_address: 172.29.1.100
ipv6_address: 172:29:1::100
relay-1-router:
extends:
file: scripts/compose/router.yml
service: router
environment:
PORT_FORWARDS: |
3478 172.29.1.100 udp
49152-65535 172.29.1.100 udp
3478 172:29:1::100 udp
49152-65535 172:29:1::100 udp
NETWORK_LATENCY_MS: 30
networks:
relay-1-internal:
ipv4_address: 172.29.1.254
ipv6_address: 172:29:1::254
interface_name: internal
internet:
ipv4_address: 203.0.113.101
ipv6_address: 203:0:113::101
interface_name: internet
relay-2:
extends:
file: scripts/compose/relay.yml
service: relay
environment:
PUBLIC_IP4_ADDR: 203.0.113.102
PUBLIC_IP6_ADDR: 203:0:113::102
command:
- sh
- -c
- |
set -e
# Add static route to internet subnet via router
ip -4 route add 203.0.113.0/24 via 172.29.2.254
ip -6 route add 203:0:113::/64 via 172:29:2::254
apk add --no-cache ethtool
ethtool -K eth0 tx off
firezone-relay
depends_on:
relay-2-router:
condition: "service_healthy"
networks:
relay-2-internal:
ipv4_address: 172.29.2.100
ipv6_address: 172:29:2::100
relay-2-router:
extends:
file: scripts/compose/router.yml
service: router
environment:
PORT_FORWARDS: |
3478 172.29.2.100 udp
49152-65535 172.29.2.100 udp
3478 172:29:2::100 udp
49152-65535 172:29:2::100 udp
NETWORK_LATENCY_MS: 30
networks:
relay-2-internal:
ipv4_address: 172.29.2.254
ipv6_address: 172:29:2::254
interface_name: internal
internet:
ipv4_address: 203.0.113.102
ipv6_address: 203:0:113::102
interface_name: internet
# The veth driver uses a pair of interfaces to connect the docker bridge to the container namespace.
# For containers that have an eBPF program attached and do XDP_TX, we need to attach a dummy program
# to the corresponding veth interface on the host to be able to receive the XDP_TX traffic and pass
# it up to the docker bridge successfully.
#
# The "recommended" way to do this is to set both veth interfaces' GRO to on, or attach an XDP program
# that does XDP_PASS to the host side veth interface. The GRO method is not reliable and was shown to
# only pass packets in large bursts every 15-20 seconds which breaks ICE setup, so we use the XDP method.
#
# For correct behaviour, we also disable any kind of offloading for all veth and bridge devices.
# This forces the kernel to calculate all checksums in software.
network-config:
image: ghcr.io/firezone/xdp-pass
pid: host
network_mode: host
privileged: true
restart: on-failure
command:
- sh
- -c
- |
set -e
VETHS=$$(ip -json link show type veth | jq -r '.[].ifname')
for dev in $$VETHS; do
echo "Attaching XDP to: $$dev"
ip link set dev $$dev xdpdrv off # Clear any existing XDP program.
ip link set dev $$dev xdpdrv obj /xdp/xdp_pass.o sec xdp
ethtool -K $$dev tx off # Disable offloading.
done
echo "Done configuring $$(echo "$$VETHS" | wc -w) veth interfaces"
BRIDGES=$$(ip -json link show type bridge | jq -r '.[].ifname')
for dev in $$BRIDGES; do
ethtool -K $$dev tx off # Disable offloading.
done
echo "Done configuring $$(echo "$$BRIDGES" | wc -w) bridge interfaces"
depends_on:
relay-1:
condition: "service_healthy"
relay-2:
condition: "service_healthy"
relay-1-router:
condition: "service_healthy"
relay-2-router:
condition: "service_healthy"
gateway-router:
condition: "service_healthy"
client-router:
condition: "service_healthy"
gateway:
condition: "service_healthy"
client:
condition: "service_healthy"
otel:
image: otel/opentelemetry-collector:latest
networks:
app-internal:
networks:
# Internet network - where all public IPs live
internet:
enable_ipv6: true
ipam:
config:
- subnet: 203.0.113.0/24
- subnet: 203:0:113::/64
app-internal:
enable_ipv6: true
ipam:
config:
- subnet: 172.28.0.0/24
- subnet: 172:28:0::/64
relay-1-internal:
enable_ipv6: true
ipam:
config:
- subnet: 172.29.1.0/24
- subnet: 172:29:1::/64
relay-2-internal:
enable_ipv6: true
ipam:
config:
- subnet: 172.29.2.0/24
- subnet: 172:29:2::/64
client-internal:
enable_ipv6: true
ipam:
config:
- subnet: 172.30.0.0/24
- subnet: 172:30:0::/64
gateway-internal:
enable_ipv6: true
ipam:
config:
- subnet: 172.31.0.0/24
- subnet: 172:31:0::/64