From b11adfcfe432f2864fe2cd654b137bfd0b407385 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Tue, 30 Sep 2025 08:23:39 +0000 Subject: [PATCH] feat(connlib): create flow on ICMP error "prohibited" (#10462) In Firezone, a Client requests an "access authorization" for a Resource on the fly when it sees the first packet for said Resource going through the tunnel. If we don't have a connection to the Gateway yet, this is also where we will establish a connection and create the WireGuard tunnel. In order for this to work, the access authorization state between the Client and the Gateway MUST NOT get out of sync. If the Client thinks it has access to a Resource, it will just route the traffic to the Gateway. If the access authorization on the Gateway has expired or vanished otherwise, the packets will be black-holed. Starting with #9816, the Gateway sends ICMP errors back to the application whenever it filters a packet. This can happen either because the access authorization is gone or because the traffic wasn't allowed by the specific filter rules on the Resource. With this patch, the Client will attempt to create a new flow (i.e. re-authorize) traffic for this resource whenever it sees such an ICMP error, therefore acting as a way of synchronizing the view of the world between Client and Gateway should they ever run out of sync. Testing turned out to be a bit tricky. If we let the authorization on the Gateway lapse naturally, we portal will also toggle the Resource off and on on the Client, resulting in "flushing" the current authorizations. Additionally, it the Client had only access to one Resource, then the Gateway will gracefully close the connection, also resulting in the Client creating a new flow for the next packet. To actually trigger this new behaviour we need to: - Access at least two resources via the same Gateway - Directly send `reject_access` to the Gateway for this particular resource To achieve this, we dynamically eval some code on the API node and instruct the Gateway channel to send `reject_access`. The connection stays intact because there is still another active access authorization but packets for the other resource are answered with ICMP errors. To achieve a safe roll-out, the new behaviour is feature-flagged. In order to still test it, we now also allow feature flags to be set via env variables. Resolves: #10074 --------- Co-authored-by: Mariusz Klochowicz --- .github/workflows/_integration_tests.yml | 13 ++++ docker-compose.yml | 1 + elixir/apps/api/lib/api/gateway/channel.ex | 9 +++ rust/connlib/ip-packet/src/icmp_error.rs | 11 ++++ rust/connlib/ip-packet/src/make.rs | 32 ++++++---- rust/connlib/tunnel/src/client.rs | 36 ++++++++++- rust/connlib/tunnel/src/peer.rs | 18 +----- rust/telemetry/src/feature_flags.rs | 67 +++++++++++++++++++- scripts/tests/create-flow-from-icmp-error.sh | 22 +++++++ scripts/tests/lib.sh | 27 ++++++++ 10 files changed, 205 insertions(+), 31 deletions(-) create mode 100755 scripts/tests/create-flow-from-icmp-error.sh diff --git a/.github/workflows/_integration_tests.yml b/.github/workflows/_integration_tests.yml index 97a0c682d..2cbbec9b1 100644 --- a/.github/workflows/_integration_tests.yml +++ b/.github/workflows/_integration_tests.yml @@ -101,6 +101,8 @@ jobs: fail-fast: false matrix: test: + - script: create-flow-from-icmp-error + min_client_version: 1.5.4 - script: curl-api-down - script: curl-api-restart - script: curl-ecn @@ -128,6 +130,15 @@ jobs: - uses: ./.github/actions/ghcr-docker-login with: github_token: ${{ secrets.GITHUB_TOKEN }} + - name: Check minimum client version + id: version_check + if: ${{ matrix.test.min_client_version }} + continue-on-error: true + run: | + ACTUAL_VERSION=$(docker run ${{ inputs.client_image }}:${{ inputs.client_tag }} firezone-headless-client --version | awk '{print $2}') + MIN_VERSION="${{ matrix.test.min_client_version }}" + + [ "$(printf '%s\n' "$MIN_VERSION" "$ACTUAL_VERSION" | sort --version-sort | head -n1)" == "$MIN_VERSION" ] # We need at least Docker v28.1 which is not yet available on GitHub actions runners - uses: docker/setup-docker-action@b60f85385d03ac8acfca6d9996982511d8620a19 # v4.3.0 - name: Seed database @@ -178,6 +189,8 @@ jobs: sudo ethtool -K docker0 tx off - run: ./scripts/tests/${{ matrix.test.script }}.sh + if: ${{ steps.version_check.outcome != 'failure' }} # Run the script if version check succeeds or is skipped + - name: Ensure Client emitted no warnings if: "!cancelled()" run: | diff --git a/docker-compose.yml b/docker-compose.yml index 433909006..5585c8a56 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -176,6 +176,7 @@ services: RUST_LOG: ${RUST_LOG:-wire=trace,debug} FIREZONE_API_URL: ws://api:8081 FIREZONE_ID: EFC7A9E3-3576-4633-B633-7D47BA9E14AC + FZFF_ICMP_ERROR_UNREACHABLE_PROHIBITED_CREATE_NEW_FLOW: true command: - sh - -c diff --git a/elixir/apps/api/lib/api/gateway/channel.ex b/elixir/apps/api/lib/api/gateway/channel.ex index e1d5b1e76..5676c81ff 100644 --- a/elixir/apps/api/lib/api/gateway/channel.ex +++ b/elixir/apps/api/lib/api/gateway/channel.ex @@ -351,6 +351,15 @@ defmodule API.Gateway.Channel do end end + # Helper to directly send reject_access in integration tests + def handle_info( + {{:reject_access, gateway_id}, client_id, resource_id}, + %{assigns: %{gateway: %{id: gateway_id}}} = socket + ) do + push(socket, "reject_access", %{client_id: client_id, resource_id: resource_id}) + {:noreply, socket} + end + # Catch-all for messages we don't handle def handle_info(_message, socket), do: {:noreply, socket} diff --git a/rust/connlib/ip-packet/src/icmp_error.rs b/rust/connlib/ip-packet/src/icmp_error.rs index 202bd783f..79680991c 100644 --- a/rust/connlib/ip-packet/src/icmp_error.rs +++ b/rust/connlib/ip-packet/src/icmp_error.rs @@ -46,6 +46,17 @@ impl IcmpError { IcmpError::V6TimeExceeded(code) => Ok(Icmpv6Type::TimeExceeded(code)), } } + + pub fn is_unreachable_prohibited(&self) -> bool { + use IcmpError::*; + use icmpv4::DestUnreachableHeader::*; + use icmpv6::DestUnreachableCode::*; + + matches!( + self, + V4Unreachable(FilterProhibited) | V6Unreachable(Prohibited) + ) + } } /// A packet that failed to route to its destination, extracted from the payload of an ICMP/ICMP6 error message. diff --git a/rust/connlib/ip-packet/src/make.rs b/rust/connlib/ip-packet/src/make.rs index 6c570df93..f92a22bec 100644 --- a/rust/connlib/ip-packet/src/make.rs +++ b/rust/connlib/ip-packet/src/make.rs @@ -172,7 +172,23 @@ where } } -pub fn icmp_dest_unreachable( +pub fn icmp_dest_unreachable_prohibited(original_packet: &IpPacket) -> Result { + icmp_dest_unreachable( + original_packet, + icmpv4::DestUnreachableHeader::FilterProhibited, + icmpv6::DestUnreachableCode::Prohibited, + ) +} + +pub fn icmp_dest_unreachable_network(original_packet: &IpPacket) -> Result { + icmp_dest_unreachable( + original_packet, + icmpv4::DestUnreachableHeader::Network, + icmpv6::DestUnreachableCode::Address, + ) +} + +fn icmp_dest_unreachable( original_packet: &IpPacket, icmpv4: icmpv4::DestUnreachableHeader, icmpv6: icmpv6::DestUnreachableCode, @@ -271,12 +287,7 @@ mod tests { ) .unwrap(); - let icmp_error = icmp_dest_unreachable( - &unreachable_packet, - icmpv4::DestUnreachableHeader::Network, - icmpv6::DestUnreachableCode::Address, - ) - .unwrap(); + let icmp_error = icmp_dest_unreachable_network(&unreachable_packet).unwrap(); assert_eq!( icmp_error.destination(), @@ -299,12 +310,7 @@ mod tests { ) .unwrap(); - let icmp_error = icmp_dest_unreachable( - &unreachable_packet, - icmpv4::DestUnreachableHeader::Network, - icmpv6::DestUnreachableCode::Address, - ) - .unwrap(); + let icmp_error = icmp_dest_unreachable_network(&unreachable_packet).unwrap(); assert_eq!( icmp_error.destination(), diff --git a/rust/connlib/tunnel/src/client.rs b/rust/connlib/tunnel/src/client.rs index a1ff98d59..4413ffa25 100644 --- a/rust/connlib/tunnel/src/client.rs +++ b/rust/connlib/tunnel/src/client.rs @@ -3,6 +3,7 @@ mod resource; use dns_resource_nat::DnsResourceNat; use dns_types::ResponseCode; +use firezone_telemetry::{analytics, feature_flags}; pub(crate) use resource::{CidrResource, Resource}; #[cfg(all(feature = "proptest", test))] pub(crate) use resource::{DnsResource, InternetResource}; @@ -183,6 +184,7 @@ impl PendingFlow { } ConnectionTrigger::UdpDnsQueryForSite(packet) => self.udp_dns_queries.push(packet), ConnectionTrigger::TcpDnsQueryForSite(query) => self.tcp_dns_queries.push(query), + ConnectionTrigger::IcmpDestinationUnreachableProhibited => {} } } } @@ -472,6 +474,20 @@ impl ClientState { &mut self.udp_dns_sockets_by_upstream_and_query_id, ); + if feature_flags::icmp_error_unreachable_prohibited_create_new_flow() + && let Ok(Some((failed_packet, error))) = packet.icmp_error() + && error.is_unreachable_prohibited() + && let Some(resource) = self.get_resource_by_destination(failed_packet.dst()) + { + analytics::feature_flag_called("icmp-error-unreachable-prohibited-create-new-flow"); + + self.on_not_connected_resource( + resource, + ConnectionTrigger::IcmpDestinationUnreachableProhibited, + now, + ); + } + Some(packet) } @@ -818,6 +834,7 @@ impl ClientState { use std::collections::hash_map::Entry; let trigger = trigger.into(); + let trigger_name = trigger.name(); debug_assert!(self.resources_by_id.contains_key(&rid)); @@ -840,7 +857,7 @@ impl ClientState { } } - tracing::debug!("Sending connection intent"); + tracing::debug!(trigger = %trigger_name, "Sending connection intent"); self.buffered_events .push_back(ClientEvent::ConnectionIntent { @@ -2013,6 +2030,23 @@ enum ConnectionTrigger { UdpDnsQueryForSite(IpPacket), /// A TCP DNS query that needs to be resolved within a particular site that we aren't connected to yet. TcpDnsQueryForSite(dns_over_tcp::Query), + /// We have received an ICMP error that is marked as "access prohibited". + /// + /// Most likely, the Gateway is filtering these packets because the Client doesn't have access (anymore). + IcmpDestinationUnreachableProhibited, +} + +impl ConnectionTrigger { + fn name(&self) -> &'static str { + match self { + ConnectionTrigger::PacketForResource(_) => "packet-for-resource", + ConnectionTrigger::UdpDnsQueryForSite(_) => "udp-dns-query-for-site", + ConnectionTrigger::TcpDnsQueryForSite(_) => "tcp-dns-query-for-site", + ConnectionTrigger::IcmpDestinationUnreachableProhibited => { + "icmp-destination-unreachable-prohibited" + } + } + } } impl From for ConnectionTrigger { diff --git a/rust/connlib/tunnel/src/peer.rs b/rust/connlib/tunnel/src/peer.rs index 57f698681..d23576f1e 100644 --- a/rust/connlib/tunnel/src/peer.rs +++ b/rust/connlib/tunnel/src/peer.rs @@ -343,11 +343,7 @@ impl ClientOnGateway { if let Err(e) = self.ensure_allowed_src_and_dst(&packet) { tracing::debug!(filtered_packet = ?packet, "{e:#}"); return Ok(TranslateOutboundResult::Filtered( - ip_packet::make::icmp_dest_unreachable( - &packet, - ip_packet::icmpv4::DestUnreachableHeader::FilterProhibited, - ip_packet::icmpv6::DestUnreachableCode::Prohibited, - )?, + ip_packet::make::icmp_dest_unreachable_prohibited(&packet)?, )); } @@ -421,11 +417,7 @@ impl ClientOnGateway { tracing::debug!(%dst, "No translation entry"); return Ok(TranslateOutboundResult::DestinationUnreachable( - ip_packet::make::icmp_dest_unreachable( - &packet, - ip_packet::icmpv4::DestUnreachableHeader::Network, - ip_packet::icmpv6::DestUnreachableCode::Address, - )?, + ip_packet::make::icmp_dest_unreachable_network(&packet)?, )); }; @@ -437,11 +429,7 @@ impl ClientOnGateway { ); return Ok(TranslateOutboundResult::DestinationUnreachable( - ip_packet::make::icmp_dest_unreachable( - &packet, - ip_packet::icmpv4::DestUnreachableHeader::Network, - ip_packet::icmpv6::DestUnreachableCode::Address, - )?, + ip_packet::make::icmp_dest_unreachable_network(&packet)?, )); } diff --git a/rust/telemetry/src/feature_flags.rs b/rust/telemetry/src/feature_flags.rs index a10fb021a..a671c102d 100644 --- a/rust/telemetry/src/feature_flags.rs +++ b/rust/telemetry/src/feature_flags.rs @@ -21,8 +21,15 @@ pub(crate) const RE_EVAL_DURATION: Duration = Duration::from_secs(5 * 60); // Process-wide storage of enabled feature flags. // -// Defaults to everything off. -static FEATURE_FLAGS: LazyLock = LazyLock::new(FeatureFlags::default); +// Defaults to everything off unless the env variables say otherwise. +static FEATURE_FLAGS: LazyLock = LazyLock::new(|| { + let flags = FeatureFlags::default(); + let from_env = update_from_env(FeatureFlagsResponse::default()); + + flags.update(from_env, FeatureFlagPayloadsResponse::default()); + + flags +}); pub fn icmp_unreachable_instead_of_nat64() -> bool { FEATURE_FLAGS.icmp_unreachable_instead_of_nat64() @@ -44,6 +51,10 @@ pub fn gateway_userspace_dns_a_aaaa_records() -> bool { FEATURE_FLAGS.gateway_userspace_dns_a_aaaa_records() } +pub fn icmp_error_unreachable_prohibited_create_new_flow() -> bool { + FEATURE_FLAGS.icmp_error_unreachable_prohibited_create_new_flow() +} + pub fn export_metrics() -> bool { false // Placeholder until we actually deploy an OTEL collector. } @@ -64,6 +75,8 @@ pub(crate) async fn evaluate_now(user_id: String, env: Env) { .inspect_err(|e| tracing::debug!("Failed to evaluate feature flags: {e:#}")) .unwrap_or_default(); + let flags = update_from_env(flags); + FEATURE_FLAGS.update(flags, payloads); sentry::Hub::main().configure_scope(|scope| { @@ -166,6 +179,8 @@ struct FeatureFlagsResponse { map_enobufs_to_wouldblock: bool, #[serde(default)] gateway_userspace_dns_a_aaaa_records: bool, + #[serde(default)] + icmp_error_unreachable_prohibited_create_new_flow: bool, } #[derive(Debug, Deserialize, Default, Clone)] @@ -182,6 +197,7 @@ struct FeatureFlags { stream_logs: RwLock, map_enobufs_to_wouldblock: AtomicBool, gateway_userspace_dns_a_aaaa_records: AtomicBool, + icmp_error_unreachable_prohibited_create_new_flow: AtomicBool, } /// Accessors to the actual feature flags. @@ -199,6 +215,7 @@ impl FeatureFlags { stream_logs, map_enobufs_to_wouldblock, gateway_userspace_dns_a_aaaa_records, + icmp_error_unreachable_prohibited_create_new_flow, }: FeatureFlagsResponse, payloads: FeatureFlagPayloadsResponse, ) { @@ -210,6 +227,11 @@ impl FeatureFlags { .store(map_enobufs_to_wouldblock, Ordering::Relaxed); self.gateway_userspace_dns_a_aaaa_records .store(gateway_userspace_dns_a_aaaa_records, Ordering::Relaxed); + self.icmp_error_unreachable_prohibited_create_new_flow + .store( + icmp_error_unreachable_prohibited_create_new_flow, + Ordering::Relaxed, + ); let log_filter = if stream_logs { LogFilter::parse(payloads.stream_logs) @@ -241,6 +263,44 @@ impl FeatureFlags { self.gateway_userspace_dns_a_aaaa_records .load(Ordering::Relaxed) } + + fn icmp_error_unreachable_prohibited_create_new_flow(&self) -> bool { + self.icmp_error_unreachable_prohibited_create_new_flow + .load(Ordering::Relaxed) + } +} + +fn update_from_env(flags: FeatureFlagsResponse) -> FeatureFlagsResponse { + FeatureFlagsResponse { + icmp_unreachable_instead_of_nat64: env_or( + "FZFF_ICMP_UNREACHABLE_INSTEAD_OF_NAT64", + flags.icmp_unreachable_instead_of_nat64, + ), + drop_llmnr_nxdomain_responses: env_or( + "FZFF_DROP_LLMNR_NXDOMAIN_RESPONSES", + flags.drop_llmnr_nxdomain_responses, + ), + stream_logs: env_or("FZFF_stream_logs", flags.stream_logs), + map_enobufs_to_wouldblock: env_or( + "FZFF_MAP_ENOBUFS_TO_WOULDBLOCK", + flags.map_enobufs_to_wouldblock, + ), + gateway_userspace_dns_a_aaaa_records: env_or( + "FZFF_GATEWAY_USERSPACE_DNS_A_AAAA_RECORDS", + flags.gateway_userspace_dns_a_aaaa_records, + ), + icmp_error_unreachable_prohibited_create_new_flow: env_or( + "FZFF_ICMP_ERROR_UNREACHABLE_PROHIBITED_CREATE_NEW_FLOW", + flags.icmp_error_unreachable_prohibited_create_new_flow, + ), + } +} + +fn env_or(key: &str, fallback: bool) -> bool { + std::env::var(key) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(fallback) } fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context { @@ -252,6 +312,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context StreamLogs { result: bool }, MapENOBUFSToWouldBlock { result: bool }, GatewayUserspaceDnsAAaaaRecords { result: bool }, + IcmpErrorUnreachableProhibitedCreateNewFlow { result: bool }, } // Exhaustive destruction so we don't forget to update this when we add a flag. @@ -261,6 +322,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context stream_logs, map_enobufs_to_wouldblock, gateway_userspace_dns_a_aaaa_records, + icmp_error_unreachable_prohibited_create_new_flow, } = flags; let value = serde_json::json!({ @@ -272,6 +334,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context SentryFlag::StreamLogs { result: stream_logs }, SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock }, SentryFlag::GatewayUserspaceDnsAAaaaRecords { result: gateway_userspace_dns_a_aaaa_records }, + SentryFlag::IcmpErrorUnreachableProhibitedCreateNewFlow { result: icmp_error_unreachable_prohibited_create_new_flow }, ] }); diff --git a/scripts/tests/create-flow-from-icmp-error.sh b/scripts/tests/create-flow-from-icmp-error.sh new file mode 100755 index 000000000..7bbb6b893 --- /dev/null +++ b/scripts/tests/create-flow-from-icmp-error.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +source "./scripts/tests/lib.sh" + +# Authorize resource 1 +client_curl_resource "172.20.0.100/get" +client_curl_resource "[172:20:0::100]/get" + +# Authorize resource 2 (important, otherwise the Gateway will close the connection on the last resource being removed) +client_ping_resource download.httpbin + +# Revoke access to resource 1 +api_send_reject_access "mycro-aws-gws" "MyCorp Network" # This is the 172.20.0.1/16 network +api_send_reject_access "mycro-aws-gws" "MyCorp Network (IPv6)" # This is the 172:20:0::1/64 network + +# Try to access resource 1 again +# First one for each IP will fail because we get an ICMP error. +expect_error client_curl_resource "172.20.0.100/get" +expect_error client_curl_resource "[172:20:0::100]/get" + +client_curl_resource "172.20.0.100/get" +client_curl_resource "[172:20:0::100]/get" diff --git a/scripts/tests/lib.sh b/scripts/tests/lib.sh index c557e6d9d..69acbe147 100755 --- a/scripts/tests/lib.sh +++ b/scripts/tests/lib.sh @@ -33,6 +33,23 @@ function client_nslookup() { client timeout 30 sh -c "nslookup $1 | tee >(cat 1>&2) | tail -n +4" } +function api_send_reject_access() { + local site_name="$1" + local resource_name="$2" + + docker compose exec -T api bin/api rpc " +Application.ensure_all_started(:domain) +account_id = \"c89bcc8c-9392-4dae-a40d-888aef6d28e0\" + +[gateway_group] = Domain.Gateways.Group.Query.not_deleted() |> Domain.Gateways.Group.Query.by_account_id(account_id) |> Domain.Gateways.Group.Query.by_name(\"$site_name\") |> Domain.Repo.all() +[gateway_id | _] = Domain.Gateways.Presence.Group.list(gateway_group.id) |> Map.keys() +[client_id | _] = Domain.Clients.Presence.Account.list(account_id) |> Map.keys() +[resource] = Domain.Resources.Resource.Query.not_deleted() |> Domain.Resources.Resource.Query.by_account_id(account_id) |> Domain.Repo.all() |> Enum.filter(&(&1.name == \"$resource_name\")) + +Domain.PubSub.Account.broadcast(account_id, {{:reject_access, gateway_id}, client_id, resource.id}) +" +} + function assert_equals() { local actual="$1" local expected="$2" @@ -69,3 +86,13 @@ function create_token_file { # cut into a release. sudo cp "$TOKEN_PATH" "$TOKEN_PATH.txt" } + +# Expects a command to fail (non-zero exit code) +# Usage: expect_error your_command arg1 arg2 +function expect_error() { + if "$@"; then + return 1 + else + return 0 + fi +}