mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-10-31 02:08:13 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			1682 lines
		
	
	
		
			55 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			1682 lines
		
	
	
		
			55 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| //go:build linux
 | |
| // +build linux
 | |
| 
 | |
| /*
 | |
| Copyright 2015 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package nftables
 | |
| 
 | |
| //
 | |
| // NOTE: this needs to be tested in e2e since it uses nftables for everything.
 | |
| //
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"crypto/sha256"
 | |
| 	"encoding/base32"
 | |
| 	"fmt"
 | |
| 	"net"
 | |
| 	"reflect"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"sync"
 | |
| 	"sync/atomic"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/danwinship/knftables"
 | |
| 
 | |
| 	v1 "k8s.io/api/core/v1"
 | |
| 	discovery "k8s.io/api/discovery/v1"
 | |
| 	"k8s.io/apimachinery/pkg/types"
 | |
| 	"k8s.io/apimachinery/pkg/util/sets"
 | |
| 	"k8s.io/apimachinery/pkg/util/wait"
 | |
| 	"k8s.io/client-go/tools/events"
 | |
| 	utilsysctl "k8s.io/component-helpers/node/util/sysctl"
 | |
| 	"k8s.io/klog/v2"
 | |
| 	"k8s.io/kubernetes/pkg/proxy"
 | |
| 	"k8s.io/kubernetes/pkg/proxy/conntrack"
 | |
| 	"k8s.io/kubernetes/pkg/proxy/healthcheck"
 | |
| 	"k8s.io/kubernetes/pkg/proxy/metaproxier"
 | |
| 	"k8s.io/kubernetes/pkg/proxy/metrics"
 | |
| 	proxyutil "k8s.io/kubernetes/pkg/proxy/util"
 | |
| 	proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables"
 | |
| 	"k8s.io/kubernetes/pkg/util/async"
 | |
| 	utilexec "k8s.io/utils/exec"
 | |
| 	netutils "k8s.io/utils/net"
 | |
| 	"k8s.io/utils/ptr"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	// Our nftables table. All of our chains/sets/maps are created inside this table,
 | |
| 	// so they don't need any "kube-" or "kube-proxy-" prefix of their own.
 | |
| 	kubeProxyTable = "kube-proxy"
 | |
| 
 | |
| 	// base chains
 | |
| 	filterPreroutingChain     = "filter-prerouting"
 | |
| 	filterInputChain          = "filter-input"
 | |
| 	filterForwardChain        = "filter-forward"
 | |
| 	filterOutputChain         = "filter-output"
 | |
| 	filterOutputPostDNATChain = "filter-output-post-dnat"
 | |
| 	natPreroutingChain        = "nat-prerouting"
 | |
| 	natOutputChain            = "nat-output"
 | |
| 	natPostroutingChain       = "nat-postrouting"
 | |
| 
 | |
| 	// service dispatch
 | |
| 	servicesChain       = "services"
 | |
| 	serviceIPsMap       = "service-ips"
 | |
| 	serviceNodePortsMap = "service-nodeports"
 | |
| 
 | |
| 	// set of IPs that accept NodePort traffic
 | |
| 	nodePortIPsSet = "nodeport-ips"
 | |
| 
 | |
| 	// set of active ClusterIPs.
 | |
| 	clusterIPsSet = "cluster-ips"
 | |
| 
 | |
| 	// handling for services with no endpoints
 | |
| 	serviceEndpointsCheckChain  = "service-endpoints-check"
 | |
| 	nodePortEndpointsCheckChain = "nodeport-endpoints-check"
 | |
| 	noEndpointServicesMap       = "no-endpoint-services"
 | |
| 	noEndpointNodePortsMap      = "no-endpoint-nodeports"
 | |
| 	rejectChain                 = "reject-chain"
 | |
| 
 | |
| 	// handling traffic to unallocated ClusterIPs and undefined ports of ClusterIPs
 | |
| 	clusterIPsCheckChain = "cluster-ips-check"
 | |
| 
 | |
| 	// LoadBalancerSourceRanges handling
 | |
| 	firewallIPsMap     = "firewall-ips"
 | |
| 	firewallCheckChain = "firewall-check"
 | |
| 
 | |
| 	// masquerading
 | |
| 	markMasqChain     = "mark-for-masquerade"
 | |
| 	masqueradingChain = "masquerading"
 | |
| )
 | |
| 
 | |
| // internal struct for string service information
 | |
| type servicePortInfo struct {
 | |
| 	*proxy.BaseServicePortInfo
 | |
| 	// The following fields are computed and stored for performance reasons.
 | |
| 	nameString             string
 | |
| 	clusterPolicyChainName string
 | |
| 	localPolicyChainName   string
 | |
| 	externalChainName      string
 | |
| 	firewallChainName      string
 | |
| }
 | |
| 
 | |
| // returns a new proxy.ServicePort which abstracts a serviceInfo
 | |
| func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *proxy.BaseServicePortInfo) proxy.ServicePort {
 | |
| 	svcPort := &servicePortInfo{BaseServicePortInfo: bsvcPortInfo}
 | |
| 
 | |
| 	// Store the following for performance reasons.
 | |
| 	svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
 | |
| 	svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name}
 | |
| 	svcPort.nameString = svcPortName.String()
 | |
| 
 | |
| 	chainNameBase := servicePortChainNameBase(&svcPortName, strings.ToLower(string(svcPort.Protocol())))
 | |
| 	svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
 | |
| 	svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
 | |
| 	svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
 | |
| 	svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase
 | |
| 
 | |
| 	return svcPort
 | |
| }
 | |
| 
 | |
| // internal struct for endpoints information
 | |
| type endpointInfo struct {
 | |
| 	*proxy.BaseEndpointInfo
 | |
| 
 | |
| 	chainName       string
 | |
| 	affinitySetName string
 | |
| }
 | |
| 
 | |
| // returns a new proxy.Endpoint which abstracts a endpointInfo
 | |
| func newEndpointInfo(baseInfo *proxy.BaseEndpointInfo, svcPortName *proxy.ServicePortName) proxy.Endpoint {
 | |
| 	chainNameBase := servicePortEndpointChainNameBase(svcPortName, strings.ToLower(string(svcPortName.Protocol)), baseInfo.String())
 | |
| 	return &endpointInfo{
 | |
| 		BaseEndpointInfo: baseInfo,
 | |
| 		chainName:        servicePortEndpointChainNamePrefix + chainNameBase,
 | |
| 		affinitySetName:  servicePortEndpointAffinityNamePrefix + chainNameBase,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Proxier is an nftables based proxy
 | |
| type Proxier struct {
 | |
| 	// ipFamily defines the IP family which this proxier is tracking.
 | |
| 	ipFamily v1.IPFamily
 | |
| 
 | |
| 	// endpointsChanges and serviceChanges contains all changes to endpoints and
 | |
| 	// services that happened since nftables was synced. For a single object,
 | |
| 	// changes are accumulated, i.e. previous is state from before all of them,
 | |
| 	// current is state after applying all of those.
 | |
| 	endpointsChanges *proxy.EndpointsChangeTracker
 | |
| 	serviceChanges   *proxy.ServiceChangeTracker
 | |
| 
 | |
| 	mu           sync.Mutex // protects the following fields
 | |
| 	svcPortMap   proxy.ServicePortMap
 | |
| 	endpointsMap proxy.EndpointsMap
 | |
| 	nodeLabels   map[string]string
 | |
| 	// endpointSlicesSynced, and servicesSynced are set to true
 | |
| 	// when corresponding objects are synced after startup. This is used to avoid
 | |
| 	// updating nftables with some partial data after kube-proxy restart.
 | |
| 	endpointSlicesSynced bool
 | |
| 	servicesSynced       bool
 | |
| 	initialized          int32
 | |
| 	syncRunner           *async.BoundedFrequencyRunner // governs calls to syncProxyRules
 | |
| 	syncPeriod           time.Duration
 | |
| 
 | |
| 	// These are effectively const and do not need the mutex to be held.
 | |
| 	nftables       knftables.Interface
 | |
| 	masqueradeAll  bool
 | |
| 	masqueradeMark string
 | |
| 	conntrack      conntrack.Interface
 | |
| 	localDetector  proxyutiliptables.LocalTrafficDetector
 | |
| 	hostname       string
 | |
| 	nodeIP         net.IP
 | |
| 	recorder       events.EventRecorder
 | |
| 
 | |
| 	serviceHealthServer healthcheck.ServiceHealthServer
 | |
| 	healthzServer       *healthcheck.ProxierHealthServer
 | |
| 
 | |
| 	// nodePortAddresses selects the interfaces where nodePort works.
 | |
| 	nodePortAddresses *proxyutil.NodePortAddresses
 | |
| 	// networkInterfacer defines an interface for several net library functions.
 | |
| 	// Inject for test purpose.
 | |
| 	networkInterfacer proxyutil.NetworkInterfacer
 | |
| 
 | |
| 	// staleChains contains information about chains to be deleted later
 | |
| 	staleChains map[string]time.Time
 | |
| 
 | |
| 	// serviceCIDRs is a comma separated list of ServiceCIDRs belonging to the IPFamily
 | |
| 	// which proxier is operating on, can be directly consumed by knftables.
 | |
| 	serviceCIDRs string
 | |
| }
 | |
| 
 | |
| // Proxier implements proxy.Provider
 | |
| var _ proxy.Provider = &Proxier{}
 | |
| 
 | |
| // NewProxier returns a new nftables Proxier. Once a proxier is created, it will keep
 | |
| // nftables up to date in the background and will not terminate if a particular nftables
 | |
| // call fails.
 | |
| func NewProxier(ipFamily v1.IPFamily,
 | |
| 	sysctl utilsysctl.Interface,
 | |
| 	syncPeriod time.Duration,
 | |
| 	minSyncPeriod time.Duration,
 | |
| 	masqueradeAll bool,
 | |
| 	masqueradeBit int,
 | |
| 	localDetector proxyutiliptables.LocalTrafficDetector,
 | |
| 	hostname string,
 | |
| 	nodeIP net.IP,
 | |
| 	recorder events.EventRecorder,
 | |
| 	healthzServer *healthcheck.ProxierHealthServer,
 | |
| 	nodePortAddressStrings []string,
 | |
| 	initOnly bool,
 | |
| ) (*Proxier, error) {
 | |
| 	nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings, nodeIP)
 | |
| 
 | |
| 	if initOnly {
 | |
| 		klog.InfoS("System initialized and --init-only specified")
 | |
| 		return nil, nil
 | |
| 	}
 | |
| 
 | |
| 	// Generate the masquerade mark to use for SNAT rules.
 | |
| 	masqueradeValue := 1 << uint(masqueradeBit)
 | |
| 	masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue)
 | |
| 	klog.V(2).InfoS("Using nftables mark for masquerade", "ipFamily", ipFamily, "mark", masqueradeMark)
 | |
| 
 | |
| 	serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer)
 | |
| 
 | |
| 	var nftablesFamily knftables.Family
 | |
| 	if ipFamily == v1.IPv4Protocol {
 | |
| 		nftablesFamily = knftables.IPv4Family
 | |
| 	} else {
 | |
| 		nftablesFamily = knftables.IPv6Family
 | |
| 	}
 | |
| 	nft, err := knftables.New(nftablesFamily, kubeProxyTable)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	proxier := &Proxier{
 | |
| 		ipFamily:            ipFamily,
 | |
| 		svcPortMap:          make(proxy.ServicePortMap),
 | |
| 		serviceChanges:      proxy.NewServiceChangeTracker(newServiceInfo, ipFamily, recorder, nil),
 | |
| 		endpointsMap:        make(proxy.EndpointsMap),
 | |
| 		endpointsChanges:    proxy.NewEndpointsChangeTracker(hostname, newEndpointInfo, ipFamily, recorder, nil),
 | |
| 		syncPeriod:          syncPeriod,
 | |
| 		nftables:            nft,
 | |
| 		masqueradeAll:       masqueradeAll,
 | |
| 		masqueradeMark:      masqueradeMark,
 | |
| 		conntrack:           conntrack.NewExec(utilexec.New()),
 | |
| 		localDetector:       localDetector,
 | |
| 		hostname:            hostname,
 | |
| 		nodeIP:              nodeIP,
 | |
| 		recorder:            recorder,
 | |
| 		serviceHealthServer: serviceHealthServer,
 | |
| 		healthzServer:       healthzServer,
 | |
| 		nodePortAddresses:   nodePortAddresses,
 | |
| 		networkInterfacer:   proxyutil.RealNetwork{},
 | |
| 		staleChains:         make(map[string]time.Time),
 | |
| 	}
 | |
| 
 | |
| 	burstSyncs := 2
 | |
| 	klog.V(2).InfoS("NFTables sync params", "ipFamily", ipFamily, "minSyncPeriod", minSyncPeriod, "syncPeriod", syncPeriod, "burstSyncs", burstSyncs)
 | |
| 	proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)
 | |
| 
 | |
| 	return proxier, nil
 | |
| }
 | |
| 
 | |
| // NewDualStackProxier creates a MetaProxier instance, with IPv4 and IPv6 proxies.
 | |
| func NewDualStackProxier(
 | |
| 	sysctl utilsysctl.Interface,
 | |
| 	syncPeriod time.Duration,
 | |
| 	minSyncPeriod time.Duration,
 | |
| 	masqueradeAll bool,
 | |
| 	masqueradeBit int,
 | |
| 	localDetectors [2]proxyutiliptables.LocalTrafficDetector,
 | |
| 	hostname string,
 | |
| 	nodeIPs map[v1.IPFamily]net.IP,
 | |
| 	recorder events.EventRecorder,
 | |
| 	healthzServer *healthcheck.ProxierHealthServer,
 | |
| 	nodePortAddresses []string,
 | |
| 	initOnly bool,
 | |
| ) (proxy.Provider, error) {
 | |
| 	// Create an ipv4 instance of the single-stack proxier
 | |
| 	ipv4Proxier, err := NewProxier(v1.IPv4Protocol, sysctl,
 | |
| 		syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[0], hostname,
 | |
| 		nodeIPs[v1.IPv4Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("unable to create ipv4 proxier: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	ipv6Proxier, err := NewProxier(v1.IPv6Protocol, sysctl,
 | |
| 		syncPeriod, minSyncPeriod, masqueradeAll, masqueradeBit, localDetectors[1], hostname,
 | |
| 		nodeIPs[v1.IPv6Protocol], recorder, healthzServer, nodePortAddresses, initOnly)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("unable to create ipv6 proxier: %v", err)
 | |
| 	}
 | |
| 	if initOnly {
 | |
| 		return nil, nil
 | |
| 	}
 | |
| 	return metaproxier.NewMetaProxier(ipv4Proxier, ipv6Proxier), nil
 | |
| }
 | |
| 
 | |
| // nftablesBaseChains lists our "base chains"; those that are directly connected to the
 | |
| // netfilter hooks (e.g., "postrouting", "input", etc.), as opposed to "regular" chains,
 | |
| // which are only run when a rule jumps to them. See
 | |
| // https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains.
 | |
| //
 | |
| // These are set up from setupNFTables() and then not directly referenced by
 | |
| // syncProxyRules().
 | |
| //
 | |
| // All of our base chains have names that are just "${type}-${hook}". e.g., "nat-prerouting".
 | |
| type nftablesBaseChain struct {
 | |
| 	name      string
 | |
| 	chainType knftables.BaseChainType
 | |
| 	hook      knftables.BaseChainHook
 | |
| 	priority  knftables.BaseChainPriority
 | |
| }
 | |
| 
 | |
| var nftablesBaseChains = []nftablesBaseChain{
 | |
| 	// We want our filtering rules to operate on pre-DNAT dest IPs, so our filter
 | |
| 	// chains have to run before DNAT.
 | |
| 	{filterPreroutingChain, knftables.FilterType, knftables.PreroutingHook, knftables.DNATPriority + "-10"},
 | |
| 	{filterInputChain, knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-10"},
 | |
| 	{filterForwardChain, knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-10"},
 | |
| 	{filterOutputChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-10"},
 | |
| 	{filterOutputPostDNATChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "+10"},
 | |
| 	{natPreroutingChain, knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority},
 | |
| 	{natOutputChain, knftables.NATType, knftables.OutputHook, knftables.DNATPriority},
 | |
| 	{natPostroutingChain, knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority},
 | |
| }
 | |
| 
 | |
| // nftablesJumpChains lists our top-level "regular chains" that are jumped to directly
 | |
| // from one of the base chains. These are set up from setupNFTables(), and some of them
 | |
| // are also referenced in syncProxyRules().
 | |
| type nftablesJumpChain struct {
 | |
| 	dstChain  string
 | |
| 	srcChain  string
 | |
| 	extraArgs string
 | |
| }
 | |
| 
 | |
| var nftablesJumpChains = []nftablesJumpChain{
 | |
| 	// We can't jump to endpointsCheckChain from filter-prerouting like
 | |
| 	// firewallCheckChain because reject action is only valid in chains using the
 | |
| 	// input, forward or output hooks.
 | |
| 	{nodePortEndpointsCheckChain, filterInputChain, "ct state new"},
 | |
| 	{serviceEndpointsCheckChain, filterInputChain, "ct state new"},
 | |
| 	{serviceEndpointsCheckChain, filterForwardChain, "ct state new"},
 | |
| 	{serviceEndpointsCheckChain, filterOutputChain, "ct state new"},
 | |
| 
 | |
| 	{firewallCheckChain, filterPreroutingChain, "ct state new"},
 | |
| 	{firewallCheckChain, filterOutputChain, "ct state new"},
 | |
| 
 | |
| 	{servicesChain, natOutputChain, ""},
 | |
| 	{servicesChain, natPreroutingChain, ""},
 | |
| 	{masqueradingChain, natPostroutingChain, ""},
 | |
| 
 | |
| 	{clusterIPsCheckChain, filterForwardChain, "ct state new"},
 | |
| 	{clusterIPsCheckChain, filterOutputPostDNATChain, "ct state new"},
 | |
| }
 | |
| 
 | |
| // ensureChain adds commands to tx to ensure that chain exists and doesn't contain
 | |
| // anything from before this transaction (using createdChains to ensure that we don't
 | |
| // Flush a chain more than once and lose *new* rules as well.)
 | |
| func ensureChain(chain string, tx *knftables.Transaction, createdChains sets.Set[string]) {
 | |
| 	if createdChains.Has(chain) {
 | |
| 		return
 | |
| 	}
 | |
| 	tx.Add(&knftables.Chain{
 | |
| 		Name: chain,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Chain{
 | |
| 		Name: chain,
 | |
| 	})
 | |
| 	createdChains.Insert(chain)
 | |
| }
 | |
| 
 | |
| func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
 | |
| 	ipX := "ip"
 | |
| 	ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value
 | |
| 	noLocalhost := "ip daddr != 127.0.0.0/8"
 | |
| 	if proxier.ipFamily == v1.IPv6Protocol {
 | |
| 		ipX = "ip6"
 | |
| 		ipvX_addr = "ipv6_addr"
 | |
| 		noLocalhost = "ip6 daddr != ::1"
 | |
| 	}
 | |
| 
 | |
| 	tx.Add(&knftables.Table{
 | |
| 		Comment: ptr.To("rules for kube-proxy"),
 | |
| 	})
 | |
| 
 | |
| 	// Create and flush base chains
 | |
| 	for _, bc := range nftablesBaseChains {
 | |
| 		chain := &knftables.Chain{
 | |
| 			Name:     bc.name,
 | |
| 			Type:     ptr.To(bc.chainType),
 | |
| 			Hook:     ptr.To(bc.hook),
 | |
| 			Priority: ptr.To(bc.priority),
 | |
| 		}
 | |
| 		tx.Add(chain)
 | |
| 		tx.Flush(chain)
 | |
| 	}
 | |
| 
 | |
| 	// Create and flush ordinary chains and add rules jumping to them
 | |
| 	createdChains := sets.New[string]()
 | |
| 	for _, c := range nftablesJumpChains {
 | |
| 		ensureChain(c.dstChain, tx, createdChains)
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: c.srcChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				c.extraArgs,
 | |
| 				"jump", c.dstChain,
 | |
| 			),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	// Ensure all of our other "top-level" chains exist
 | |
| 	for _, chain := range []string{servicesChain, clusterIPsCheckChain, masqueradingChain, markMasqChain} {
 | |
| 		ensureChain(chain, tx, createdChains)
 | |
| 	}
 | |
| 
 | |
| 	// Add the rules in the mark-for-masquerade and masquerading chains
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: markMasqChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			"mark", "set", "mark", "or", proxier.masqueradeMark,
 | |
| 		),
 | |
| 	})
 | |
| 
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: masqueradingChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			"mark", "and", proxier.masqueradeMark, "==", "0",
 | |
| 			"return",
 | |
| 		),
 | |
| 	})
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: masqueradingChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			"mark", "set", "mark", "xor", proxier.masqueradeMark,
 | |
| 		),
 | |
| 	})
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: masqueradingChain,
 | |
| 		Rule:  "masquerade fully-random",
 | |
| 	})
 | |
| 
 | |
| 	// add cluster-ips set.
 | |
| 	tx.Add(&knftables.Set{
 | |
| 		Name:    clusterIPsSet,
 | |
| 		Type:    ipvX_addr,
 | |
| 		Comment: ptr.To("Active ClusterIPs"),
 | |
| 	})
 | |
| 
 | |
| 	// reject traffic to invalid ports of ClusterIPs.
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: clusterIPsCheckChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			ipX, "daddr", "@", clusterIPsSet, "reject",
 | |
| 		),
 | |
| 		Comment: ptr.To("Reject traffic to invalid ports of ClusterIPs"),
 | |
| 	})
 | |
| 
 | |
| 	// drop traffic to unallocated ClusterIPs.
 | |
| 	if len(proxier.serviceCIDRs) > 0 {
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: clusterIPsCheckChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				ipX, "daddr", "{", proxier.serviceCIDRs, "}",
 | |
| 				"drop",
 | |
| 			),
 | |
| 			Comment: ptr.To("Drop traffic to unallocated ClusterIPs"),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	// Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete"
 | |
| 	// rather than just "delete" when we want to ensure the set doesn't exist, because
 | |
| 	// doing just "delete" would return an error if the set didn't exist.)
 | |
| 	tx.Add(&knftables.Set{
 | |
| 		Name:    nodePortIPsSet,
 | |
| 		Type:    ipvX_addr,
 | |
| 		Comment: ptr.To("IPs that accept NodePort traffic"),
 | |
| 	})
 | |
| 	if proxier.nodePortAddresses.MatchAll() {
 | |
| 		tx.Delete(&knftables.Set{
 | |
| 			Name: nodePortIPsSet,
 | |
| 		})
 | |
| 	} else {
 | |
| 		tx.Flush(&knftables.Set{
 | |
| 			Name: nodePortIPsSet,
 | |
| 		})
 | |
| 		nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer)
 | |
| 		if err != nil {
 | |
| 			klog.ErrorS(err, "Failed to get node ip address matching nodeport cidrs, services with nodeport may not work as intended", "CIDRs", proxier.nodePortAddresses)
 | |
| 		}
 | |
| 		for _, ip := range nodeIPs {
 | |
| 			if ip.IsLoopback() {
 | |
| 				klog.ErrorS(nil, "--nodeport-addresses includes localhost but localhost NodePorts are not supported", "address", ip.String())
 | |
| 				continue
 | |
| 			}
 | |
| 			tx.Add(&knftables.Element{
 | |
| 				Set: nodePortIPsSet,
 | |
| 				Key: []string{
 | |
| 					ip.String(),
 | |
| 				},
 | |
| 			})
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Set up "no endpoints" drop/reject handling
 | |
| 	tx.Add(&knftables.Map{
 | |
| 		Name:    noEndpointServicesMap,
 | |
| 		Type:    ipvX_addr + " . inet_proto . inet_service : verdict",
 | |
| 		Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"),
 | |
| 	})
 | |
| 	tx.Add(&knftables.Map{
 | |
| 		Name:    noEndpointNodePortsMap,
 | |
| 		Type:    "inet_proto . inet_service : verdict",
 | |
| 		Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"),
 | |
| 	})
 | |
| 
 | |
| 	tx.Add(&knftables.Chain{
 | |
| 		Name:    rejectChain,
 | |
| 		Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"),
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Chain{
 | |
| 		Name: rejectChain,
 | |
| 	})
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: rejectChain,
 | |
| 		Rule:  "reject",
 | |
| 	})
 | |
| 
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: serviceEndpointsCheckChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport",
 | |
| 			"vmap", "@", noEndpointServicesMap,
 | |
| 		),
 | |
| 	})
 | |
| 
 | |
| 	if proxier.nodePortAddresses.MatchAll() {
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: nodePortEndpointsCheckChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				noLocalhost,
 | |
| 				"meta l4proto . th dport",
 | |
| 				"vmap", "@", noEndpointNodePortsMap,
 | |
| 			),
 | |
| 		})
 | |
| 	} else {
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: nodePortEndpointsCheckChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				ipX, "daddr", "@", nodePortIPsSet,
 | |
| 				"meta l4proto . th dport",
 | |
| 				"vmap", "@", noEndpointNodePortsMap,
 | |
| 			),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	// Set up LoadBalancerSourceRanges firewalling
 | |
| 	tx.Add(&knftables.Map{
 | |
| 		Name:    firewallIPsMap,
 | |
| 		Type:    ipvX_addr + " . inet_proto . inet_service : verdict",
 | |
| 		Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
 | |
| 	})
 | |
| 
 | |
| 	ensureChain(firewallCheckChain, tx, createdChains)
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: firewallCheckChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport",
 | |
| 			"vmap", "@", firewallIPsMap,
 | |
| 		),
 | |
| 	})
 | |
| 
 | |
| 	// Set up service dispatch
 | |
| 	tx.Add(&knftables.Map{
 | |
| 		Name:    serviceIPsMap,
 | |
| 		Type:    ipvX_addr + " . inet_proto . inet_service : verdict",
 | |
| 		Comment: ptr.To("ClusterIP, ExternalIP and LoadBalancer IP traffic"),
 | |
| 	})
 | |
| 	tx.Add(&knftables.Map{
 | |
| 		Name:    serviceNodePortsMap,
 | |
| 		Type:    "inet_proto . inet_service : verdict",
 | |
| 		Comment: ptr.To("NodePort traffic"),
 | |
| 	})
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: servicesChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport",
 | |
| 			"vmap", "@", serviceIPsMap,
 | |
| 		),
 | |
| 	})
 | |
| 	if proxier.nodePortAddresses.MatchAll() {
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: servicesChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				"fib daddr type local",
 | |
| 				noLocalhost,
 | |
| 				"meta l4proto . th dport",
 | |
| 				"vmap", "@", serviceNodePortsMap,
 | |
| 			),
 | |
| 		})
 | |
| 	} else {
 | |
| 		tx.Add(&knftables.Rule{
 | |
| 			Chain: servicesChain,
 | |
| 			Rule: knftables.Concat(
 | |
| 				ipX, "daddr @nodeport-ips",
 | |
| 				"meta l4proto . th dport",
 | |
| 				"vmap", "@", serviceNodePortsMap,
 | |
| 			),
 | |
| 		})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CleanupLeftovers removes all nftables rules and chains created by the Proxier
 | |
| // It returns true if an error was encountered. Errors are logged.
 | |
| func CleanupLeftovers() bool {
 | |
| 	var encounteredError bool
 | |
| 
 | |
| 	for _, family := range []knftables.Family{knftables.IPv4Family, knftables.IPv6Family} {
 | |
| 		nft, err := knftables.New(family, kubeProxyTable)
 | |
| 		if err == nil {
 | |
| 			tx := nft.NewTransaction()
 | |
| 			tx.Delete(&knftables.Table{})
 | |
| 			err = nft.Run(context.TODO(), tx)
 | |
| 		}
 | |
| 		if err != nil && !knftables.IsNotFound(err) {
 | |
| 			klog.ErrorS(err, "Error cleaning up nftables rules")
 | |
| 			encounteredError = true
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return encounteredError
 | |
| }
 | |
| 
 | |
| // Sync is called to synchronize the proxier state to nftables as soon as possible.
 | |
| func (proxier *Proxier) Sync() {
 | |
| 	if proxier.healthzServer != nil {
 | |
| 		proxier.healthzServer.QueuedUpdate(proxier.ipFamily)
 | |
| 	}
 | |
| 	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
 | |
| 	proxier.syncRunner.Run()
 | |
| }
 | |
| 
 | |
| // SyncLoop runs periodic work.  This is expected to run as a goroutine or as the main loop of the app.  It does not return.
 | |
| func (proxier *Proxier) SyncLoop() {
 | |
| 	// Update healthz timestamp at beginning in case Sync() never succeeds.
 | |
| 	if proxier.healthzServer != nil {
 | |
| 		proxier.healthzServer.Updated(proxier.ipFamily)
 | |
| 	}
 | |
| 
 | |
| 	// synthesize "last change queued" time as the informers are syncing.
 | |
| 	metrics.SyncProxyRulesLastQueuedTimestamp.SetToCurrentTime()
 | |
| 	proxier.syncRunner.Loop(wait.NeverStop)
 | |
| }
 | |
| 
 | |
| func (proxier *Proxier) setInitialized(value bool) {
 | |
| 	var initialized int32
 | |
| 	if value {
 | |
| 		initialized = 1
 | |
| 	}
 | |
| 	atomic.StoreInt32(&proxier.initialized, initialized)
 | |
| }
 | |
| 
 | |
| func (proxier *Proxier) isInitialized() bool {
 | |
| 	return atomic.LoadInt32(&proxier.initialized) > 0
 | |
| }
 | |
| 
 | |
| // OnServiceAdd is called whenever creation of new service object
 | |
| // is observed.
 | |
| func (proxier *Proxier) OnServiceAdd(service *v1.Service) {
 | |
| 	proxier.OnServiceUpdate(nil, service)
 | |
| }
 | |
| 
 | |
| // OnServiceUpdate is called whenever modification of an existing
 | |
| // service object is observed.
 | |
| func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) {
 | |
| 	if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() {
 | |
| 		proxier.Sync()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // OnServiceDelete is called whenever deletion of an existing service
 | |
| // object is observed.
 | |
| func (proxier *Proxier) OnServiceDelete(service *v1.Service) {
 | |
| 	proxier.OnServiceUpdate(service, nil)
 | |
| 
 | |
| }
 | |
| 
 | |
| // OnServiceSynced is called once all the initial event handlers were
 | |
| // called and the state is fully propagated to local cache.
 | |
| func (proxier *Proxier) OnServiceSynced() {
 | |
| 	proxier.mu.Lock()
 | |
| 	proxier.servicesSynced = true
 | |
| 	proxier.setInitialized(proxier.endpointSlicesSynced)
 | |
| 	proxier.mu.Unlock()
 | |
| 
 | |
| 	// Sync unconditionally - this is called once per lifetime.
 | |
| 	proxier.syncProxyRules()
 | |
| }
 | |
| 
 | |
| // OnEndpointSliceAdd is called whenever creation of a new endpoint slice object
 | |
| // is observed.
 | |
| func (proxier *Proxier) OnEndpointSliceAdd(endpointSlice *discovery.EndpointSlice) {
 | |
| 	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
 | |
| 		proxier.Sync()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // OnEndpointSliceUpdate is called whenever modification of an existing endpoint
 | |
| // slice object is observed.
 | |
| func (proxier *Proxier) OnEndpointSliceUpdate(_, endpointSlice *discovery.EndpointSlice) {
 | |
| 	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, false) && proxier.isInitialized() {
 | |
| 		proxier.Sync()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // OnEndpointSliceDelete is called whenever deletion of an existing endpoint slice
 | |
| // object is observed.
 | |
| func (proxier *Proxier) OnEndpointSliceDelete(endpointSlice *discovery.EndpointSlice) {
 | |
| 	if proxier.endpointsChanges.EndpointSliceUpdate(endpointSlice, true) && proxier.isInitialized() {
 | |
| 		proxier.Sync()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // OnEndpointSlicesSynced is called once all the initial event handlers were
 | |
| // called and the state is fully propagated to local cache.
 | |
| func (proxier *Proxier) OnEndpointSlicesSynced() {
 | |
| 	proxier.mu.Lock()
 | |
| 	proxier.endpointSlicesSynced = true
 | |
| 	proxier.setInitialized(proxier.servicesSynced)
 | |
| 	proxier.mu.Unlock()
 | |
| 
 | |
| 	// Sync unconditionally - this is called once per lifetime.
 | |
| 	proxier.syncProxyRules()
 | |
| }
 | |
| 
 | |
| // OnNodeAdd is called whenever creation of new node object
 | |
| // is observed.
 | |
| func (proxier *Proxier) OnNodeAdd(node *v1.Node) {
 | |
| 	if node.Name != proxier.hostname {
 | |
| 		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
 | |
| 			"eventNode", node.Name, "currentNode", proxier.hostname)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	proxier.mu.Lock()
 | |
| 	proxier.nodeLabels = map[string]string{}
 | |
| 	for k, v := range node.Labels {
 | |
| 		proxier.nodeLabels[k] = v
 | |
| 	}
 | |
| 	proxier.mu.Unlock()
 | |
| 	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
 | |
| 
 | |
| 	proxier.Sync()
 | |
| }
 | |
| 
 | |
| // OnNodeUpdate is called whenever modification of an existing
 | |
| // node object is observed.
 | |
| func (proxier *Proxier) OnNodeUpdate(oldNode, node *v1.Node) {
 | |
| 	if node.Name != proxier.hostname {
 | |
| 		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
 | |
| 			"eventNode", node.Name, "currentNode", proxier.hostname)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if reflect.DeepEqual(proxier.nodeLabels, node.Labels) {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	proxier.mu.Lock()
 | |
| 	proxier.nodeLabels = map[string]string{}
 | |
| 	for k, v := range node.Labels {
 | |
| 		proxier.nodeLabels[k] = v
 | |
| 	}
 | |
| 	proxier.mu.Unlock()
 | |
| 	klog.V(4).InfoS("Updated proxier node labels", "labels", node.Labels)
 | |
| 
 | |
| 	proxier.Sync()
 | |
| }
 | |
| 
 | |
| // OnNodeDelete is called whenever deletion of an existing node
 | |
| // object is observed.
 | |
| func (proxier *Proxier) OnNodeDelete(node *v1.Node) {
 | |
| 	if node.Name != proxier.hostname {
 | |
| 		klog.ErrorS(nil, "Received a watch event for a node that doesn't match the current node",
 | |
| 			"eventNode", node.Name, "currentNode", proxier.hostname)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	proxier.mu.Lock()
 | |
| 	proxier.nodeLabels = nil
 | |
| 	proxier.mu.Unlock()
 | |
| 
 | |
| 	proxier.Sync()
 | |
| }
 | |
| 
 | |
| // OnNodeSynced is called once all the initial event handlers were
 | |
| // called and the state is fully propagated to local cache.
 | |
| func (proxier *Proxier) OnNodeSynced() {
 | |
| }
 | |
| 
 | |
| // OnServiceCIDRsChanged is called whenever a change is observed
 | |
| // in any of the ServiceCIDRs, and provides complete list of service cidrs.
 | |
| func (proxier *Proxier) OnServiceCIDRsChanged(cidrs []string) {
 | |
| 	proxier.mu.Lock()
 | |
| 	defer proxier.mu.Unlock()
 | |
| 
 | |
| 	cidrsForProxier := make([]string, 0)
 | |
| 	for _, cidr := range cidrs {
 | |
| 		isIPv4CIDR := netutils.IsIPv4CIDRString(cidr)
 | |
| 		if proxier.ipFamily == v1.IPv4Protocol && isIPv4CIDR {
 | |
| 			cidrsForProxier = append(cidrsForProxier, cidr)
 | |
| 		}
 | |
| 
 | |
| 		if proxier.ipFamily == v1.IPv6Protocol && !isIPv4CIDR {
 | |
| 			cidrsForProxier = append(cidrsForProxier, cidr)
 | |
| 		}
 | |
| 	}
 | |
| 	proxier.serviceCIDRs = strings.Join(cidrsForProxier, ",")
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	// Maximum length for one of our chain name prefixes, including the trailing
 | |
| 	// hyphen.
 | |
| 	chainNamePrefixLengthMax = 16
 | |
| 
 | |
| 	// Maximum length of the string returned from servicePortChainNameBase or
 | |
| 	// servicePortEndpointChainNameBase.
 | |
| 	chainNameBaseLengthMax = knftables.NameLengthMax - chainNamePrefixLengthMax
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	servicePortPolicyClusterChainNamePrefix = "service-"
 | |
| 	servicePortPolicyLocalChainNamePrefix   = "local-"
 | |
| 	serviceExternalChainNamePrefix          = "external-"
 | |
| 	servicePortEndpointChainNamePrefix      = "endpoint-"
 | |
| 	servicePortEndpointAffinityNamePrefix   = "affinity-"
 | |
| 	servicePortFirewallChainNamePrefix      = "firewall-"
 | |
| )
 | |
| 
 | |
| // hashAndTruncate prefixes name with a hash of itself and then truncates to
 | |
| // chainNameBaseLengthMax. The hash ensures that (a) the name is still unique if we have
 | |
| // to truncate the end, and (b) it's visually distinguishable from other chains that would
 | |
| // otherwise have nearly identical names (e.g., different endpoint chains for a given
 | |
| // service that differ in only a single digit).
 | |
| func hashAndTruncate(name string) string {
 | |
| 	hash := sha256.Sum256([]byte(name))
 | |
| 	encoded := base32.StdEncoding.EncodeToString(hash[:])
 | |
| 	name = encoded[:8] + "-" + name
 | |
| 	if len(name) > chainNameBaseLengthMax {
 | |
| 		name = name[:chainNameBaseLengthMax-3] + "..."
 | |
| 	}
 | |
| 	return name
 | |
| }
 | |
| 
 | |
| // servicePortChainNameBase returns the base name for a chain for the given ServicePort.
 | |
| // This is something like "HASH-namespace/serviceName/protocol/portName", e.g,
 | |
| // "ULMVA6XW-ns1/svc1/tcp/p80".
 | |
| func servicePortChainNameBase(servicePortName *proxy.ServicePortName, protocol string) string {
 | |
| 	// nftables chains can contain the characters [A-Za-z0-9_./-] (but must start with
 | |
| 	// a letter, underscore, or dot).
 | |
| 	//
 | |
| 	// Namespace, Service, and Port names can contain [a-z0-9-] (with some additional
 | |
| 	// restrictions that aren't relevant here).
 | |
| 	//
 | |
| 	// Protocol is /(tcp|udp|sctp)/.
 | |
| 	//
 | |
| 	// Thus, we can safely use all Namespace names, Service names, protocol values,
 | |
| 	// and Port names directly in nftables chain names (though note that this assumes
 | |
| 	// that the chain name won't *start* with any of those strings, since that might
 | |
| 	// be illegal). We use "/" to separate the parts of the name, which is one of the
 | |
| 	// two characters allowed in a chain name that isn't allowed in our input strings.
 | |
| 
 | |
| 	name := fmt.Sprintf("%s/%s/%s/%s",
 | |
| 		servicePortName.NamespacedName.Namespace,
 | |
| 		servicePortName.NamespacedName.Name,
 | |
| 		protocol,
 | |
| 		servicePortName.Port,
 | |
| 	)
 | |
| 
 | |
| 	// The namespace, service, and port name can each be up to 63 characters, protocol
 | |
| 	// can be up to 4, plus 8 for the hash and 4 additional punctuation characters.
 | |
| 	// That's a total of 205, which is less than chainNameBaseLengthMax (240). So this
 | |
| 	// will never actually return a truncated name.
 | |
| 	return hashAndTruncate(name)
 | |
| }
 | |
| 
 | |
| // servicePortEndpointChainNameBase returns the suffix for chain names for the given
 | |
| // endpoint. This is something like
 | |
| // "HASH-namespace/serviceName/protocol/portName__endpointIP/endpointport", e.g.,
 | |
| // "5OJB2KTY-ns1/svc1/tcp/p80__10.180.0.1/80".
 | |
| func servicePortEndpointChainNameBase(servicePortName *proxy.ServicePortName, protocol, endpoint string) string {
 | |
| 	// As above in servicePortChainNameBase: Namespace, Service, Port, Protocol, and
 | |
| 	// EndpointPort are all safe to copy into the chain name directly. But if
 | |
| 	// EndpointIP is IPv6 then it will contain colons, which aren't allowed in a chain
 | |
| 	// name. IPv6 IPs are also quite long, but we can't safely truncate them (e.g. to
 | |
| 	// only the final segment) because (especially for manually-created external
 | |
| 	// endpoints), we can't know for sure that any part of them is redundant.
 | |
| 
 | |
| 	endpointIP, endpointPort, _ := net.SplitHostPort(endpoint)
 | |
| 	if strings.Contains(endpointIP, ":") {
 | |
| 		endpointIP = strings.ReplaceAll(endpointIP, ":", ".")
 | |
| 	}
 | |
| 
 | |
| 	// As above, we use "/" to separate parts of the name, and "__" to separate the
 | |
| 	// "service" part from the "endpoint" part.
 | |
| 	name := fmt.Sprintf("%s/%s/%s/%s__%s/%s",
 | |
| 		servicePortName.NamespacedName.Namespace,
 | |
| 		servicePortName.NamespacedName.Name,
 | |
| 		protocol,
 | |
| 		servicePortName.Port,
 | |
| 		endpointIP,
 | |
| 		endpointPort,
 | |
| 	)
 | |
| 
 | |
| 	// The part of name before the "__" can be up to 205 characters (as with
 | |
| 	// servicePortChainNameBase above). An IPv6 address can be up to 39 characters, and
 | |
| 	// a port can be up to 5 digits, plus 3 punctuation characters gives a max total
 | |
| 	// length of 252, well over chainNameBaseLengthMax (240), so truncation is
 | |
| 	// theoretically possible (though incredibly unlikely).
 | |
| 	return hashAndTruncate(name)
 | |
| }
 | |
| 
 | |
| func isServiceChainName(chainString string) bool {
 | |
| 	// The chains returned from servicePortChainNameBase and
 | |
| 	// servicePortEndpointChainNameBase will always have at least one "/" in them.
 | |
| 	// Since none of our "stock" chain names use slashes, we can distinguish them this
 | |
| 	// way.
 | |
| 	return strings.Contains(chainString, "/")
 | |
| }
 | |
| 
 | |
| func isAffinitySetName(set string) bool {
 | |
| 	return strings.HasPrefix(set, servicePortEndpointAffinityNamePrefix)
 | |
| }
 | |
| 
 | |
| // This is where all of the nftables calls happen.
 | |
| // This assumes proxier.mu is NOT held
 | |
| func (proxier *Proxier) syncProxyRules() {
 | |
| 	proxier.mu.Lock()
 | |
| 	defer proxier.mu.Unlock()
 | |
| 
 | |
| 	// don't sync rules till we've received services and endpoints
 | |
| 	if !proxier.isInitialized() {
 | |
| 		klog.V(2).InfoS("Not syncing nftables until Services and Endpoints have been received from master")
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	//
 | |
| 	// Below this point we will not return until we try to write the nftables rules.
 | |
| 	//
 | |
| 
 | |
| 	// Keep track of how long syncs take.
 | |
| 	start := time.Now()
 | |
| 	defer func() {
 | |
| 		metrics.SyncProxyRulesLatency.Observe(metrics.SinceInSeconds(start))
 | |
| 		klog.V(2).InfoS("SyncProxyRules complete", "elapsed", time.Since(start))
 | |
| 	}()
 | |
| 
 | |
| 	serviceUpdateResult := proxier.svcPortMap.Update(proxier.serviceChanges)
 | |
| 	endpointUpdateResult := proxier.endpointsMap.Update(proxier.endpointsChanges)
 | |
| 
 | |
| 	klog.V(2).InfoS("Syncing nftables rules")
 | |
| 
 | |
| 	success := false
 | |
| 	defer func() {
 | |
| 		if !success {
 | |
| 			klog.InfoS("Sync failed", "retryingTime", proxier.syncPeriod)
 | |
| 			proxier.syncRunner.RetryAfter(proxier.syncPeriod)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	// If there are sufficiently-stale chains left over from previous transactions,
 | |
| 	// try to delete them now.
 | |
| 	if len(proxier.staleChains) > 0 {
 | |
| 		oneSecondAgo := start.Add(-time.Second)
 | |
| 		tx := proxier.nftables.NewTransaction()
 | |
| 		deleted := 0
 | |
| 		for chain, modtime := range proxier.staleChains {
 | |
| 			if modtime.Before(oneSecondAgo) {
 | |
| 				tx.Delete(&knftables.Chain{
 | |
| 					Name: chain,
 | |
| 				})
 | |
| 				delete(proxier.staleChains, chain)
 | |
| 				deleted++
 | |
| 			}
 | |
| 		}
 | |
| 		if deleted > 0 {
 | |
| 			klog.InfoS("Deleting stale nftables chains", "numChains", deleted)
 | |
| 			err := proxier.nftables.Run(context.TODO(), tx)
 | |
| 			if err != nil {
 | |
| 				// We already deleted the entries from staleChains, but if
 | |
| 				// the chains still exist, they'll just get added back
 | |
| 				// (with a later timestamp) at the end of the sync.
 | |
| 				klog.ErrorS(err, "Unable to delete stale chains; will retry later")
 | |
| 				// FIXME: metric
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Now start the actual syncing transaction
 | |
| 	tx := proxier.nftables.NewTransaction()
 | |
| 	proxier.setupNFTables(tx)
 | |
| 
 | |
| 	// We need to use, eg, "ip daddr" for IPv4 but "ip6 daddr" for IPv6
 | |
| 	ipX := "ip"
 | |
| 	ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value
 | |
| 	if proxier.ipFamily == v1.IPv6Protocol {
 | |
| 		ipX = "ip6"
 | |
| 		ipvX_addr = "ipv6_addr"
 | |
| 	}
 | |
| 
 | |
| 	// We currently fully-rebuild our sets and maps on each resync
 | |
| 	tx.Flush(&knftables.Set{
 | |
| 		Name: clusterIPsSet,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Map{
 | |
| 		Name: firewallIPsMap,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Map{
 | |
| 		Name: noEndpointServicesMap,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Map{
 | |
| 		Name: noEndpointNodePortsMap,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Map{
 | |
| 		Name: serviceIPsMap,
 | |
| 	})
 | |
| 	tx.Flush(&knftables.Map{
 | |
| 		Name: serviceNodePortsMap,
 | |
| 	})
 | |
| 
 | |
| 	// Accumulate service/endpoint chains and affinity sets to keep.
 | |
| 	activeChains := sets.New[string]()
 | |
| 	activeAffinitySets := sets.New[string]()
 | |
| 
 | |
| 	// Compute total number of endpoint chains across all services
 | |
| 	// to get a sense of how big the cluster is.
 | |
| 	totalEndpoints := 0
 | |
| 	for svcName := range proxier.svcPortMap {
 | |
| 		totalEndpoints += len(proxier.endpointsMap[svcName])
 | |
| 	}
 | |
| 
 | |
| 	// These two variables are used to publish the sync_proxy_rules_no_endpoints_total
 | |
| 	// metric.
 | |
| 	serviceNoLocalEndpointsTotalInternal := 0
 | |
| 	serviceNoLocalEndpointsTotalExternal := 0
 | |
| 
 | |
| 	// Build rules for each service-port.
 | |
| 	for svcName, svc := range proxier.svcPortMap {
 | |
| 		svcInfo, ok := svc.(*servicePortInfo)
 | |
| 		if !ok {
 | |
| 			klog.ErrorS(nil, "Failed to cast serviceInfo", "serviceName", svcName)
 | |
| 			continue
 | |
| 		}
 | |
| 		protocol := strings.ToLower(string(svcInfo.Protocol()))
 | |
| 		svcPortNameString := svcInfo.nameString
 | |
| 
 | |
| 		// Figure out the endpoints for Cluster and Local traffic policy.
 | |
| 		// allLocallyReachableEndpoints is the set of all endpoints that can be routed to
 | |
| 		// from this node, given the service's traffic policies. hasEndpoints is true
 | |
| 		// if the service has any usable endpoints on any node, not just this one.
 | |
| 		allEndpoints := proxier.endpointsMap[svcName]
 | |
| 		clusterEndpoints, localEndpoints, allLocallyReachableEndpoints, hasEndpoints := proxy.CategorizeEndpoints(allEndpoints, svcInfo, proxier.nodeLabels)
 | |
| 
 | |
| 		// Note the endpoint chains that will be used
 | |
| 		for _, ep := range allLocallyReachableEndpoints {
 | |
| 			if epInfo, ok := ep.(*endpointInfo); ok {
 | |
| 				ensureChain(epInfo.chainName, tx, activeChains)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// clusterPolicyChain contains the endpoints used with "Cluster" traffic policy
 | |
| 		clusterPolicyChain := svcInfo.clusterPolicyChainName
 | |
| 		usesClusterPolicyChain := len(clusterEndpoints) > 0 && svcInfo.UsesClusterEndpoints()
 | |
| 		if usesClusterPolicyChain {
 | |
| 			ensureChain(clusterPolicyChain, tx, activeChains)
 | |
| 		}
 | |
| 
 | |
| 		// localPolicyChain contains the endpoints used with "Local" traffic policy
 | |
| 		localPolicyChain := svcInfo.localPolicyChainName
 | |
| 		usesLocalPolicyChain := len(localEndpoints) > 0 && svcInfo.UsesLocalEndpoints()
 | |
| 		if usesLocalPolicyChain {
 | |
| 			ensureChain(localPolicyChain, tx, activeChains)
 | |
| 		}
 | |
| 
 | |
| 		// internalPolicyChain is the chain containing the endpoints for
 | |
| 		// "internal" (ClusterIP) traffic. internalTrafficChain is the chain that
 | |
| 		// internal traffic is routed to (which is always the same as
 | |
| 		// internalPolicyChain). hasInternalEndpoints is true if we should
 | |
| 		// generate rules pointing to internalTrafficChain, or false if there are
 | |
| 		// no available internal endpoints.
 | |
| 		internalPolicyChain := clusterPolicyChain
 | |
| 		hasInternalEndpoints := hasEndpoints
 | |
| 		if svcInfo.InternalPolicyLocal() {
 | |
| 			internalPolicyChain = localPolicyChain
 | |
| 			if len(localEndpoints) == 0 {
 | |
| 				hasInternalEndpoints = false
 | |
| 			}
 | |
| 		}
 | |
| 		internalTrafficChain := internalPolicyChain
 | |
| 
 | |
| 		// Similarly, externalPolicyChain is the chain containing the endpoints
 | |
| 		// for "external" (NodePort, LoadBalancer, and ExternalIP) traffic.
 | |
| 		// externalTrafficChain is the chain that external traffic is routed to
 | |
| 		// (which is always the service's "EXT" chain). hasExternalEndpoints is
 | |
| 		// true if there are endpoints that will be reached by external traffic.
 | |
| 		// (But we may still have to generate externalTrafficChain even if there
 | |
| 		// are no external endpoints, to ensure that the short-circuit rules for
 | |
| 		// local traffic are set up.)
 | |
| 		externalPolicyChain := clusterPolicyChain
 | |
| 		hasExternalEndpoints := hasEndpoints
 | |
| 		if svcInfo.ExternalPolicyLocal() {
 | |
| 			externalPolicyChain = localPolicyChain
 | |
| 			if len(localEndpoints) == 0 {
 | |
| 				hasExternalEndpoints = false
 | |
| 			}
 | |
| 		}
 | |
| 		externalTrafficChain := svcInfo.externalChainName // eventually jumps to externalPolicyChain
 | |
| 
 | |
| 		// usesExternalTrafficChain is based on hasEndpoints, not hasExternalEndpoints,
 | |
| 		// because we need the local-traffic-short-circuiting rules even when there
 | |
| 		// are no externally-usable endpoints.
 | |
| 		usesExternalTrafficChain := hasEndpoints && svcInfo.ExternallyAccessible()
 | |
| 		if usesExternalTrafficChain {
 | |
| 			ensureChain(externalTrafficChain, tx, activeChains)
 | |
| 		}
 | |
| 
 | |
| 		var internalTrafficFilterVerdict, externalTrafficFilterVerdict string
 | |
| 		if !hasEndpoints {
 | |
| 			// The service has no endpoints at all; hasInternalEndpoints and
 | |
| 			// hasExternalEndpoints will also be false, and we will not
 | |
| 			// generate any chains in the "nat" table for the service; only
 | |
| 			// rules in the "filter" table rejecting incoming packets for
 | |
| 			// the service's IPs.
 | |
| 			internalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain)
 | |
| 			externalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain)
 | |
| 		} else {
 | |
| 			if !hasInternalEndpoints {
 | |
| 				// The internalTrafficPolicy is "Local" but there are no local
 | |
| 				// endpoints. Traffic to the clusterIP will be dropped, but
 | |
| 				// external traffic may still be accepted.
 | |
| 				internalTrafficFilterVerdict = "drop"
 | |
| 				serviceNoLocalEndpointsTotalInternal++
 | |
| 			}
 | |
| 			if !hasExternalEndpoints {
 | |
| 				// The externalTrafficPolicy is "Local" but there are no
 | |
| 				// local endpoints. Traffic to "external" IPs from outside
 | |
| 				// the cluster will be dropped, but traffic from inside
 | |
| 				// the cluster may still be accepted.
 | |
| 				externalTrafficFilterVerdict = "drop"
 | |
| 				serviceNoLocalEndpointsTotalExternal++
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Capture the clusterIP.
 | |
| 		tx.Add(&knftables.Element{
 | |
| 			Set: clusterIPsSet,
 | |
| 			Key: []string{svcInfo.ClusterIP().String()},
 | |
| 		})
 | |
| 		if hasInternalEndpoints {
 | |
| 			tx.Add(&knftables.Element{
 | |
| 				Map: serviceIPsMap,
 | |
| 				Key: []string{
 | |
| 					svcInfo.ClusterIP().String(),
 | |
| 					protocol,
 | |
| 					strconv.Itoa(svcInfo.Port()),
 | |
| 				},
 | |
| 				Value: []string{
 | |
| 					fmt.Sprintf("goto %s", internalTrafficChain),
 | |
| 				},
 | |
| 			})
 | |
| 		} else {
 | |
| 			// No endpoints.
 | |
| 			tx.Add(&knftables.Element{
 | |
| 				Map: noEndpointServicesMap,
 | |
| 				Key: []string{
 | |
| 					svcInfo.ClusterIP().String(),
 | |
| 					protocol,
 | |
| 					strconv.Itoa(svcInfo.Port()),
 | |
| 				},
 | |
| 				Value: []string{
 | |
| 					internalTrafficFilterVerdict,
 | |
| 				},
 | |
| 				Comment: &svcPortNameString,
 | |
| 			})
 | |
| 		}
 | |
| 
 | |
| 		// Capture externalIPs.
 | |
| 		for _, externalIP := range svcInfo.ExternalIPs() {
 | |
| 			if hasEndpoints {
 | |
| 				// Send traffic bound for external IPs to the "external
 | |
| 				// destinations" chain.
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: serviceIPsMap,
 | |
| 					Key: []string{
 | |
| 						externalIP.String(),
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.Port()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						fmt.Sprintf("goto %s", externalTrafficChain),
 | |
| 					},
 | |
| 				})
 | |
| 			}
 | |
| 			if !hasExternalEndpoints {
 | |
| 				// Either no endpoints at all (REJECT) or no endpoints for
 | |
| 				// external traffic (DROP anything that didn't get
 | |
| 				// short-circuited by the EXT chain.)
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: noEndpointServicesMap,
 | |
| 					Key: []string{
 | |
| 						externalIP.String(),
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.Port()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						externalTrafficFilterVerdict,
 | |
| 					},
 | |
| 					Comment: &svcPortNameString,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		usesFWChain := len(svcInfo.LoadBalancerVIPs()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
 | |
| 		fwChain := svcInfo.firewallChainName
 | |
| 		if usesFWChain {
 | |
| 			ensureChain(fwChain, tx, activeChains)
 | |
| 			var sources []string
 | |
| 			allowFromNode := false
 | |
| 			for _, cidr := range svcInfo.LoadBalancerSourceRanges() {
 | |
| 				if len(sources) > 0 {
 | |
| 					sources = append(sources, ",")
 | |
| 				}
 | |
| 				sources = append(sources, cidr.String())
 | |
| 				if cidr.Contains(proxier.nodeIP) {
 | |
| 					allowFromNode = true
 | |
| 				}
 | |
| 			}
 | |
| 			// For VIP-like LBs, the VIP is often added as a local
 | |
| 			// address (via an IP route rule).  In that case, a request
 | |
| 			// from a node to the VIP will not hit the loadbalancer but
 | |
| 			// will loop back with the source IP set to the VIP.  We
 | |
| 			// need the following rules to allow requests from this node.
 | |
| 			if allowFromNode {
 | |
| 				for _, lbip := range svcInfo.LoadBalancerVIPs() {
 | |
| 					sources = append(sources, ",", lbip.String())
 | |
| 				}
 | |
| 			}
 | |
| 			tx.Add(&knftables.Rule{
 | |
| 				Chain: fwChain,
 | |
| 				Rule: knftables.Concat(
 | |
| 					ipX, "saddr", "!=", "{", sources, "}",
 | |
| 					"drop",
 | |
| 				),
 | |
| 			})
 | |
| 		}
 | |
| 
 | |
| 		// Capture load-balancer ingress.
 | |
| 		for _, lbip := range svcInfo.LoadBalancerVIPs() {
 | |
| 			if hasEndpoints {
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: serviceIPsMap,
 | |
| 					Key: []string{
 | |
| 						lbip.String(),
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.Port()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						fmt.Sprintf("goto %s", externalTrafficChain),
 | |
| 					},
 | |
| 				})
 | |
| 			}
 | |
| 
 | |
| 			if usesFWChain {
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: firewallIPsMap,
 | |
| 					Key: []string{
 | |
| 						lbip.String(),
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.Port()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						fmt.Sprintf("goto %s", fwChain),
 | |
| 					},
 | |
| 					Comment: &svcPortNameString,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 		if !hasExternalEndpoints {
 | |
| 			// Either no endpoints at all (REJECT) or no endpoints for
 | |
| 			// external traffic (DROP anything that didn't get short-circuited
 | |
| 			// by the EXT chain.)
 | |
| 			for _, lbip := range svcInfo.LoadBalancerVIPs() {
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: noEndpointServicesMap,
 | |
| 					Key: []string{
 | |
| 						lbip.String(),
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.Port()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						externalTrafficFilterVerdict,
 | |
| 					},
 | |
| 					Comment: &svcPortNameString,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Capture nodeports.
 | |
| 		if svcInfo.NodePort() != 0 {
 | |
| 			if hasEndpoints {
 | |
| 				// Jump to the external destination chain.  For better or for
 | |
| 				// worse, nodeports are not subect to loadBalancerSourceRanges,
 | |
| 				// and we can't change that.
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: serviceNodePortsMap,
 | |
| 					Key: []string{
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.NodePort()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						fmt.Sprintf("goto %s", externalTrafficChain),
 | |
| 					},
 | |
| 				})
 | |
| 			}
 | |
| 			if !hasExternalEndpoints {
 | |
| 				// Either no endpoints at all (REJECT) or no endpoints for
 | |
| 				// external traffic (DROP anything that didn't get
 | |
| 				// short-circuited by the EXT chain.)
 | |
| 				tx.Add(&knftables.Element{
 | |
| 					Map: noEndpointNodePortsMap,
 | |
| 					Key: []string{
 | |
| 						protocol,
 | |
| 						strconv.Itoa(svcInfo.NodePort()),
 | |
| 					},
 | |
| 					Value: []string{
 | |
| 						externalTrafficFilterVerdict,
 | |
| 					},
 | |
| 					Comment: &svcPortNameString,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Set up internal traffic handling.
 | |
| 		if hasInternalEndpoints {
 | |
| 			if proxier.masqueradeAll {
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: internalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						ipX, "daddr", svcInfo.ClusterIP(),
 | |
| 						protocol, "dport", svcInfo.Port(),
 | |
| 						"jump", markMasqChain,
 | |
| 					),
 | |
| 				})
 | |
| 			} else if proxier.localDetector.IsImplemented() {
 | |
| 				// This masquerades off-cluster traffic to a service VIP. The
 | |
| 				// idea is that you can establish a static route for your
 | |
| 				// Service range, routing to any node, and that node will
 | |
| 				// bridge into the Service for you. Since that might bounce
 | |
| 				// off-node, we masquerade here.
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: internalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						ipX, "daddr", svcInfo.ClusterIP(),
 | |
| 						protocol, "dport", svcInfo.Port(),
 | |
| 						proxier.localDetector.IfNotLocalNFT(),
 | |
| 						"jump", markMasqChain,
 | |
| 					),
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Set up external traffic handling (if any "external" destinations are
 | |
| 		// enabled). All captured traffic for all external destinations should
 | |
| 		// jump to externalTrafficChain, which will handle some special cases and
 | |
| 		// then jump to externalPolicyChain.
 | |
| 		if usesExternalTrafficChain {
 | |
| 			if !svcInfo.ExternalPolicyLocal() {
 | |
| 				// If we are using non-local endpoints we need to masquerade,
 | |
| 				// in case we cross nodes.
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: externalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						"jump", markMasqChain,
 | |
| 					),
 | |
| 				})
 | |
| 			} else {
 | |
| 				// If we are only using same-node endpoints, we can retain the
 | |
| 				// source IP in most cases.
 | |
| 
 | |
| 				if proxier.localDetector.IsImplemented() {
 | |
| 					// Treat all locally-originated pod -> external destination
 | |
| 					// traffic as a special-case.  It is subject to neither
 | |
| 					// form of traffic policy, which simulates going up-and-out
 | |
| 					// to an external load-balancer and coming back in.
 | |
| 					tx.Add(&knftables.Rule{
 | |
| 						Chain: externalTrafficChain,
 | |
| 						Rule: knftables.Concat(
 | |
| 							proxier.localDetector.IfLocalNFT(),
 | |
| 							"goto", clusterPolicyChain,
 | |
| 						),
 | |
| 						Comment: ptr.To("short-circuit pod traffic"),
 | |
| 					})
 | |
| 				}
 | |
| 
 | |
| 				// Locally originated traffic (not a pod, but the host node)
 | |
| 				// still needs masquerade because the LBIP itself is a local
 | |
| 				// address, so that will be the chosen source IP.
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: externalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						"fib", "saddr", "type", "local",
 | |
| 						"jump", markMasqChain,
 | |
| 					),
 | |
| 					Comment: ptr.To("masquerade local traffic"),
 | |
| 				})
 | |
| 
 | |
| 				// Redirect all src-type=LOCAL -> external destination to the
 | |
| 				// policy=cluster chain. This allows traffic originating
 | |
| 				// from the host to be redirected to the service correctly.
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: externalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						"fib", "saddr", "type", "local",
 | |
| 						"goto", clusterPolicyChain,
 | |
| 					),
 | |
| 					Comment: ptr.To("short-circuit local traffic"),
 | |
| 				})
 | |
| 			}
 | |
| 
 | |
| 			// Anything else falls thru to the appropriate policy chain.
 | |
| 			if hasExternalEndpoints {
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: externalTrafficChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						"goto", externalPolicyChain,
 | |
| 					),
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
 | |
| 			// Generate the per-endpoint affinity sets
 | |
| 			for _, ep := range allLocallyReachableEndpoints {
 | |
| 				epInfo, ok := ep.(*endpointInfo)
 | |
| 				if !ok {
 | |
| 					klog.ErrorS(nil, "Failed to cast endpointsInfo", "endpointsInfo", ep)
 | |
| 					continue
 | |
| 				}
 | |
| 
 | |
| 				// Create a set to store current affinity mappings. As
 | |
| 				// with the iptables backend, endpoint affinity is
 | |
| 				// recorded for connections from a particular source IP
 | |
| 				// (without regard to source port) to a particular
 | |
| 				// ServicePort (without regard to which service IP was
 | |
| 				// used to reach the service). This may be changed in the
 | |
| 				// future.
 | |
| 				tx.Add(&knftables.Set{
 | |
| 					Name: epInfo.affinitySetName,
 | |
| 					Type: ipvX_addr,
 | |
| 					Flags: []knftables.SetFlag{
 | |
| 						// The nft docs say "dynamic" is only
 | |
| 						// needed for sets containing stateful
 | |
| 						// objects (eg counters), but (at least on
 | |
| 						// RHEL8) if we create the set without
 | |
| 						// "dynamic", it later gets mutated to
 | |
| 						// have it, and then the next attempt to
 | |
| 						// tx.Add() it here fails because it looks
 | |
| 						// like we're trying to change the flags.
 | |
| 						knftables.DynamicFlag,
 | |
| 						knftables.TimeoutFlag,
 | |
| 					},
 | |
| 					Timeout: ptr.To(time.Duration(svcInfo.StickyMaxAgeSeconds()) * time.Second),
 | |
| 				})
 | |
| 				activeAffinitySets.Insert(epInfo.affinitySetName)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// If Cluster policy is in use, create the chain and create rules jumping
 | |
| 		// from clusterPolicyChain to the clusterEndpoints
 | |
| 		if usesClusterPolicyChain {
 | |
| 			proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, clusterPolicyChain, clusterEndpoints)
 | |
| 		}
 | |
| 
 | |
| 		// If Local policy is in use, create rules jumping from localPolicyChain
 | |
| 		// to the localEndpoints
 | |
| 		if usesLocalPolicyChain {
 | |
| 			proxier.writeServiceToEndpointRules(tx, svcPortNameString, svcInfo, localPolicyChain, localEndpoints)
 | |
| 		}
 | |
| 
 | |
| 		// Generate the per-endpoint chains
 | |
| 		for _, ep := range allLocallyReachableEndpoints {
 | |
| 			epInfo, ok := ep.(*endpointInfo)
 | |
| 			if !ok {
 | |
| 				klog.ErrorS(nil, "Failed to cast endpointInfo", "endpointInfo", ep)
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			endpointChain := epInfo.chainName
 | |
| 
 | |
| 			// Handle traffic that loops back to the originator with SNAT.
 | |
| 			tx.Add(&knftables.Rule{
 | |
| 				Chain: endpointChain,
 | |
| 				Rule: knftables.Concat(
 | |
| 					ipX, "saddr", epInfo.IP(),
 | |
| 					"jump", markMasqChain,
 | |
| 				),
 | |
| 			})
 | |
| 
 | |
| 			// Handle session affinity
 | |
| 			if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
 | |
| 				tx.Add(&knftables.Rule{
 | |
| 					Chain: endpointChain,
 | |
| 					Rule: knftables.Concat(
 | |
| 						"update", "@", epInfo.affinitySetName,
 | |
| 						"{", ipX, "saddr", "}",
 | |
| 					),
 | |
| 				})
 | |
| 			}
 | |
| 
 | |
| 			// DNAT to final destination.
 | |
| 			tx.Add(&knftables.Rule{
 | |
| 				Chain: endpointChain,
 | |
| 				Rule: knftables.Concat(
 | |
| 					"meta l4proto", protocol,
 | |
| 					"dnat to", epInfo.String(),
 | |
| 				),
 | |
| 			})
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Figure out which chains are now stale. Unfortunately, we can't delete them
 | |
| 	// right away, because with kernels before 6.2, if there is a map element pointing
 | |
| 	// to a chain, and you delete that map element, the kernel doesn't notice until a
 | |
| 	// short amount of time later that the chain is now unreferenced. So we flush them
 | |
| 	// now, and record the time that they become stale in staleChains so they can be
 | |
| 	// deleted later.
 | |
| 	existingChains, err := proxier.nftables.List(context.TODO(), "chains")
 | |
| 	if err == nil {
 | |
| 		for _, chain := range existingChains {
 | |
| 			if isServiceChainName(chain) {
 | |
| 				if !activeChains.Has(chain) {
 | |
| 					tx.Flush(&knftables.Chain{
 | |
| 						Name: chain,
 | |
| 					})
 | |
| 					proxier.staleChains[chain] = start
 | |
| 				} else {
 | |
| 					delete(proxier.staleChains, chain)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	} else if !knftables.IsNotFound(err) {
 | |
| 		klog.ErrorS(err, "Failed to list nftables chains: stale chains will not be deleted")
 | |
| 	}
 | |
| 
 | |
| 	// OTOH, we can immediately delete any stale affinity sets
 | |
| 	existingSets, err := proxier.nftables.List(context.TODO(), "sets")
 | |
| 	if err == nil {
 | |
| 		for _, set := range existingSets {
 | |
| 			if isAffinitySetName(set) && !activeAffinitySets.Has(set) {
 | |
| 				tx.Delete(&knftables.Set{
 | |
| 					Name: set,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 	} else if !knftables.IsNotFound(err) {
 | |
| 		klog.ErrorS(err, "Failed to list nftables sets: stale affinity sets will not be deleted")
 | |
| 	}
 | |
| 
 | |
| 	// Sync rules.
 | |
| 	klog.V(2).InfoS("Reloading service nftables data",
 | |
| 		"numServices", len(proxier.svcPortMap),
 | |
| 		"numEndpoints", totalEndpoints,
 | |
| 	)
 | |
| 
 | |
| 	// FIXME
 | |
| 	// klog.V(9).InfoS("Running nftables transaction", "transaction", tx.Bytes())
 | |
| 
 | |
| 	err = proxier.nftables.Run(context.TODO(), tx)
 | |
| 	if err != nil {
 | |
| 		klog.ErrorS(err, "nftables sync failed")
 | |
| 		metrics.IptablesRestoreFailuresTotal.Inc()
 | |
| 		return
 | |
| 	}
 | |
| 	success = true
 | |
| 
 | |
| 	for name, lastChangeTriggerTimes := range endpointUpdateResult.LastChangeTriggerTimes {
 | |
| 		for _, lastChangeTriggerTime := range lastChangeTriggerTimes {
 | |
| 			latency := metrics.SinceInSeconds(lastChangeTriggerTime)
 | |
| 			metrics.NetworkProgrammingLatency.Observe(latency)
 | |
| 			klog.V(4).InfoS("Network programming", "endpoint", klog.KRef(name.Namespace, name.Name), "elapsed", latency)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("internal").Set(float64(serviceNoLocalEndpointsTotalInternal))
 | |
| 	metrics.SyncProxyRulesNoLocalEndpointsTotal.WithLabelValues("external").Set(float64(serviceNoLocalEndpointsTotalExternal))
 | |
| 	if proxier.healthzServer != nil {
 | |
| 		proxier.healthzServer.Updated(proxier.ipFamily)
 | |
| 	}
 | |
| 	metrics.SyncProxyRulesLastTimestamp.SetToCurrentTime()
 | |
| 
 | |
| 	// Update service healthchecks.  The endpoints list might include services that are
 | |
| 	// not "OnlyLocal", but the services list will not, and the serviceHealthServer
 | |
| 	// will just drop those endpoints.
 | |
| 	if err := proxier.serviceHealthServer.SyncServices(proxier.svcPortMap.HealthCheckNodePorts()); err != nil {
 | |
| 		klog.ErrorS(err, "Error syncing healthcheck services")
 | |
| 	}
 | |
| 	if err := proxier.serviceHealthServer.SyncEndpoints(proxier.endpointsMap.LocalReadyEndpoints()); err != nil {
 | |
| 		klog.ErrorS(err, "Error syncing healthcheck endpoints")
 | |
| 	}
 | |
| 
 | |
| 	// Finish housekeeping, clear stale conntrack entries for UDP Services
 | |
| 	conntrack.CleanStaleEntries(proxier.conntrack, proxier.svcPortMap, serviceUpdateResult, endpointUpdateResult)
 | |
| }
 | |
| 
 | |
| func (proxier *Proxier) writeServiceToEndpointRules(tx *knftables.Transaction, svcPortNameString string, svcInfo *servicePortInfo, svcChain string, endpoints []proxy.Endpoint) {
 | |
| 	// First write session affinity rules, if applicable.
 | |
| 	if svcInfo.SessionAffinityType() == v1.ServiceAffinityClientIP {
 | |
| 		ipX := "ip"
 | |
| 		if proxier.ipFamily == v1.IPv6Protocol {
 | |
| 			ipX = "ip6"
 | |
| 		}
 | |
| 
 | |
| 		for _, ep := range endpoints {
 | |
| 			epInfo, ok := ep.(*endpointInfo)
 | |
| 			if !ok {
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			tx.Add(&knftables.Rule{
 | |
| 				Chain: svcChain,
 | |
| 				Rule: knftables.Concat(
 | |
| 					ipX, "saddr", "@", epInfo.affinitySetName,
 | |
| 					"goto", epInfo.chainName,
 | |
| 				),
 | |
| 			})
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Now write loadbalancing rule
 | |
| 	var elements []string
 | |
| 	for i, ep := range endpoints {
 | |
| 		epInfo, ok := ep.(*endpointInfo)
 | |
| 		if !ok {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		elements = append(elements,
 | |
| 			strconv.Itoa(i), ":", "goto", epInfo.chainName,
 | |
| 		)
 | |
| 		if i != len(endpoints)-1 {
 | |
| 			elements = append(elements, ",")
 | |
| 		}
 | |
| 	}
 | |
| 	tx.Add(&knftables.Rule{
 | |
| 		Chain: svcChain,
 | |
| 		Rule: knftables.Concat(
 | |
| 			"numgen random mod", len(endpoints), "vmap",
 | |
| 			"{", elements, "}",
 | |
| 		),
 | |
| 	})
 | |
| }
 | 
