Add experimental Cilium CNI provider

* Accept experimental CNI `networking` mode "cilium"
* Run Cilium v1.8.0 with overlay vxlan tunnels and a
minimal set of features. We're interested in:
  * IPAM: Divide pod_cidr into /24 subnets per node
  * CNI networking pod-to-pod, pod-to-external
  * BPF masquerade
  * NetworkPolicy as defined by Kubernetes (no L7)
* Continue using kube-proxy with Cilium probe mode
* Firewall changes:
  * Require UDP 8472 for vxlan (Linux kernel default) between nodes
  * Optional ICMP echo(8) between nodes for host reachability (health)
  * Optional TCP 4240 between nodes for host reachability (health)
This commit is contained in:
Dalton Hubble
2020-06-14 20:09:12 -07:00
parent e75697ce35
commit af36c53936
9 changed files with 624 additions and 2 deletions

View File

@@ -40,6 +40,22 @@ locals {
if var.networking == "calico"
}
# cilium manifests map
# { manifests-networking/manifest.yaml => content }
cilium_manifests = {
for name in fileset("${path.module}/resources/cilium", "**/*.yaml") :
"manifests-networking/${name}" => templatefile(
"${path.module}/resources/cilium/${name}",
{
cilium_agent_image = var.container_images["cilium_agent"]
cilium_operator_image = var.container_images["cilium_operator"]
pod_cidr = var.pod_cidr
daemonset_tolerations = var.daemonset_tolerations
}
)
if var.networking == "cilium"
}
# kube-router manifests map
# { manifests-networking/manifest.yaml => content }
kube_router_manifests = {

View File

@@ -29,6 +29,7 @@ output "assets_dist" {
local.flannel_manifests,
local.calico_manifests,
local.kube_router_manifests,
local.cilium_manifests,
)
sensitive = true
}

View File

@@ -0,0 +1,27 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cilium-operator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cilium-operator
subjects:
- kind: ServiceAccount
name: cilium-operator
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cilium-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cilium-agent
subjects:
- kind: ServiceAccount
name: cilium-agent
namespace: kube-system

View File

@@ -0,0 +1,136 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cilium-operator
rules:
- apiGroups:
- ""
resources:
# to automatically delete [core|kube]dns pods so that are starting to being
# managed by Cilium
- pods
verbs:
- get
- list
- watch
- delete
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
# to perform the translation of a CNP that contains `ToGroup` to its endpoints
- services
- endpoints
# to check apiserver connectivity
- namespaces
verbs:
- get
- list
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- apiGroups:
- cilium.io
resources:
- ciliumnetworkpolicies
- ciliumnetworkpolicies/status
- ciliumclusterwidenetworkpolicies
- ciliumclusterwidenetworkpolicies/status
- ciliumendpoints
- ciliumendpoints/status
- ciliumnodes
- ciliumnodes/status
- ciliumidentities
- ciliumidentities/status
verbs:
- '*'
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cilium-agent
rules:
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- namespaces
- services
- nodes
- endpoints
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- pods
- nodes
verbs:
- get
- list
- watch
- update
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- patch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- create
- get
- list
- watch
- update
- apiGroups:
- cilium.io
resources:
- ciliumnetworkpolicies
- ciliumnetworkpolicies/status
- ciliumclusterwidenetworkpolicies
- ciliumclusterwidenetworkpolicies/status
- ciliumendpoints
- ciliumendpoints/status
- ciliumnodes
- ciliumnodes/status
- ciliumidentities
- ciliumidentities/status
verbs:
- '*'

View File

@@ -0,0 +1,175 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: cilium
namespace: kube-system
data:
# Identity allocation mode selects how identities are shared between cilium
# nodes by setting how they are stored. The options are "crd" or "kvstore".
# - "crd" stores identities in kubernetes as CRDs (custom resource definition).
# These can be queried with:
# kubectl get ciliumid
# - "kvstore" stores identities in a kvstore, etcd or consul, that is
# configured below. Cilium versions before 1.6 supported only the kvstore
# backend. Upgrades from these older cilium versions should continue using
# the kvstore by commenting out the identity-allocation-mode below, or
# setting it to "kvstore".
identity-allocation-mode: crd
# identity-change-grace-period is the grace period that needs to pass
# before an endpoint that has changed its identity will start using
# that new identity. During the grace period, the new identity has
# already been allocated and other nodes in the cluster have a chance
# to whitelist the new upcoming identity of the endpoint.
identity-change-grace-period: "5s"
# If you want to run cilium in debug mode change this value to true
debug: "false"
# TCP liveness and readiness probes (prefer exec probe for now)
agent-health-port: "9876"
# Prometheus
# enable-metrics: "true"
# prometheus-serve-addr: ":foo"
# operator-prometheus-serve-addr: ":bar"
# Enable IPv4 addressing. If enabled, all endpoints are allocated an IPv4
# address.
enable-ipv4: "true"
# Enable IPv6 addressing. If enabled, all endpoints are allocated an IPv6
# address.
enable-ipv6: "false"
# Enable probing for a more efficient clock source for the BPF datapath
enable-bpf-clock-probe: "true"
# If you want cilium monitor to aggregate tracing for packets, set this level
# to "low", "medium", or "maximum". The higher the level, the less packets
# that will be seen in monitor output.
monitor-aggregation: medium
# The monitor aggregation interval governs the typical time between monitor
# notification events for each allowed connection.
#
# Only effective when monitor aggregation is set to "medium" or higher.
monitor-aggregation-interval: 5s
# The monitor aggregation flags determine which TCP flags which, upon the
# first observation, cause monitor notifications to be generated.
#
# Only effective when monitor aggregation is set to "medium" or higher.
monitor-aggregation-flags: all
# bpf-policy-map-max specified the maximum number of entries in endpoint
# policy map (per endpoint)
bpf-policy-map-max: "16384"
# Specifies the ratio (0.0-1.0) of total system memory to use for dynamic
# sizing of the TCP CT, non-TCP CT, NAT and policy BPF maps.
bpf-map-dynamic-size-ratio: "0.0025"
# Pre-allocation of map entries allows per-packet latency to be reduced, at
# the expense of up-front memory allocation for the entries in the maps. The
# default value below will minimize memory usage in the default installation;
# users who are sensitive to latency may consider setting this to "true".
#
# This option was introduced in Cilium 1.4. Cilium 1.3 and earlier ignore
# this option and behave as though it is set to "true".
#
# If this value is modified, then during the next Cilium startup the restore
# of existing endpoints and tracking of ongoing connections may be disrupted.
# This may lead to policy drops or a change in loadbalancing decisions for a
# connection for some time. Endpoints may need to be recreated to restore
# connectivity.
#
# If this option is set to "false" during an upgrade from 1.3 or earlier to
# 1.4 or later, then it may cause one-time disruptions during the upgrade.
preallocate-bpf-maps: "false"
# Encapsulation mode for communication between nodes
# Possible values:
# - disabled
# - vxlan (default)
# - geneve
tunnel: vxlan
# Name of the cluster. Only relevant when building a mesh of clusters.
cluster-name: default
# DNS Polling periodically issues a DNS lookup for each `matchName` from
# cilium-agent. The result is used to regenerate endpoint policy.
# DNS lookups are repeated with an interval of 5 seconds, and are made for
# A(IPv4) and AAAA(IPv6) addresses. Should a lookup fail, the most recent IP
# data is used instead. An IP change will trigger a regeneration of the Cilium
# policy for each endpoint and increment the per cilium-agent policy
# repository revision.
#
# This option is disabled by default starting from version 1.4.x in favor
# of a more powerful DNS proxy-based implementation, see [0] for details.
# Enable this option if you want to use FQDN policies but do not want to use
# the DNS proxy.
#
# To ease upgrade, users may opt to set this option to "true".
# Otherwise please refer to the Upgrade Guide [1] which explains how to
# prepare policy rules for upgrade.
#
# [0] http://docs.cilium.io/en/stable/policy/language/#dns-based
# [1] http://docs.cilium.io/en/stable/install/upgrade/#changes-that-may-require-action
tofqdns-enable-poller: "false"
# wait-bpf-mount makes init container wait until bpf filesystem is mounted
wait-bpf-mount: "false"
auto-direct-node-routes: "false"
# enableXTSocketFallback enables the fallback compatibility solution
# when the xt_socket kernel module is missing and it is needed for
# the datapath L7 redirection to work properly. See documentation
# for details on when this can be disabled:
# http://docs.cilium.io/en/latest/install/system_requirements/#admin-kernel-version.
enable-xt-socket-fallback: "true"
# installIptablesRules enables installation of iptables rules to allow for
# TPROXY (L7 proxy injection), itpables based masquerading and compatibility
# with kube-proxy. See documentation for details on when this can be
# disabled.
install-iptables-rules: "true"
# masquerade traffic leaving the node destined for outside
masquerade: "true"
# bpfMasquerade enables masquerading with BPF instead of iptables
enable-bpf-masquerade: "true"
# kube-proxy
kube-proxy-replacement: "probe"
enable-session-affinity: "false"
# ClusterIPs from host namespace
enable-host-reachable-services: "false"
# NodePort
enable-node-port: "true"
node-port-bind-protection: "true"
enable-auto-protect-node-port-range: "true"
# IPAM
ipam: "cluster-pool"
disable-cnp-status-updates: "true"
k8s-require-ipv4-pod-cidr: "true"
k8s-require-ipv6-pod-cidr: "false"
cluster-pool-ipv4-cidr: "${pod_cidr}"
cluster-pool-ipv4-mask-size: "24"
# Health
enable-endpoint-health-checking: "true"
# Identity
enable-well-known-identities: "false"
enable-remote-node-identity: "true"
# Misc
# enable-l7-proxy: "false"
policy-audit-mode: "false"
operator-api-serve-addr: "127.0.0.1:9234"

View File

@@ -0,0 +1,175 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cilium-agent
namespace: kube-system
labels:
k8s-app: cilium-agent
spec:
selector:
matchLabels:
k8s-app: cilium-agent
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
k8s-app: cilium-agent
annotations:
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
spec:
hostNetwork: true
priorityClassName: system-node-critical
serviceAccountName: cilium-agent
tolerations:
- key: node-role.kubernetes.io/controller
operator: Exists
- key: node.kubernetes.io/not-ready
operator: Exists
%{~ for key in daemonset_tolerations ~}
- key: ${key}
operator: Exists
%{~ endfor ~}
initContainers:
- name: clean-cilium-state
image: ${cilium_agent_image}
command:
- /init-container.sh
env:
- name: CILIUM_WAIT_BPF_MOUNT
valueFrom:
configMapKeyRef:
name: cilium
key: wait-bpf-mount
optional: true
securityContext:
capabilities:
add:
- NET_ADMIN
privileged: true
volumeMounts:
- name: sys-fs-bpf
mountPath: /sys/fs/bpf
mountPropagation: HostToContainer
- name: var-run-cilium
mountPath: /var/run/cilium
containers:
- name: cilium-agent
image: ${cilium_agent_image}
command:
- cilium-agent
args:
- --config-dir=/tmp/cilium/config-map
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: CILIUM_K8S_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
ports:
# Not yet used, prefer exec's
- name: health
protocol: TCP
containerPort: 9876
lifecycle:
# Install Cilium CNI binary and CNI network config
postStart:
exec:
command:
- "/cni-install.sh"
- "--enable-debug=false"
preStop:
exec:
command:
- /cni-uninstall.sh
securityContext:
capabilities:
add:
- NET_ADMIN
- SYS_MODULE
privileged: true
livenessProbe:
exec:
command:
- cilium
- status
- --brief
periodSeconds: 30
initialDelaySeconds: 120
successThreshold: 1
failureThreshold: 10
timeoutSeconds: 5
readinessProbe:
exec:
command:
- cilium
- status
- --brief
periodSeconds: 20
initialDelaySeconds: 5
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 5
volumeMounts:
# Load kernel modules
- name: lib-modules
mountPath: /lib/modules
readOnly: true
- name: xtables-lock
mountPath: /run/xtables.lock
# Keep state between restarts
- name: var-run-cilium
mountPath: /var/run/cilium
- name: sys-fs-bpf
mountPath: /sys/fs/bpf
# Configuration
- name: config
mountPath: /tmp/cilium/config-map
readOnly: true
# Install CNI plugin and config on host
- name: cni-bin-dir
mountPath: /host/opt/cni/bin
- name: cni-conf-dir
mountPath: /host/etc/cni/net.d
terminationGracePeriodSeconds: 1
volumes:
# Load kernel modules
- name: lib-modules
hostPath:
path: /lib/modules
# Access iptables concurrently with other processes (e.g. kube-proxy)
- name: xtables-lock
hostPath:
type: FileOrCreate
path: /run/xtables.lock
# Keep state between restarts
- name: var-run-cilium
hostPath:
path: /var/run/cilium
type: DirectoryOrCreate
# Keep state between restarts for bpf maps
- name: sys-fs-bpf
hostPath:
path: /sys/fs/bpf
type: DirectoryOrCreate
# Read configuration
- name: config
configMap:
name: cilium
# Install CNI plugin and config on host
- name: cni-bin-dir
hostPath:
type: DirectoryOrCreate
path: /opt/cni/bin
- name: cni-conf-dir
hostPath:
type: DirectoryOrCreate
path: /etc/kubernetes/cni/net.d

View File

@@ -0,0 +1,77 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: cilium-operator
namespace: kube-system
spec:
replicas: 1
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
selector:
matchLabels:
name: cilium-operator
template:
metadata:
labels:
name: cilium-operator
annotations:
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
spec:
hostNetwork: true
priorityClassName: system-cluster-critical
serviceAccountName: cilium-operator
tolerations:
- key: node-role.kubernetes.io/controller
operator: Exists
- key: node.kubernetes.io/not-ready
operator: Exists
containers:
- name: cilium-operator
image: ${cilium_operator_image}
command:
- cilium-operator-generic
args:
- --config-dir=/tmp/cilium/config-map
- --debug=$(CILIUM_DEBUG)
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: CILIUM_K8S_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: CILIUM_DEBUG
valueFrom:
configMapKeyRef:
name: cilium
key: debug
optional: true
ports:
- name: health
protocol: TCP
containerPort: 9234
livenessProbe:
httpGet:
scheme: HTTP
host: 127.0.0.1
port: 9234
path: /healthz
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 3
volumeMounts:
- name: config
mountPath: /tmp/cilium/config-map
readOnly: true
volumes:
# Read configuration
- name: config
configMap:
name: cilium

View File

@@ -0,0 +1,13 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: cilium-operator
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cilium-agent
namespace: kube-system

View File

@@ -27,7 +27,7 @@ variable "cloud_provider" {
variable "networking" {
type = string
description = "Choice of networking provider (flannel or calico or kube-router)"
description = "Choice of networking provider (flannel or calico or kube-router or cilium)"
default = "flannel"
}
@@ -80,7 +80,9 @@ variable "container_images" {
kube_scheduler = "k8s.gcr.io/kube-scheduler:v1.18.4"
kube_proxy = "k8s.gcr.io/kube-proxy:v1.18.4"
# experimental
kube_router = "cloudnativelabs/kube-router:v0.3.2"
kube_router = "cloudnativelabs/kube-router:v0.3.2"
cilium_agent = "docker.io/cilium/cilium:v1.8.0-rc4"
cilium_operator = "docker.io/cilium/operator-generic:v1.8.0-rc4"
}
}