Install new cluster for OWLS testing

Signed-off-by: Carsten Schafer <Carsten.Schafer@kinarasystems.com>
This commit is contained in:
Carsten Schafer
2024-04-10 09:39:49 -04:00
parent 43276f524a
commit e616156663
10 changed files with 347 additions and 30 deletions

View File

@@ -7,16 +7,18 @@ The scripts should work on MacOS and Linux (as of yet untested).
## Requirements
### MacOS
- Homebrew (Mac)
- Homebrew
- gettext for envsubst (via Homebrew v0.21.1)
### General
- eksctl (v0.157.0+)
- aws-cli (v2.13.19)
- eksctl (v0.175.0+)
- aws-cli (v2.15.34+)
## Setup
- Prepare an environment file (see [env\_example](./env_example).
- Prepare an environment file - see [env\_example](./env_example).
- Make sure all required utilities are installed.
- Make sure that you can run "aws --version" and "eksctl version"
- Make sure that any AWS SSO environment variables are set.
@@ -34,10 +36,10 @@ The scripts should work on MacOS and Linux (as of yet untested).
## Scaling nodegroups
Set CLUSTER_NODES in your env\_FILE and also set the desiredCapacity for the nodegroup in cluster.CLUSTER_NAME.yaml and run:
Set CLUSTER\_NODES in your env\_FILE and run:
```bash
source env\_FILE
eksctl scale nodegroup -f cluster.$CLUSTER_NAME.yaml
source env_FILE
./scaleit
```
## Next Steps
@@ -50,8 +52,7 @@ shared services.
- Run "source env\_FILE ; ./cleaner" (using the env file you created above)
Note that sometimes AWS has trouble cleaning up when things are or appear in-use. The eksctl
command to delete the cluster may thus fail requiring chasing down the noted rewsources. One of the
command to delete the cluster may thus fail requiring chasing down the noted resources. One of the
resources that seems to always linger are LBs. Deleting these manually and restarting cleanup,
sometimes works. Other times inspecting the CloudFormation resource for this cluster for errors
will lead to discovery of the problematic resources. After you delete these resources manually, you may retry deletion of the CloudFormation stack. That should take care of deleting any remaining resources.

View File

@@ -0,0 +1,150 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: tip-wlan-qa
region: us-east-2
version: "1.27"
iam:
withOIDC: true
serviceAccounts:
- metadata:
name: aws-load-balancer-controller
namespace: kube-system
wellKnownPolicies:
awsLoadBalancerController: true
#- metadata:
# name: ebs-csi-controller-sa
# namespace: kube-system
# wellKnownPolicies:
# ebsCSIController: true
#- metadata:
# name: efs-csi-controller-sa
# namespace: kube-system
# wellKnownPolicies:
# efsCSIController: true
#- metadata:
# name: external-dns
# namespace: kube-system
# wellKnownPolicies:
# externalDNS: true
#- metadata:
# name: cert-manager
# namespace: cert-manager
# wellKnownPolicies:
# certManager: true
- metadata:
name: cluster-autoscaler
namespace: kube-system
labels: {aws-usage: "cluster-ops"}
wellKnownPolicies:
autoScaler: true
- metadata:
name: autoscaler-service
namespace: kube-system
attachPolicy: # inline policy can be defined along with `attachPolicyARNs`
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- "autoscaling:DescribeAutoScalingGroups"
- "autoscaling:DescribeAutoScalingInstances"
- "autoscaling:DescribeLaunchConfigurations"
- "autoscaling:DescribeTags"
- "autoscaling:SetDesiredCapacity"
- "autoscaling:TerminateInstanceInAutoScalingGroup"
- "ec2:DescribeLaunchTemplateVersions"
Resource: '*'
availabilityZones:
- us-east-2a
- us-east-2b
- us-east-2c
vpc:
cidr: 10.10.0.0/16
clusterEndpoints:
publicAccess: true
privateAccess: true
#managedNodeGroups:
#- name: def
# instanceType: m5.2xlarge
# amiFamily: AmazonLinux2
# #Try this next time with unsafe-sysctls:
# #ami: ami-0c92ea9c7c0380b66
# #ami: ami-03a6eaae9938c858c
# minSize: 2
# maxSize: 8
# volumeSize: 100
# ssh: # import public key from file
# allow: true
# publicKeyPath: id_rsa_tip-wlan-qa.pub
# # This does not work for managed node groups:
# #overrideBootstrapCommand: |
# # #!/bin/bash
# # /etc/eks/bootstrap.sh tip-wlan-qa --kubelet-extra-args "--allowed-unsafe-sysctls 'net.*'"
# tags:
# # EC2 tags required for cluster-autoscaler auto-discovery
# k8s.io/cluster-autoscaler/enabled: "true"
# k8s.io/cluster-autoscaler/tip-wlan-qa: "owned"
# kubernetes.io/cluster-autoscaler/enabled: "true"
# kubernetes.io/cluster-autoscaler/tip-wlan-qa: "owned"
nodeGroups:
- name: def
instanceType: m5.2xlarge
amiFamily: AmazonLinux2
minSize: 2
maxSize: 8
desiredCapacity: 2
volumeSize: 100
ssh: # import public key from file
allow: true
publicKeyPath: id_rsa_tip-wlan-qa.pub
kubeletExtraConfig:
allowedUnsafeSysctls:
- "net.ipv4.tcp_keepalive_intvl"
- "net.ipv4.tcp_keepalive_probes"
- "net.ipv4.tcp_keepalive_time"
tags:
# EC2 tags required for cluster-autoscaler auto-discovery
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/tip-wlan-qa: "owned"
kubernetes.io/cluster-autoscaler/enabled: "true"
kubernetes.io/cluster-autoscaler/tip-wlan-qa: "owned"
iamIdentityMappings:
- arn: arn:aws:iam::289708231103:user/gha-wlan-testing
username: gha-wlan-testing
noDuplicateARNs: true # prevents shadowing of ARNs
groups:
- system:masters
- arn: arn:aws:iam::289708231103:user/gha-toolsmith
username: gha-toolsmith
noDuplicateARNs: true
groups:
- system:masters
- arn: arn:aws:iam::289708231103:user/gha-wlan-cloud-helm
username: gha-wlan-cloud-helm
noDuplicateARNs: true
groups:
- system:masters
- arn: arn:aws:iam::289708231103:role/AWSReservedSSO_SystemAdministrator_622371b0ceece6f8
groups:
- system:masters
username: admin
noDuplicateARNs: true
addons:
- name: vpc-cni # no version is specified so it deploys the default version
attachPolicyARNs:
- arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
- name: coredns
version: latest # auto discovers the latest available
- name: kube-proxy
version: latest
#- name: aws-ebs-csi-driver
# wellKnownPolicies: # add IAM and service account
# ebsCSIController: true

View File

@@ -7,6 +7,7 @@ unset AWS_PROFILE
#export AWS_SECRET_ACCESS_KEY="z6bl3..."
#export AWS_SESSION_TOKEN="Igo..."
export AWS_DEFAULT_REGION="ap-south-1"
export AWS_REGION="$AWS_DEFAULT_REGION"
export AWS_ACCOUNT_ID="289708231103"
export CLUSTER_DOMAIN="lab.wlan.tip.build"
export CLUSTER_ZONE_ID="Z213ADJASKDA1345" # zone id of $CLUSTER_DOMAIN zone

View File

@@ -1,5 +1,6 @@
unset AWS_PROFILE
export AWS_DEFAULT_REGION="ap-south-1"
export AWS_REGION="$AWS_DEFAULT_REGION"
export AWS_ACCOUNT_ID="289708231103"
export CLUSTER_DOMAIN="lab.wlan.tip.build"
export CLUSTER_ZONE_ID="Z09534373UTXT2L1YL912"

View File

@@ -0,0 +1,13 @@
unset AWS_PROFILE
export AWS_DEFAULT_REGION="us-east-2"
export AWS_REGION="$AWS_DEFAULT_REGION"
export AWS_ACCOUNT_ID="289708231103"
export CLUSTER_DOMAIN="lab.wlan.tip.build"
export CLUSTER_ZONE_ID="Z09534373UTXT2L1YL912"
export CLUSTER_INSTANCE_TYPE="c5.xlarge"
export CLUSTER_INSTANCE_TYPE="m5.2xlarge"
export CLUSTER_NAME="tip-wlan-qa"
export CLUSTER_NODES=2
export CLUSTER_MIN_NODES=2
export CLUSTER_MAX_NODES=8
export CLUSTER_VOLUME_SIZE=100

View File

@@ -98,6 +98,8 @@ if enabled ; then
fi
#----------------------------------
nextstep "Creating $CLUSTER_NAME EKS cluster in 10 seconds..."
sleep 10
nextstep "Creating $CLUSTER_NAME EKS cluster"
${steps[$cstep]} eksctl create cluster -f $config_file
@@ -146,8 +148,7 @@ arn="arn:aws:iam::${account_id}:policy/${role_name}"
envsubst < route53policy.json.tpl > route53policy.json
${steps[$cstep]} aws iam create-policy \
--policy-name $role_name \
--policy-document file://route53policy.json \
--no-cli-pager
--policy-document file://route53policy.json
${steps[$cstep]} eksctl create iamserviceaccount \
--name $sa_name \
--namespace kube-system \
@@ -164,8 +165,7 @@ sa_name="${role_name}-sa"
arn="arn:aws:iam::${account_id}:policy/${role_name}"
${steps[$cstep]} aws iam create-policy \
--policy-name $role_name \
--policy-document file://alb_ingress_policy.json \
--no-cli-pager
--policy-document file://alb_ingress_policy.json
${steps[$cstep]} eksctl create iamserviceaccount \
--cluster $CLUSTER_NAME \
--namespace kube-system \

View File

@@ -0,0 +1,46 @@
#!/bin/bash
. ./utils.sh
check_env
echo "Scaling cluster:"
show_env
echo "Press ENTER to continue [or CTRL-C to exit]"
read enter
declare -a steps
max_steps=10
for ((i=0; i < $max_steps; i++)) ; do
steps[$i]=""
done
if [ -n "$1" ] ; then
for ((i=0; i < $1; i++)) ; do
steps[$i]="echo"
done
fi
cstep=1
logv startscale "$(date)"
#set -x
echo "Determine caller identity"
if [ -n "$AWS_PROFILE" ] ; then
account_id=$(aws sts get-caller-identity --query Account --output text --profile $AWS_PROFILE)
else
account_id=$(aws sts get-caller-identity --query Account --output text)
fi
logv accountid $account_id
if [ -z "$account_id" ] ; then
echo "Unable to determine caller-identity!"
exit 1
fi
config_file="cluster.$CLUSTER_NAME.yaml"
nextstep "Generating cluster.yml file -> $config_file"
envsubst < cluster.yaml > $config_file
eksctl scale nodegroup -f cluster.$CLUSTER_NAME.yaml
logv scaled $CLUSTER_NAME
logv endscale "$(date)"
#----------------------------------
#set +x
cstep=-1

View File

@@ -36,9 +36,13 @@ function check_env()
echo "Default CLUSTER_MIN_NODES to $CLUSTER_MIN_NODES"
fi
if [ -z "$CLUSTER_MAX_NODES" ] ; then
export CLUSTER_MAX_NODES="3"
export CLUSTER_MAX_NODES="$(expr $CLUSTER_NODES + 2)"
echo "Default CLUSTER_MAX_NODES to $CLUSTER_MAX_NODES"
fi
if [ "$CLUSTER_MAX_NODES" -lt "$CLUSTER_NODES" ] ; then
echo "CLUSTER_MAX_NODES [$CLUSTER_MAX_NODES] is < CLUSTER_NODES [$CLUSTER_NODES]"
exit 1
fi
if [ -z "$CLUSTER_VOLUME_SIZE" ] ; then
export CLUSTER_VOLUME_SIZE="100"
echo "Default CLUSTER_VOLUME_SIZE to $CLUSTER_VOLUME_SIZE"

View File

@@ -67,4 +67,33 @@ The current password is encoded in secrets/dashboard.yaml.
4. Run `helmfile --environment $ENVIRONMENT diff` to see changes that would be applied
5. If everything is correct, run `helmfile --environment $ENVIRONMENT apply` to see changes that would be applied
If you would like to limit releasae that you would like to affect, you may use labels. For example, if you want to see changes that would be done only to **influxdb** release in **amazon-cicd** environment, you may run `helmfile --environment amazon-cicd --selector app=influxdb diff`
If you would like to limit release that you would like to affect, you may use labels. For example, if you want to see changes that would be done only to **influxdb** release in **amazon-cicd** environment, you may run `helmfile --environment amazon-cicd --selector app=influxdb diff`
### First install
Upon first install there is usually a back and forth with dependencies, so it helps to install
things in a certain order. This is the suggested order:
```
./predeploy.sh
helmfile --environment $ENVIRONMENT -l app=aws-load-balancer-controller apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=metrics-server apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=node-termination-handler apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=cert-manager apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=external-dns apply
helmfile --environment $ENVIRONMENT --skip-deps -l crd=prometheus-operator-crd apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=prometheus-operator apply
helmfile --environment $ENVIRONMENT --skip-deps -l app=ingress apply
# anything not covered previously
helmfile --environment $ENVIRONMENT --skip-deps apply
```
## First time - post installation activities
```
kubectl edit sc gp2
# add at root level: allowVolumeExpansion: true
kubectl edit ingressclass alb
# add under: metadata.annotations:
# ingressclass.kubernetes.io/is-default-class: "true"
```

View File

@@ -19,7 +19,9 @@ environments:
- monitoring:
namespace: monitoring
publicNamespaces: ['openwifi-qa01', 'openwifi-qa02', 'openwifi-dev01', 'openwifi-demo']
clusterName: TIP WLAN CI/CD
- domain: lab.wlan.tip.build
- hostSuffix: ".lab.wlan.tip.build"
- storageClass: gp2
- autoscaler:
#TODO:
@@ -65,6 +67,75 @@ environments:
- core-dump-handler:
#skip this one now
enabled: false
amazon-qa2:
secrets:
- secrets/influxdb.yaml
- secrets/sso.yaml
- secrets/alertmanager.yaml
- secrets/actions-runner-controller.yaml
- secrets/ucentral-ap-firmware-logstash.yaml
- secrets/core-dump-handler.yaml
values:
- eks:
clusterName: tip-wlan-qa
#region: ap-south-1
region: us-east-2
dnsRegion: us-east-1
accountID: 289708231103
hostedZoneId: cicd
certificateARNCICD: arn:aws:acm:ap-south-1:289708231103:certificate/2cc8c764-11fd-411d-bf7d-a93f488f3f6c
certificateARNLab: arn:aws:acm:ap-south-1:289708231103:certificate/5281be14-c5e6-45c5-8d5c-e8660c76fbcb
- monitoring:
namespace: monitoring
publicNamespaces: ['openwifi-qa02']
clusterName: TIP QA
- domain: lab.wlan.tip.build
- hostSuffix: "-qa02.lab.wlan.tip.build"
- storageClass: gp2
- autoscaler:
#TODO:
#enabled: true
enabled: false
- ingress:
enabled: true
- elastic:
#TODO:
#enabled: true
enabled: false
- kibana:
#TODO:
#enabled: true
enabled: false
- logstash:
#TODO:
#enabled: true
enabled: false
- prometheus:
enabled: true
- k8s-dashboard:
enabled: false
- portainer:
enabled: false
- metrics-server:
enabled: true
- external-dns:
enabled: true
- alb-ingress:
enabled: true
- node-termination-handler:
enabled: false
- influxdb:
#skip this one for now
enabled: false
- actions-runner-controller:
enabled: false
- cert-manager:
enabled: true
- calico:
enabled: false
- core-dump-handler:
#skip this one now
enabled: false
---
@@ -191,6 +262,7 @@ releases:
role: setup
group: monitoring
app: prometheus-operator
crd: prometheus-operator-crd
- name: prometheus-operator
condition: prometheus.enabled
@@ -231,7 +303,7 @@ releases:
#nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
#kubernetes.io/ingress.class: nginx-sso
hosts:
- prometheus.{{ .Environment.Values.domain }}
- prometheus{{ .Environment.Values.hostSuffix }}
- additionalPrometheusRulesMap:
testbed:
groups:
@@ -303,7 +375,7 @@ releases:
title: Pod *{{`{{ $labels.namespace }}/{{ $labels.pod }}`}}* is crash looping
description: "The pod *{{`{{ $labels.namespace }}/{{ $labels.pod }}`}}* is crash looping. Please check its logs to see what is going on."
- alert: Pod restarted many times
expr: kube_pod_container_status_restarts_total > 5
expr: (sum(increase(kube_pod_container_status_restarts_total[10m])) by (container)) > 0
labels:
severity: warning
area: testbed
@@ -360,7 +432,7 @@ releases:
#nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
#kubernetes.io/ingress.class: nginx-sso
hosts:
- grafana.{{ .Environment.Values.domain }}
- grafana{{ .Environment.Values.hostSuffix }}
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
@@ -394,7 +466,7 @@ releases:
# - name: InfluxDB
# type: influxdb
# access: proxy
# url: https://influx.cicd.{{ .Environment.Values.domain }}
# url: https://influx.cicd{{ .Environment.Values.hostSuffix }}
# user: tip
# secureJsonData:
# token: {{ .Environment.Values.influxdb.adminUser.token }}
@@ -606,7 +678,7 @@ releases:
nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
kubernetes.io/ingress.class: nginx-sso
hosts:
- kibana.{{ .Environment.Values.domain }}
- kibana{{ .Environment.Values.hostSuffix }}
- lifecycle:
postStart:
exec:
@@ -849,18 +921,18 @@ releases:
annotations:
nginx.ingress.kubernetes.io/backend-protocol: HTTPS
hosts:
- host: k8s-admin.{{ .Environment.Values.domain }}
- host: k8s-admin{{ .Environment.Values.hostSuffix }}
paths:
- path: "/"
- name: k8s-dashboard-roles
condition: k8s-dashboard.enabled
- name: simple-auth-roles
condition: ingress.enabled
namespace: {{ .Environment.Values.monitoring.namespace }}
chart: charts/k8s-dashboard-roles
labels:
role: setup
group: monitoring
app: k8s-dashboard
group: system
app: ingress
- name: k8s-dashboard
condition: k8s-dashboard.enabled
@@ -876,10 +948,10 @@ releases:
defaultNamespace: {{ index .Environment.Values.monitoring.publicNamespaces 0 }}
namespaceFallbackList: {{ .Environment.Values.monitoring.publicNamespaces | toJson }}
itemsPerPage: 25
clusterName: TIP WLAN CI/CD
clusterName: {{ .Environment.Values.monitoring.clusterName }}
- extraArgs:
- --enable-skip-login
- --system-banner=Welcome to the TIP WLAN CI/CD Kubernetes cluster. If you are missing your namespace in the above select box, please <a href="https://telecominfraproject.atlassian.net/browse/WIFI">create a ticket</a>.
- --system-banner=Welcome to the {{ .Environment.Values.monitoring.clusterName}} Kubernetes cluster. If you are missing your namespace in the above select box, please <a href="https://telecominfraproject.atlassian.net/browse/WIFI">create a ticket</a>.
- rbac:
create: false
clusterRoleMetrics: true
@@ -901,7 +973,7 @@ releases:
nginx.ingress.kubernetes.io/auth-secret: k8s-dashboard-basic-auth
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
hosts:
- k8s-dashboard.{{ .Environment.Values.domain }}
- k8s-dashboard{{ .Environment.Values.hostSuffix }}
- name: metrics-server
condition: metrics-server.enabled
@@ -993,8 +1065,8 @@ releases:
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS": 443}]'
alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_302"}}'
alb.ingress.kubernetes.io/healthcheck-path: /health
external-dns.alpha.kubernetes.io/hostname: influx.cicd.{{ .Environment.Values.domain }}
hostname: influx.cicd.{{ .Environment.Values.domain }}
external-dns.alpha.kubernetes.io/hostname: influx.cicd{{ .Environment.Values.hostSuffix }}
hostname: influx.cicd{{ .Environment.Values.hostSuffix }}
path: "/*"
- name: actions-runner-controller