Log verbose state for DRBD devices that are not healthy.

This will help troubleshoot issues that occurred in the past but have already been resolved.

Signed-off-by: Denis Seleznev <kto.3decb@gmail.com>
This commit is contained in:
Denis Seleznev
2025-04-08 14:01:41 +02:00
parent 49b5b510ee
commit aed184f6ef
2 changed files with 61 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
#!/bin/bash
set -e
terminate() {
echo "Caught signal, terminating"
exit 0
}
trap terminate SIGINT SIGQUIT SIGTERM
echo "Started logger of bad DRBD statuses"
while true; do
all_devices="$(drbdsetup status --json 2>/dev/null)"
unhealthy_devices="$(echo "${all_devices}" | jq -r '
map(select(
.suspended != false or
."force-io-failures" != false or
# Diskless can be legit when allowRemoteVolumeAccess is set to "true"
# TODO how does forced-diskless look?
([.devices[]."disk-state"] | inside(["UpToDate", "Consistent", "Diskless"]) | not) or
(.connections[]."connection-state" != "Connected") or
# congested is not an alarm but an indicator
(.connections[]."congested" != false) or
(.connections[].peer_devices[]."replication-state" != "Established")
))
| unique
')"
if [ "${unhealthy_devices}" != '[]' ]; then
echo -e "Unhealthy devices:\n${unhealthy_devices}"
fi
sleep 30 &
pid=$!
wait $pid
done

View File

@@ -47,6 +47,29 @@ spec:
- mountPath: /scripts
name: script-volume
readOnly: true
- name: drbd-logger
image: {{ include "cozy.linstor.version.satellite" . }}
command:
- "/scripts/plunger-drbd-logger.sh"
securityContext:
capabilities:
add:
- NET_ADMIN
- SYS_ADMIN
drop:
- ALL
privileged: true
readOnlyRootFilesystem: false
volumeMounts:
- mountPath: /dev
name: dev
- mountPath: /var/lib/drbd
name: var-lib-drbd
- mountPath: /var/lib/linstor.d
name: var-lib-linstor-d
- mountPath: /scripts
name: script-volume
readOnly: true
volumes:
- name: script-volume
configMap: