mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-04 04:08:16 +00:00 
			
		
		
		
	Periodically fetch logexported nodes instead of sleeping
This commit is contained in:
		@@ -281,6 +281,24 @@ function dump_nodes() {
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Collect names of nodes which didn't run logexporter successfully.
 | 
			
		||||
# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
 | 
			
		||||
# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
 | 
			
		||||
# Assumes:
 | 
			
		||||
#   NODE_NAMES
 | 
			
		||||
# Sets:
 | 
			
		||||
#   NON_LOGEXPORTED_NODES
 | 
			
		||||
function find_non_logexported_nodes() {
 | 
			
		||||
  succeeded_nodes=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry) || return 1
 | 
			
		||||
  echo "Successfully listed marker files for successful nodes"
 | 
			
		||||
  NON_LOGEXPORTED_NODES=()
 | 
			
		||||
  for node in "${NODE_NAMES[@]}"; do
 | 
			
		||||
    if [[ ! "${succeeded_nodes}" =~ "${node}" ]]; then
 | 
			
		||||
      NON_LOGEXPORTED_NODES+=("${node}")
 | 
			
		||||
    fi
 | 
			
		||||
  done
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
function dump_nodes_with_logexporter() {
 | 
			
		||||
  echo "Detecting nodes in the cluster"
 | 
			
		||||
  detect-node-names &> /dev/null
 | 
			
		||||
@@ -312,14 +330,27 @@ function dump_nodes_with_logexporter() {
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  # Give some time for the pods to finish uploading logs.
 | 
			
		||||
  sleep "${logexport_sleep_seconds}"
 | 
			
		||||
  # Periodically fetch list of already logexported nodes to verify
 | 
			
		||||
  # if we aren't already done.
 | 
			
		||||
  start="$(date +%s)"
 | 
			
		||||
  while true; do
 | 
			
		||||
    now="$(date +%s)"
 | 
			
		||||
    if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
 | 
			
		||||
      echo "Waiting for all nodes to be logexported timed out."
 | 
			
		||||
      break
 | 
			
		||||
    fi
 | 
			
		||||
    if find_non_logexported_nodes; then
 | 
			
		||||
      if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
 | 
			
		||||
        break
 | 
			
		||||
      fi
 | 
			
		||||
    fi
 | 
			
		||||
    sleep 15
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
 | 
			
		||||
  local nodes_succeeded
 | 
			
		||||
  for retry in {1..10}; do
 | 
			
		||||
    if nodes_succeeded=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry); then
 | 
			
		||||
      echo "Successfully listed marker files for successful nodes"
 | 
			
		||||
    if find_non_logexported_nodes; then
 | 
			
		||||
      break
 | 
			
		||||
    else
 | 
			
		||||
      echo "Attempt ${retry} failed to list marker files for succeessful nodes"
 | 
			
		||||
@@ -333,15 +364,10 @@ function dump_nodes_with_logexporter() {
 | 
			
		||||
    fi
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  # Collect names of nodes which didn't run logexporter successfully.
 | 
			
		||||
  # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
 | 
			
		||||
  # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
 | 
			
		||||
  failed_nodes=()
 | 
			
		||||
  for node in "${NODE_NAMES[@]}"; do
 | 
			
		||||
    if [[ ! "${nodes_succeeded}" =~ "${node}" ]]; then
 | 
			
		||||
      echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
 | 
			
		||||
      failed_nodes+=("${node}")
 | 
			
		||||
    fi
 | 
			
		||||
  for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
 | 
			
		||||
    echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
 | 
			
		||||
    failed_nodes+=("${node}")
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user