mirror of
				https://github.com/optim-enterprises-bv/kubernetes.git
				synced 2025-11-03 19:58:17 +00:00 
			
		
		
		
	Reduce logspam for crash looping containers
This commit is contained in:
		@@ -394,7 +394,8 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
 | 
			
		||||
	for i, c := range containers {
 | 
			
		||||
		status, err := m.runtimeService.ContainerStatus(c.Id)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			klog.Errorf("ContainerStatus for %s error: %v", c.Id, err)
 | 
			
		||||
			// Merely log this here; GetPodStatus will actually report the error out.
 | 
			
		||||
			klog.V(4).Infof("ContainerStatus for %s error: %v", c.Id, err)
 | 
			
		||||
			return nil, err
 | 
			
		||||
		}
 | 
			
		||||
		cStatus := toKubeContainerStatus(status, m.runtimeName)
 | 
			
		||||
 
 | 
			
		||||
@@ -59,6 +59,8 @@ const (
 | 
			
		||||
 | 
			
		||||
	// The expiration time of version cache.
 | 
			
		||||
	versionCacheTTL = 60 * time.Second
 | 
			
		||||
	// How frequently to report identical errors
 | 
			
		||||
	identicalErrorDelay = 1 * time.Minute
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
@@ -123,6 +125,12 @@ type kubeGenericRuntimeManager struct {
 | 
			
		||||
 | 
			
		||||
	// Manage RuntimeClass resources.
 | 
			
		||||
	runtimeClassManager *runtimeclass.Manager
 | 
			
		||||
 | 
			
		||||
	// Cache last per-container error message to reduce log spam
 | 
			
		||||
	lastError map[string]string
 | 
			
		||||
 | 
			
		||||
	// Time last per-container error message was printed
 | 
			
		||||
	errorPrinted map[string]time.Time
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// KubeGenericRuntime is a interface contains interfaces for container runtime and command.
 | 
			
		||||
@@ -177,6 +185,8 @@ func NewKubeGenericRuntimeManager(
 | 
			
		||||
		internalLifecycle:   internalLifecycle,
 | 
			
		||||
		legacyLogProvider:   legacyLogProvider,
 | 
			
		||||
		runtimeClassManager: runtimeClassManager,
 | 
			
		||||
		lastError:           make(map[string]string),
 | 
			
		||||
		errorPrinted:        make(map[string]time.Time),
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion)
 | 
			
		||||
@@ -819,6 +829,15 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo
 | 
			
		||||
	return
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (m *kubeGenericRuntimeManager) cleanupErrorTimeouts() {
 | 
			
		||||
	for name, timeout := range m.errorPrinted {
 | 
			
		||||
		if time.Now().Sub(timeout) >= identicalErrorDelay {
 | 
			
		||||
			delete(m.errorPrinted, name)
 | 
			
		||||
			delete(m.lastError, name)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetPodStatus retrieves the status of the pod, including the
 | 
			
		||||
// information of all containers in the pod that are visible in Runtime.
 | 
			
		||||
func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
 | 
			
		||||
@@ -868,9 +887,16 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp
 | 
			
		||||
	// Get statuses of all containers visible in the pod.
 | 
			
		||||
	containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
 | 
			
		||||
		lastMsg, ok := m.lastError[podFullName]
 | 
			
		||||
		if !ok || err.Error() != lastMsg || time.Now().Sub(m.errorPrinted[podFullName]) >= identicalErrorDelay {
 | 
			
		||||
			klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
 | 
			
		||||
			m.errorPrinted[podFullName] = time.Now()
 | 
			
		||||
			m.lastError[podFullName] = err.Error()
 | 
			
		||||
		}
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	delete(m.errorPrinted, podFullName)
 | 
			
		||||
	delete(m.lastError, podFullName)
 | 
			
		||||
 | 
			
		||||
	return &kubecontainer.PodStatus{
 | 
			
		||||
		ID:                uid,
 | 
			
		||||
 
 | 
			
		||||
@@ -244,7 +244,8 @@ func (g *GenericPLEG) relist() {
 | 
			
		||||
			// serially may take a while. We should be aware of this and
 | 
			
		||||
			// parallelize if needed.
 | 
			
		||||
			if err := g.updateCache(pod, pid); err != nil {
 | 
			
		||||
				klog.Errorf("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
 | 
			
		||||
				// Rely on updateCache calling GetPodStatus to log the actual error.
 | 
			
		||||
				klog.V(4).Infof("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
 | 
			
		||||
 | 
			
		||||
				// make sure we try to reinspect the pod during the next relisting
 | 
			
		||||
				needsReinspection[pid] = pod
 | 
			
		||||
@@ -274,7 +275,8 @@ func (g *GenericPLEG) relist() {
 | 
			
		||||
			klog.V(5).Infof("GenericPLEG: Reinspecting pods that previously failed inspection")
 | 
			
		||||
			for pid, pod := range g.podsToReinspect {
 | 
			
		||||
				if err := g.updateCache(pod, pid); err != nil {
 | 
			
		||||
					klog.Errorf("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
 | 
			
		||||
					// Rely on updateCache calling GetPodStatus to log the actual error.
 | 
			
		||||
					klog.V(5).Infof("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
 | 
			
		||||
					needsReinspection[pid] = pod
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
 
 | 
			
		||||
@@ -36,8 +36,17 @@ import (
 | 
			
		||||
type RemoteRuntimeService struct {
 | 
			
		||||
	timeout       time.Duration
 | 
			
		||||
	runtimeClient runtimeapi.RuntimeServiceClient
 | 
			
		||||
	// Cache last per-container error message to reduce log spam
 | 
			
		||||
	lastError map[string]string
 | 
			
		||||
	// Time last per-container error message was printed
 | 
			
		||||
	errorPrinted map[string]time.Time
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	// How frequently to report identical errors
 | 
			
		||||
	identicalErrorDelay = 1 * time.Minute
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// NewRemoteRuntimeService creates a new internalapi.RuntimeService.
 | 
			
		||||
func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) {
 | 
			
		||||
	klog.V(3).Infof("Connecting to runtime service %s", endpoint)
 | 
			
		||||
@@ -57,6 +66,8 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (
 | 
			
		||||
	return &RemoteRuntimeService{
 | 
			
		||||
		timeout:       connectionTimeout,
 | 
			
		||||
		runtimeClient: runtimeapi.NewRuntimeServiceClient(conn),
 | 
			
		||||
		lastError:     make(map[string]string),
 | 
			
		||||
		errorPrinted:  make(map[string]time.Time),
 | 
			
		||||
	}, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -225,6 +236,8 @@ func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64)
 | 
			
		||||
	ctx, cancel := getContextWithTimeout(t)
 | 
			
		||||
	defer cancel()
 | 
			
		||||
 | 
			
		||||
	delete(r.lastError, containerID)
 | 
			
		||||
	delete(r.errorPrinted, containerID)
 | 
			
		||||
	_, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{
 | 
			
		||||
		ContainerId: containerID,
 | 
			
		||||
		Timeout:     timeout,
 | 
			
		||||
@@ -243,6 +256,8 @@ func (r *RemoteRuntimeService) RemoveContainer(containerID string) error {
 | 
			
		||||
	ctx, cancel := getContextWithTimeout(r.timeout)
 | 
			
		||||
	defer cancel()
 | 
			
		||||
 | 
			
		||||
	delete(r.lastError, containerID)
 | 
			
		||||
	delete(r.errorPrinted, containerID)
 | 
			
		||||
	_, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
 | 
			
		||||
		ContainerId: containerID,
 | 
			
		||||
	})
 | 
			
		||||
@@ -270,6 +285,16 @@ func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter
 | 
			
		||||
	return resp.Containers, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Clean up any expired last-error timers
 | 
			
		||||
func (r *RemoteRuntimeService) cleanupErrorTimeouts() {
 | 
			
		||||
	for ID, timeout := range r.errorPrinted {
 | 
			
		||||
		if time.Now().Sub(timeout) >= identicalErrorDelay {
 | 
			
		||||
			delete(r.lastError, ID)
 | 
			
		||||
			delete(r.errorPrinted, ID)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// ContainerStatus returns the container status.
 | 
			
		||||
func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) {
 | 
			
		||||
	ctx, cancel := getContextWithTimeout(r.timeout)
 | 
			
		||||
@@ -278,10 +303,19 @@ func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.
 | 
			
		||||
	resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{
 | 
			
		||||
		ContainerId: containerID,
 | 
			
		||||
	})
 | 
			
		||||
	r.cleanupErrorTimeouts()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
 | 
			
		||||
		// Don't spam the log with endless messages about the same failure.
 | 
			
		||||
		lastMsg, ok := r.lastError[containerID]
 | 
			
		||||
		if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
 | 
			
		||||
			klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
 | 
			
		||||
			r.errorPrinted[containerID] = time.Now()
 | 
			
		||||
			r.lastError[containerID] = err.Error()
 | 
			
		||||
		}
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	delete(r.lastError, containerID)
 | 
			
		||||
	delete(r.errorPrinted, containerID)
 | 
			
		||||
 | 
			
		||||
	if resp.Status != nil {
 | 
			
		||||
		if err := verifyContainerStatus(resp.Status); err != nil {
 | 
			
		||||
@@ -456,10 +490,18 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C
 | 
			
		||||
	resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{
 | 
			
		||||
		ContainerId: containerID,
 | 
			
		||||
	})
 | 
			
		||||
	r.cleanupErrorTimeouts()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
 | 
			
		||||
		lastMsg, ok := r.lastError[containerID]
 | 
			
		||||
		if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
 | 
			
		||||
			klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
 | 
			
		||||
			r.errorPrinted[containerID] = time.Now()
 | 
			
		||||
			r.lastError[containerID] = err.Error()
 | 
			
		||||
		}
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	delete(r.lastError, containerID)
 | 
			
		||||
	delete(r.errorPrinted, containerID)
 | 
			
		||||
 | 
			
		||||
	return resp.GetStats(), nil
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user