Expose information about scheduling latency in scalability tests.

2025-11-03 19:58:17 +00:00 · 2015-11-18 17:07:26 +01:00
parent 4a9b0fc715
commit a968f98dc2
3 changed files with 77 additions and 8 deletions
--- a/plugin/pkg/scheduler/metrics/metrics.go
+++ b/plugin/pkg/scheduler/metrics/metrics.go
@@ -31,6 +31,7 @@ var (
 			Subsystem: schedulerSubsystem,
 			Name:      "e2e_scheduling_latency_microseconds",
 			Help:      "E2e scheduling latency (scheduling algorithm + binding)",
+			MaxAge:    time.Hour,
 		},
 	)
 	SchedulingAlgorithmLatency = prometheus.NewSummary(
@@ -38,6 +39,7 @@ var (
 			Subsystem: schedulerSubsystem,
 			Name:      "scheduling_algorithm_latency_microseconds",
 			Help:      "Scheduling algorithm latency",
+			MaxAge:    time.Hour,
 		},
 	)
 	BindingLatency = prometheus.NewSummary(
@@ -45,6 +47,7 @@ var (
 			Subsystem: schedulerSubsystem,
 			Name:      "binding_latency_microseconds",
 			Help:      "Binding latency",
+			MaxAge:    time.Hour,
 		},
 	)
 )
--- a/test/e2e/density.go
+++ b/test/e2e/density.go
@@ -104,10 +104,15 @@ var _ = Describe("Density [Skipped]", func() {

 		expectNoError(writePerfData(c, fmt.Sprintf(testContext.OutputDir+"/%s", uuid), "after"))

-		// Verify latency metrics
+		// Verify latency metrics.
 		highLatencyRequests, err := HighLatencyRequests(c)
 		expectNoError(err)
 		Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests")
+
+		// Verify scheduler metrics.
+		// TODO: Reset metrics at the beginning of the test.
+		// We should do something similar to how we do it for APIserver.
+		expectNoError(VerifySchedulerLatency())
 	})

 	framework := NewFramework("density")
--- a/test/e2e/metrics_util.go
+++ b/test/e2e/metrics_util.go
@@ -57,6 +57,12 @@ type PodStartupLatency struct {
 	Latency LatencyMetric `json:"latency"`
 }

+type SchedulingLatency struct {
+	Scheduling LatencyMetric `json:"scheduling:`
+	Binding    LatencyMetric `json:"binding"`
+	Total      LatencyMetric `json:"total"`
+}
+
 type APICall struct {
 	Resource string        `json:"resource"`
 	Verb     string        `json:"verb"`
@@ -78,26 +84,31 @@ func (a APIResponsiveness) Less(i, j int) bool {
 func (a *APIResponsiveness) addMetric(resource, verb string, quantile float64, latency time.Duration) {
 	for i, apicall := range a.APICalls {
 		if apicall.Resource == resource && apicall.Verb == verb {
-			a.APICalls[i] = setQuantile(apicall, quantile, latency)
+			a.APICalls[i] = setQuantileAPICall(apicall, quantile, latency)
 			return
 		}
 	}
-	apicall := setQuantile(APICall{Resource: resource, Verb: verb}, quantile, latency)
+	apicall := setQuantileAPICall(APICall{Resource: resource, Verb: verb}, quantile, latency)
 	a.APICalls = append(a.APICalls, apicall)
 }

 // 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
 // Only 0.5, 0.9 and 0.99 quantiles are supported.
-func setQuantile(apicall APICall, quantile float64, latency time.Duration) APICall {
+func setQuantileAPICall(apicall APICall, quantile float64, latency time.Duration) APICall {
+	setQuantile(&apicall.Latency, quantile, latency)
+	return apicall
+}
+
+// Only 0.5, 0.9 and 0.99 quantiles are supported.
+func setQuantile(metric *LatencyMetric, quantile float64, latency time.Duration) {
 	switch quantile {
 	case 0.5:
-		apicall.Latency.Perc50 = latency
+		metric.Perc50 = latency
 	case 0.9:
-		apicall.Latency.Perc90 = latency
+		metric.Perc90 = latency
 	case 0.99:
-		apicall.Latency.Perc99 = latency
+		metric.Perc99 = latency
 	}
-	return apicall
 }

 func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) {
@@ -233,6 +244,56 @@ func getMetrics(c *client.Client) (string, error) {
 	return string(body), nil
 }

+// Retrieves scheduler metrics information.
+func getSchedulingLatency() (SchedulingLatency, error) {
+	result := SchedulingLatency{}
+
+	cmd := "curl http://localhost:10251/metrics"
+	sshResult, err := SSH(cmd, getMasterHost()+":22", testContext.Provider)
+	if err != nil || sshResult.Code != 0 {
+		return result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
+	}
+	samples, err := extractMetricSamples(sshResult.Stdout)
+	if err != nil {
+		return result, err
+	}
+
+	for _, sample := range samples {
+		var metric *LatencyMetric = nil
+		switch sample.Metric[model.MetricNameLabel] {
+		case "scheduler_scheduling_algorithm_latency_microseconds":
+			metric = &result.Scheduling
+		case "scheduler_binding_latency_microseconds":
+			metric = &result.Binding
+		case "scheduler_e2e_scheduling_latency_microseconds":
+			metric = &result.Total
+		}
+		if metric == nil {
+			continue
+		}
+
+		latency := sample.Value
+		quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
+		if err != nil {
+			return result, err
+		}
+		setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond)
+	}
+	return result, nil
+}
+
+// Verifies (currently just by logging them) the scheduling latencies.
+func VerifySchedulerLatency() error {
+	latency, err := getSchedulingLatency()
+	if err != nil {
+		return err
+	}
+	Logf("Scheduling latency: %s", prettyPrintJSON(latency))
+
+	// TODO: Add some reasonable checks once we know more about the values.
+	return nil
+}
+
 func prettyPrintJSON(metrics interface{}) string {
 	output := &bytes.Buffer{}
 	if err := json.NewEncoder(output).Encode(metrics); err != nil {