Skip to content

Commit

Permalink
chore: added cluster utilization metric and a couple of node lifetime…
Browse files Browse the repository at this point in the history
… metrics (#1603)
  • Loading branch information
Youssef-Beltagy authored Aug 29, 2024
1 parent ee2f7d5 commit 14f12dd
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 40 deletions.
133 changes: 109 additions & 24 deletions pkg/controllers/metrics/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,67 +49,91 @@ const (
var (
allocatable = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "allocatable",
Help: "Node allocatable are the resources allocatable by nodes.",
},
nodeLabelNames(),
nodeLabelNamesWithResourceType(),
)
totalPodRequests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "total_pod_requests",
Help: "Node total pod requests are the resources requested by pods bound to nodes, including the DaemonSet pods.",
},
nodeLabelNames(),
nodeLabelNamesWithResourceType(),
)
totalPodLimits = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "total_pod_limits",
Help: "Node total pod limits are the resources specified by pod limits, including the DaemonSet pods.",
},
nodeLabelNames(),
nodeLabelNamesWithResourceType(),
)
totalDaemonRequests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "total_daemon_requests",
Help: "Node total daemon requests are the resource requested by DaemonSet pods bound to nodes.",
},
nodeLabelNames(),
nodeLabelNamesWithResourceType(),
)
totalDaemonLimits = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "total_daemon_limits",
Help: "Node total daemon limits are the resources specified by DaemonSet pod limits.",
},
nodeLabelNames(),
nodeLabelNamesWithResourceType(),
)
systemOverhead = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "system_overhead",
Help: "Node system daemon overhead are the resources reserved for system overhead, the difference between the node's capacity and allocatable values are reported by the status.",
},
nodeLabelNamesWithResourceType(),
)
lifetimeGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "current_lifetime_seconds",
Help: "Node age in seconds",
},
nodeLabelNames(),
)
clusterUtilizationGaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: "cluster",
Name: "utilization_percent",
Help: "Utilization of allocatable resources by pod requests",
},
[]string{resourceType},
)
wellKnownLabels = getWellKnownLabels()
)

func nodeLabelNamesWithResourceType() []string {
return append(
nodeLabelNames(),
resourceType,
)
}

func nodeLabelNames() []string {
return append(
// WellKnownLabels includes the nodepool label, so we don't need to add it as its own item here.
// If we do, prometheus will panic since there would be duplicate labels.
sets.New(lo.Values(wellKnownLabels)...).UnsortedList(),
resourceType,
nodeName,
nodePhase,
)
Expand All @@ -123,6 +147,8 @@ func init() {
totalDaemonRequests,
totalDaemonLimits,
systemOverhead,
lifetimeGaugeVec,
clusterUtilizationGaugeVec,
)
}

Expand All @@ -144,9 +170,17 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) {
nodes := lo.Reject(c.cluster.Nodes(), func(n *state.StateNode, _ int) bool {
return n.Node == nil
})
c.metricStore.ReplaceAll(lo.SliceToMap(nodes, func(n *state.StateNode) (string, []*metrics.StoreMetric) {

// Build per-node metrics
metricsMap := lo.SliceToMap(nodes, func(n *state.StateNode) (string, []*metrics.StoreMetric) {
return client.ObjectKeyFromObject(n.Node).String(), buildMetrics(n)
}))
})

// Build cluster level metric
metricsMap["clusterUtilization"] = buildClusterUtilizationMetric(nodes)

c.metricStore.ReplaceAll(metricsMap)

return reconcile.Result{RequeueAfter: time.Second * 5}, nil
}

Expand All @@ -157,6 +191,43 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error {
Complete(singleton.AsReconciler(c))
}

func buildClusterUtilizationMetric(nodes state.StateNodes) []*metrics.StoreMetric {

// Aggregate resources allocated/utilized for all the nodes and pods inside the nodes
allocatableAggregate, utilizedAggregate := corev1.ResourceList{}, corev1.ResourceList{}

for _, node := range nodes {
resources.MergeInto(allocatableAggregate, node.Allocatable())
resources.MergeInto(utilizedAggregate, node.PodRequests())
}

res := make([]*metrics.StoreMetric, 0, len(allocatableAggregate))

for resourceName, allocatableResource := range allocatableAggregate {

if allocatableResource.Value() == 0 {
// This zero check may be unnecessary. I'm erring towards caution.
continue
}

utilizedResource := utilizedAggregate[resourceName]

// Typecast to float before the calculation to maximize resolution
utilizationPercentage := 100 * lo.Ternary(
resourceName == corev1.ResourceCPU,
float64(utilizedResource.MilliValue())/float64(allocatableResource.MilliValue()),
float64(utilizedResource.Value())/float64(allocatableResource.Value()))

res = append(res, &metrics.StoreMetric{
GaugeVec: clusterUtilizationGaugeVec,
Value: utilizationPercentage,
Labels: prometheus.Labels{resourceType: resourceNameToString(resourceName)},
})
}

return res
}

func buildMetrics(n *state.StateNode) (res []*metrics.StoreMetric) {
for gaugeVec, resourceList := range map[*prometheus.GaugeVec]corev1.ResourceList{
systemOverhead: resources.Subtract(n.Node.Status.Capacity, n.Node.Status.Allocatable),
Expand All @@ -170,16 +241,26 @@ func buildMetrics(n *state.StateNode) (res []*metrics.StoreMetric) {
res = append(res, &metrics.StoreMetric{
GaugeVec: gaugeVec,
Value: lo.Ternary(resourceName == corev1.ResourceCPU, float64(quantity.MilliValue())/float64(1000), float64(quantity.Value())),
Labels: getNodeLabels(n.Node, strings.ReplaceAll(strings.ToLower(string(resourceName)), "-", "_")),
Labels: getNodeLabelsWithResourceType(n.Node, resourceNameToString(resourceName)),
})
}
}
return res
return append(res,
&metrics.StoreMetric{
GaugeVec: lifetimeGaugeVec,
Value: time.Since(n.Node.GetCreationTimestamp().Time).Seconds(),
Labels: getNodeLabels(n.Node),
})
}

func getNodeLabels(node *corev1.Node, resourceTypeName string) prometheus.Labels {
metricLabels := prometheus.Labels{}
func getNodeLabelsWithResourceType(node *corev1.Node, resourceTypeName string) prometheus.Labels {
metricLabels := getNodeLabels(node)
metricLabels[resourceType] = resourceTypeName
return metricLabels
}

func getNodeLabels(node *corev1.Node) prometheus.Labels {
metricLabels := prometheus.Labels{}
metricLabels[nodeName] = node.Name
metricLabels[nodePhase] = string(node.Status.Phase)

Expand All @@ -202,3 +283,7 @@ func getWellKnownLabels() map[string]string {
}
return labels
}

func resourceNameToString(resourceName corev1.ResourceName) string {
return strings.ReplaceAll(strings.ToLower(string(resourceName)), "-", "_")
}
39 changes: 28 additions & 11 deletions pkg/controllers/metrics/node/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,18 @@ var _ = AfterSuite(func() {
})

var _ = Describe("Node Metrics", func() {
It("should update the allocatable metric", func() {
resources := corev1.ResourceList{
var node *corev1.Node
var resources corev1.ResourceList

BeforeEach(func() {
resources = corev1.ResourceList{
corev1.ResourcePods: resource.MustParse("100"),
corev1.ResourceCPU: resource.MustParse("5000"),
corev1.ResourceMemory: resource.MustParse("32Gi"),
}

node := test.Node(test.NodeOptions{Allocatable: resources})
node = test.Node(test.NodeOptions{Allocatable: resources})
})
It("should update the allocatable metric", func() {
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
ExpectSingletonReconciled(ctx, metricsStateController)
Expand All @@ -99,14 +103,27 @@ var _ = Describe("Node Metrics", func() {
Expect(metric.GetGauge().GetValue()).To(BeNumerically("~", v.AsApproximateFloat64()))
}
})
It("should remove the node metric gauge when the node is deleted", func() {
resources := corev1.ResourceList{
corev1.ResourcePods: resource.MustParse("100"),
corev1.ResourceCPU: resource.MustParse("5000"),
corev1.ResourceMemory: resource.MustParse("32Gi"),
}
It("should update the node lifetime and cluster utilization metrics", func() {

node := test.Node(test.NodeOptions{Allocatable: resources})
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
ExpectSingletonReconciled(ctx, metricsStateController)

metric, found := FindMetricWithLabelValues("karpenter_nodes_current_lifetime_seconds", map[string]string{
"node_name": node.GetName(),
})
Expect(found).To(BeTrue())
Expect(metric.GetGauge().GetValue()).To(BeNumerically(">=", 0))

for resourceName := range resources {
metric, found := FindMetricWithLabelValues("karpenter_cluster_utilization_percent", map[string]string{
"resource_type": resourceName.String(),
})
Expect(found).To(BeTrue())
Expect(metric.GetGauge().GetValue()).To(BeNumerically("==", 0))
}
})
It("should remove the node metric gauge when the node is deleted", func() {
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
ExpectSingletonReconciled(ctx, metricsStateController)
Expand Down
7 changes: 7 additions & 0 deletions pkg/controllers/node/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,13 +234,20 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error
if err := c.kubeClient.Patch(ctx, n, client.StrategicMergeFrom(stored)); err != nil {
return client.IgnoreNotFound(fmt.Errorf("patching node, %w", err))
}

metrics.NodesTerminatedTotal.With(prometheus.Labels{
metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey],
}).Inc()

// We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp
TerminationDurationSeconds.With(prometheus.Labels{
metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey],
}).Observe(time.Since(stored.DeletionTimestamp.Time).Seconds())

NodeLifetimeDurationSeconds.With(map[string]string{
metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey],
}).Observe(time.Since(n.CreationTimestamp.Time).Seconds())

log.FromContext(ctx).Info("deleted node")
}
return nil
Expand Down
43 changes: 42 additions & 1 deletion pkg/controllers/node/termination/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,22 @@ limitations under the License.
package termination

import (
"time"

"github.com/prometheus/client_golang/prometheus"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"

"sigs.k8s.io/karpenter/pkg/metrics"
)

func init() {
crmetrics.Registry.MustRegister(TerminationDurationSeconds)
crmetrics.Registry.MustRegister(
TerminationDurationSeconds,
NodeLifetimeDurationSeconds)
}

const dayDuration = time.Hour * 24

var (
TerminationDurationSeconds = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Expand All @@ -38,4 +44,39 @@ var (
},
[]string{metrics.NodePoolLabel},
)
NodeLifetimeDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "lifetime_duration_seconds",
Help: "The lifetime duration of the nodes since creation.",
Buckets: []float64{

(time.Minute * 15).Seconds(),
(time.Minute * 30).Seconds(),
(time.Minute * 45).Seconds(),

time.Hour.Seconds(),
(time.Hour * 2).Seconds(),
(time.Hour * 4).Seconds(),
(time.Hour * 6).Seconds(),
(time.Hour * 8).Seconds(),
(time.Hour * 10).Seconds(),
(time.Hour * 12).Seconds(),
(time.Hour * 16).Seconds(),
(time.Hour * 20).Seconds(),

dayDuration.Seconds(),
(dayDuration * 2).Seconds(),
(dayDuration * 3).Seconds(),
(dayDuration * 5).Seconds(),
(dayDuration * 10).Seconds(),
(dayDuration * 15).Seconds(),
(dayDuration * 20).Seconds(),
(dayDuration * 25).Seconds(),
(dayDuration * 30).Seconds(),
},
},
[]string{metrics.NodePoolLabel},
)
)
13 changes: 13 additions & 0 deletions pkg/controllers/node/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ var _ = Describe("Termination", func() {
// Reset the metrics collectors
metrics.NodesTerminatedTotal.Reset()
termination.TerminationDurationSeconds.Reset()
termination.NodeLifetimeDurationSeconds.Reset()
})

Context("Reconciliation", func() {
Expand Down Expand Up @@ -846,6 +847,18 @@ var _ = Describe("Termination", func() {
Expect(ok).To(BeTrue())
Expect(lo.FromPtr(m.GetCounter().Value)).To(BeNumerically("==", 1))
})
It("should fire the lifetime duration histogram metric when deleting nodes", func() {
ExpectApplied(ctx, env.Client, node, nodeClaim)
Expect(env.Client.Delete(ctx, node)).To(Succeed())
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

m, ok := FindMetricWithLabelValues("karpenter_nodes_lifetime_duration_seconds", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]})
Expect(ok).To(BeTrue())
Expect(lo.FromPtr(m.GetHistogram().SampleCount)).To(BeNumerically("==", 1))
})
It("should update the eviction queueDepth metric when reconciling pods", func() {
minAvailable := intstr.FromInt32(0)
labelSelector := map[string]string{test.RandomName(): test.RandomName()}
Expand Down
Loading

0 comments on commit 14f12dd

Please sign in to comment.