diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index ff1d265a76..1d0a7ef9d9 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,7 @@ ## Main +- [9942](https://github.com/grafana/loki/pull/9942) **btaani**: Use a condition to warn when there are no nodes with matching labels for zone-awareness + ## 0.4.0 (2023-07-27) - [10019](https://github.com/grafana/loki/pull/10019) **periklis**: Update Loki operand to v2.8.3 diff --git a/operator/apis/loki/v1/lokistack_types.go b/operator/apis/loki/v1/lokistack_types.go index 3e4e34ae21..dbc597d481 100644 --- a/operator/apis/loki/v1/lokistack_types.go +++ b/operator/apis/loki/v1/lokistack_types.go @@ -990,6 +990,8 @@ const ( ReasonFailedCertificateRotation LokiStackConditionReason = "FailedCertificateRotation" // ReasonQueryTimeoutInvalid when the QueryTimeout can not be parsed. ReasonQueryTimeoutInvalid LokiStackConditionReason = "ReasonQueryTimeoutInvalid" + // ReasonNoZoneAwareNodes when the cluster does not contain any nodes with the labels needed for zone-awareness. + ReasonNoZoneAwareNodes LokiStackConditionReason = "ReasonNoZoneAwareNodes" ) // PodStatusMap defines the type for mapping pod status to pod name. diff --git a/operator/internal/status/lokistack.go b/operator/internal/status/lokistack.go index 1a34d78658..838ff0c6ca 100644 --- a/operator/internal/status/lokistack.go +++ b/operator/internal/status/lokistack.go @@ -13,12 +13,14 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( - messageReady = "All components ready" - messageFailed = "Some LokiStack components failed" - messagePending = "Some LokiStack components pending on dependencies" + messageReady = "All components ready" + messageFailed = "Some LokiStack components failed" + messagePending = "Some LokiStack components pending on dependencies" + messageDegradedNodeLabels = "Cluster contains no nodes matching the labels used for zone-awareness" ) var ( @@ -37,6 +39,11 @@ var ( Message: messageReady, Reason: string(lokiv1.ReasonReadyComponents), } + conditionDegradedNodeLabels = metav1.Condition{ + Type: string(lokiv1.ConditionDegraded), + Message: messageDegradedNodeLabels, + Reason: string(lokiv1.ReasonNoZoneAwareNodes), + } ) // DegradedError contains information about why the managed LokiStack has an invalid configuration. @@ -61,7 +68,7 @@ func SetDegradedCondition(ctx context.Context, k k8s.Client, req ctrl.Request, m return updateCondition(ctx, k, req, degraded) } -func generateCondition(cs *lokiv1.LokiStackComponentStatus) metav1.Condition { +func generateCondition(ctx context.Context, cs *lokiv1.LokiStackComponentStatus, k client.Client, req ctrl.Request, stack *lokiv1.LokiStack) (metav1.Condition, error) { // Check for failed pods first failed := len(cs.Compactor[corev1.PodFailed]) + len(cs.Distributor[corev1.PodFailed]) + @@ -73,7 +80,7 @@ func generateCondition(cs *lokiv1.LokiStackComponentStatus) metav1.Condition { len(cs.Ruler[corev1.PodFailed]) if failed != 0 { - return conditionFailed + return conditionFailed, nil } // Check for pending pods @@ -87,10 +94,37 @@ func generateCondition(cs *lokiv1.LokiStackComponentStatus) metav1.Condition { len(cs.Ruler[corev1.PodPending]) if pending != 0 { - return conditionPending + if stack.Spec.Replication != nil && len(stack.Spec.Replication.Zones) > 0 { + // When there are pending pods and zone-awareness is enabled check if there are any nodes + // that can satisfy the constraints and emit a condition if not. + nodesOk, err := checkForZoneawareNodes(ctx, k, stack.Spec.Replication.Zones) + if err != nil { + return metav1.Condition{}, err + } + + if !nodesOk { + return conditionDegradedNodeLabels, nil + } + } + + return conditionPending, nil + } + + return conditionReady, nil +} + +func checkForZoneawareNodes(ctx context.Context, k client.Client, zones []lokiv1.ZoneSpec) (bool, error) { + nodeLabels := client.HasLabels{} + for _, z := range zones { + nodeLabels = append(nodeLabels, z.TopologyKey) + } + + nodeList := &corev1.NodeList{} + if err := k.List(ctx, nodeList, nodeLabels); err != nil { + return false, err } - return conditionReady + return len(nodeList.Items) > 0, nil } func updateCondition(ctx context.Context, k k8s.Client, req ctrl.Request, condition metav1.Condition) error { diff --git a/operator/internal/status/lokistack_test.go b/operator/internal/status/lokistack_test.go index 7e60cc8afd..64726795e5 100644 --- a/operator/internal/status/lokistack_test.go +++ b/operator/internal/status/lokistack_test.go @@ -2,6 +2,7 @@ package status import ( "context" + "errors" "testing" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -152,7 +153,23 @@ func TestSetDegradedCondition_WhenNoneExisting_AppendDegradedCondition(t *testin require.NotZero(t, sw.UpdateCallCount()) } -func TestGenerateConditions(t *testing.T) { +func TestGenerateCondition(t *testing.T) { + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "test-lokistack", + Namespace: "some-ns", + }, + } + lokiStack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-lokistack", + Namespace: "test-ns", + }, + } tt := []struct { desc string componentStatus *lokiv1.LokiStackComponentStatus @@ -192,7 +209,95 @@ func TestGenerateConditions(t *testing.T) { t.Run(tc.desc, func(t *testing.T) { t.Parallel() - condition := generateCondition(tc.componentStatus) + condition, err := generateCondition(context.TODO(), tc.componentStatus, k, r, &lokiStack) + require.Nil(t, err) + require.Equal(t, tc.wantCondition, condition) + }) + } +} + +func TestGenerateCondition_ZoneAwareLokiStack(t *testing.T) { + testError := errors.New("test-error") + tt := []struct { + desc string + nodes []corev1.Node + wantCondition metav1.Condition + wantErr error + }{ + { + desc: "nodes available", + nodes: []corev1.Node{ + {}, + }, + wantCondition: conditionPending, + }, + { + desc: "no nodes available", + nodes: []corev1.Node{}, + wantCondition: conditionDegradedNodeLabels, + }, + { + desc: "api error", + nodes: []corev1.Node{}, + wantErr: testError, + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "test-lokistack", + Namespace: "some-ns", + }, + } + componentStatus := &lokiv1.LokiStackComponentStatus{ + Ingester: map[corev1.PodPhase][]string{ + corev1.PodPending: { + "pod-0", + }, + }, + } + lokiStack := lokiv1.LokiStack{ + Spec: lokiv1.LokiStackSpec{ + Replication: &lokiv1.ReplicationSpec{ + Zones: []lokiv1.ZoneSpec{ + { + TopologyKey: "topology-key", + }, + }, + }, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-lokistack", + Namespace: "test-ns", + }, + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + } + + k, _ := setupFakesNoError(t, &lokiStack) + k.ListStub = func(_ context.Context, ol client.ObjectList, options ...client.ListOption) error { + for _, o := range options { + if labels, ok := o.(client.HasLabels); ok { + require.Len(t, labels, 1) + require.Equal(t, "topology-key", labels[0]) + } + } + + k.SetClientObjectList(ol, &corev1.NodeList{ + Items: tc.nodes, + }) + return tc.wantErr + } + + condition, err := generateCondition(context.TODO(), componentStatus, k, r, &lokiStack) + + require.Equal(t, tc.wantErr, err) require.Equal(t, tc.wantCondition, condition) }) } diff --git a/operator/internal/status/status.go b/operator/internal/status/status.go index ca4e7c1bf3..2382254bf5 100644 --- a/operator/internal/status/status.go +++ b/operator/internal/status/status.go @@ -31,7 +31,10 @@ func Refresh(ctx context.Context, k k8s.Client, req ctrl.Request, now time.Time) return err } - condition := generateCondition(cs) + condition, err := generateCondition(ctx, cs, k, req, &stack) + if err != nil { + return err + } condition.LastTransitionTime = metav1.NewTime(now) condition.Status = metav1.ConditionTrue diff --git a/operator/internal/status/status_test.go b/operator/internal/status/status_test.go index 81ecc15345..6befb13df8 100644 --- a/operator/internal/status/status_test.go +++ b/operator/internal/status/status_test.go @@ -12,6 +12,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) func TestRefreshSuccess(t *testing.T) { @@ -81,3 +82,67 @@ func TestRefreshSuccess(t *testing.T) { require.Equal(t, wantStatus, updatedStack.Status) } + +func TestRefreshSuccess_ZoneAwarePendingPod(t *testing.T) { + now := time.Now() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "test-stack", + Namespace: "test-ns", + }, + } + stack := lokiv1.LokiStack{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-stack", + Namespace: "test-ns", + }, + Spec: lokiv1.LokiStackSpec{ + Replication: &lokiv1.ReplicationSpec{ + Zones: []lokiv1.ZoneSpec{ + { + TopologyKey: corev1.LabelTopologyZone, + }, + }, + }, + }, + } + testPod := corev1.Pod{ + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + }, + } + + k, sw := setupFakesNoError(t, &stack) + k.ListStub = func(ctx context.Context, ol client.ObjectList, _ ...client.ListOption) error { + switch ol.(type) { + case *corev1.PodList: + k.SetClientObjectList(ol, &corev1.PodList{ + Items: []corev1.Pod{ + testPod, + }, + }) + case *corev1.NodeList: + k.SetClientObjectList(ol, &corev1.NodeList{ + Items: []corev1.Node{}, + }) + } + return nil + } + + err := Refresh(context.Background(), k, req, now) + + require.NoError(t, err) + require.Equal(t, 1, k.GetCallCount()) + require.Equal(t, 9, k.ListCallCount()) + require.Equal(t, 1, sw.UpdateCallCount()) + _, updated, _ := sw.UpdateArgsForCall(0) + updatedStack, ok := updated.(*lokiv1.LokiStack) + if !ok { + t.Fatalf("not a LokiStack: %T", updatedStack) + } + + require.Len(t, updatedStack.Status.Conditions, 1) + condition := updatedStack.Status.Conditions[0] + require.Equal(t, conditionDegradedNodeLabels.Reason, condition.Reason) + require.Equal(t, conditionDegradedNodeLabels.Type, condition.Type) +}