From 78b8801ea4fcd7709f90c0ecc092f0f035b8165d Mon Sep 17 00:00:00 2001 From: whitewindmills Date: Mon, 19 Aug 2024 18:04:53 +0800 Subject: [PATCH 1/2] add new cluster condition: IncompleteAPIEnablements Signed-off-by: whitewindmills --- pkg/apis/cluster/types.go | 3 +++ pkg/apis/cluster/v1alpha1/types.go | 3 +++ .../status/cluster_status_controller.go | 25 ++++++++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pkg/apis/cluster/types.go b/pkg/apis/cluster/types.go index 05d783cb5283..f7f8aea3df97 100644 --- a/pkg/apis/cluster/types.go +++ b/pkg/apis/cluster/types.go @@ -266,6 +266,9 @@ type LocalSecretReference struct { const ( // ClusterConditionReady means the cluster is healthy and ready to accept workloads. ClusterConditionReady = "Ready" + + // ClusterConditionCompleteAPIEnablements indicates whether the cluster's API enablements(.status.apiEnablements) is complete. + ClusterConditionCompleteAPIEnablements = "CompleteAPIEnablements" ) // ClusterStatus contains information about the current status of a diff --git a/pkg/apis/cluster/v1alpha1/types.go b/pkg/apis/cluster/v1alpha1/types.go index 29cfc61da0fc..fe56dca7da94 100644 --- a/pkg/apis/cluster/v1alpha1/types.go +++ b/pkg/apis/cluster/v1alpha1/types.go @@ -278,6 +278,9 @@ type LocalSecretReference struct { const ( // ClusterConditionReady means the cluster is healthy and ready to accept workloads. ClusterConditionReady = "Ready" + + // ClusterConditionCompleteAPIEnablements indicates whether the cluster's API enablements(.status.apiEnablements) is complete. + ClusterConditionCompleteAPIEnablements = "CompleteAPIEnablements" ) // ClusterStatus contains information about the current status of a diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go index 45a4b98d686e..5efd583c4f95 100644 --- a/pkg/controllers/status/cluster_status_controller.go +++ b/pkg/controllers/status/cluster_status_controller.go @@ -67,6 +67,10 @@ const ( clusterNotReachableReason = "ClusterNotReachable" clusterNotReachableMsg = "cluster is not reachable" statusCollectionFailed = "StatusCollectionFailed" + + apiEnablementsComplete = "Complete" + apiEnablementPartialAPIEnablements = "Partial" + apiEnablementEmptyAPIEnablements = "Empty" ) var ( @@ -214,29 +218,42 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu // can be safely removed from current controller. c.initializeGenericInformerManagerForCluster(clusterClient) - err = c.setCurrentClusterStatus(clusterClient, cluster, ¤tClusterStatus) + var conditions []metav1.Condition + conditions, err = c.setCurrentClusterStatus(clusterClient, cluster, ¤tClusterStatus) if err != nil { return err } + conditions = append(conditions, *readyCondition) + return c.updateStatusIfNeeded(cluster, currentClusterStatus, conditions...) } return c.updateStatusIfNeeded(cluster, currentClusterStatus, *readyCondition) } -func (c *ClusterStatusController) setCurrentClusterStatus(clusterClient *util.ClusterClient, cluster *clusterv1alpha1.Cluster, currentClusterStatus *clusterv1alpha1.ClusterStatus) error { +func (c *ClusterStatusController) setCurrentClusterStatus(clusterClient *util.ClusterClient, cluster *clusterv1alpha1.Cluster, currentClusterStatus *clusterv1alpha1.ClusterStatus) ([]metav1.Condition, error) { + var conditions []metav1.Condition clusterVersion, err := getKubernetesVersion(clusterClient) if err != nil { klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err) } currentClusterStatus.KubernetesVersion = clusterVersion + var apiEnablementCondition metav1.Condition // get the list of APIs installed in the member cluster apiEnables, err := getAPIEnablements(clusterClient) if len(apiEnables) == 0 { + apiEnablementCondition = util.NewCondition(clusterv1alpha1.ClusterConditionCompleteAPIEnablements, + apiEnablementEmptyAPIEnablements, "collected empty APIEnablements from the cluster", metav1.ConditionFalse) klog.Errorf("Failed to get any APIs installed in Cluster %s. Error: %v.", cluster.GetName(), err) } else if err != nil { + apiEnablementCondition = util.NewCondition(clusterv1alpha1.ClusterConditionCompleteAPIEnablements, + apiEnablementPartialAPIEnablements, fmt.Sprintf("might collect partial APIEnablements(%d) from the cluster", len(apiEnables)), metav1.ConditionFalse) klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err) + } else { + apiEnablementCondition = util.NewCondition(clusterv1alpha1.ClusterConditionCompleteAPIEnablements, + apiEnablementsComplete, "collected complete APIEnablements from the cluster", metav1.ConditionTrue) } + conditions = append(conditions, apiEnablementCondition) currentClusterStatus.APIEnablements = apiEnables if c.EnableClusterResourceModeling { @@ -246,7 +263,7 @@ func (c *ClusterStatusController) setCurrentClusterStatus(clusterClient *util.Cl klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err) // in large-scale clusters, the timeout may occur. // if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer - return err + return nil, err } nodes, err := listNodes(clusterInformerManager) if err != nil { @@ -264,7 +281,7 @@ func (c *ClusterStatusController) setCurrentClusterStatus(clusterClient *util.Cl currentClusterStatus.ResourceSummary.AllocatableModelings = getAllocatableModelings(cluster, nodes, pods) } } - return nil + return conditions, nil } func setStatusCollectionFailedCondition(c client.Client, cluster *clusterv1alpha1.Cluster, message string) error { From 6ab6fb6e675b7e81e57959605b4fef841a5986b2 Mon Sep 17 00:00:00 2001 From: huangyanfeng Date: Thu, 18 Jul 2024 10:16:13 +0800 Subject: [PATCH 2/2] Skip cluster removal if already scheduled and API enablements are incomplete to prevent accidental removal. Signed-off-by: huangyanfeng --- .../plugins/apienablement/api_enablement.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pkg/scheduler/framework/plugins/apienablement/api_enablement.go b/pkg/scheduler/framework/plugins/apienablement/api_enablement.go index 15d32ab35951..791180f639e4 100644 --- a/pkg/scheduler/framework/plugins/apienablement/api_enablement.go +++ b/pkg/scheduler/framework/plugins/apienablement/api_enablement.go @@ -19,6 +19,7 @@ package apienablement import ( "context" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/klog/v2" clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" @@ -54,10 +55,19 @@ func (p *APIEnablement) Filter( _ *workv1alpha2.ResourceBindingStatus, cluster *clusterv1alpha1.Cluster, ) *framework.Result { - if !helper.IsAPIEnabled(cluster.Status.APIEnablements, bindingSpec.Resource.APIVersion, bindingSpec.Resource.Kind) { - klog.V(2).Infof("Cluster(%s) not fit as missing API(%s, kind=%s)", cluster.Name, bindingSpec.Resource.APIVersion, bindingSpec.Resource.Kind) - return framework.NewResult(framework.Unschedulable, "cluster(s) did not have the API resource") + if helper.IsAPIEnabled(cluster.Status.APIEnablements, bindingSpec.Resource.APIVersion, bindingSpec.Resource.Kind) { + return framework.NewResult(framework.Success) } - return framework.NewResult(framework.Success) + // Let the cluster pass if it is already on the list of schedule result and the cluster's + // API enablements is incomplete, to avoid the issue that cluster be accidentally removed + // due to untrusted API enablements. + if bindingSpec.TargetContains(cluster.Name) && + !meta.IsStatusConditionTrue(cluster.Status.Conditions, clusterv1alpha1.ClusterConditionCompleteAPIEnablements) { + return framework.NewResult(framework.Success) + } + + klog.V(2).Infof("Cluster(%s) not fit as missing API(%s, kind=%s)", cluster.Name, bindingSpec.Resource.APIVersion, bindingSpec.Resource.Kind) + + return framework.NewResult(framework.Unschedulable, "cluster(s) did not have the API resource") }