Skip to content

Commit 9937f8f

Browse files
authored
Merge pull request #7929 from elmiko/issue-7928-decrease-size-fix
make DecreaseTargetSize more accurate for clusterapi
2 parents f04fd5b + 003e6cd commit 9937f8f

File tree

4 files changed

+283
-37
lines changed

4 files changed

+283
-37
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go

+23
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ const (
5555
resourceNameMachineSet = "machinesets"
5656
resourceNameMachineDeployment = "machinedeployments"
5757
resourceNameMachinePool = "machinepools"
58+
deletingMachinePrefix = "deleting-machine-"
5859
failedMachinePrefix = "failed-machine-"
5960
pendingMachinePrefix = "pending-machine-"
6061
machineTemplateKind = "MachineTemplate"
@@ -314,6 +315,9 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
314315
return u.DeepCopy(), nil
315316
}
316317

318+
if isDeletingMachineProviderID(providerID) {
319+
return c.findMachine(machineKeyFromDeletingMachineProviderID(providerID))
320+
}
317321
if isFailedMachineProviderID(providerID) {
318322
return c.findMachine(machineKeyFromFailedProviderID(providerID))
319323
}
@@ -339,6 +343,19 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
339343
return c.findMachine(path.Join(ns, machineID))
340344
}
341345

346+
func createDeletingMachineNormalizedProviderID(namespace, name string) string {
347+
return fmt.Sprintf("%s%s_%s", deletingMachinePrefix, namespace, name)
348+
}
349+
350+
func isDeletingMachineProviderID(providerID normalizedProviderID) bool {
351+
return strings.HasPrefix(string(providerID), deletingMachinePrefix)
352+
}
353+
354+
func machineKeyFromDeletingMachineProviderID(providerID normalizedProviderID) string {
355+
namespaceName := strings.TrimPrefix(string(providerID), deletingMachinePrefix)
356+
return strings.Replace(namespaceName, "_", "/", 1)
357+
}
358+
342359
func isPendingMachineProviderID(providerID normalizedProviderID) bool {
343360
return strings.HasPrefix(string(providerID), pendingMachinePrefix)
344361
}
@@ -610,6 +627,12 @@ func (c *machineController) findScalableResourceProviderIDs(scalableResource *un
610627

611628
if found {
612629
if providerID != "" {
630+
// If the machine is deleting, prepend the deletion guard on the provider id
631+
// so that it can be properly filtered when counting the number of nodes and instances.
632+
if !machine.GetDeletionTimestamp().IsZero() {
633+
klog.V(4).Infof("Machine %q has a non-zero deletion timestamp", machine.GetName())
634+
providerID = createDeletingMachineNormalizedProviderID(machine.GetNamespace(), machine.GetName())
635+
}
613636
providerIDs = append(providerIDs, providerID)
614637
continue
615638
}

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go

+113
Original file line numberDiff line numberDiff line change
@@ -2131,3 +2131,116 @@ func Test_machineController_nodeGroups(t *testing.T) {
21312131
})
21322132
}
21332133
}
2134+
2135+
func Test_isDeletingMachineProviderID(t *testing.T) {
2136+
type testCase struct {
2137+
description string
2138+
providerID string
2139+
expectedReturn bool
2140+
}
2141+
2142+
testCases := []testCase{
2143+
{
2144+
description: "proper provider ID without deletion prefix should return false",
2145+
providerID: "fake-provider://a.provider.id-0001",
2146+
expectedReturn: false,
2147+
},
2148+
{
2149+
description: "provider ID with deletion prefix should return true",
2150+
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
2151+
expectedReturn: true,
2152+
},
2153+
{
2154+
description: "empty provider ID should return false",
2155+
providerID: "",
2156+
expectedReturn: false,
2157+
},
2158+
{
2159+
description: "provider ID created with createDeletingMachineNormalizedProviderID returns true",
2160+
providerID: createDeletingMachineNormalizedProviderID("cluster-api", "id-0001"),
2161+
expectedReturn: true,
2162+
},
2163+
}
2164+
2165+
for _, tc := range testCases {
2166+
t.Run(tc.description, func(t *testing.T) {
2167+
observed := isDeletingMachineProviderID(normalizedProviderID(tc.providerID))
2168+
if observed != tc.expectedReturn {
2169+
t.Fatalf("unexpected return for provider ID %q, expected %t, observed %t", tc.providerID, tc.expectedReturn, observed)
2170+
}
2171+
})
2172+
}
2173+
2174+
}
2175+
2176+
func Test_machineKeyFromDeletingMachineProviderID(t *testing.T) {
2177+
type testCase struct {
2178+
description string
2179+
providerID string
2180+
expectedReturn string
2181+
}
2182+
2183+
testCases := []testCase{
2184+
{
2185+
description: "real looking provider ID with no deletion prefix returns provider id",
2186+
providerID: "fake-provider://a.provider.id-0001",
2187+
expectedReturn: "fake-provider://a.provider.id-0001",
2188+
},
2189+
{
2190+
description: "namespace_name provider ID with no deletion prefix returns proper provider ID",
2191+
providerID: "cluster-api_id-0001",
2192+
expectedReturn: "cluster-api/id-0001",
2193+
},
2194+
{
2195+
description: "namespace_name provider ID with deletion prefix returns proper provider ID",
2196+
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
2197+
expectedReturn: "cluster-api/id-0001",
2198+
},
2199+
{
2200+
description: "namespace_name provider ID with deletion prefix and two underscores returns proper provider ID",
2201+
providerID: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id_0001"),
2202+
expectedReturn: "cluster-api/id_0001",
2203+
},
2204+
{
2205+
description: "provider ID created with createDeletingMachineNormalizedProviderID returns proper provider ID",
2206+
providerID: createDeletingMachineNormalizedProviderID("cluster-api", "id-0001"),
2207+
expectedReturn: "cluster-api/id-0001",
2208+
},
2209+
}
2210+
2211+
for _, tc := range testCases {
2212+
t.Run(tc.description, func(t *testing.T) {
2213+
observed := machineKeyFromDeletingMachineProviderID(normalizedProviderID(tc.providerID))
2214+
if observed != tc.expectedReturn {
2215+
t.Fatalf("unexpected return for provider ID %q, expected %q, observed %q", tc.providerID, tc.expectedReturn, observed)
2216+
}
2217+
})
2218+
}
2219+
}
2220+
2221+
func Test_createDeletingMachineNormalizedProviderID(t *testing.T) {
2222+
type testCase struct {
2223+
description string
2224+
namespace string
2225+
name string
2226+
expectedReturn string
2227+
}
2228+
2229+
testCases := []testCase{
2230+
{
2231+
description: "namespace and name return proper normalized ID",
2232+
namespace: "cluster-api",
2233+
name: "id-0001",
2234+
expectedReturn: fmt.Sprintf("%s%s_%s", deletingMachinePrefix, "cluster-api", "id-0001"),
2235+
},
2236+
}
2237+
2238+
for _, tc := range testCases {
2239+
t.Run(tc.description, func(t *testing.T) {
2240+
observed := createDeletingMachineNormalizedProviderID(tc.namespace, tc.name)
2241+
if observed != tc.expectedReturn {
2242+
t.Fatalf("unexpected return for (namespace %q, name %q), expected %q, observed %q", tc.namespace, tc.name, tc.expectedReturn, observed)
2243+
}
2244+
})
2245+
}
2246+
}

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

+20-3
Original file line numberDiff line numberDiff line change
@@ -203,11 +203,28 @@ func (ng *nodegroup) DecreaseTargetSize(delta int) error {
203203
return err
204204
}
205205

206-
if size+delta < len(nodes) {
207-
return fmt.Errorf("attempt to delete existing nodes targetSize:%d delta:%d existingNodes: %d",
208-
size, delta, len(nodes))
206+
// we want to filter out machines that are not nodes (eg failed or pending)
207+
// so that the node group size can be set properly. this affects situations
208+
// where an instance is created, but cannot become a node due to cloud provider
209+
// issues such as quota limitations, and thus the autoscaler needs to properly
210+
// set the size of the node group. without this adjustment, the core autoscaler
211+
// will become confused about the state of nodes and instances in the clusterapi
212+
// provider.
213+
actualNodes := 0
214+
for _, node := range nodes {
215+
if !isPendingMachineProviderID(normalizedProviderID(node.Id)) &&
216+
!isFailedMachineProviderID(normalizedProviderID(node.Id)) &&
217+
!isDeletingMachineProviderID(normalizedProviderID(node.Id)) {
218+
actualNodes += 1
219+
}
220+
}
221+
222+
if size+delta < actualNodes {
223+
return fmt.Errorf("node group %s: attempt to delete existing nodes currentReplicas:%d delta:%d existingNodes: %d",
224+
ng.scalableResource.Name(), size, delta, actualNodes)
209225
}
210226

227+
klog.V(4).Infof("%s: DecreaseTargetSize: scaling down: currentReplicas: %d, delta: %d, existingNodes: %d", ng.scalableResource.Name(), size, delta, len(nodes))
211228
return ng.scalableResource.SetSize(size + delta)
212229
}
213230

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

+127-34
Original file line numberDiff line numberDiff line change
@@ -406,18 +406,73 @@ func TestNodeGroupIncreaseSize(t *testing.T) {
406406

407407
func TestNodeGroupDecreaseTargetSize(t *testing.T) {
408408
type testCase struct {
409-
description string
410-
delta int
411-
initial int32
412-
targetSizeIncrement int32
413-
expected int32
414-
expectedError bool
409+
description string
410+
delta int
411+
initial int32
412+
targetSizeIncrement int32
413+
expected int32
414+
expectedError bool
415+
includeDeletingMachine bool
416+
includeFailedMachine bool
417+
includePendingMachine bool
415418
}
416419

417420
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {
418421
controller, stop := mustCreateTestController(t, testConfig)
419422
defer stop()
420423

424+
// machines in deletion should not be counted towards the active nodes when calculating a decrease in size.
425+
if tc.includeDeletingMachine {
426+
if tc.initial < 1 {
427+
t.Fatal("test cannot pass, deleted machine requires at least 1 machine in machineset")
428+
}
429+
430+
// Simulate a machine in deleting
431+
machine := testConfig.machines[0].DeepCopy()
432+
timestamp := metav1.Now()
433+
machine.SetDeletionTimestamp(&timestamp)
434+
435+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
436+
t.Fatalf("unexpected error updating machine, got %v", err)
437+
}
438+
}
439+
440+
// machines that have failed should not be counted towards the active nodes when calculating a decrease in size.
441+
if tc.includeFailedMachine {
442+
// because we want to allow for tests that have deleted machines and failed machines, we use the second machine in the test data.
443+
if tc.initial < 2 {
444+
t.Fatal("test cannot pass, failed machine requires at least 2 machine in machineset")
445+
}
446+
447+
// Simulate a failed machine
448+
machine := testConfig.machines[1].DeepCopy()
449+
450+
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
451+
unstructured.SetNestedField(machine.Object, "FailureMessage", "status", "failureMessage")
452+
453+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
454+
t.Fatalf("unexpected error updating machine, got %v", err)
455+
}
456+
}
457+
458+
// machines that are in pending state should not be counted towards the active nodes when calculating a decrease in size.
459+
if tc.includePendingMachine {
460+
// because we want to allow for tests that have deleted, failed machines, and pending machine, we use the third machine in the test data.
461+
if tc.initial < 3 {
462+
t.Fatal("test cannot pass, pending machine requires at least 3 machine in machineset")
463+
}
464+
465+
// Simulate a pending machine
466+
machine := testConfig.machines[2].DeepCopy()
467+
468+
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
469+
unstructured.RemoveNestedField(machine.Object, "status", "nodeRef")
470+
471+
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
472+
t.Fatalf("unexpected error updating machine, got %v", err)
473+
}
474+
}
475+
421476
nodegroups, err := controller.nodeGroups()
422477
if err != nil {
423478
t.Fatalf("unexpected error: %v", err)
@@ -522,45 +577,83 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
522577
}
523578
}
524579

525-
annotations := map[string]string{
526-
nodeGroupMinSizeAnnotationKey: "1",
527-
nodeGroupMaxSizeAnnotationKey: "10",
528-
}
529-
530-
t.Run("MachineSet", func(t *testing.T) {
531-
tc := testCase{
580+
testCases := []testCase{
581+
{
532582
description: "Same number of existing instances and node group target size should error",
533583
initial: 3,
534584
targetSizeIncrement: 0,
535585
expected: 3,
536586
delta: -1,
537587
expectedError: true,
538-
}
539-
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
540-
})
541-
542-
t.Run("MachineSet", func(t *testing.T) {
543-
tc := testCase{
588+
},
589+
{
544590
description: "A node group with target size 4 but only 3 existing instances should decrease by 1",
545591
initial: 3,
546592
targetSizeIncrement: 1,
547593
expected: 3,
548594
delta: -1,
549-
}
550-
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
551-
})
595+
},
596+
{
597+
description: "A node group with 4 replicas with one machine in deleting state should decrease by 1",
598+
initial: 4,
599+
targetSizeIncrement: 0,
600+
expected: 3,
601+
delta: -1,
602+
includeDeletingMachine: true,
603+
},
604+
{
605+
description: "A node group with 4 replicas with one failed machine should decrease by 1",
606+
initial: 4,
607+
targetSizeIncrement: 0,
608+
expected: 3,
609+
delta: -1,
610+
includeFailedMachine: true,
611+
},
612+
{
613+
description: "A node group with 4 replicas with one pending machine should decrease by 1",
614+
initial: 4,
615+
targetSizeIncrement: 0,
616+
expected: 3,
617+
delta: -1,
618+
includePendingMachine: true,
619+
},
620+
{
621+
description: "A node group with 5 replicas with one pending and one failed machine should decrease by 2",
622+
initial: 5,
623+
targetSizeIncrement: 0,
624+
expected: 3,
625+
delta: -2,
626+
includeFailedMachine: true,
627+
includePendingMachine: true,
628+
},
629+
{
630+
description: "A node group with 5 replicas with one pending, one failed, and one deleting machine should decrease by 3",
631+
initial: 5,
632+
targetSizeIncrement: 0,
633+
expected: 2,
634+
delta: -3,
635+
includeFailedMachine: true,
636+
includePendingMachine: true,
637+
includeDeletingMachine: true,
638+
},
639+
}
552640

553-
t.Run("MachineDeployment", func(t *testing.T) {
554-
tc := testCase{
555-
description: "Same number of existing instances and node group target size should error",
556-
initial: 3,
557-
targetSizeIncrement: 0,
558-
expected: 3,
559-
delta: -1,
560-
expectedError: true,
561-
}
562-
test(t, &tc, createMachineDeploymentTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
563-
})
641+
annotations := map[string]string{
642+
nodeGroupMinSizeAnnotationKey: "1",
643+
nodeGroupMaxSizeAnnotationKey: "10",
644+
}
645+
646+
for _, tc := range testCases {
647+
t.Run(tc.description, func(t *testing.T) {
648+
test(t, &tc, createMachineSetTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
649+
})
650+
}
651+
652+
for _, tc := range testCases {
653+
t.Run(tc.description, func(t *testing.T) {
654+
test(t, &tc, createMachineDeploymentTestConfig(RandomString(6), RandomString(6), RandomString(6), int(tc.initial), annotations, nil))
655+
})
656+
}
564657
}
565658

566659
func TestNodeGroupDecreaseSizeErrors(t *testing.T) {
@@ -580,7 +673,7 @@ func TestNodeGroupDecreaseSizeErrors(t *testing.T) {
580673
description: "errors because initial+delta < len(nodes)",
581674
delta: -1,
582675
initial: 3,
583-
errorMsg: "attempt to delete existing nodes targetSize:3 delta:-1 existingNodes: 3",
676+
errorMsg: "attempt to delete existing nodes currentReplicas:3 delta:-1 existingNodes: 3",
584677
}}
585678

586679
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {

0 commit comments

Comments
 (0)