Skip to content

Commit f5ff9ef

Browse files
committed
refactor: more instance-generic async provisioning
1 parent dd42341 commit f5ff9ef

File tree

3 files changed

+125
-84
lines changed

3 files changed

+125
-84
lines changed

pkg/cloudprovider/cloudprovider.go

Lines changed: 74 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -96,30 +96,6 @@ func New(
9696
}
9797
}
9898

99-
// Create a node given the constraints.
100-
func (c *CloudProvider) handleVMInstanceCreation(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) error {
101-
if c.isStandaloneNodeClaim(nodeClaim) {
102-
// processStandaloneNodeClaimDeletion:
103-
// Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104-
// so we delete them synchronously. After marking Launched=true,
105-
// their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106-
// the lanch controller. This ensures we retry continuously until we hit the registration TTL
107-
err := vmPromise.Wait()
108-
if err != nil {
109-
return c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, false)
110-
}
111-
} else {
112-
// For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
113-
// Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
114-
// crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
115-
// no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116-
// be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117-
// only held in memory.
118-
go c.waitOnPromise(ctx, vmPromise, nodeClaim)
119-
}
120-
return nil
121-
}
122-
12399
func (c *CloudProvider) validateNodeClass(nodeClass *v1beta1.AKSNodeClass) error {
124100
nodeClassReady := nodeClass.StatusConditions().Get(status.ConditionReady)
125101
if nodeClassReady.IsFalse() {
@@ -169,12 +145,17 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169145
if len(instanceTypes) == 0 {
170146
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
171147
}
148+
149+
return c.createVMInstance(ctx, nodeClass, nodeClaim, instanceTypes)
150+
}
151+
152+
func (c *CloudProvider) createVMInstance(ctx context.Context, nodeClass *v1beta1.AKSNodeClass, nodeClaim *karpv1.NodeClaim, instanceTypes []*cloudprovider.InstanceType) (*karpv1.NodeClaim, error) {
172153
vmPromise, err := c.vmInstanceProvider.BeginCreate(ctx, nodeClass, nodeClaim, instanceTypes)
173154
if err != nil {
174155
return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
175156
}
176157

177-
if err = c.handleVMInstanceCreation(ctx, vmPromise, nodeClaim); err != nil {
158+
if err := c.handleInstancePromise(ctx, vmPromise, nodeClaim); err != nil {
178159
return nil, err
179160
}
180161

@@ -183,37 +164,84 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
183164
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
184165
return i.Name == string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))
185166
})
186-
187-
nc, err := c.vmInstanceToNodeClaim(ctx, vm, instanceType)
167+
newNodeClaim, err := c.vmInstanceToNodeClaim(ctx, vm, instanceType)
188168
if err != nil {
189169
return nil, err
190170
}
191-
if err := setAdditionalAnnotationsForNewNodeClaim(ctx, nc, nodeClass); err != nil {
171+
if err := setAdditionalAnnotationsForNewNodeClaim(ctx, newNodeClaim, nodeClass); err != nil {
192172
return nil, err
193173
}
194-
return nc, nil
174+
return newNodeClaim, nil
195175
}
196176

197-
func (c *CloudProvider) waitOnPromise(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) {
198-
defer func() {
199-
if r := recover(); r != nil {
200-
err := fmt.Errorf("%v", r)
201-
log.FromContext(ctx).Error(err, "panic during waitOnPromise")
177+
// handleInstancePromise handles the instance promise, primarily deciding on sync/async provisioning.
178+
func (c *CloudProvider) handleInstancePromise(ctx context.Context, instancePromise instance.InstancePromise, nodeClaim *karpv1.NodeClaim) error {
179+
if isNodeClaimStandalone(nodeClaim) {
180+
// Standalone NodeClaims aren't re-queued for reconciliation in the provision_trigger controller,
181+
// so we delete them synchronously. After marking Launched=true,
182+
// their status can't be reverted to false once the delete completes due to how core caches nodeclaims in
183+
// the launch controller. This ensures we retry continuously until we hit the registration TTL
184+
// return c.waitOnInstancePromise(ctx, instancePromise, nodeClaim, true)
185+
err := instancePromise.Wait()
186+
if err != nil {
187+
c.handleInstancePromiseWaitError(ctx, instancePromise, nodeClaim, err)
188+
return cloudprovider.NewCreateError(fmt.Errorf("creating standalone instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
202189
}
203-
}()
190+
}
191+
// For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
192+
// Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
193+
// crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
194+
// no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
195+
// be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
196+
// only held in memory.
197+
// go c.waitOnInstancePromise(ctx, instancePromise, nodeClaim, false)
198+
go func() {
199+
defer func() {
200+
if r := recover(); r != nil {
201+
err := fmt.Errorf("%v", r)
202+
log.FromContext(ctx).Error(err, "panic during waiting on instance promise")
203+
}
204+
}()
204205

205-
err := vmPromise.Wait()
206+
err := instancePromise.Wait()
206207

207-
// Wait until the claim is Launched, to avoid racing with creation.
208-
// This isn't strictly required, but without this, failure test scenarios are harder
209-
// to write because the nodeClaim gets deleted by error handling below before
210-
// the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
211-
// it would actually happen in production).
212-
c.waitUntilLaunched(ctx, nodeClaim)
208+
// Wait until the claim is Launched, to avoid racing with creation.
209+
// This isn't strictly required, but without this, failure test scenarios are harder
210+
// to write because the nodeClaim gets deleted by error handling below before
211+
// the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
212+
// it would actually happen in production).
213+
c.waitUntilLaunched(ctx, nodeClaim)
213214

214-
if err != nil {
215-
_ = c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, true)
216-
return
215+
if err != nil {
216+
c.handleInstancePromiseWaitError(ctx, instancePromise, nodeClaim, err)
217+
218+
// For async provisioning, also delete the NodeClaim
219+
if deleteErr := c.kubeClient.Delete(ctx, nodeClaim); deleteErr != nil {
220+
deleteErr = client.IgnoreNotFound(deleteErr)
221+
if deleteErr != nil {
222+
log.FromContext(ctx).Error(deleteErr, "failed to delete nodeclaim, will wait for liveness TTL", "NodeClaim", nodeClaim.Name)
223+
}
224+
}
225+
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
226+
metrics.ReasonLabel: "async_provisioning",
227+
metrics.NodePoolLabel: nodeClaim.Labels[karpv1.NodePoolLabelKey],
228+
metrics.CapacityTypeLabel: nodeClaim.Labels[karpv1.CapacityTypeLabelKey],
229+
})
230+
}
231+
}()
232+
return nil
233+
}
234+
235+
func (c *CloudProvider) handleInstancePromiseWaitError(ctx context.Context, instancePromise instance.InstancePromise, nodeClaim *karpv1.NodeClaim, waitErr error) {
236+
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToRegister(nodeClaim, waitErr))
237+
log.FromContext(ctx).Error(waitErr, "failed launching nodeclaim")
238+
239+
cleanUpError := instancePromise.Cleanup(ctx)
240+
if cleanUpError != nil {
241+
// Fallback to garbage collection to clean up the instance, if it survived.
242+
if cloudprovider.IgnoreNodeClaimNotFoundError(cleanUpError) != nil {
243+
log.FromContext(ctx).Error(cleanUpError, "failed to delete instance", "instanceName", instancePromise.GetInstanceName())
244+
}
217245
}
218246
}
219247

@@ -240,41 +268,6 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
240268
}
241269
}
242270

243-
// handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
244-
func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err error, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim, removeNodeClaim bool) error {
245-
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToRegister(nodeClaim, err))
246-
log.FromContext(ctx).Error(err, "failed launching nodeclaim")
247-
248-
// Clean up the VM
249-
// TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
250-
vmName := lo.FromPtr(vmPromise.VM.Name)
251-
deleteErr := c.vmInstanceProvider.Delete(ctx, vmName)
252-
if cloudprovider.IgnoreNodeClaimNotFoundError(deleteErr) != nil {
253-
log.FromContext(ctx).Error(deleteErr, "failed to delete VM", "vmName", vmName)
254-
}
255-
256-
// For async provisioning, also delete the NodeClaim
257-
if removeNodeClaim {
258-
if deleteErr := c.kubeClient.Delete(ctx, nodeClaim); deleteErr != nil {
259-
deleteErr = client.IgnoreNotFound(deleteErr)
260-
if deleteErr != nil {
261-
log.FromContext(ctx).Error(deleteErr, "failed to delete nodeclaim, will wait for liveness TTL", "NodeClaim", nodeClaim.Name)
262-
}
263-
}
264-
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
265-
metrics.ReasonLabel: "async_provisioning",
266-
metrics.NodePoolLabel: nodeClaim.Labels[karpv1.NodePoolLabelKey],
267-
metrics.CapacityTypeLabel: nodeClaim.Labels[karpv1.CapacityTypeLabelKey],
268-
})
269-
}
270-
271-
// For standalone node claims, return a CreateError
272-
if !removeNodeClaim {
273-
return cloudprovider.NewCreateError(fmt.Errorf("creating instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
274-
}
275-
return nil
276-
}
277-
278271
func (c *CloudProvider) List(ctx context.Context) ([]*karpv1.NodeClaim, error) {
279272
vmInstances, err := c.vmInstanceProvider.List(ctx)
280273
if err != nil {
@@ -508,7 +501,7 @@ func GetNodeClaimNameFromVMName(vmName string) string {
508501

509502
const truncateAt = 1200
510503

511-
func (c *CloudProvider) isStandaloneNodeClaim(nodeClaim *karpv1.NodeClaim) bool {
504+
func isNodeClaimStandalone(nodeClaim *karpv1.NodeClaim) bool {
512505
// NodeClaims without the nodepool label are considered standalone
513506
_, hasNodePoolLabel := nodeClaim.Labels[karpv1.NodePoolLabelKey]
514507
return !hasNodePoolLabel
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
Portions Copyright (c) Microsoft Corporation.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package instance
18+
19+
import (
20+
"context"
21+
)
22+
23+
// Intended for lifecycle handling on the higher abstractions.
24+
type InstancePromise interface {
25+
// Cleanup removes the instance from the cloud provider.
26+
Cleanup(ctx context.Context) error
27+
// Wait blocks until the instance is ready.
28+
Wait() error
29+
// GetInstanceName returns the name of the instance. Recommended to be used for logging only due to generic nature.
30+
GetInstanceName() string
31+
}

pkg/providers/instance/vminstance.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,24 @@ func GetManagedExtensionNames(provisionMode string) []string {
9696
type Resource = map[string]interface{}
9797

9898
type VirtualMachinePromise struct {
99-
VM *armcompute.VirtualMachine
100-
Wait func() error
99+
VM *armcompute.VirtualMachine
100+
WaitFunc func() error
101+
102+
providerRef VMProvider
103+
}
104+
105+
func (p *VirtualMachinePromise) Cleanup(ctx context.Context) error {
106+
// This won't clean up leaked NICs if the VM doesn't exist... intentional?
107+
// From Delete(): "Leftover network interfaces (if any) will be cleaned by by GC controller"
108+
// Still, we could try to DeleteNic()?
109+
return p.providerRef.Delete(ctx, lo.FromPtr(p.VM.Name))
110+
}
111+
112+
func (p *VirtualMachinePromise) Wait() error {
113+
return p.WaitFunc()
114+
}
115+
func (p *VirtualMachinePromise) GetInstanceName() string {
116+
return lo.FromPtr(p.VM.Name)
101117
}
102118

103119
type VMProvider interface {
@@ -723,7 +739,8 @@ func (p *DefaultVMProvider) beginLaunchInstance(
723739
result.VM.Properties.TimeCreated = lo.ToPtr(time.Now())
724740

725741
return &VirtualMachinePromise{
726-
Wait: func() error {
742+
providerRef: p,
743+
WaitFunc: func() error {
727744
if result.Poller == nil {
728745
// Poller is nil means the VM existed already and we're done.
729746
// TODO: if the VM doesn't have extensions this will still happen and we will have to

0 commit comments

Comments
 (0)