Skip to content

Commit ce93679

Browse files
authored
refactor: more instance-generic async provisioning (#1165)
1 parent 1f327c6 commit ce93679

File tree

3 files changed

+123
-84
lines changed

3 files changed

+123
-84
lines changed

pkg/cloudprovider/cloudprovider.go

Lines changed: 72 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -96,30 +96,6 @@ func New(
9696
}
9797
}
9898

99-
// Create a node given the constraints.
100-
func (c *CloudProvider) handleVMInstanceCreation(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) error {
101-
if c.isStandaloneNodeClaim(nodeClaim) {
102-
// processStandaloneNodeClaimDeletion:
103-
// Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104-
// so we delete them synchronously. After marking Launched=true,
105-
// their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106-
// the lanch controller. This ensures we retry continuously until we hit the registration TTL
107-
err := vmPromise.Wait()
108-
if err != nil {
109-
return c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, false)
110-
}
111-
} else {
112-
// For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
113-
// Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
114-
// crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
115-
// no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116-
// be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117-
// only held in memory.
118-
go c.waitOnPromise(ctx, vmPromise, nodeClaim)
119-
}
120-
return nil
121-
}
122-
12399
func (c *CloudProvider) validateNodeClass(nodeClass *v1beta1.AKSNodeClass) error {
124100
nodeClassReady := nodeClass.StatusConditions().Get(status.ConditionReady)
125101
if nodeClassReady.IsFalse() {
@@ -169,12 +145,17 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169145
if len(instanceTypes) == 0 {
170146
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
171147
}
148+
149+
return c.createVMInstance(ctx, nodeClass, nodeClaim, instanceTypes)
150+
}
151+
152+
func (c *CloudProvider) createVMInstance(ctx context.Context, nodeClass *v1beta1.AKSNodeClass, nodeClaim *karpv1.NodeClaim, instanceTypes []*cloudprovider.InstanceType) (*karpv1.NodeClaim, error) {
172153
vmPromise, err := c.vmInstanceProvider.BeginCreate(ctx, nodeClass, nodeClaim, instanceTypes)
173154
if err != nil {
174155
return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
175156
}
176157

177-
if err = c.handleVMInstanceCreation(ctx, vmPromise, nodeClaim); err != nil {
158+
if err := c.handleInstancePromise(ctx, vmPromise, nodeClaim); err != nil {
178159
return nil, err
179160
}
180161

@@ -183,37 +164,82 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
183164
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
184165
return i.Name == string(lo.FromPtr(vm.Properties.HardwareProfile.VMSize))
185166
})
186-
187-
nc, err := c.vmInstanceToNodeClaim(ctx, vm, instanceType)
167+
newNodeClaim, err := c.vmInstanceToNodeClaim(ctx, vm, instanceType)
188168
if err != nil {
189169
return nil, err
190170
}
191-
if err := setAdditionalAnnotationsForNewNodeClaim(ctx, nc, nodeClass); err != nil {
171+
if err := setAdditionalAnnotationsForNewNodeClaim(ctx, newNodeClaim, nodeClass); err != nil {
192172
return nil, err
193173
}
194-
return nc, nil
174+
return newNodeClaim, nil
195175
}
196176

197-
func (c *CloudProvider) waitOnPromise(ctx context.Context, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim) {
198-
defer func() {
199-
if r := recover(); r != nil {
200-
err := fmt.Errorf("%v", r)
201-
log.FromContext(ctx).Error(err, "panic during waitOnPromise")
177+
// handleInstancePromise handles the instance promise, primarily deciding on sync/async provisioning.
178+
func (c *CloudProvider) handleInstancePromise(ctx context.Context, instancePromise instance.Promise, nodeClaim *karpv1.NodeClaim) error {
179+
if isNodeClaimStandalone(nodeClaim) {
180+
// Standalone NodeClaims aren't re-queued for reconciliation in the provision_trigger controller,
181+
// so we delete them synchronously. After marking Launched=true,
182+
// their status can't be reverted to false once the delete completes due to how core caches nodeclaims in
183+
// the launch controller. This ensures we retry continuously until we hit the registration TTL
184+
err := instancePromise.Wait()
185+
if err != nil {
186+
c.handleInstancePromiseWaitError(ctx, instancePromise, nodeClaim, err)
187+
return cloudprovider.NewCreateError(fmt.Errorf("creating standalone instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
202188
}
203-
}()
189+
}
190+
// For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
191+
// Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
192+
// crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
193+
// no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
194+
// be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
195+
// only held in memory.
196+
go func() {
197+
defer func() {
198+
if r := recover(); r != nil {
199+
err := fmt.Errorf("%v", r)
200+
log.FromContext(ctx).Error(err, "panic during waiting on instance promise")
201+
}
202+
}()
204203

205-
err := vmPromise.Wait()
204+
err := instancePromise.Wait()
206205

207-
// Wait until the claim is Launched, to avoid racing with creation.
208-
// This isn't strictly required, but without this, failure test scenarios are harder
209-
// to write because the nodeClaim gets deleted by error handling below before
210-
// the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
211-
// it would actually happen in production).
212-
c.waitUntilLaunched(ctx, nodeClaim)
206+
// Wait until the claim is Launched, to avoid racing with creation.
207+
// This isn't strictly required, but without this, failure test scenarios are harder
208+
// to write because the nodeClaim gets deleted by error handling below before
209+
// the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
210+
// it would actually happen in production).
211+
c.waitUntilLaunched(ctx, nodeClaim)
213212

214-
if err != nil {
215-
_ = c.handleNodeClaimCreationError(ctx, err, vmPromise, nodeClaim, true)
216-
return
213+
if err != nil {
214+
c.handleInstancePromiseWaitError(ctx, instancePromise, nodeClaim, err)
215+
216+
// For async provisioning, also delete the NodeClaim
217+
if deleteErr := c.kubeClient.Delete(ctx, nodeClaim); deleteErr != nil {
218+
deleteErr = client.IgnoreNotFound(deleteErr)
219+
if deleteErr != nil {
220+
log.FromContext(ctx).Error(deleteErr, "failed to delete nodeclaim, will wait for liveness TTL", "NodeClaim", nodeClaim.Name)
221+
}
222+
}
223+
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
224+
metrics.ReasonLabel: "async_provisioning",
225+
metrics.NodePoolLabel: nodeClaim.Labels[karpv1.NodePoolLabelKey],
226+
metrics.CapacityTypeLabel: nodeClaim.Labels[karpv1.CapacityTypeLabelKey],
227+
})
228+
}
229+
}()
230+
return nil
231+
}
232+
233+
func (c *CloudProvider) handleInstancePromiseWaitError(ctx context.Context, instancePromise instance.Promise, nodeClaim *karpv1.NodeClaim, waitErr error) {
234+
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToRegister(nodeClaim, waitErr))
235+
log.FromContext(ctx).Error(waitErr, "failed launching nodeclaim")
236+
237+
cleanUpError := instancePromise.Cleanup(ctx)
238+
if cleanUpError != nil {
239+
// Fallback to garbage collection to clean up the instance, if it survived.
240+
if cloudprovider.IgnoreNodeClaimNotFoundError(cleanUpError) != nil {
241+
log.FromContext(ctx).Error(cleanUpError, "failed to delete instance", "instanceName", instancePromise.GetInstanceName())
242+
}
217243
}
218244
}
219245

@@ -240,41 +266,6 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
240266
}
241267
}
242268

243-
// handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
244-
func (c *CloudProvider) handleNodeClaimCreationError(ctx context.Context, err error, vmPromise *instance.VirtualMachinePromise, nodeClaim *karpv1.NodeClaim, removeNodeClaim bool) error {
245-
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToRegister(nodeClaim, err))
246-
log.FromContext(ctx).Error(err, "failed launching nodeclaim")
247-
248-
// Clean up the VM
249-
// TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
250-
vmName := lo.FromPtr(vmPromise.VM.Name)
251-
deleteErr := c.vmInstanceProvider.Delete(ctx, vmName)
252-
if cloudprovider.IgnoreNodeClaimNotFoundError(deleteErr) != nil {
253-
log.FromContext(ctx).Error(deleteErr, "failed to delete VM", "vmName", vmName)
254-
}
255-
256-
// For async provisioning, also delete the NodeClaim
257-
if removeNodeClaim {
258-
if deleteErr := c.kubeClient.Delete(ctx, nodeClaim); deleteErr != nil {
259-
deleteErr = client.IgnoreNotFound(deleteErr)
260-
if deleteErr != nil {
261-
log.FromContext(ctx).Error(deleteErr, "failed to delete nodeclaim, will wait for liveness TTL", "NodeClaim", nodeClaim.Name)
262-
}
263-
}
264-
metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{
265-
metrics.ReasonLabel: "async_provisioning",
266-
metrics.NodePoolLabel: nodeClaim.Labels[karpv1.NodePoolLabelKey],
267-
metrics.CapacityTypeLabel: nodeClaim.Labels[karpv1.CapacityTypeLabelKey],
268-
})
269-
}
270-
271-
// For standalone node claims, return a CreateError
272-
if !removeNodeClaim {
273-
return cloudprovider.NewCreateError(fmt.Errorf("creating instance failed, %w", err), CreateInstanceFailedReason, truncateMessage(err.Error()))
274-
}
275-
return nil
276-
}
277-
278269
func (c *CloudProvider) List(ctx context.Context) ([]*karpv1.NodeClaim, error) {
279270
vmInstances, err := c.vmInstanceProvider.List(ctx)
280271
if err != nil {
@@ -508,7 +499,7 @@ func GetNodeClaimNameFromVMName(vmName string) string {
508499

509500
const truncateAt = 1200
510501

511-
func (c *CloudProvider) isStandaloneNodeClaim(nodeClaim *karpv1.NodeClaim) bool {
502+
func isNodeClaimStandalone(nodeClaim *karpv1.NodeClaim) bool {
512503
// NodeClaims without the nodepool label are considered standalone
513504
_, hasNodePoolLabel := nodeClaim.Labels[karpv1.NodePoolLabelKey]
514505
return !hasNodePoolLabel
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
Portions Copyright (c) Microsoft Corporation.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package instance
18+
19+
import (
20+
"context"
21+
)
22+
23+
// Intended for lifecycle handling on the higher abstractions.
24+
type Promise interface {
25+
// Cleanup removes the instance from the cloud provider.
26+
Cleanup(ctx context.Context) error
27+
// Wait blocks until the instance is ready.
28+
Wait() error
29+
// GetInstanceName returns the name of the instance. Recommended to be used for logging only due to generic nature.
30+
GetInstanceName() string
31+
}

pkg/providers/instance/vminstance.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,24 @@ func GetManagedExtensionNames(provisionMode string) []string {
9696
type Resource = map[string]interface{}
9797

9898
type VirtualMachinePromise struct {
99-
VM *armcompute.VirtualMachine
100-
Wait func() error
99+
VM *armcompute.VirtualMachine
100+
WaitFunc func() error
101+
102+
providerRef VMProvider
103+
}
104+
105+
func (p *VirtualMachinePromise) Cleanup(ctx context.Context) error {
106+
// This won't clean up leaked NICs if the VM doesn't exist... intentional?
107+
// From Delete(): "Leftover network interfaces (if any) will be cleaned by by GC controller"
108+
// Still, we could try to DeleteNic()?
109+
return p.providerRef.Delete(ctx, lo.FromPtr(p.VM.Name))
110+
}
111+
112+
func (p *VirtualMachinePromise) Wait() error {
113+
return p.WaitFunc()
114+
}
115+
func (p *VirtualMachinePromise) GetInstanceName() string {
116+
return lo.FromPtr(p.VM.Name)
101117
}
102118

103119
type VMProvider interface {
@@ -750,7 +766,8 @@ func (p *DefaultVMProvider) beginLaunchInstance(
750766
result.VM.Properties.TimeCreated = lo.ToPtr(time.Now())
751767

752768
return &VirtualMachinePromise{
753-
Wait: func() error {
769+
providerRef: p,
770+
WaitFunc: func() error {
754771
if result.Poller == nil {
755772
// Poller is nil means the VM existed already and we're done.
756773
// TODO: if the VM doesn't have extensions this will still happen and we will have to

0 commit comments

Comments
 (0)