@@ -96,30 +96,6 @@ func New(
9696 }
9797}
9898
99- // Create a node given the constraints.
100- func (c * CloudProvider ) handleVMInstanceCreation (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) error {
101- if c .isStandaloneNodeClaim (nodeClaim ) {
102- // processStandaloneNodeClaimDeletion:
103- // Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104- // so we delete them synchronously. After marking Launched=true,
105- // their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106- // the lanch controller. This ensures we retry continuously until we hit the registration TTL
107- err := vmPromise .Wait ()
108- if err != nil {
109- return c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , false )
110- }
111- } else {
112- // For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
113- // Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
114- // crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
115- // no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116- // be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117- // only held in memory.
118- go c .waitOnPromise (ctx , vmPromise , nodeClaim )
119- }
120- return nil
121- }
122-
12399func (c * CloudProvider ) validateNodeClass (nodeClass * v1beta1.AKSNodeClass ) error {
124100 nodeClassReady := nodeClass .StatusConditions ().Get (status .ConditionReady )
125101 if nodeClassReady .IsFalse () {
@@ -169,12 +145,17 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169145 if len (instanceTypes ) == 0 {
170146 return nil , cloudprovider .NewInsufficientCapacityError (fmt .Errorf ("all requested instance types were unavailable during launch" ))
171147 }
148+
149+ return c .createVMInstance (ctx , nodeClass , nodeClaim , instanceTypes )
150+ }
151+
152+ func (c * CloudProvider ) createVMInstance (ctx context.Context , nodeClass * v1beta1.AKSNodeClass , nodeClaim * karpv1.NodeClaim , instanceTypes []* cloudprovider.InstanceType ) (* karpv1.NodeClaim , error ) {
172153 vmPromise , err := c .vmInstanceProvider .BeginCreate (ctx , nodeClass , nodeClaim , instanceTypes )
173154 if err != nil {
174155 return nil , cloudprovider .NewCreateError (fmt .Errorf ("creating instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
175156 }
176157
177- if err = c .handleVMInstanceCreation (ctx , vmPromise , nodeClaim ); err != nil {
158+ if err : = c .handleInstancePromise (ctx , vmPromise , nodeClaim ); err != nil {
178159 return nil , err
179160 }
180161
@@ -183,37 +164,84 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
183164 instanceType , _ := lo .Find (instanceTypes , func (i * cloudprovider.InstanceType ) bool {
184165 return i .Name == string (lo .FromPtr (vm .Properties .HardwareProfile .VMSize ))
185166 })
186-
187- nc , err := c .vmInstanceToNodeClaim (ctx , vm , instanceType )
167+ newNodeClaim , err := c .vmInstanceToNodeClaim (ctx , vm , instanceType )
188168 if err != nil {
189169 return nil , err
190170 }
191- if err := setAdditionalAnnotationsForNewNodeClaim (ctx , nc , nodeClass ); err != nil {
171+ if err := setAdditionalAnnotationsForNewNodeClaim (ctx , newNodeClaim , nodeClass ); err != nil {
192172 return nil , err
193173 }
194- return nc , nil
174+ return newNodeClaim , nil
195175}
196176
197- func (c * CloudProvider ) waitOnPromise (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) {
198- defer func () {
199- if r := recover (); r != nil {
200- err := fmt .Errorf ("%v" , r )
201- log .FromContext (ctx ).Error (err , "panic during waitOnPromise" )
177+ // handleInstancePromise handles the instance promise, primarily deciding on sync/async provisioning.
178+ func (c * CloudProvider ) handleInstancePromise (ctx context.Context , instancePromise instance.InstancePromise , nodeClaim * karpv1.NodeClaim ) error {
179+ if isNodeClaimStandalone (nodeClaim ) {
180+ // Standalone NodeClaims aren't re-queued for reconciliation in the provision_trigger controller,
181+ // so we delete them synchronously. After marking Launched=true,
182+ // their status can't be reverted to false once the delete completes due to how core caches nodeclaims in
183+ // the launch controller. This ensures we retry continuously until we hit the registration TTL
184+ // return c.waitOnInstancePromise(ctx, instancePromise, nodeClaim, true)
185+ err := instancePromise .Wait ()
186+ if err != nil {
187+ c .handleInstancePromiseWaitError (ctx , instancePromise , nodeClaim , err )
188+ return cloudprovider .NewCreateError (fmt .Errorf ("creating standalone instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
202189 }
203- }()
190+ }
191+ // For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
192+ // Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
193+ // crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
194+ // no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
195+ // be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
196+ // only held in memory.
197+ // go c.waitOnInstancePromise(ctx, instancePromise, nodeClaim, false)
198+ go func () {
199+ defer func () {
200+ if r := recover (); r != nil {
201+ err := fmt .Errorf ("%v" , r )
202+ log .FromContext (ctx ).Error (err , "panic during waiting on instance promise" )
203+ }
204+ }()
204205
205- err := vmPromise .Wait ()
206+ err := instancePromise .Wait ()
206207
207- // Wait until the claim is Launched, to avoid racing with creation.
208- // This isn't strictly required, but without this, failure test scenarios are harder
209- // to write because the nodeClaim gets deleted by error handling below before
210- // the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
211- // it would actually happen in production).
212- c .waitUntilLaunched (ctx , nodeClaim )
208+ // Wait until the claim is Launched, to avoid racing with creation.
209+ // This isn't strictly required, but without this, failure test scenarios are harder
210+ // to write because the nodeClaim gets deleted by error handling below before
211+ // the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
212+ // it would actually happen in production).
213+ c .waitUntilLaunched (ctx , nodeClaim )
213214
214- if err != nil {
215- _ = c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , true )
216- return
215+ if err != nil {
216+ c .handleInstancePromiseWaitError (ctx , instancePromise , nodeClaim , err )
217+
218+ // For async provisioning, also delete the NodeClaim
219+ if deleteErr := c .kubeClient .Delete (ctx , nodeClaim ); deleteErr != nil {
220+ deleteErr = client .IgnoreNotFound (deleteErr )
221+ if deleteErr != nil {
222+ log .FromContext (ctx ).Error (deleteErr , "failed to delete nodeclaim, will wait for liveness TTL" , "NodeClaim" , nodeClaim .Name )
223+ }
224+ }
225+ metrics .NodeClaimsDisruptedTotal .Inc (map [string ]string {
226+ metrics .ReasonLabel : "async_provisioning" ,
227+ metrics .NodePoolLabel : nodeClaim .Labels [karpv1 .NodePoolLabelKey ],
228+ metrics .CapacityTypeLabel : nodeClaim .Labels [karpv1 .CapacityTypeLabelKey ],
229+ })
230+ }
231+ }()
232+ return nil
233+ }
234+
235+ func (c * CloudProvider ) handleInstancePromiseWaitError (ctx context.Context , instancePromise instance.InstancePromise , nodeClaim * karpv1.NodeClaim , waitErr error ) {
236+ c .recorder .Publish (cloudproviderevents .NodeClaimFailedToRegister (nodeClaim , waitErr ))
237+ log .FromContext (ctx ).Error (waitErr , "failed launching nodeclaim" )
238+
239+ cleanUpError := instancePromise .Cleanup (ctx )
240+ if cleanUpError != nil {
241+ // Fallback to garbage collection to clean up the instance, if it survived.
242+ if cloudprovider .IgnoreNodeClaimNotFoundError (cleanUpError ) != nil {
243+ log .FromContext (ctx ).Error (cleanUpError , "failed to delete instance" , "instanceName" , instancePromise .GetInstanceName ())
244+ }
217245 }
218246}
219247
@@ -240,41 +268,6 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
240268 }
241269}
242270
243- // handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
244- func (c * CloudProvider ) handleNodeClaimCreationError (ctx context.Context , err error , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim , removeNodeClaim bool ) error {
245- c .recorder .Publish (cloudproviderevents .NodeClaimFailedToRegister (nodeClaim , err ))
246- log .FromContext (ctx ).Error (err , "failed launching nodeclaim" )
247-
248- // Clean up the VM
249- // TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
250- vmName := lo .FromPtr (vmPromise .VM .Name )
251- deleteErr := c .vmInstanceProvider .Delete (ctx , vmName )
252- if cloudprovider .IgnoreNodeClaimNotFoundError (deleteErr ) != nil {
253- log .FromContext (ctx ).Error (deleteErr , "failed to delete VM" , "vmName" , vmName )
254- }
255-
256- // For async provisioning, also delete the NodeClaim
257- if removeNodeClaim {
258- if deleteErr := c .kubeClient .Delete (ctx , nodeClaim ); deleteErr != nil {
259- deleteErr = client .IgnoreNotFound (deleteErr )
260- if deleteErr != nil {
261- log .FromContext (ctx ).Error (deleteErr , "failed to delete nodeclaim, will wait for liveness TTL" , "NodeClaim" , nodeClaim .Name )
262- }
263- }
264- metrics .NodeClaimsDisruptedTotal .Inc (map [string ]string {
265- metrics .ReasonLabel : "async_provisioning" ,
266- metrics .NodePoolLabel : nodeClaim .Labels [karpv1 .NodePoolLabelKey ],
267- metrics .CapacityTypeLabel : nodeClaim .Labels [karpv1 .CapacityTypeLabelKey ],
268- })
269- }
270-
271- // For standalone node claims, return a CreateError
272- if ! removeNodeClaim {
273- return cloudprovider .NewCreateError (fmt .Errorf ("creating instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
274- }
275- return nil
276- }
277-
278271func (c * CloudProvider ) List (ctx context.Context ) ([]* karpv1.NodeClaim , error ) {
279272 vmInstances , err := c .vmInstanceProvider .List (ctx )
280273 if err != nil {
@@ -508,7 +501,7 @@ func GetNodeClaimNameFromVMName(vmName string) string {
508501
509502const truncateAt = 1200
510503
511- func ( c * CloudProvider ) isStandaloneNodeClaim (nodeClaim * karpv1.NodeClaim ) bool {
504+ func isNodeClaimStandalone (nodeClaim * karpv1.NodeClaim ) bool {
512505 // NodeClaims without the nodepool label are considered standalone
513506 _ , hasNodePoolLabel := nodeClaim .Labels [karpv1 .NodePoolLabelKey ]
514507 return ! hasNodePoolLabel
0 commit comments