@@ -96,30 +96,6 @@ func New(
9696 }
9797}
9898
99- // Create a node given the constraints.
100- func (c * CloudProvider ) handleVMInstanceCreation (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) error {
101- if c .isStandaloneNodeClaim (nodeClaim ) {
102- // processStandaloneNodeClaimDeletion:
103- // Standalone NodeClaims aren’t re-queued for reconciliation in the provision_trigger controller,
104- // so we delete them synchronously. After marking Launched=true,
105- // their status can’t be reverted to false once the delete completes due to how core caches nodeclaims in
106- // the lanch controller. This ensures we retry continuously until we hit the registration TTL
107- err := vmPromise .Wait ()
108- if err != nil {
109- return c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , false )
110- }
111- } else {
112- // For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
113- // Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
114- // crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
115- // no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
116- // be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
117- // only held in memory.
118- go c .waitOnPromise (ctx , vmPromise , nodeClaim )
119- }
120- return nil
121- }
122-
12399func (c * CloudProvider ) validateNodeClass (nodeClass * v1beta1.AKSNodeClass ) error {
124100 nodeClassReady := nodeClass .StatusConditions ().Get (status .ConditionReady )
125101 if nodeClassReady .IsFalse () {
@@ -169,12 +145,17 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
169145 if len (instanceTypes ) == 0 {
170146 return nil , cloudprovider .NewInsufficientCapacityError (fmt .Errorf ("all requested instance types were unavailable during launch" ))
171147 }
148+
149+ return c .createVMInstance (ctx , nodeClass , nodeClaim , instanceTypes )
150+ }
151+
152+ func (c * CloudProvider ) createVMInstance (ctx context.Context , nodeClass * v1beta1.AKSNodeClass , nodeClaim * karpv1.NodeClaim , instanceTypes []* cloudprovider.InstanceType ) (* karpv1.NodeClaim , error ) {
172153 vmPromise , err := c .vmInstanceProvider .BeginCreate (ctx , nodeClass , nodeClaim , instanceTypes )
173154 if err != nil {
174155 return nil , cloudprovider .NewCreateError (fmt .Errorf ("creating instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
175156 }
176157
177- if err = c .handleVMInstanceCreation (ctx , vmPromise , nodeClaim ); err != nil {
158+ if err : = c .handleInstancePromise (ctx , vmPromise , nodeClaim ); err != nil {
178159 return nil , err
179160 }
180161
@@ -183,37 +164,82 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
183164 instanceType , _ := lo .Find (instanceTypes , func (i * cloudprovider.InstanceType ) bool {
184165 return i .Name == string (lo .FromPtr (vm .Properties .HardwareProfile .VMSize ))
185166 })
186-
187- nc , err := c .vmInstanceToNodeClaim (ctx , vm , instanceType )
167+ newNodeClaim , err := c .vmInstanceToNodeClaim (ctx , vm , instanceType )
188168 if err != nil {
189169 return nil , err
190170 }
191- if err := setAdditionalAnnotationsForNewNodeClaim (ctx , nc , nodeClass ); err != nil {
171+ if err := setAdditionalAnnotationsForNewNodeClaim (ctx , newNodeClaim , nodeClass ); err != nil {
192172 return nil , err
193173 }
194- return nc , nil
174+ return newNodeClaim , nil
195175}
196176
197- func (c * CloudProvider ) waitOnPromise (ctx context.Context , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim ) {
198- defer func () {
199- if r := recover (); r != nil {
200- err := fmt .Errorf ("%v" , r )
201- log .FromContext (ctx ).Error (err , "panic during waitOnPromise" )
177+ // handleInstancePromise handles the instance promise, primarily deciding on sync/async provisioning.
178+ func (c * CloudProvider ) handleInstancePromise (ctx context.Context , instancePromise instance.Promise , nodeClaim * karpv1.NodeClaim ) error {
179+ if isNodeClaimStandalone (nodeClaim ) {
180+ // Standalone NodeClaims aren't re-queued for reconciliation in the provision_trigger controller,
181+ // so we delete them synchronously. After marking Launched=true,
182+ // their status can't be reverted to false once the delete completes due to how core caches nodeclaims in
183+ // the launch controller. This ensures we retry continuously until we hit the registration TTL
184+ err := instancePromise .Wait ()
185+ if err != nil {
186+ c .handleInstancePromiseWaitError (ctx , instancePromise , nodeClaim , err )
187+ return cloudprovider .NewCreateError (fmt .Errorf ("creating standalone instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
202188 }
203- }()
189+ }
190+ // For NodePool-managed nodeclaims, launch a single goroutine to poll the returned promise.
191+ // Note that we could store the LRO details on the NodeClaim, but we don't bother today because Karpenter
192+ // crashes should be rare, and even in the case of a crash, as long as the node comes up successfully there's
193+ // no issue. If the node doesn't come up successfully in that case, the node and the linked claim will
194+ // be garbage collected after the TTL, but the cause of the nodes issue will be lost, as the LRO URL was
195+ // only held in memory.
196+ go func () {
197+ defer func () {
198+ if r := recover (); r != nil {
199+ err := fmt .Errorf ("%v" , r )
200+ log .FromContext (ctx ).Error (err , "panic during waiting on instance promise" )
201+ }
202+ }()
204203
205- err := vmPromise .Wait ()
204+ err := instancePromise .Wait ()
206205
207- // Wait until the claim is Launched, to avoid racing with creation.
208- // This isn't strictly required, but without this, failure test scenarios are harder
209- // to write because the nodeClaim gets deleted by error handling below before
210- // the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
211- // it would actually happen in production).
212- c .waitUntilLaunched (ctx , nodeClaim )
206+ // Wait until the claim is Launched, to avoid racing with creation.
207+ // This isn't strictly required, but without this, failure test scenarios are harder
208+ // to write because the nodeClaim gets deleted by error handling below before
209+ // the EnsureApplied call finishes, so EnsureApplied creates it again (which is wrong/isn't how
210+ // it would actually happen in production).
211+ c .waitUntilLaunched (ctx , nodeClaim )
213212
214- if err != nil {
215- _ = c .handleNodeClaimCreationError (ctx , err , vmPromise , nodeClaim , true )
216- return
213+ if err != nil {
214+ c .handleInstancePromiseWaitError (ctx , instancePromise , nodeClaim , err )
215+
216+ // For async provisioning, also delete the NodeClaim
217+ if deleteErr := c .kubeClient .Delete (ctx , nodeClaim ); deleteErr != nil {
218+ deleteErr = client .IgnoreNotFound (deleteErr )
219+ if deleteErr != nil {
220+ log .FromContext (ctx ).Error (deleteErr , "failed to delete nodeclaim, will wait for liveness TTL" , "NodeClaim" , nodeClaim .Name )
221+ }
222+ }
223+ metrics .NodeClaimsDisruptedTotal .Inc (map [string ]string {
224+ metrics .ReasonLabel : "async_provisioning" ,
225+ metrics .NodePoolLabel : nodeClaim .Labels [karpv1 .NodePoolLabelKey ],
226+ metrics .CapacityTypeLabel : nodeClaim .Labels [karpv1 .CapacityTypeLabelKey ],
227+ })
228+ }
229+ }()
230+ return nil
231+ }
232+
233+ func (c * CloudProvider ) handleInstancePromiseWaitError (ctx context.Context , instancePromise instance.Promise , nodeClaim * karpv1.NodeClaim , waitErr error ) {
234+ c .recorder .Publish (cloudproviderevents .NodeClaimFailedToRegister (nodeClaim , waitErr ))
235+ log .FromContext (ctx ).Error (waitErr , "failed launching nodeclaim" )
236+
237+ cleanUpError := instancePromise .Cleanup (ctx )
238+ if cleanUpError != nil {
239+ // Fallback to garbage collection to clean up the instance, if it survived.
240+ if cloudprovider .IgnoreNodeClaimNotFoundError (cleanUpError ) != nil {
241+ log .FromContext (ctx ).Error (cleanUpError , "failed to delete instance" , "instanceName" , instancePromise .GetInstanceName ())
242+ }
217243 }
218244}
219245
@@ -240,41 +266,6 @@ func (c *CloudProvider) waitUntilLaunched(ctx context.Context, nodeClaim *karpv1
240266 }
241267}
242268
243- // handleNodeClaimCreationError handles common error processing for both standalone and async node claim creation failures
244- func (c * CloudProvider ) handleNodeClaimCreationError (ctx context.Context , err error , vmPromise * instance.VirtualMachinePromise , nodeClaim * karpv1.NodeClaim , removeNodeClaim bool ) error {
245- c .recorder .Publish (cloudproviderevents .NodeClaimFailedToRegister (nodeClaim , err ))
246- log .FromContext (ctx ).Error (err , "failed launching nodeclaim" )
247-
248- // Clean up the VM
249- // TODO: This won't clean up leaked NICs if the VM doesn't exist... intentional?
250- vmName := lo .FromPtr (vmPromise .VM .Name )
251- deleteErr := c .vmInstanceProvider .Delete (ctx , vmName )
252- if cloudprovider .IgnoreNodeClaimNotFoundError (deleteErr ) != nil {
253- log .FromContext (ctx ).Error (deleteErr , "failed to delete VM" , "vmName" , vmName )
254- }
255-
256- // For async provisioning, also delete the NodeClaim
257- if removeNodeClaim {
258- if deleteErr := c .kubeClient .Delete (ctx , nodeClaim ); deleteErr != nil {
259- deleteErr = client .IgnoreNotFound (deleteErr )
260- if deleteErr != nil {
261- log .FromContext (ctx ).Error (deleteErr , "failed to delete nodeclaim, will wait for liveness TTL" , "NodeClaim" , nodeClaim .Name )
262- }
263- }
264- metrics .NodeClaimsDisruptedTotal .Inc (map [string ]string {
265- metrics .ReasonLabel : "async_provisioning" ,
266- metrics .NodePoolLabel : nodeClaim .Labels [karpv1 .NodePoolLabelKey ],
267- metrics .CapacityTypeLabel : nodeClaim .Labels [karpv1 .CapacityTypeLabelKey ],
268- })
269- }
270-
271- // For standalone node claims, return a CreateError
272- if ! removeNodeClaim {
273- return cloudprovider .NewCreateError (fmt .Errorf ("creating instance failed, %w" , err ), CreateInstanceFailedReason , truncateMessage (err .Error ()))
274- }
275- return nil
276- }
277-
278269func (c * CloudProvider ) List (ctx context.Context ) ([]* karpv1.NodeClaim , error ) {
279270 vmInstances , err := c .vmInstanceProvider .List (ctx )
280271 if err != nil {
@@ -508,7 +499,7 @@ func GetNodeClaimNameFromVMName(vmName string) string {
508499
509500const truncateAt = 1200
510501
511- func ( c * CloudProvider ) isStandaloneNodeClaim (nodeClaim * karpv1.NodeClaim ) bool {
502+ func isNodeClaimStandalone (nodeClaim * karpv1.NodeClaim ) bool {
512503 // NodeClaims without the nodepool label are considered standalone
513504 _ , hasNodePoolLabel := nodeClaim .Labels [karpv1 .NodePoolLabelKey ]
514505 return ! hasNodePoolLabel
0 commit comments