Skip to content

Commit 9dda0ef

Browse files
authored
feat: AKS machine API interface and clients (#1183)
1 parent 4f5a700 commit 9dda0ef

File tree

9 files changed

+395
-1
lines changed

9 files changed

+395
-1
lines changed

pkg/operator/options/options.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ type Options struct {
9292
AdditionalTags map[string]string `json:"additionalTags,omitempty"`
9393
EnableAzureSDKLogging bool `json:"enableAzureSDKLogging,omitempty"` // Controls whether Azure SDK middleware logging is enabled
9494
DiskEncryptionSetID string `json:"diskEncryptionSetId,omitempty"`
95+
96+
// If set to true, existing AKS machines created with PROVISION_MODE=aksmachineapi will be managed even with other provision modes. This option does not have any effect if PROVISION_MODE=aksmachineapi, as it will behave as if this option is set to true.
97+
ManageExistingAKSMachines bool `json:"manageExistingAKSMachines,omitempty"`
9598
}
9699

97100
func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
@@ -117,6 +120,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
117120
fs.StringVar(&o.SIGAccessTokenServerURL, "sig-access-token-server-url", env.WithDefaultString("SIG_ACCESS_TOKEN_SERVER_URL", ""), "The URL for the SIG access token server. Only used for AKS managed karpenter. UseSIG must be set tot true for this to take effect.")
118121
fs.StringVar(&o.SIGSubscriptionID, "sig-subscription-id", env.WithDefaultString("SIG_SUBSCRIPTION_ID", ""), "The subscription ID of the shared image gallery.")
119122
fs.StringVar(&o.DiskEncryptionSetID, "node-osdisk-diskencryptionset-id", env.WithDefaultString("NODE_OSDISK_DISKENCRYPTIONSET_ID", ""), "The ARM resource ID of the disk encryption set to use for customer-managed key (BYOK) encryption.")
123+
fs.BoolVar(&o.ManageExistingAKSMachines, "manage-existing-aks-machines", env.WithDefaultBool("MANAGE_EXISTING_AKS_MACHINES", false), "If set to true, existing AKS machines created with PROVISION_MODE=aksmachineapi will be managed even with other provision modes. This option does not have any effect if PROVISION_MODE=aksmachineapi, as it will behave as if this option is set to true.")
120124

121125
additionalTagsFlag := k8sflag.NewMapStringString(&o.AdditionalTags)
122126
if err := additionalTagsFlag.Set(env.WithDefaultString("ADDITIONAL_TAGS", "")); err != nil {

pkg/operator/options/suite_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ var _ = Describe("Options", func() {
7070
"LINUX_ADMIN_USERNAME",
7171
"ADDITIONAL_TAGS",
7272
"ENABLE_AZURE_SDK_LOGGING",
73+
"MANAGE_EXISTING_AKS_MACHINES",
7374
}
7475

7576
var fs *coreoptions.FlagSet
@@ -123,6 +124,7 @@ var _ = Describe("Options", func() {
123124
os.Setenv("KUBELET_IDENTITY_CLIENT_ID", "2345678-1234-1234-1234-123456789012")
124125
os.Setenv("LINUX_ADMIN_USERNAME", "customadminusername")
125126
os.Setenv("ADDITIONAL_TAGS", "test-tag=test-value")
127+
os.Setenv("MANAGE_EXISTING_AKS_MACHINES", "true")
126128
fs = &coreoptions.FlagSet{
127129
FlagSet: flag.NewFlagSet("karpenter", flag.ContinueOnError),
128130
}
@@ -152,6 +154,7 @@ var _ = Describe("Options", func() {
152154
KubeletIdentityClientID: lo.ToPtr("2345678-1234-1234-1234-123456789012"),
153155
AdditionalTags: map[string]string{"test-tag": "test-value"},
154156
ClusterDNSServiceIP: lo.ToPtr("10.244.0.1"),
157+
ManageExistingAKSMachines: lo.ToPtr(true),
155158
})
156159
Expect(opts).To(BeComparableTo(expectedOpts, cmpopts.IgnoreUnexported(options.Options{})))
157160
})
@@ -491,6 +494,35 @@ var _ = Describe("Options", func() {
491494
)
492495
Expect(err).To(MatchError(ContainSubstring("validating options, additional-tags key \"<key1>\" contains invalid characters.")))
493496
})
497+
498+
It("should default manage-existing-aks-machines to false", func() {
499+
err := opts.Parse(
500+
fs,
501+
"--cluster-name", "my-name",
502+
"--cluster-endpoint", "https://karpenter-000000000000.hcp.westus2.staging.azmk8s.io",
503+
"--kubelet-bootstrap-token", "flag-bootstrap-token",
504+
"--ssh-public-key", "flag-ssh-public-key",
505+
"--vnet-subnet-id", "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpentervnet/subnets/karpentersub",
506+
"--node-resource-group", "my-node-rg",
507+
)
508+
Expect(err).ToNot(HaveOccurred())
509+
Expect(opts.ManageExistingAKSMachines).To(BeFalse())
510+
})
511+
512+
It("should succeed with manage-existing-aks-machines set", func() {
513+
err := opts.Parse(
514+
fs,
515+
"--cluster-name", "my-name",
516+
"--cluster-endpoint", "https://karpenter-000000000000.hcp.westus2.staging.azmk8s.io",
517+
"--kubelet-bootstrap-token", "flag-bootstrap-token",
518+
"--ssh-public-key", "flag-ssh-public-key",
519+
"--vnet-subnet-id", "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpentervnet/subnets/karpentersub",
520+
"--node-resource-group", "my-node-rg",
521+
"--manage-existing-aks-machines",
522+
)
523+
Expect(err).ToNot(HaveOccurred())
524+
Expect(opts.ManageExistingAKSMachines).To(BeTrue())
525+
})
494526
})
495527

496528
Context("Admin Username Validation", func() {
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
Portions Copyright (c) Microsoft Corporation.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package instance
18+
19+
import (
20+
"net/http"
21+
"strings"
22+
23+
sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
24+
)
25+
26+
func IsAKSMachineOrMachinesPoolNotFound(err error) bool {
27+
if err == nil {
28+
return false
29+
}
30+
azErr := sdkerrors.IsResponseError(err)
31+
if azErr != nil && (azErr.StatusCode == http.StatusNotFound || // Covers AKS machines pool not found on PUT machine, GET machine, GET (list) machines, POST agent pool (DELETE machines), and AKS machine not found on GET machine
32+
(azErr.StatusCode == http.StatusBadRequest && azErr.ErrorCode == "InvalidParameter" && strings.Contains(azErr.Error(), "Cannot find any valid machines"))) { // Covers AKS machine not found on POST agent pool (DELETE machines)
33+
return true
34+
}
35+
return false
36+
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
Portions Copyright (c) Microsoft Corporation.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package instance
18+
19+
import (
20+
"fmt"
21+
"io"
22+
"net/http"
23+
"strings"
24+
25+
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
26+
27+
. "github.com/onsi/ginkgo/v2"
28+
. "github.com/onsi/gomega"
29+
)
30+
31+
// createAzureResponseError creates a proper Azure SDK error with the given error code and message
32+
func createAzureResponseError(errorCode, errorMessage string, statusCode int) error {
33+
errorBody := fmt.Sprintf(`{"error": {"code": "%s", "message": "%s"}}`, errorCode, errorMessage)
34+
return &azcore.ResponseError{
35+
ErrorCode: errorCode,
36+
StatusCode: statusCode,
37+
RawResponse: &http.Response{
38+
Body: io.NopCloser(strings.NewReader(errorBody)),
39+
},
40+
}
41+
}
42+
43+
var _ = Describe("AKSMachineInstanceUtils Helper Functions", func() {
44+
45+
Context("IsAKSMachineOrMachinesPoolNotFound", func() {
46+
It("should return false for nil error", func() {
47+
result := IsAKSMachineOrMachinesPoolNotFound(nil)
48+
Expect(result).To(BeFalse())
49+
})
50+
51+
It("should return true for HTTP 404 status code", func() {
52+
azureError := &azcore.ResponseError{
53+
ErrorCode: "lol",
54+
StatusCode: 404,
55+
RawResponse: nil,
56+
}
57+
58+
result := IsAKSMachineOrMachinesPoolNotFound(azureError)
59+
Expect(result).To(BeTrue())
60+
})
61+
62+
It("should return true for InvalidParameter error with 'Cannot find any valid machines' message", func() {
63+
// Create the exact error message from your example
64+
errorMessage := "Cannot find any valid machines to delete. Please check your input machine names. The valid machines to delete in agent pool 'testmpool' are: testmachine."
65+
azureError := createAzureResponseError("InvalidParameter", errorMessage, 400)
66+
67+
result := IsAKSMachineOrMachinesPoolNotFound(azureError)
68+
Expect(result).To(BeTrue())
69+
})
70+
71+
It("should return false for HTTP 400 with InvalidParameter but different message", func() {
72+
// Create an InvalidParameter error with a different message that shouldn't match
73+
differentMessage := "InvalidParameter: Some other validation error"
74+
azureError := createAzureResponseError("InvalidParameter", differentMessage, 400)
75+
76+
result := IsAKSMachineOrMachinesPoolNotFound(azureError)
77+
Expect(result).To(BeFalse())
78+
})
79+
80+
It("should return false for other HTTP status codes", func() {
81+
azureError := &azcore.ResponseError{
82+
ErrorCode: "Unauthorized",
83+
StatusCode: 401,
84+
RawResponse: nil,
85+
}
86+
87+
result := IsAKSMachineOrMachinesPoolNotFound(azureError)
88+
Expect(result).To(BeFalse())
89+
90+
azureError = &azcore.ResponseError{
91+
ErrorCode: "InternalOperationError",
92+
StatusCode: 500,
93+
RawResponse: nil,
94+
}
95+
96+
result = IsAKSMachineOrMachinesPoolNotFound(azureError)
97+
Expect(result).To(BeFalse())
98+
})
99+
100+
It("should return false for non-Azure SDK errors", func() {
101+
result := IsAKSMachineOrMachinesPoolNotFound(fmt.Errorf("some generic error"))
102+
Expect(result).To(BeFalse())
103+
})
104+
})
105+
})

pkg/providers/instance/azure_client.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
2525
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
2626
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
27+
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
2728
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork"
2829
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resourcegraph/armresourcegraph"
2930
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions"
@@ -41,6 +42,17 @@ import (
4142
armopts "github.com/Azure/karpenter-provider-azure/pkg/utils/opts"
4243
)
4344

45+
type AKSMachinesAPI interface {
46+
BeginCreateOrUpdate(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachineName string, parameters armcontainerservice.Machine, options *armcontainerservice.MachinesClientBeginCreateOrUpdateOptions) (*runtime.Poller[armcontainerservice.MachinesClientCreateOrUpdateResponse], error)
47+
Get(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachineName string, options *armcontainerservice.MachinesClientGetOptions) (armcontainerservice.MachinesClientGetResponse, error)
48+
NewListPager(resourceGroupName string, resourceName string, agentPoolName string, options *armcontainerservice.MachinesClientListOptions) *runtime.Pager[armcontainerservice.MachinesClientListResponse]
49+
}
50+
51+
type AKSAgentPoolsAPI interface {
52+
Get(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, options *armcontainerservice.AgentPoolsClientGetOptions) (armcontainerservice.AgentPoolsClientGetResponse, error)
53+
BeginDeleteMachines(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachines armcontainerservice.AgentPoolDeleteMachinesParameter, options *armcontainerservice.AgentPoolsClientBeginDeleteMachinesOptions) (*runtime.Poller[armcontainerservice.AgentPoolsClientDeleteMachinesResponse], error)
54+
}
55+
4456
type VirtualMachinesAPI interface {
4557
BeginCreateOrUpdate(ctx context.Context, resourceGroupName string, vmName string, parameters armcompute.VirtualMachine, options *armcompute.VirtualMachinesClientBeginCreateOrUpdateOptions) (*runtime.Poller[armcompute.VirtualMachinesClientCreateOrUpdateResponse], error)
4658
Get(ctx context.Context, resourceGroupName string, vmName string, options *armcompute.VirtualMachinesClientGetOptions) (armcompute.VirtualMachinesClientGetResponse, error)
@@ -72,6 +84,8 @@ type SubnetsAPI interface {
7284
type AZClient struct {
7385
azureResourceGraphClient AzureResourceGraphAPI
7486
virtualMachinesClient VirtualMachinesAPI
87+
aksMachinesClient AKSMachinesAPI
88+
agentPoolsClient AKSAgentPoolsAPI
7589
virtualMachinesExtensionClient VirtualMachineExtensionsAPI
7690
networkInterfacesClient NetworkInterfacesAPI
7791
subnetsClient SubnetsAPI
@@ -93,6 +107,8 @@ func (c *AZClient) SubnetsClient() SubnetsAPI {
93107
func NewAZClientFromAPI(
94108
virtualMachinesClient VirtualMachinesAPI,
95109
azureResourceGraphClient AzureResourceGraphAPI,
110+
aksMachinesClient AKSMachinesAPI,
111+
agentPoolsClient AKSAgentPoolsAPI,
96112
virtualMachinesExtensionClient VirtualMachineExtensionsAPI,
97113
interfacesClient NetworkInterfacesAPI,
98114
subnetsClient SubnetsAPI,
@@ -107,6 +123,8 @@ func NewAZClientFromAPI(
107123
return &AZClient{
108124
virtualMachinesClient: virtualMachinesClient,
109125
azureResourceGraphClient: azureResourceGraphClient,
126+
aksMachinesClient: aksMachinesClient,
127+
agentPoolsClient: agentPoolsClient,
110128
virtualMachinesExtensionClient: virtualMachinesExtensionClient,
111129
networkInterfacesClient: interfacesClient,
112130
subnetsClient: subnetsClient,
@@ -184,7 +202,12 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *auth.Environment, c
184202
// TODO Move this over to track 2 when skewer is migrated
185203
skuClient := skuclient.NewSkuClient(cfg.SubscriptionID, cred, env.Cloud)
186204

187-
var nodeBootstrappingClient imagefamilytypes.NodeBootstrappingAPI = nil
205+
// These clients are used for Azure instance management.
206+
var nodeBootstrappingClient imagefamilytypes.NodeBootstrappingAPI
207+
var aksMachinesClient AKSMachinesAPI
208+
var agentPoolsClient AKSAgentPoolsAPI
209+
210+
// Only create the bootstrapping client if we need to use it.
188211
if o.ProvisionMode == consts.ProvisionModeBootstrappingClient {
189212
nodeBootstrappingClient, err = imagefamily.NewNodeBootstrappingClient(
190213
ctx,
@@ -200,9 +223,38 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *auth.Environment, c
200223
}
201224
}
202225

226+
// Only create AKS machine clients if we need to use them.
227+
// Otherwise, use the no-op dry clients, which will act like there are no AKS machines present.
228+
if o.ManageExistingAKSMachines {
229+
aksMachinesClient, err = armcontainerservice.NewMachinesClient(cfg.SubscriptionID, cred, opts)
230+
if err != nil {
231+
return nil, err
232+
}
233+
agentPoolsClient, err = armcontainerservice.NewAgentPoolsClient(cfg.SubscriptionID, cred, opts)
234+
if err != nil {
235+
return nil, err
236+
}
237+
} else {
238+
aksMachinesClient = NewNoAKSMachinesClient()
239+
agentPoolsClient = NewNoAKSAgentPoolsClient()
240+
241+
// Try create true clients. This is just for diagnostic purposes and serves no real functionality.
242+
// This portion of code can be removed once we are confident that this works reliably.
243+
_, err = armcontainerservice.NewMachinesClient(cfg.SubscriptionID, cred, opts)
244+
if err != nil {
245+
log.FromContext(ctx).Info("failed to create true AKS machines client, but tolerated due to currently on no-client", "error", err)
246+
}
247+
_, err = armcontainerservice.NewAgentPoolsClient(cfg.SubscriptionID, cred, opts)
248+
if err != nil {
249+
log.FromContext(ctx).Info("failed to create true AKS agent pools client, but tolerated due to currently on no-client", "error", err)
250+
}
251+
}
252+
203253
return NewAZClientFromAPI(
204254
virtualMachinesClient,
205255
azureResourceGraphClient,
256+
aksMachinesClient,
257+
agentPoolsClient,
206258
extensionsClient,
207259
interfacesClient,
208260
subnetsClient,

0 commit comments

Comments
 (0)