Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/operator/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ type Options struct {
AdditionalTags map[string]string `json:"additionalTags,omitempty"`
EnableAzureSDKLogging bool `json:"enableAzureSDKLogging,omitempty"` // Controls whether Azure SDK middleware logging is enabled
DiskEncryptionSetID string `json:"diskEncryptionSetId,omitempty"`

// If set to true, existing AKS machines created with PROVISION_MODE=aksmachineapi will be managed even with other provision modes. This option does not have any effect if PROVISION_MODE=aksmachineapi, as it will behave as if this option is set to true.
ManageExistingAKSMachines bool `json:"manageExistingAKSMachines,omitempty"`
}

func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
Expand All @@ -117,6 +120,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
fs.StringVar(&o.SIGAccessTokenServerURL, "sig-access-token-server-url", env.WithDefaultString("SIG_ACCESS_TOKEN_SERVER_URL", ""), "The URL for the SIG access token server. Only used for AKS managed karpenter. UseSIG must be set tot true for this to take effect.")
fs.StringVar(&o.SIGSubscriptionID, "sig-subscription-id", env.WithDefaultString("SIG_SUBSCRIPTION_ID", ""), "The subscription ID of the shared image gallery.")
fs.StringVar(&o.DiskEncryptionSetID, "node-osdisk-diskencryptionset-id", env.WithDefaultString("NODE_OSDISK_DISKENCRYPTIONSET_ID", ""), "The ARM resource ID of the disk encryption set to use for customer-managed key (BYOK) encryption.")
fs.BoolVar(&o.ManageExistingAKSMachines, "manage-existing-aks-machines", env.WithDefaultBool("MANAGE_EXISTING_AKS_MACHINES", false), "If set to true, existing AKS machines created with PROVISION_MODE=aksmachineapi will be managed even with other provision modes. This option does not have any effect if PROVISION_MODE=aksmachineapi, as it will behave as if this option is set to true.")

additionalTagsFlag := k8sflag.NewMapStringString(&o.AdditionalTags)
if err := additionalTagsFlag.Set(env.WithDefaultString("ADDITIONAL_TAGS", "")); err != nil {
Expand Down
32 changes: 32 additions & 0 deletions pkg/operator/options/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ var _ = Describe("Options", func() {
"LINUX_ADMIN_USERNAME",
"ADDITIONAL_TAGS",
"ENABLE_AZURE_SDK_LOGGING",
"MANAGE_EXISTING_AKS_MACHINES",
}

var fs *coreoptions.FlagSet
Expand Down Expand Up @@ -123,6 +124,7 @@ var _ = Describe("Options", func() {
os.Setenv("KUBELET_IDENTITY_CLIENT_ID", "2345678-1234-1234-1234-123456789012")
os.Setenv("LINUX_ADMIN_USERNAME", "customadminusername")
os.Setenv("ADDITIONAL_TAGS", "test-tag=test-value")
os.Setenv("MANAGE_EXISTING_AKS_MACHINES", "true")
fs = &coreoptions.FlagSet{
FlagSet: flag.NewFlagSet("karpenter", flag.ContinueOnError),
}
Expand Down Expand Up @@ -152,6 +154,7 @@ var _ = Describe("Options", func() {
KubeletIdentityClientID: lo.ToPtr("2345678-1234-1234-1234-123456789012"),
AdditionalTags: map[string]string{"test-tag": "test-value"},
ClusterDNSServiceIP: lo.ToPtr("10.244.0.1"),
ManageExistingAKSMachines: lo.ToPtr(true),
})
Expect(opts).To(BeComparableTo(expectedOpts, cmpopts.IgnoreUnexported(options.Options{})))
})
Expand Down Expand Up @@ -491,6 +494,35 @@ var _ = Describe("Options", func() {
)
Expect(err).To(MatchError(ContainSubstring("validating options, additional-tags key \"<key1>\" contains invalid characters.")))
})

It("should default manage-existing-aks-machines to false", func() {
err := opts.Parse(
fs,
"--cluster-name", "my-name",
"--cluster-endpoint", "https://karpenter-000000000000.hcp.westus2.staging.azmk8s.io",
"--kubelet-bootstrap-token", "flag-bootstrap-token",
"--ssh-public-key", "flag-ssh-public-key",
"--vnet-subnet-id", "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpentervnet/subnets/karpentersub",
"--node-resource-group", "my-node-rg",
)
Expect(err).ToNot(HaveOccurred())
Expect(opts.ManageExistingAKSMachines).To(BeFalse())
})

It("should succeed with manage-existing-aks-machines set", func() {
err := opts.Parse(
fs,
"--cluster-name", "my-name",
"--cluster-endpoint", "https://karpenter-000000000000.hcp.westus2.staging.azmk8s.io",
"--kubelet-bootstrap-token", "flag-bootstrap-token",
"--ssh-public-key", "flag-ssh-public-key",
"--vnet-subnet-id", "/subscriptions/12345678-1234-1234-1234-123456789012/resourceGroups/sillygeese/providers/Microsoft.Network/virtualNetworks/karpentervnet/subnets/karpentersub",
"--node-resource-group", "my-node-rg",
"--manage-existing-aks-machines",
)
Expect(err).ToNot(HaveOccurred())
Expect(opts.ManageExistingAKSMachines).To(BeTrue())
})
})

Context("Admin Username Validation", func() {
Expand Down
36 changes: 36 additions & 0 deletions pkg/providers/instance/aksmachineinstanceutils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Portions Copyright (c) Microsoft Corporation.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package instance

import (
"net/http"
"strings"

sdkerrors "github.com/Azure/azure-sdk-for-go-extensions/pkg/errors"
)

func IsAKSMachineOrMachinesPoolNotFound(err error) bool {
if err == nil {
return false
}
azErr := sdkerrors.IsResponseError(err)
if azErr != nil && (azErr.StatusCode == http.StatusNotFound || // Covers AKS machines pool not found on PUT machine, GET machine, GET (list) machines, POST agent pool (DELETE machines), and AKS machine not found on GET machine
(azErr.StatusCode == http.StatusBadRequest && azErr.ErrorCode == "InvalidParameter" && strings.Contains(azErr.Error(), "Cannot find any valid machines"))) { // Covers AKS machine not found on POST agent pool (DELETE machines)
return true
}
return false
}
105 changes: 105 additions & 0 deletions pkg/providers/instance/aksmachineinstanceutils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
Portions Copyright (c) Microsoft Corporation.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package instance

import (
"fmt"
"io"
"net/http"
"strings"

"github.com/Azure/azure-sdk-for-go/sdk/azcore"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

// createAzureResponseError creates a proper Azure SDK error with the given error code and message
func createAzureResponseError(errorCode, errorMessage string, statusCode int) error {
errorBody := fmt.Sprintf(`{"error": {"code": "%s", "message": "%s"}}`, errorCode, errorMessage)
return &azcore.ResponseError{
ErrorCode: errorCode,
StatusCode: statusCode,
RawResponse: &http.Response{
Body: io.NopCloser(strings.NewReader(errorBody)),
},
}
}

var _ = Describe("AKSMachineInstanceUtils Helper Functions", func() {

Context("IsAKSMachineOrMachinesPoolNotFound", func() {
It("should return false for nil error", func() {
result := IsAKSMachineOrMachinesPoolNotFound(nil)
Expect(result).To(BeFalse())
})

It("should return true for HTTP 404 status code", func() {
azureError := &azcore.ResponseError{
ErrorCode: "lol",
StatusCode: 404,
RawResponse: nil,
}

result := IsAKSMachineOrMachinesPoolNotFound(azureError)
Expect(result).To(BeTrue())
})

It("should return true for InvalidParameter error with 'Cannot find any valid machines' message", func() {
// Create the exact error message from your example
errorMessage := "Cannot find any valid machines to delete. Please check your input machine names. The valid machines to delete in agent pool 'testmpool' are: testmachine."
azureError := createAzureResponseError("InvalidParameter", errorMessage, 400)

result := IsAKSMachineOrMachinesPoolNotFound(azureError)
Expect(result).To(BeTrue())
})

It("should return false for HTTP 400 with InvalidParameter but different message", func() {
// Create an InvalidParameter error with a different message that shouldn't match
differentMessage := "InvalidParameter: Some other validation error"
azureError := createAzureResponseError("InvalidParameter", differentMessage, 400)

result := IsAKSMachineOrMachinesPoolNotFound(azureError)
Expect(result).To(BeFalse())
})

It("should return false for other HTTP status codes", func() {
azureError := &azcore.ResponseError{
ErrorCode: "Unauthorized",
StatusCode: 401,
RawResponse: nil,
}

result := IsAKSMachineOrMachinesPoolNotFound(azureError)
Expect(result).To(BeFalse())

azureError = &azcore.ResponseError{
ErrorCode: "InternalOperationError",
StatusCode: 500,
RawResponse: nil,
}

result = IsAKSMachineOrMachinesPoolNotFound(azureError)
Expect(result).To(BeFalse())
})

It("should return false for non-Azure SDK errors", func() {
result := IsAKSMachineOrMachinesPoolNotFound(fmt.Errorf("some generic error"))
Expect(result).To(BeFalse())
})
})
})
54 changes: 53 additions & 1 deletion pkg/providers/instance/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v8"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resourcegraph/armresourcegraph"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armsubscriptions"
Expand All @@ -41,6 +42,17 @@ import (
armopts "github.com/Azure/karpenter-provider-azure/pkg/utils/opts"
)

type AKSMachinesAPI interface {
BeginCreateOrUpdate(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachineName string, parameters armcontainerservice.Machine, options *armcontainerservice.MachinesClientBeginCreateOrUpdateOptions) (*runtime.Poller[armcontainerservice.MachinesClientCreateOrUpdateResponse], error)
Get(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachineName string, options *armcontainerservice.MachinesClientGetOptions) (armcontainerservice.MachinesClientGetResponse, error)
NewListPager(resourceGroupName string, resourceName string, agentPoolName string, options *armcontainerservice.MachinesClientListOptions) *runtime.Pager[armcontainerservice.MachinesClientListResponse]
}

type AKSAgentPoolsAPI interface {
Get(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, options *armcontainerservice.AgentPoolsClientGetOptions) (armcontainerservice.AgentPoolsClientGetResponse, error)
BeginDeleteMachines(ctx context.Context, resourceGroupName string, resourceName string, agentPoolName string, aksMachines armcontainerservice.AgentPoolDeleteMachinesParameter, options *armcontainerservice.AgentPoolsClientBeginDeleteMachinesOptions) (*runtime.Poller[armcontainerservice.AgentPoolsClientDeleteMachinesResponse], error)
}

type VirtualMachinesAPI interface {
BeginCreateOrUpdate(ctx context.Context, resourceGroupName string, vmName string, parameters armcompute.VirtualMachine, options *armcompute.VirtualMachinesClientBeginCreateOrUpdateOptions) (*runtime.Poller[armcompute.VirtualMachinesClientCreateOrUpdateResponse], error)
Get(ctx context.Context, resourceGroupName string, vmName string, options *armcompute.VirtualMachinesClientGetOptions) (armcompute.VirtualMachinesClientGetResponse, error)
Expand Down Expand Up @@ -72,6 +84,8 @@ type SubnetsAPI interface {
type AZClient struct {
azureResourceGraphClient AzureResourceGraphAPI
virtualMachinesClient VirtualMachinesAPI
aksMachinesClient AKSMachinesAPI
agentPoolsClient AKSAgentPoolsAPI
virtualMachinesExtensionClient VirtualMachineExtensionsAPI
networkInterfacesClient NetworkInterfacesAPI
subnetsClient SubnetsAPI
Expand All @@ -93,6 +107,8 @@ func (c *AZClient) SubnetsClient() SubnetsAPI {
func NewAZClientFromAPI(
virtualMachinesClient VirtualMachinesAPI,
azureResourceGraphClient AzureResourceGraphAPI,
aksMachinesClient AKSMachinesAPI,
agentPoolsClient AKSAgentPoolsAPI,
virtualMachinesExtensionClient VirtualMachineExtensionsAPI,
interfacesClient NetworkInterfacesAPI,
subnetsClient SubnetsAPI,
Expand All @@ -107,6 +123,8 @@ func NewAZClientFromAPI(
return &AZClient{
virtualMachinesClient: virtualMachinesClient,
azureResourceGraphClient: azureResourceGraphClient,
aksMachinesClient: aksMachinesClient,
agentPoolsClient: agentPoolsClient,
virtualMachinesExtensionClient: virtualMachinesExtensionClient,
networkInterfacesClient: interfacesClient,
subnetsClient: subnetsClient,
Expand Down Expand Up @@ -184,7 +202,12 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *auth.Environment, c
// TODO Move this over to track 2 when skewer is migrated
skuClient := skuclient.NewSkuClient(cfg.SubscriptionID, cred, env.Cloud)

var nodeBootstrappingClient imagefamilytypes.NodeBootstrappingAPI = nil
// These clients are used for Azure instance management.
var nodeBootstrappingClient imagefamilytypes.NodeBootstrappingAPI
var aksMachinesClient AKSMachinesAPI
var agentPoolsClient AKSAgentPoolsAPI

// Only create the bootstrapping client if we need to use it.
if o.ProvisionMode == consts.ProvisionModeBootstrappingClient {
nodeBootstrappingClient, err = imagefamily.NewNodeBootstrappingClient(
ctx,
Expand All @@ -200,9 +223,38 @@ func NewAZClient(ctx context.Context, cfg *auth.Config, env *auth.Environment, c
}
}

// Only create AKS machine clients if we need to use them.
// Otherwise, use the no-op dry clients, which will act like there are no AKS machines present.
if o.ManageExistingAKSMachines {
aksMachinesClient, err = armcontainerservice.NewMachinesClient(cfg.SubscriptionID, cred, opts)
if err != nil {
return nil, err
}
agentPoolsClient, err = armcontainerservice.NewAgentPoolsClient(cfg.SubscriptionID, cred, opts)
if err != nil {
return nil, err
}
} else {
aksMachinesClient = NewNoAKSMachinesClient()
agentPoolsClient = NewNoAKSAgentPoolsClient()

// Try create true clients. This is just for diagnostic purposes and serves no real functionality.
// This portion of code can be removed once we are confident that this works reliably.
_, err = armcontainerservice.NewMachinesClient(cfg.SubscriptionID, cred, opts)
if err != nil {
log.FromContext(ctx).Info("failed to create true AKS machines client, but tolerated due to currently on no-client", "error", err)
}
_, err = armcontainerservice.NewAgentPoolsClient(cfg.SubscriptionID, cred, opts)
if err != nil {
log.FromContext(ctx).Info("failed to create true AKS agent pools client, but tolerated due to currently on no-client", "error", err)
}
}

return NewAZClientFromAPI(
virtualMachinesClient,
azureResourceGraphClient,
aksMachinesClient,
agentPoolsClient,
extensionsClient,
interfacesClient,
subnetsClient,
Expand Down
Loading
Loading