diff --git a/aqua.yaml b/aqua.yaml index 0ef3586b3..e67f0605f 100644 --- a/aqua.yaml +++ b/aqua.yaml @@ -32,3 +32,4 @@ packages: - name: evilmartians/lefthook@v2.0.8 - name: bridgecrewio/checkov@3.2.495 - name: kubernetes-sigs/krew@v0.4.5 + - name: Azure/kubelogin@v0.2.13 diff --git a/contexts/_template/blueprint.yaml b/contexts/_template/blueprint.yaml index 6c6089c26..939e011b8 100644 --- a/contexts/_template/blueprint.yaml +++ b/contexts/_template/blueprint.yaml @@ -3,11 +3,6 @@ apiVersion: blueprints.windsorcli.dev/v1alpha1 metadata: name: template description: Base blueprint template for core services -repository: - url: "" - ref: - branch: main - secretName: flux-system sources: [] terraform: [] kustomize: [] diff --git a/contexts/_template/features/provider-azure.yaml b/contexts/_template/features/provider-azure.yaml index baceb1165..f41fad60c 100644 --- a/contexts/_template/features/provider-azure.yaml +++ b/contexts/_template/features/provider-azure.yaml @@ -35,5 +35,23 @@ kustomize: path: csi dependsOn: - policy-resources + components: + - azure-disk cleanup: - pvcs + +# Telemetry - exclude metrics-server as AKS provides it natively +- name: telemetry-resources + path: telemetry/resources + strategy: replace + dependsOn: + - telemetry-base + - csi + components: + - prometheus + - prometheus/flux + - fluentbit + - fluentbit/containerd + - fluentbit/fluentd + - fluentbit/kubernetes + - fluentbit/systemd diff --git a/kustomize/csi/azure-disk/kustomization.yaml b/kustomize/csi/azure-disk/kustomization.yaml new file mode 100644 index 000000000..0237a1186 --- /dev/null +++ b/kustomize/csi/azure-disk/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component +resources: + - storageclass.yaml diff --git a/kustomize/csi/azure-disk/storageclass.yaml b/kustomize/csi/azure-disk/storageclass.yaml new file mode 100644 index 000000000..fb1cd8e9b --- /dev/null +++ b/kustomize/csi/azure-disk/storageclass.yaml @@ -0,0 +1,14 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: single + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: disk.csi.azure.com +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +parameters: + skuName: Premium_LRS + cachingMode: ReadWrite + fsType: ext4 +reclaimPolicy: Delete diff --git a/terraform/cluster/azure-aks/.terraform.lock.hcl b/terraform/cluster/azure-aks/.terraform.lock.hcl index ae51b1537..84e7aa022 100644 --- a/terraform/cluster/azure-aks/.terraform.lock.hcl +++ b/terraform/cluster/azure-aks/.terraform.lock.hcl @@ -32,21 +32,21 @@ provider "registry.terraform.io/hashicorp/azurerm" { } provider "registry.terraform.io/hashicorp/local" { - version = "2.5.3" + version = "2.6.1" hashes = [ - "h1:MCzg+hs1/ZQ32u56VzJMWP9ONRQPAAqAjuHuzbyshvI=", - "zh:284d4b5b572eacd456e605e94372f740f6de27b71b4e1fd49b63745d8ecd4927", - "zh:40d9dfc9c549e406b5aab73c023aa485633c1b6b730c933d7bcc2fa67fd1ae6e", - "zh:6243509bb208656eb9dc17d3c525c89acdd27f08def427a0dce22d5db90a4c8b", + "h1:DbiR/D2CPigzCGweYIyJH0N0x04oyI5xiZ9wSW/s3kQ=", + "zh:10050d08f416de42a857e4b6f76809aae63ea4ec6f5c852a126a915dede814b4", + "zh:2df2a3ebe9830d4759c59b51702e209fe053f47453cb4688f43c063bac8746b7", + "zh:2e759568bcc38c86ca0e43701d34cf29945736fdc8e429c5b287ddc2703c7b18", + "zh:6a62a34e48500ab4aea778e355e162ebde03260b7a9eb9edc7e534c84fbca4c6", + "zh:74373728ba32a1d5450a3a88ac45624579e32755b086cd4e51e88d9aca240ef6", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:885d85869f927853b6fe330e235cd03c337ac3b933b0d9ae827ec32fa1fdcdbf", - "zh:bab66af51039bdfcccf85b25fe562cbba2f54f6b3812202f4873ade834ec201d", - "zh:c505ff1bf9442a889ac7dca3ac05a8ee6f852e0118dd9a61796a2f6ff4837f09", - "zh:d36c0b5770841ddb6eaf0499ba3de48e5d4fc99f4829b6ab66b0fab59b1aaf4f", - "zh:ddb6a407c7f3ec63efb4dad5f948b54f7f4434ee1a2607a49680d494b1776fe1", - "zh:e0dafdd4500bec23d3ff221e3a9b60621c5273e5df867bc59ef6b7e41f5c91f6", - "zh:ece8742fd2882a8fc9d6efd20e2590010d43db386b920b2a9c220cfecc18de47", - "zh:f4c6b3eb8f39105004cf720e202f04f57e3578441cfb76ca27611139bc116a82", + "zh:8dddae588971a996f622e7589cd8b9da7834c744ac12bfb59c97fa77ded95255", + "zh:946f82f66353bb97aefa8d95c4ca86db227f9b7c50b82415289ac47e4e74d08d", + "zh:e9a5c09e6f35e510acf15b666fd0b34a30164cecdcd81ce7cda0f4b2dade8d91", + "zh:eafe5b873ef42b32feb2f969c38ff8652507e695620cbaf03b9db714bee52249", + "zh:ec146289fa27650c9d433bb5c7847379180c0b7a323b1b94e6e7ad5d2a7dbe71", + "zh:fc882c35ce05631d76c0973b35adde26980778fc81d9da81a2fade2b9d73423b", ] } diff --git a/terraform/cluster/azure-aks/main.tf b/terraform/cluster/azure-aks/main.tf index cc729630a..09230b4c6 100644 --- a/terraform/cluster/azure-aks/main.tf +++ b/terraform/cluster/azure-aks/main.tf @@ -34,6 +34,8 @@ provider "azurerm" { data "azurerm_client_config" "current" {} +data "azurerm_subscription" "current" {} + data "azurerm_virtual_network" "vnet" { name = "${var.vnet_module_name}-${var.context_id}" resource_group_name = "${var.vnet_module_name}-${var.context_id}" @@ -55,9 +57,14 @@ data "azurerm_subnet" "private" { #----------------------------------------------------------------------------------------------------------------------- locals { - kubeconfig_path = "${var.context_path}/.kube/config" - rg_name = var.resource_group_name == null ? "${var.name}-${var.context_id}" : var.resource_group_name - cluster_name = var.cluster_name == null ? "${var.name}-${var.context_id}" : var.cluster_name + kubeconfig_path = "${var.context_path}/.kube/config" + rg_name = var.resource_group_name == null ? "${var.name}-${var.context_id}" : var.resource_group_name + cluster_name = var.cluster_name == null ? "${var.name}-${var.context_id}" : var.cluster_name + node_resource_group_name = split("/", azurerm_kubernetes_cluster.main.node_resource_group_id)[4] + node_pool_names = concat( + [var.default_node_pool.name], + var.autoscaled_node_pool.enabled ? [var.autoscaled_node_pool.name] : [] + ) tags = merge({ WindsorContextID = var.context_id }, var.tags) @@ -92,7 +99,7 @@ resource "azurerm_key_vault" "key_vault" { resource_group_name = azurerm_resource_group.aks.name tenant_id = data.azurerm_client_config.current.tenant_id sku_name = "premium" - enabled_for_disk_encryption = true + enabled_for_disk_encryption = var.disk_encryption_enabled purge_protection_enabled = true soft_delete_retention_days = var.soft_delete_retention_days # checkov:skip=CKV_AZURE_189: We are using a public cluster for testing @@ -133,10 +140,12 @@ resource "azurerm_key_vault_access_policy" "key_vault_access_policy" { } resource "azurerm_key_vault_access_policy" "key_vault_access_policy_disk" { + count = var.disk_encryption_enabled ? 1 : 0 + key_vault_id = azurerm_key_vault.key_vault.id tenant_id = data.azurerm_client_config.current.tenant_id - object_id = azurerm_disk_encryption_set.main.identity.0.principal_id + object_id = azurerm_disk_encryption_set.main[0].identity.0.principal_id key_permissions = [ "Get", @@ -153,9 +162,17 @@ resource "azurerm_key_vault_access_policy" "key_vault_access_policy_disk" { ] } +# Moved block to handle transition from single instance to count-based resource +moved { + from = azurerm_key_vault_access_policy.key_vault_access_policy_disk + to = azurerm_key_vault_access_policy.key_vault_access_policy_disk[0] +} + resource "time_static" "expiry" {} resource "azurerm_key_vault_key" "key_vault_key" { + count = var.disk_encryption_enabled ? 1 : 0 + name = "${var.name}-${var.context_id}-${random_string.key.result}" key_vault_id = azurerm_key_vault.key_vault.id key_type = "RSA-HSM" @@ -185,17 +202,31 @@ resource "azurerm_key_vault_key" "key_vault_key" { ] } +# Moved block to handle transition from single instance to count-based resource +moved { + from = azurerm_key_vault_key.key_vault_key + to = azurerm_key_vault_key.key_vault_key[0] +} + resource "azurerm_disk_encryption_set" "main" { + count = var.disk_encryption_enabled ? 1 : 0 + name = "${var.name}-${var.context_id}-${random_string.key.result}" resource_group_name = azurerm_resource_group.aks.name location = azurerm_resource_group.aks.location - key_vault_key_id = azurerm_key_vault_key.key_vault_key.id + key_vault_key_id = azurerm_key_vault_key.key_vault_key[0].id identity { type = "SystemAssigned" } } +# Moved block to handle transition from single instance to count-based resource +moved { + from = azurerm_disk_encryption_set.main + to = azurerm_disk_encryption_set.main[0] +} + #----------------------------------------------------------------------------------------------------------------------- # Log Analytics Workspace #----------------------------------------------------------------------------------------------------------------------- @@ -211,6 +242,86 @@ resource "azurerm_log_analytics_workspace" "aks_logs" { }, local.tags) } +resource "azurerm_monitor_diagnostic_setting" "aks_cluster" { + name = "${var.name}-${var.context_id}-aks-diag" + target_resource_id = azurerm_kubernetes_cluster.main.id + log_analytics_workspace_id = azurerm_log_analytics_workspace.aks_logs.id + + dynamic "enabled_log" { + for_each = var.diagnostic_log_categories + content { + category = enabled_log.value + + dynamic "retention_policy" { + for_each = var.diagnostic_log_retention_days != null ? [1] : [] + content { + enabled = true + days = var.diagnostic_log_retention_days + } + } + } + } + + enabled_metric { + category = "AllMetrics" + } +} + +#----------------------------------------------------------------------------------------------------------------------- +# Data Collection Rule (DCR) +#----------------------------------------------------------------------------------------------------------------------- + +resource "azurerm_monitor_data_collection_rule" "container_insights" { + count = var.container_insights_enabled ? 1 : 0 + name = "${var.name}-${var.context_id}-dcr" + resource_group_name = azurerm_resource_group.aks.name + location = azurerm_resource_group.aks.location + + destinations { + log_analytics { + workspace_resource_id = azurerm_log_analytics_workspace.aks_logs.id + name = "ciworkspace" + } + } + + data_flow { + streams = ["Microsoft-ContainerLogV2", "Microsoft-KubeEvents", "Microsoft-KubePodInventory"] + destinations = ["ciworkspace"] + } + + data_sources { + extension { + streams = ["Microsoft-ContainerLogV2", "Microsoft-KubeEvents", "Microsoft-KubePodInventory"] + extension_name = "ContainerInsights" + extension_json = jsonencode({ + dataCollectionSettings = { + interval = "1m", + namespaceFilteringMode = "Off", + enableContainerLogV2 = true + } + }) + name = "ContainerInsightsExtension" + } + } + + description = "DCR for Azure Monitor Container Insights" + tags = local.tags +} + +resource "azurerm_monitor_data_collection_rule_association" "aks_dcr" { + count = var.container_insights_enabled ? 1 : 0 + name = "${var.name}-${var.context_id}-dcr-assoc" + target_resource_id = azurerm_kubernetes_cluster.main.id + data_collection_rule_id = azurerm_monitor_data_collection_rule.container_insights[0].id + description = "Association of DCR to AKS Cluster" +} + +#----------------------------------------------------------------------------------------------------------------------- +# AKS uses system-assigned managed identity by default (Microsoft best practice) +# AKS automatically creates and manages identities for both control plane and kubelet +# Reference: https://learn.microsoft.com/azure/aks/managed-identity-overview +#----------------------------------------------------------------------------------------------------------------------- + #----------------------------------------------------------------------------------------------------------------------- # AKS Cluster #----------------------------------------------------------------------------------------------------------------------- @@ -225,17 +336,25 @@ resource "azurerm_kubernetes_cluster" "main" { role_based_access_control_enabled = var.role_based_access_control_enabled automatic_upgrade_channel = var.automatic_upgrade_channel sku_tier = var.sku_tier - # checkov:skip=CKV_AZURE_6: This feature is in preview, we are using a public cluster for testing - # api_server_authorized_ip_ranges = [0.0.0.0/0] + + # checkov:skip=CKV_AZURE_6: We allow user to restrict IPs or default to open (null) + api_server_access_profile { + authorized_ip_ranges = var.authorized_ip_ranges + } + # checkov:skip=CKV_AZURE_115: We are using a public cluster for testing - # private clusters are encouraged for production private_cluster_enabled = var.private_cluster_enabled - disk_encryption_set_id = azurerm_disk_encryption_set.main.id + disk_encryption_set_id = var.disk_encryption_enabled ? azurerm_disk_encryption_set.main[0].id : null # checkov:skip=CKV_AZURE_116: This replaces the addon_profile azure_policy_enabled = var.azure_policy_enabled # checkov:skip=CKV_AZURE_141: We are setting this to false to avoid the creation of an AD local_account_disabled = var.local_account_disabled + azure_active_directory_role_based_access_control { + azure_rbac_enabled = true + admin_group_object_ids = var.admin_object_ids + } + key_vault_secrets_provider { secret_rotation_enabled = true } @@ -247,6 +366,7 @@ resource "azurerm_kubernetes_cluster" "main" { vnet_subnet_id = coalesce(var.vnet_subnet_id, try(data.azurerm_subnet.private[0].id, null)) orchestrator_version = var.kubernetes_version only_critical_addons_enabled = var.default_node_pool.only_critical_addons_enabled + zones = var.default_node_pool.availability_zones # checkov:skip=CKV_AZURE_226: we are using the managed disk type to reduce costs os_disk_type = var.default_node_pool.os_disk_type @@ -255,6 +375,15 @@ resource "azurerm_kubernetes_cluster" "main" { # checkov:skip=CKV_AZURE_168: This is set in the variable by default to 50 max_pods = var.default_node_pool.max_pods temporary_name_for_rotation = "rotate" + + dynamic "upgrade_settings" { + for_each = var.default_node_pool.upgrade_settings != null ? [var.default_node_pool.upgrade_settings] : [] + content { + drain_timeout_in_minutes = upgrade_settings.value.drain_timeout_in_minutes + max_surge = upgrade_settings.value.max_surge + node_soak_duration_in_minutes = upgrade_settings.value.node_soak_duration_in_minutes + } + } } auto_scaler_profile { @@ -274,29 +403,27 @@ resource "azurerm_kubernetes_cluster" "main" { vertical_pod_autoscaler_enabled = var.workload_autoscaler_profile.vertical_pod_autoscaler_enabled } - network_profile { - network_plugin = "azure" - network_policy = "cilium" - service_cidr = var.service_cidr - dns_service_ip = var.dns_service_ip - } + oidc_issuer_enabled = var.oidc_issuer_enabled + workload_identity_enabled = var.workload_identity_enabled - oms_agent { - log_analytics_workspace_id = azurerm_log_analytics_workspace.aks_logs.id - } + image_cleaner_enabled = var.image_cleaner_enabled + image_cleaner_interval_hours = var.image_cleaner_interval_hours - identity { - type = length(var.user_assigned_identity_ids) > 0 ? "UserAssigned" : "SystemAssigned" - identity_ids = var.user_assigned_identity_ids + network_profile { + network_plugin = "azure" + network_plugin_mode = "overlay" + network_policy = "cilium" + network_data_plane = "cilium" + outbound_type = var.outbound_type + service_cidr = var.service_cidr + dns_service_ip = var.dns_service_ip } - dynamic "kubelet_identity" { - for_each = var.kubelet_user_assigned_identity_id != null ? [1] : [] - content { - client_id = var.kubelet_client_id - object_id = var.kubelet_object_id - user_assigned_identity_id = var.kubelet_user_assigned_identity_id - } + # Use system-assigned managed identity (Microsoft default and best practice) + # AKS automatically creates Contributor role on node RG for control plane + # AKS automatically creates Virtual Machine Contributor role on node RG for kubelet + identity { + type = "SystemAssigned" } tags = merge({ @@ -313,6 +440,7 @@ resource "azurerm_kubernetes_cluster_node_pool" "autoscaled" { auto_scaling_enabled = true min_count = var.autoscaled_node_pool.min_count max_count = var.autoscaled_node_pool.max_count + zones = var.autoscaled_node_pool.availability_zones vnet_subnet_id = coalesce( var.vnet_subnet_id, try(data.azurerm_subnet.private[length(local.private_subnets) - 1].id, null) @@ -325,12 +453,118 @@ resource "azurerm_kubernetes_cluster_node_pool" "autoscaled" { host_encryption_enabled = var.autoscaled_node_pool.host_encryption_enabled temporary_name_for_rotation = "rotate" + dynamic "upgrade_settings" { + for_each = var.autoscaled_node_pool.upgrade_settings != null ? [var.autoscaled_node_pool.upgrade_settings] : [] + content { + drain_timeout_in_minutes = upgrade_settings.value.drain_timeout_in_minutes + max_surge = upgrade_settings.value.max_surge + node_soak_duration_in_minutes = upgrade_settings.value.node_soak_duration_in_minutes + } + } + tags = merge({ Name = var.autoscaled_node_pool.name }, local.tags) } +# Assign Network Contributor role on subnet to control plane identity (required for custom VNet). +# Azure CLI auto-assigns this, but Terraform requires manual assignment. +# This is needed for the control plane to manage load balancers and network resources in the custom VNet. +# Reference: https://learn.microsoft.com/azure/aks/configure-kubenet +resource "azurerm_role_assignment" "subnet_network_contributor_cp" { + scope = coalesce(var.vnet_subnet_id, try(data.azurerm_subnet.private[0].id, null)) + role_definition_name = "Network Contributor" + principal_id = azurerm_kubernetes_cluster.main.identity[0].principal_id +} + +# AKS automatically creates Virtual Machine Contributor role assignment on node resource group for the kubelet identity. +# However, disk attachment operations require additional permissions beyond Virtual Machine Contributor. +# Create a custom role with minimal permissions for VMSS disk operations. +resource "azurerm_role_definition" "aks_kubelet_vmss_disk_manager" { + name = "AKS Kubelet VMSS Disk Manager - ${var.context_id}" + scope = azurerm_kubernetes_cluster.main.node_resource_group_id + description = "Minimal permissions for AKS kubelet identity to manage VMSS disk attachments" + + permissions { + actions = concat( + [ + # VMSS virtual machine operations for disk attachment (REQUIRED) + "Microsoft.Compute/virtualMachineScaleSets/virtualMachines/read", + "Microsoft.Compute/virtualMachineScaleSets/virtualMachines/write", + # Core disk operations (REQUIRED for basic disk attachment) + "Microsoft.Compute/disks/read", + "Microsoft.Compute/disks/write", + "Microsoft.Compute/disks/delete", + "Microsoft.Compute/disks/beginGetAccess/action", + "Microsoft.Compute/disks/endGetAccess/action", + # Location/operation queries (may be needed for operation status checks) + "Microsoft.Compute/locations/DiskOperations/read", + "Microsoft.Compute/locations/vmSizes/read", + "Microsoft.Compute/locations/operations/read" + ], + var.enable_volume_snapshots ? [ + # Snapshot operations (only included if volume snapshots are enabled) + "Microsoft.Compute/snapshots/read", + "Microsoft.Compute/snapshots/write", + "Microsoft.Compute/snapshots/delete" + ] : [] + ) + not_actions = [] + } + + assignable_scopes = [ + azurerm_kubernetes_cluster.main.node_resource_group_id + ] +} + +resource "azurerm_role_assignment" "kubelet_vmss_disk_manager" { + scope = azurerm_kubernetes_cluster.main.node_resource_group_id + role_definition_id = azurerm_role_definition.aks_kubelet_vmss_disk_manager.role_definition_resource_id + principal_id = azurerm_kubernetes_cluster.main.kubelet_identity[0].object_id +} + +# Assign Reader role on the disk encryption set to the control plane identity. +# Required when using Customer-Managed Keys (CMK) for disk encryption. +resource "azurerm_role_assignment" "cp_disk_encryption_set_reader" { + count = var.disk_encryption_enabled ? 1 : 0 + + scope = azurerm_disk_encryption_set.main[0].id + role_definition_name = "Reader" + principal_id = azurerm_kubernetes_cluster.main.identity[0].principal_id +} + +# Assign Reader role on the disk encryption set to the kubelet identity. +# Required when using Customer-Managed Keys (CMK) for disk encryption. +# Uses system-assigned kubelet identity that AKS creates automatically. +resource "azurerm_role_assignment" "node_pool_disk_encryption_set_reader" { + count = var.disk_encryption_enabled ? 1 : 0 + + scope = azurerm_disk_encryption_set.main[0].id + role_definition_name = "Reader" + principal_id = azurerm_kubernetes_cluster.main.kubelet_identity[0].object_id +} + +# Moved block to handle transition from single instance to count-based resource +moved { + from = azurerm_role_assignment.node_pool_disk_encryption_set_reader + to = azurerm_role_assignment.node_pool_disk_encryption_set_reader[0] +} + resource "local_file" "kube_config" { content = azurerm_kubernetes_cluster.main.kube_config_raw filename = local.kubeconfig_path } + +# Automatically assign "Azure Kubernetes Service RBAC Cluster Admin" to the +# identity running Terraform (the deployer) and any additional admins provided. +# This ensures immediate access when local_account_disabled is set to true. +resource "azurerm_role_assignment" "aks_rbac_admin" { + for_each = toset(concat( + [data.azurerm_client_config.current.object_id], + var.admin_object_ids + )) + + scope = azurerm_kubernetes_cluster.main.id + role_definition_name = "Azure Kubernetes Service RBAC Cluster Admin" + principal_id = each.value +} diff --git a/terraform/cluster/azure-aks/variables.tf b/terraform/cluster/azure-aks/variables.tf index 44e0a5947..81ef960d7 100644 --- a/terraform/cluster/azure-aks/variables.tf +++ b/terraform/cluster/azure-aks/variables.tf @@ -2,6 +2,12 @@ # Variables #----------------------------------------------------------------------------------------------------------------------- +variable "admin_object_ids" { + type = list(string) + description = "List of Azure AD Object IDs (User or Group) to assign 'Azure Kubernetes Service RBAC Cluster Admin' role. Required when local_account_disabled is true to ensure access." + default = [] +} + variable "context_path" { type = string description = "The path to the context folder, where kubeconfig is stored" @@ -50,6 +56,12 @@ variable "region" { default = "eastus" } +variable "disk_encryption_enabled" { + description = "Enable Customer-Managed Keys (CMK) disk encryption for the AKS cluster. When enabled, creates a Key Vault, encryption key, and Disk Encryption Set." + type = bool + default = true +} + variable "kubernetes_version" { description = "Version of Kubernetes to use" type = string @@ -73,6 +85,12 @@ variable "default_node_pool" { max_count = number node_count = number only_critical_addons_enabled = bool + availability_zones = optional(list(string)) + upgrade_settings = optional(object({ + drain_timeout_in_minutes = optional(number, 0) + max_surge = optional(string, "10%") + node_soak_duration_in_minutes = optional(number, 0) + })) }) default = { name = "system" @@ -84,6 +102,11 @@ variable "default_node_pool" { max_count = 3 node_count = 1 only_critical_addons_enabled = true + upgrade_settings = { + drain_timeout_in_minutes = 0 + max_surge = "10%" + node_soak_duration_in_minutes = 0 + } } } @@ -99,6 +122,12 @@ variable "autoscaled_node_pool" { host_encryption_enabled = bool min_count = number max_count = number + availability_zones = optional(list(string)) + upgrade_settings = optional(object({ + drain_timeout_in_minutes = optional(number, 0) + max_surge = optional(string, "10%") + node_soak_duration_in_minutes = optional(number, 0) + })) }) default = { enabled = true @@ -110,6 +139,11 @@ variable "autoscaled_node_pool" { host_encryption_enabled = true min_count = 1 max_count = 3 + upgrade_settings = { + drain_timeout_in_minutes = 0 + max_surge = "10%" + node_soak_duration_in_minutes = 0 + } } } @@ -184,7 +218,13 @@ variable "azure_policy_enabled" { variable "local_account_disabled" { type = bool description = "Whether to disable local accounts for the AKS cluster" - default = false + default = true +} + +variable "authorized_ip_ranges" { + type = set(string) + description = "Set of authorized IP ranges to allow access to the API server. If null, allows all (0.0.0.0/0)." + default = null } variable "public_network_access_enabled" { @@ -205,12 +245,6 @@ variable "expiration_date" { default = null } -variable "user_assigned_identity_ids" { - type = list(string) - description = "User assigned identity IDs for the AKS cluster. If provided, the cluster will use only user-assigned identities." - default = [] -} - variable "soft_delete_retention_days" { type = number description = "The number of days to retain the AKS cluster's key vault" @@ -241,20 +275,66 @@ variable "endpoint_private_access" { default = false } -variable "kubelet_client_id" { - description = "Client ID of the user-assigned identity to use for the kubelet. If not provided, the cluster will use the system-assigned identity." - type = string - default = null +variable "enable_volume_snapshots" { + description = "Enable volume snapshot permissions for the kubelet identity. Set to false to use minimal permissions if volume snapshots are not needed." + type = bool + default = true } -variable "kubelet_object_id" { - description = "Object ID of the user-assigned identity to use for the kubelet. If not provided, the cluster will use the system-assigned identity." +variable "outbound_type" { + description = "The outbound (egress) routing method which should be used for this Kubernetes Cluster." type = string - default = null + default = "userAssignedNATGateway" + validation { + condition = contains(["loadBalancer", "userDefinedRouting", "managedNATGateway", "userAssignedNATGateway"], var.outbound_type) + error_message = "The outbound_type must be one of: loadBalancer, userDefinedRouting, managedNATGateway, userAssignedNATGateway." + } } -variable "kubelet_user_assigned_identity_id" { - description = "Resource ID of the user-assigned identity to use for the kubelet. If not provided, the cluster will use the system-assigned identity." - type = string +variable "oidc_issuer_enabled" { + description = "Enable OIDC issuer for the AKS cluster" + type = bool + default = true +} + +variable "workload_identity_enabled" { + description = "Enable Workload Identity for the AKS cluster" + type = bool + default = true +} + +variable "image_cleaner_enabled" { + description = "Enable Image Cleaner for the AKS cluster" + type = bool + default = true +} + +variable "image_cleaner_interval_hours" { + description = "Interval in hours for Image Cleaner to run" + type = number + default = 48 +} + +variable "diagnostic_log_categories" { + type = set(string) + description = "Set of log categories to send to Log Analytics. Default excludes expensive 'kube-audit'" + default = [ + "kube-audit-admin", + "kube-controller-manager", + "cluster-autoscaler", + "guard", + "kube-scheduler" + ] +} + +variable "diagnostic_log_retention_days" { + type = number + description = "Number of days to retain diagnostic logs. If null, uses the Log Analytics Workspace default retention period." default = null } + +variable "container_insights_enabled" { + type = bool + description = "Enable Azure Monitor Container Insights for collecting container logs, Kubernetes events, and pod/node inventory. Disable for cost-sensitive dev/test environments or when using alternative monitoring solutions." + default = true +} diff --git a/terraform/network/azure-vnet/main.tf b/terraform/network/azure-vnet/main.tf index 8fee9e21a..664c7236a 100644 --- a/terraform/network/azure-vnet/main.tf +++ b/terraform/network/azure-vnet/main.tf @@ -125,7 +125,22 @@ resource "azurerm_nat_gateway_public_ip_association" "main" { public_ip_address_id = azurerm_public_ip.nat[count.index].id } -# Associate NAT Gateway with private subnet +resource "azurerm_route_table" "private" { + count = var.vnet_zones + name = "${var.name}-private-${count.index + 1}-${var.context_id}" + location = azurerm_resource_group.main.location + resource_group_name = azurerm_resource_group.main.name + tags = merge({ + Name = "${var.name}-private-${count.index + 1}-${var.context_id}" + }, local.tags) +} + +resource "azurerm_subnet_route_table_association" "private" { + count = var.vnet_zones + subnet_id = azurerm_subnet.private[count.index].id + route_table_id = azurerm_route_table.private[count.index].id +} + resource "azurerm_subnet_nat_gateway_association" "private" { count = var.enable_nat_gateway ? var.vnet_zones : 0 subnet_id = azurerm_subnet.private[count.index].id