From ddf272808e73a4b2d5f9c6a4c2020d914fdefa41 Mon Sep 17 00:00:00 2001 From: Yan Sun Date: Sat, 11 Apr 2026 11:46:13 -0700 Subject: [PATCH] Fix helm e2e test for remediation workflow ConfigMapImage field (#1329) The test "upgrade with rendering spec.remediationWorkflow" was failing because it didn't expect the ConfigMapImage field in the DeviceConfigSpec. The helm chart sets a default value for configMapImage in values.yaml, which gets rendered into the DeviceConfig CR. Updated the expected spec to include the ConfigMapImage field with the default value from the helm chart. Also added E2E_ANR_CONFIGMAP_IMAGE to the e2e test Makefile for consistency with other configurable image variables. Co-authored-by: Claude Opus 4.5 (cherry picked from commit d24f89f246581892a281bc689d9107a0917438f2) --- hack/k8s-patch/metadata-patch/values.yaml | 2 ++ helm-charts-k8s/README.md | 3 +-- helm-charts-k8s/values.yaml | 2 ++ tests/e2e/Makefile | 2 ++ tests/helm-e2e/helm_e2e_test.go | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 195901c8..ce550d24 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -300,6 +300,8 @@ deviceConfig: nodeDrainPolicy: {} # -- Enable/disable automatic workflow start on node issues autoStartWorkflow: true + # -- Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. + configMapImage: "" # AMD GPU operator controller related configs controllerManager: manager: diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 09885c1b..8553d92f 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -123,8 +123,6 @@ For bugs and feature requests, please file an issue on our [GitHub Issues](https The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE). -## gpu-operator-charts - ![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: dev](https://img.shields.io/badge/AppVersion-dev-informational?style=flat-square) AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters. @@ -253,6 +251,7 @@ Kubernetes: `>= 1.29.0-0` | deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | | deviceConfig.spec.remediationWorkflow.autoStartWorkflow | bool | `true` | Enable/disable automatic workflow start on node issues | | deviceConfig.spec.remediationWorkflow.config | object | `{}` | Configuration for remediation workflow | +| deviceConfig.spec.remediationWorkflow.configMapImage | string | `""` | Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. | | deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller | | deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit | | deviceConfig.spec.remediationWorkflow.nodeDrainPolicy | object | `{}` | Policy for draining nodes during remediation | diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 195901c8..ce550d24 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -300,6 +300,8 @@ deviceConfig: nodeDrainPolicy: {} # -- Enable/disable automatic workflow start on node issues autoStartWorkflow: true + # -- Container image used to create the remediation ConfigMap. This image contains the default remediation ConfigMap configmap.yaml file. + configMapImage: "" # AMD GPU operator controller related configs controllerManager: manager: diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 3dea053a..0f5ff001 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -18,6 +18,7 @@ E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.4.0 E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_KUBEVIRT_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest E2E_UTILS_CONTAINER_IMAGE ?= docker.io/rocm/gpu-operator-utils:v1.4.0 +E2E_ANR_CONFIGMAP_IMAGE ?= docker.io/rocm/amd-gpu-operator-remediation-config-utils:latest E2E_NODE_DIAG_IMAGE ?= busybox:1.36 E2E_DRA_DRIVER_IMAGE ?= rocm/k8s-gpu-dra-driver:latest @@ -38,6 +39,7 @@ export E2E_AGFHC_TEST_RUNNER_IMAGE export E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE export E2E_KUBEVIRT_NODE_LABELLER_IMAGE export E2E_UTILS_CONTAINER_IMAGE +export E2E_ANR_CONFIGMAP_IMAGE export E2E_NODE_DIAG_IMAGE export E2E_DRA_DRIVER_IMAGE diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index 8aed527a..162159d3 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -1002,6 +1002,7 @@ deviceConfig: TtlForFailedWorkflows: "36h", TesterImage: "test.io/test/remediation-workflow-tester:v1.3.0", AutoStartWorkflow: &boolTrue, + ConfigMapImage: "", }, }, verifyFunc: s.verifyRemediationWorkflow,