From fad466292c493d2fba9fd99b15928632646bdb54 Mon Sep 17 00:00:00 2001 From: Yan Sun Date: Mon, 13 Apr 2026 16:33:14 -0700 Subject: [PATCH] [Fix] Add workflow and workflow-triggered pod collection to techsupport (#1337) * Add workflow and workflow-triggered pod collection to techsupport Enhance the techsupport_dump.sh script to collect workflow CRs and workflow-triggered pods when auto node remediation feature is enabled. This helps with debugging workflow-based node remediation issues. Changes: - Add WORKFLOW_RESOURCES variable for workflow CRs - Collect workflow CRs (get, describe, yaml/json output) - Collect workflow-triggered pods identified by workflows.argoproj.io/workflow label - Add per-node log collection for workflow-triggered pods - Include error resilience with || true for ephemeral workflow pods Co-Authored-By: Claude Opus 4.5 * Make pod_logs function resilient to ephemeral pod failures Add error handling (|| true) to kubectl logs commands in pod_logs function to prevent script termination when collecting logs from ephemeral/terminated workflow pods. With set -e enabled, failed log collection would previously abort the entire techsupport run before reaching error handlers. Changes: - Add '2>&1 || true' to current container logs command - Add '2>&1 || true' to previous container logs command - Ensures individual pod log failures don't terminate script execution - Critical for short-lived workflow pods that may be deleted during collection Co-Authored-By: Claude Opus 4.5 * Add workflow controller pod collection to techsupport Collect information and logs from the workflow controller pod (identified by label app=amd-gpu-operator-workflow-controller) in addition to workflow CRs and workflow-triggered pods. Changes: - Add workflow controller pod collection in cluster-wide section - kubectl get/describe output in both text and JSON/YAML format - Add workflow controller pod log collection per node - Maintains error resilience with || true for optional feature Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Claude Opus 4.5 (cherry picked from commit 70b0104643e7b5d2239bd7ec4851ed9706618c6c) --- tools/techsupport_dump.sh | 64 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/tools/techsupport_dump.sh b/tools/techsupport_dump.sh index f2240d6a..a07261a7 100755 --- a/tools/techsupport_dump.sh +++ b/tools/techsupport_dump.sh @@ -31,6 +31,7 @@ NFD_RESOURCES="pods daemonsets deployments configmap" KMM_RESOURCES="pods daemonsets deployments modules configmap" GPUOPER_RESOURCES="pods daemonsets deployments deviceconfig configmap" NPD_RESOURCES="pods daemonsets configmap" +WORKFLOW_RESOURCES="workflows" OUTPUT_FORMAT="json" WIDE="" @@ -66,8 +67,8 @@ pod_logs() { for lpod in ${PODS}; do pod=$(basename ${lpod}) log " ${NS}/${pod}" - ${KNS} logs "${pod}" --all-containers >${TECH_SUPPORT_FILE}/${NODE}/${FEATURE}/${NS}_${pod}.txt - ${KNS} logs -p "${pod}" --all-containers --tail 1 >/dev/null 2>&1 && ${KNS} logs -p "${pod}" >${TECH_SUPPORT_FILE}/${NODE}/${FEATURE}/${NS}_${pod}_previous.txt + ${KNS} logs "${pod}" --all-containers >${TECH_SUPPORT_FILE}/${NODE}/${FEATURE}/${NS}_${pod}.txt 2>&1 || true + ${KNS} logs -p "${pod}" --all-containers --tail 1 >/dev/null 2>&1 && ${KNS} logs -p "${pod}" >${TECH_SUPPORT_FILE}/${NODE}/${FEATURE}/${NS}_${pod}_previous.txt 2>&1 || true done echo "${PODS}" >${TECH_SUPPORT_FILE}/${node}/${FEATURE}/pods.txt } @@ -250,6 +251,49 @@ if [ -n "${NPD_NS}" ]; then done fi +# workflow resources (auto node remediation feature) +log "workflows:" +for resource in ${WORKFLOW_RESOURCES}; do + log " ${GPUOPER_NS}/${resource}" + mkdir -p ${TECH_SUPPORT_FILE}/workflow/ + ${KUBECTL} get -n ${GPUOPER_NS} ${resource} ${WIDE} >${TECH_SUPPORT_FILE}/workflow/${resource}.txt 2>&1 || true + ${KUBECTL} describe -n ${GPUOPER_NS} ${resource} >>${TECH_SUPPORT_FILE}/workflow/${resource}.txt 2>&1 || true + ${KUBECTL} get -n ${GPUOPER_NS} ${resource} -o ${OUTPUT_FORMAT} >${TECH_SUPPORT_FILE}/workflow/${resource}.${OUTPUT_FORMAT} 2>&1 || true +done + +# workflow controller pod +log "workflow controller:" +mkdir -p ${TECH_SUPPORT_FILE}/workflow/ +WORKFLOW_CONTROLLER_PODS=$(${KUBECTL} get pods -n ${GPUOPER_NS} -l app=amd-gpu-operator-workflow-controller --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null || true) +if [ -n "$WORKFLOW_CONTROLLER_PODS" ]; then + echo "" >${TECH_SUPPORT_FILE}/workflow/workflow-controller-pods.txt + echo "" >${TECH_SUPPORT_FILE}/workflow/workflow-controller-pods.${OUTPUT_FORMAT} + while IFS= read -r pod_name; do + [ -z "$pod_name" ] && continue + log " ${GPUOPER_NS}/${pod_name}" + ${KUBECTL} describe -n ${GPUOPER_NS} pod "$pod_name" >>${TECH_SUPPORT_FILE}/workflow/workflow-controller-pods.txt 2>&1 || true + ${KUBECTL} get -n ${GPUOPER_NS} pod "$pod_name" -o ${OUTPUT_FORMAT} >>${TECH_SUPPORT_FILE}/workflow/workflow-controller-pods.${OUTPUT_FORMAT} 2>&1 || true + done <<< "$WORKFLOW_CONTROLLER_PODS" +fi + +# workflow-triggered test runner pods (pods with workflow labels) +log "workflow-triggered pods:" +mkdir -p ${TECH_SUPPORT_FILE}/workflow/ +WORKFLOW_PODS=$(${KUBECTL} get pods -n ${GPUOPER_NS} -l workflows.argoproj.io/workflow --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null || true) +if [ -n "$WORKFLOW_PODS" ]; then + echo "" >${TECH_SUPPORT_FILE}/workflow/workflow-pods.txt + echo "" >${TECH_SUPPORT_FILE}/workflow/workflow-pods.${OUTPUT_FORMAT} + POD_COUNT=$(echo "$WORKFLOW_PODS" | wc -l) + POD_NUM=0 + while IFS= read -r pod_name; do + [ -z "$pod_name" ] && continue + POD_NUM=$((POD_NUM + 1)) + log " ($POD_NUM/$POD_COUNT) ${GPUOPER_NS}/${pod_name}" + ${KUBECTL} describe -n ${GPUOPER_NS} pod "$pod_name" >>${TECH_SUPPORT_FILE}/workflow/workflow-pods.txt 2>&1 || true + ${KUBECTL} get -n ${GPUOPER_NS} pod "$pod_name" -o ${OUTPUT_FORMAT} >>${TECH_SUPPORT_FILE}/workflow/workflow-pods.${OUTPUT_FORMAT} 2>&1 || true + done <<< "$WORKFLOW_PODS" +fi + CONTROL_PLANE=$(${KUBECTL} get nodes -l node-role.kubernetes.io/control-plane | grep -w Ready | awk '{print $1}') # logs if [ "${NODES}" == "all" ]; then @@ -424,6 +468,22 @@ for node in "${nodeList[@]}"; do fi fi + # workflow controller pod logs + WORKFLOW_CONTROLLER_PODS=$(${KNS} get pods -o name --field-selector spec.nodeName=${node} -l app=amd-gpu-operator-workflow-controller 2>/dev/null || true) + if [ -n "$WORKFLOW_CONTROLLER_PODS" ]; then + if ! pod_logs $GPUOPER_NS "workflow-controller" $node "$WORKFLOW_CONTROLLER_PODS"; then + log "Failed to collect logs for workflow controller on node ${node}" + fi + fi + + # workflow-triggered pod logs (pods with workflow labels on this node) + WORKFLOW_NODE_PODS=$(${KNS} get pods -o name --field-selector spec.nodeName=${node} -l workflows.argoproj.io/workflow 2>/dev/null || true) + if [ -n "$WORKFLOW_NODE_PODS" ]; then + if ! pod_logs $GPUOPER_NS "workflow-pods" $node "$WORKFLOW_NODE_PODS"; then + log "Failed to collect logs for workflow-triggered pods on node ${node}" + fi + fi + # operator pod logs GPUOPER_PODS=$(${KNS} get pods -o name --field-selector spec.nodeName=${node} -l "app.kubernetes.io/name=gpu-operator-charts" 2>/dev/null || true) if [ -z "$GPUOPER_PODS" ]; then