From 5518c3b8c32675ef940cd667e8e3bcf3e8ebfddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:30:27 +0100 Subject: [PATCH 1/7] feat(k8s): add k8s-health-check reusable workflow Validates key cluster components after a Terraform apply to gate prod deploys on dev health. Checks Karpenter, Datadog (operator, cluster-agent, node-agent) and Lacework rollout status, plus Datadog operator reconciliation error logs. --- .github/workflows/k8s-health-check.yaml | 71 +++++++++++++++++++++++++ docs/workflows/k8s-health-check.md | 51 ++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 .github/workflows/k8s-health-check.yaml create mode 100644 docs/workflows/k8s-health-check.md diff --git a/.github/workflows/k8s-health-check.yaml b/.github/workflows/k8s-health-check.yaml new file mode 100644 index 0000000..81f1670 --- /dev/null +++ b/.github/workflows/k8s-health-check.yaml @@ -0,0 +1,71 @@ +# Kubernetes Health Check Workflow +# Validates that key cluster components are healthy after a Terraform apply. +# Checks rollout status for: Karpenter, Datadog (operator, cluster-agent, node-agent), Lacework. +name: Kubernetes Health Check +on: + workflow_call: + inputs: + environment: + description: "GitHub environment to use for resolving variables (AWS_OIDC_ROLE_ARN, AWS_REGION)." + type: string + aws_account_id: + description: "The AWS account ID." + type: string + aws_region: + description: "The AWS region." + type: string + aws_role_name: + description: "The name of the role to assume with OIDC." + type: string + aws_oidc_role_arn: + description: "AWS OIDC IAM role to assume." + type: string + +jobs: + health-check: + permissions: + id-token: write + contents: read + runs-on: ubuntu-latest + environment: ${{ inputs.environment }} + env: + AWS_REGION: ${{ inputs.aws_region || vars.aws_region || 'eu-central-1' }} + ROLE_TO_ASSUME: ${{ inputs.aws_oidc_role_arn || vars.aws_oidc_role_arn || format('arn:aws:iam::{0}:role/{1}', inputs.aws_account_id, inputs.aws_role_name) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 # v6 + with: + role-to-assume: ${{ env.ROLE_TO_ASSUME }} + aws-region: ${{ env.AWS_REGION }} + + - name: Configure kubectl + run: | + EKS_CLUSTER_NAME=$(aws eks list-clusters --region "$AWS_REGION" --query 'clusters[0]' --output text) + aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION" + + - name: Check Karpenter rollout + run: kubectl rollout status deployment/karpenter -n kube-system --timeout=5m + + - name: Check Datadog rollout + run: | + kubectl rollout status deployment/datadog-operator -n monitoring --timeout=5m + kubectl rollout status deployment/datadog-agent-cluster-agent -n monitoring --timeout=5m + kubectl rollout status daemonset/datadog-agent -n monitoring --timeout=5m + + - name: Check Datadog operator for reconciliation errors + run: | + ERRORS=$(kubectl logs -n monitoring deployment/datadog-operator --since=3m 2>/dev/null | grep '"level":"ERROR"' || true) + if [ -n "$ERRORS" ]; then + echo "Datadog operator reconciliation errors detected:" + echo "$ERRORS" + exit 1 + fi + + - name: Check Lacework rollout + run: | + if kubectl get namespace lacework &>/dev/null; then + kubectl rollout status daemonset/lacework-agent -n lacework --timeout=5m + kubectl rollout status deployment/lacework-agent-cluster -n lacework --timeout=5m + else + echo "Lacework not deployed, skipping" + fi diff --git a/docs/workflows/k8s-health-check.md b/docs/workflows/k8s-health-check.md new file mode 100644 index 0000000..42aef4d --- /dev/null +++ b/docs/workflows/k8s-health-check.md @@ -0,0 +1,51 @@ +--- +title: Kubernetes Health Check +--- + + +## Kubernetes Health Check + + +## Description + +Validates that key cluster components are healthy after a Terraform apply. +Intended to run after `tf-apply` for the platform stack and gate the prod apply on dev health. + +Checks performed: +- **Karpenter** — controller Deployment rollout (`kube-system`) +- **Datadog** — operator Deployment, cluster-agent Deployment, node-agent DaemonSet rollout (`monitoring`); operator log scan for reconciliation errors +- **Lacework** — node-agent DaemonSet and cluster Deployment rollout (`lacework`); skipped gracefully if not deployed + +## Usage + +```yaml +jobs: + apply-dev: + uses: DND-IT/github-workflows/.github/workflows/tf-apply.yaml@v3 + with: + environment: platform-dev + + health-check-dev: + needs: apply-dev + uses: DND-IT/github-workflows/.github/workflows/k8s-health-check.yaml@v3 + with: + environment: platform-dev + + apply-prod: + needs: health-check-dev + uses: DND-IT/github-workflows/.github/workflows/tf-apply.yaml@v3 + with: + environment: platform-prod +``` + + +### Inputs + +| name | description | type | required | default | +| --- | --- | --- | --- | --- | +| `environment` |
GitHub environment to use for resolving variables (AWS_OIDC_ROLE_ARN, AWS_REGION).
| `string` | `false` | `""` | +| `aws_account_id` |The AWS account ID.
| `string` | `false` | `""` | +| `aws_region` |The AWS region.
| `string` | `false` | `""` | +| `aws_role_name` |The name of the role to assume with OIDC.
| `string` | `false` | `""` | +| `aws_oidc_role_arn` |AWS OIDC IAM role to assume.
| `string` | `false` | `""` | + From cd108b2c6f1d511c7d1c843c5723b30f499f4aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:32:14 +0100 Subject: [PATCH 2/7] test(k8s): add integration test for k8s-health-check workflow --- .github/workflows/_test-k8s-health-check.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/_test-k8s-health-check.yaml diff --git a/.github/workflows/_test-k8s-health-check.yaml b/.github/workflows/_test-k8s-health-check.yaml new file mode 100644 index 0000000..1765ccb --- /dev/null +++ b/.github/workflows/_test-k8s-health-check.yaml @@ -0,0 +1,13 @@ +on: + pull_request: + branches: + - main + paths: + - '.github/workflows/_test-k8s-health-check.yaml' + - '.github/workflows/k8s-health-check.yaml' + +jobs: + test_k8s_health_check: + uses: ./.github/workflows/k8s-health-check.yaml + with: + environment: sandbox From ce630ad63872ad79489c3f5ad3a765b5c1ef5007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:37:36 +0100 Subject: [PATCH 3/7] docs: regenerate workflow documentation --- docs/workflows/k8s-health-check.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/workflows/k8s-health-check.md b/docs/workflows/k8s-health-check.md index 42aef4d..6c0e677 100644 --- a/docs/workflows/k8s-health-check.md +++ b/docs/workflows/k8s-health-check.md @@ -43,7 +43,7 @@ jobs: | name | description | type | required | default | | --- | --- | --- | --- | --- | -| `environment` |GitHub environment to use for resolving variables (AWS_OIDC_ROLE_ARN, AWS_REGION).
| `string` | `false` | `""` | +| `environment` |GitHub environment to use for resolving variables (AWSOIDCROLEARN, AWSREGION).
| `string` | `false` | `""` | | `aws_account_id` |The AWS account ID.
| `string` | `false` | `""` | | `aws_region` |The AWS region.
| `string` | `false` | `""` | | `aws_role_name` |The name of the role to assume with OIDC.
| `string` | `false` | `""` | From 82989495884a33f17bb26f61ded32d5a50cbec65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:39:21 +0100 Subject: [PATCH 4/7] fix(renovate): rename baseBranchPatterns to baseBranches --- renovate.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/renovate.json b/renovate.json index 4479f55..8b59dd4 100644 --- a/renovate.json +++ b/renovate.json @@ -1,6 +1,6 @@ { "$schema": "https://docs.renovatebot.com/renovate-schema.json", - "baseBranchPatterns": [ + "baseBranches": [ "main" ], "extends": [ From 38c61a5c7ef257b00d054ae80374b52551490b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:42:03 +0100 Subject: [PATCH 5/7] fix(k8s-health-check): handle missing EKS cluster and use platform-dev for test - Fail with a clear error if no cluster is found in the region instead of passing 'None' to aws eks update-kubeconfig - Switch test environment from sandbox (no cluster) to platform-dev --- .github/workflows/_test-k8s-health-check.yaml | 2 +- .github/workflows/k8s-health-check.yaml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_test-k8s-health-check.yaml b/.github/workflows/_test-k8s-health-check.yaml index 1765ccb..f85e7b0 100644 --- a/.github/workflows/_test-k8s-health-check.yaml +++ b/.github/workflows/_test-k8s-health-check.yaml @@ -10,4 +10,4 @@ jobs: test_k8s_health_check: uses: ./.github/workflows/k8s-health-check.yaml with: - environment: sandbox + environment: platform-dev diff --git a/.github/workflows/k8s-health-check.yaml b/.github/workflows/k8s-health-check.yaml index 81f1670..bf8d966 100644 --- a/.github/workflows/k8s-health-check.yaml +++ b/.github/workflows/k8s-health-check.yaml @@ -41,6 +41,10 @@ jobs: - name: Configure kubectl run: | EKS_CLUSTER_NAME=$(aws eks list-clusters --region "$AWS_REGION" --query 'clusters[0]' --output text) + if [ "$EKS_CLUSTER_NAME" = "None" ] || [ -z "$EKS_CLUSTER_NAME" ]; then + echo "No EKS cluster found in region $AWS_REGION for account $(aws sts get-caller-identity --query Account --output text)" + exit 1 + fi aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION" - name: Check Karpenter rollout From 82de587ec04831c07c281f81000ba00940991c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:45:22 +0100 Subject: [PATCH 6/7] test(k8s): remove integration test for k8s-health-check workflow The workflow requires access to a specific EKS cluster in the platform-dev AWS account. This environment is not available in this repo and would require additional OIDC trust configuration. The workflow is effectively tested via the disco-infra-terraform pipeline which already has the correct access. --- .github/workflows/_test-k8s-health-check.yaml | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 .github/workflows/_test-k8s-health-check.yaml diff --git a/.github/workflows/_test-k8s-health-check.yaml b/.github/workflows/_test-k8s-health-check.yaml deleted file mode 100644 index f85e7b0..0000000 --- a/.github/workflows/_test-k8s-health-check.yaml +++ /dev/null @@ -1,13 +0,0 @@ -on: - pull_request: - branches: - - main - paths: - - '.github/workflows/_test-k8s-health-check.yaml' - - '.github/workflows/k8s-health-check.yaml' - -jobs: - test_k8s_health_check: - uses: ./.github/workflows/k8s-health-check.yaml - with: - environment: platform-dev From 21dec5fae2822a8032de75fb29edd7815a88ce2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20J=C3=BCrges?= <36044481+chrisamti@users.noreply.github.com> Date: Thu, 19 Mar 2026 14:33:13 +0100 Subject: [PATCH 7/7] feat(k8s-health-check): check DatadogAgent CRD status conditions Adds a direct check on the DatadogAgent custom resource status conditions, catching Error/Degraded states (e.g. immutable field errors) at the source rather than relying solely on operator log parsing. --- .github/workflows/k8s-health-check.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/k8s-health-check.yaml b/.github/workflows/k8s-health-check.yaml index bf8d966..d65f987 100644 --- a/.github/workflows/k8s-health-check.yaml +++ b/.github/workflows/k8s-health-check.yaml @@ -56,6 +56,16 @@ jobs: kubectl rollout status deployment/datadog-agent-cluster-agent -n monitoring --timeout=5m kubectl rollout status daemonset/datadog-agent -n monitoring --timeout=5m + - name: Check DatadogAgent CRD status + run: | + CONDITIONS=$(kubectl get datadogagent datadog-agent -n monitoring -o json 2>/dev/null \ + | jq -r '.status.conditions[]? | select(.status == "True" and (.type | test("Error|Degraded"))) | "\(.type): \(.message)"') + if [ -n "$CONDITIONS" ]; then + echo "DatadogAgent has error conditions:" + echo "$CONDITIONS" + exit 1 + fi + - name: Check Datadog operator for reconciliation errors run: | ERRORS=$(kubectl logs -n monitoring deployment/datadog-operator --since=3m 2>/dev/null | grep '"level":"ERROR"' || true)