diff --git a/.github/workflows/k8s-health-check.yaml b/.github/workflows/k8s-health-check.yaml new file mode 100644 index 0000000..d65f987 --- /dev/null +++ b/.github/workflows/k8s-health-check.yaml @@ -0,0 +1,85 @@ +# Kubernetes Health Check Workflow +# Validates that key cluster components are healthy after a Terraform apply. +# Checks rollout status for: Karpenter, Datadog (operator, cluster-agent, node-agent), Lacework. +name: Kubernetes Health Check +on: + workflow_call: + inputs: + environment: + description: "GitHub environment to use for resolving variables (AWS_OIDC_ROLE_ARN, AWS_REGION)." + type: string + aws_account_id: + description: "The AWS account ID." + type: string + aws_region: + description: "The AWS region." + type: string + aws_role_name: + description: "The name of the role to assume with OIDC." + type: string + aws_oidc_role_arn: + description: "AWS OIDC IAM role to assume." + type: string + +jobs: + health-check: + permissions: + id-token: write + contents: read + runs-on: ubuntu-latest + environment: ${{ inputs.environment }} + env: + AWS_REGION: ${{ inputs.aws_region || vars.aws_region || 'eu-central-1' }} + ROLE_TO_ASSUME: ${{ inputs.aws_oidc_role_arn || vars.aws_oidc_role_arn || format('arn:aws:iam::{0}:role/{1}', inputs.aws_account_id, inputs.aws_role_name) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 # v6 + with: + role-to-assume: ${{ env.ROLE_TO_ASSUME }} + aws-region: ${{ env.AWS_REGION }} + + - name: Configure kubectl + run: | + EKS_CLUSTER_NAME=$(aws eks list-clusters --region "$AWS_REGION" --query 'clusters[0]' --output text) + if [ "$EKS_CLUSTER_NAME" = "None" ] || [ -z "$EKS_CLUSTER_NAME" ]; then + echo "No EKS cluster found in region $AWS_REGION for account $(aws sts get-caller-identity --query Account --output text)" + exit 1 + fi + aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION" + + - name: Check Karpenter rollout + run: kubectl rollout status deployment/karpenter -n kube-system --timeout=5m + + - name: Check Datadog rollout + run: | + kubectl rollout status deployment/datadog-operator -n monitoring --timeout=5m + kubectl rollout status deployment/datadog-agent-cluster-agent -n monitoring --timeout=5m + kubectl rollout status daemonset/datadog-agent -n monitoring --timeout=5m + + - name: Check DatadogAgent CRD status + run: | + CONDITIONS=$(kubectl get datadogagent datadog-agent -n monitoring -o json 2>/dev/null \ + | jq -r '.status.conditions[]? | select(.status == "True" and (.type | test("Error|Degraded"))) | "\(.type): \(.message)"') + if [ -n "$CONDITIONS" ]; then + echo "DatadogAgent has error conditions:" + echo "$CONDITIONS" + exit 1 + fi + + - name: Check Datadog operator for reconciliation errors + run: | + ERRORS=$(kubectl logs -n monitoring deployment/datadog-operator --since=3m 2>/dev/null | grep '"level":"ERROR"' || true) + if [ -n "$ERRORS" ]; then + echo "Datadog operator reconciliation errors detected:" + echo "$ERRORS" + exit 1 + fi + + - name: Check Lacework rollout + run: | + if kubectl get namespace lacework &>/dev/null; then + kubectl rollout status daemonset/lacework-agent -n lacework --timeout=5m + kubectl rollout status deployment/lacework-agent-cluster -n lacework --timeout=5m + else + echo "Lacework not deployed, skipping" + fi diff --git a/docs/workflows/k8s-health-check.md b/docs/workflows/k8s-health-check.md new file mode 100644 index 0000000..6c0e677 --- /dev/null +++ b/docs/workflows/k8s-health-check.md @@ -0,0 +1,51 @@ +--- +title: Kubernetes Health Check +--- + + +## Kubernetes Health Check + + +## Description + +Validates that key cluster components are healthy after a Terraform apply. +Intended to run after `tf-apply` for the platform stack and gate the prod apply on dev health. + +Checks performed: +- **Karpenter** — controller Deployment rollout (`kube-system`) +- **Datadog** — operator Deployment, cluster-agent Deployment, node-agent DaemonSet rollout (`monitoring`); operator log scan for reconciliation errors +- **Lacework** — node-agent DaemonSet and cluster Deployment rollout (`lacework`); skipped gracefully if not deployed + +## Usage + +```yaml +jobs: + apply-dev: + uses: DND-IT/github-workflows/.github/workflows/tf-apply.yaml@v3 + with: + environment: platform-dev + + health-check-dev: + needs: apply-dev + uses: DND-IT/github-workflows/.github/workflows/k8s-health-check.yaml@v3 + with: + environment: platform-dev + + apply-prod: + needs: health-check-dev + uses: DND-IT/github-workflows/.github/workflows/tf-apply.yaml@v3 + with: + environment: platform-prod +``` + + +### Inputs + +| name | description | type | required | default | +| --- | --- | --- | --- | --- | +| `environment` |

GitHub environment to use for resolving variables (AWSOIDCROLEARN, AWSREGION).

| `string` | `false` | `""` | +| `aws_account_id` |

The AWS account ID.

| `string` | `false` | `""` | +| `aws_region` |

The AWS region.

| `string` | `false` | `""` | +| `aws_role_name` |

The name of the role to assume with OIDC.

| `string` | `false` | `""` | +| `aws_oidc_role_arn` |

AWS OIDC IAM role to assume.

| `string` | `false` | `""` | + diff --git a/renovate.json b/renovate.json index 4479f55..8b59dd4 100644 --- a/renovate.json +++ b/renovate.json @@ -1,6 +1,6 @@ { "$schema": "https://docs.renovatebot.com/renovate-schema.json", - "baseBranchPatterns": [ + "baseBranches": [ "main" ], "extends": [