diff --git a/.docker/forge-github-app-register/Dockerfile b/.docker/forge-github-app-register/Dockerfile index a17a9b8a..5ff54542 100644 --- a/.docker/forge-github-app-register/Dockerfile +++ b/.docker/forge-github-app-register/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.14-slim@sha256:0aecac02dc3d4c5dbb024b753af084cafe41f5416e02193f1ce345d671ec966e +FROM python:3.14-slim@sha256:6a27522252aef8432841f224d9baaa6e9fce07b07584154fa0b9a96603af7456 RUN useradd --create-home appuser WORKDIR /home/appuser diff --git a/.docker/pre-commit/Dockerfile b/.docker/pre-commit/Dockerfile index 88487ddf..0d0f77c5 100644 --- a/.docker/pre-commit/Dockerfile +++ b/.docker/pre-commit/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:24.04@sha256:c35e29c9450151419d9448b0fd75374fec4fff364a27f176fb458d472dfc9e54 AS build +FROM ubuntu:24.04@sha256:d1e2e92c075e5ca139d51a140fff46f84315c0fdce203eab2807c7e495eff4f9 AS build WORKDIR /opt/build @@ -12,119 +12,99 @@ RUN apt-get update && apt-get install -y \ SHELL ["/bin/bash", "-o", "pipefail", "-c"] # renovate: datasource=github-releases depName=gitleaks/gitleaks registryUrl=https://github.com/ -ARG GITLEAKS_VERSION="8.24.3" +ARG GITLEAKS_VERSION="8.30.0" ARG GITLEAKS_SRC="https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}/gitleaks_${GITLEAKS_VERSION}_linux_x64.tar.gz" ARG GITLEAKS_ARTIFACT="gitleaks.tar.gz" -ARG GITLEAKS_CHECKSUM="9991e0b2903da4c8f6122b5c3186448b927a5da4deef1fe45271c3793f4ee29c" RUN set -eux; \ wget --progress=dot:giga -O ${GITLEAKS_ARTIFACT} ${GITLEAKS_SRC}; \ - echo "${GITLEAKS_CHECKSUM} ${GITLEAKS_ARTIFACT}" | sha256sum -c -; \ tar -zxvf ${GITLEAKS_ARTIFACT} -C /usr/local/bin/; \ chmod 755 /usr/local/bin/gitleaks # renovate: datasource=github-releases depName=hadolint/hadolint registryUrl=https://github.com/ -ARG HADOLINT_VERSION="v2.12.0" -ARG HADOLINT_SRC="https://github.com/hadolint/hadolint/releases/download/v2.12.0/hadolint-Linux-x86_64" +ARG HADOLINT_VERSION="2.14.0" +ARG HADOLINT_SRC="https://github.com/hadolint/hadolint/releases/download/v${HADOLINT_VERSION}/hadolint-Linux-x86_64" ARG HADOLINT_ARTIFACT="hadolint" -ARG HADOLINT_CHECKSUM="56de6d5e5ec427e17b74fa48d51271c7fc0d61244bf5c90e828aab8362d55010" RUN set -eux; \ wget --progress=dot:giga -O ${HADOLINT_ARTIFACT} ${HADOLINT_SRC}; \ - echo "${HADOLINT_CHECKSUM} ${HADOLINT_ARTIFACT}" | sha256sum -c -; \ mv ${HADOLINT_ARTIFACT} /usr/local/bin; \ chmod 755 /usr/local/bin/hadolint # Download jq. -# renovate: datasource=github-releases depName=stedolan/jq registryUrl=https://github.com/ -ARG JQ_VERSION="1.7.1" -ARG JQ_SRC="https://github.com/stedolan/jq/releases/download/jq-${JQ_VERSION}/jq-linux64" +# renovate: datasource=github-releases depName=jqlang/jq registryUrl=https://github.com/ extractVersion=^jq-(?.*)$ +ARG JQ_VERSION="1.8.1" +ARG JQ_SRC="https://github.com/jqlang/jq/releases/download/jq-${JQ_VERSION}/jq-linux64" ARG JQ_ARTIFACT="jq" -ARG JQ_CHECKSUM="5942c9b0934e510ee61eb3e30273f1b3fe2590df93933a93d7c58b81d19c8ff5" RUN set -eux; \ wget --progress=dot:giga -O ${JQ_ARTIFACT} ${JQ_SRC}; \ - echo "${JQ_CHECKSUM} ${JQ_ARTIFACT}" | sha256sum -c -; \ mv ${JQ_ARTIFACT} /usr/local/bin/; \ chmod 755 /usr/local/bin/jq # renovate: datasource=github-releases depName=hashicorp/packer registryUrl=https://github.com/ -ARG PACKER_VERSION="1.12.0" +ARG PACKER_VERSION="1.15.0" ARG PACKER_SRC="https://releases.hashicorp.com/packer/${PACKER_VERSION}/packer_${PACKER_VERSION}_linux_amd64.zip" ARG PACKER_ARTIFACT="packer.zip" -ARG PACKER_CHECKSUM="e859a76659570d1e29fa55396d5d908091bacacd4567c17770e616c4b58c9ace" RUN set -eux; \ wget --progress=dot:giga -O ${PACKER_ARTIFACT} ${PACKER_SRC}; \ - echo "${PACKER_CHECKSUM} ${PACKER_ARTIFACT}" | sha256sum -c -; \ unzip -o ${PACKER_ARTIFACT} -d /usr/local/bin/; \ chmod 755 /usr/local/bin/packer; \ rm ${PACKER_ARTIFACT} # renovate: datasource=github-releases depName=koalaman/shellcheck registryUrl=https://github.com/ -ARG SHELLCHECK_VERSION="0.10.0" +ARG SHELLCHECK_VERSION="0.11.0" ARG SHELLCHECK_SRC="https://github.com/koalaman/shellcheck/releases/download/v${SHELLCHECK_VERSION}/shellcheck-v${SHELLCHECK_VERSION}.linux.x86_64.tar.xz" ARG SHELLCHECK_ARTIFACT="shellcheck.tar.xz" -ARG SHELLCHECK_CHECKSUM="6c881ab0698e4e6ea235245f22832860544f17ba386442fe7e9d629f8cbedf87" RUN set -eux; \ wget --progress=dot:giga -O ${SHELLCHECK_ARTIFACT} ${SHELLCHECK_SRC}; \ - echo "${SHELLCHECK_CHECKSUM} ${SHELLCHECK_ARTIFACT}" | sha256sum -c -; \ tar -xJvf ${SHELLCHECK_ARTIFACT}; \ mv shellcheck-v${SHELLCHECK_VERSION}/shellcheck /usr/local/bin/shellcheck; \ chmod 755 /usr/local/bin/shellcheck # renovate: datasource=github-releases depName=mvdan/sh registryUrl=https://github.com/ -ARG SHFMT_VERSION="3.11.0" +ARG SHFMT_VERSION="3.13.0" ARG SHFMT_SRC="https://github.com/mvdan/sh/releases/download/v${SHFMT_VERSION}/shfmt_v${SHFMT_VERSION}_linux_amd64" ARG SHFMT_ARTIFACT="shfmt" -ARG SHFMT_CHECKSUM="1904ec6bac715c1d05cd7f6612eec8f67a625c3749cb327e5bfb4127d09035ff" RUN set -eux; \ wget --progress=dot:giga -O ${SHFMT_ARTIFACT} ${SHFMT_SRC}; \ - echo "${SHFMT_CHECKSUM} ${SHFMT_ARTIFACT}" | sha256sum -c -; \ mv ${SHFMT_ARTIFACT} /usr/local/bin; \ chmod 755 /usr/local/bin/shfmt # renovate: datasource=github-releases depName=terraform-docs/terraform-docs registryUrl=https://github.com/ -ARG TERRAFORM_DOCS_VERSION="0.20.0" +ARG TERRAFORM_DOCS_VERSION="0.21.0" ARG TERRAFORM_DOCS_SRC="https://github.com/terraform-docs/terraform-docs/releases/download/v${TERRAFORM_DOCS_VERSION}/terraform-docs-v${TERRAFORM_DOCS_VERSION}-linux-amd64.tar.gz" ARG TERRAFORM_DOCS_ARTIFACT="terraform-docs.tar.gz" -ARG TERRAFORM_DOCS_CHECKSUM="34ae01772412bb11474e6718ea62113e38ff5964ee570a98c69fafe3a6dff286" RUN set -eux; \ wget --progress=dot:giga -O ${TERRAFORM_DOCS_ARTIFACT} ${TERRAFORM_DOCS_SRC}; \ - echo "${TERRAFORM_DOCS_CHECKSUM} ${TERRAFORM_DOCS_ARTIFACT}" | sha256sum -c -; \ tar -zxvf ${TERRAFORM_DOCS_ARTIFACT} -C /usr/local/bin/;\ chmod 755 /usr/local/bin/terraform-docs # renovate: datasource=github-releases depName=gruntwork-io/terragrunt registryUrl=https://github.com/ -ARG TERRAGRUNT_VERSION="0.77.22" +ARG TERRAGRUNT_VERSION="0.99.4" ARG TERRAGRUNT_SRC="https://github.com/gruntwork-io/terragrunt/releases/download/v${TERRAGRUNT_VERSION}/terragrunt_linux_amd64" ARG TERRAGRUNT_ARTIFACT="terragrunt" -ARG TERRAGRUNT_CHECKSUM="42036586250f5db53dd2460427c5df43420fa22b935998f1530181474f525386" RUN set -eux; \ wget --progress=dot:giga -O ${TERRAGRUNT_ARTIFACT} ${TERRAGRUNT_SRC}; \ - echo "${TERRAGRUNT_CHECKSUM} ${TERRAGRUNT_ARTIFACT}" | sha256sum -c -; \ mv ${TERRAGRUNT_ARTIFACT} /usr/local/bin; \ chmod 755 /usr/local/bin/terragrunt # renovate: datasource=github-releases depName=terraform-linters/tflint registryUrl=https://github.com/ -ARG TFLINT_VERSION="0.56.0" +ARG TFLINT_VERSION="0.61.0" ARG TFLINT_SRC="https://github.com/terraform-linters/tflint/releases/download/v${TFLINT_VERSION}/tflint_linux_amd64.zip" ARG TFLINT_ARTIFACT="tflint.zip" -ARG TFLINT_CHECKSUM="e0d74c557815ee51c6ecfe826ed62fd411ee6c10e1eab5532a0b0cc684c5db8a" RUN set -eux; \ wget --progress=dot:giga -O ${TFLINT_ARTIFACT} ${TFLINT_SRC}; \ - echo "${TFLINT_CHECKSUM} ${TFLINT_ARTIFACT}" | sha256sum -c -; \ unzip -o ${TFLINT_ARTIFACT} -d /usr/local/bin/; \ - chmod 755 /usr/local/bin/tflint + chmod 755 /usr/local/bin/tflint; # renovate: datasource=github-releases depName=opentofu/opentofu registryUrl=https://github.com/ -ARG TOFU_VERSION="1.9.1" +ARG TOFU_VERSION="1.11.5" ARG TOFU_SRC="https://github.com/opentofu/opentofu/releases/download/v${TOFU_VERSION}/tofu_${TOFU_VERSION}_linux_amd64.zip" ARG TOFU_ARTIFACT="tofu.zip" -ARG TOFU_CHECKSUM="19eda43eaa45bef3e21d87c58f31a6df73e8534ea30e78619a463bdfdb889cd2" RUN set -eux; \ wget --progress=dot:giga -O ${TOFU_ARTIFACT} ${TOFU_SRC}; \ - echo "${TOFU_CHECKSUM} ${TOFU_ARTIFACT}" | sha256sum -c -; \ unzip -o ${TOFU_ARTIFACT} -d /usr/local/bin/; \ chmod 755 /usr/local/bin/tofu -FROM ubuntu:24.04@sha256:c35e29c9450151419d9448b0fd75374fec4fff364a27f176fb458d472dfc9e54 AS final +FROM ubuntu:24.04@sha256:d1e2e92c075e5ca139d51a140fff46f84315c0fdce203eab2807c7e495eff4f9 AS final ENV DEBIAN_FRONTEND=noninteractive diff --git a/.github/workflows/build-forge-github-app-register.yml b/.github/workflows/build-forge-github-app-register.yml index 43835bf0..4621bd30 100644 --- a/.github/workflows/build-forge-github-app-register.yml +++ b/.github/workflows/build-forge-github-app-register.yml @@ -19,6 +19,7 @@ on: - reopened branches: - main + - renovatebot paths: - .docker/forge-github-app-register/** - .github/workflows/build-forge-github-app-register.yml @@ -38,15 +39,15 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 with: platforms: ${{ env.PLATFORMS }} - name: Log in to the Container registry - uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 + uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -54,12 +55,12 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5.9.0 + uses: docker/metadata-action@030e881283bb7a6894de51c315a6bfe6a94e05cf # v6.0.0 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - name: Build and push Docker image [forge-github-app-register] - uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 with: context: ./.docker/forge-github-app-register file: .docker/forge-github-app-register/Dockerfile diff --git a/.github/workflows/build-pre-commit.yml b/.github/workflows/build-pre-commit.yml index 2452a4d2..0b87ac6b 100644 --- a/.github/workflows/build-pre-commit.yml +++ b/.github/workflows/build-pre-commit.yml @@ -19,6 +19,7 @@ on: - reopened branches: - main + - renovatebot paths: - .docker/pre-commit/** - .github/workflows/build-pre-commit.yml @@ -38,15 +39,15 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 with: platforms: ${{ env.PLATFORMS }} - name: Log in to the Container registry - uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 + uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -54,12 +55,12 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5.9.0 + uses: docker/metadata-action@030e881283bb7a6894de51c315a6bfe6a94e05cf # v6.0.0 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - name: Build and push Docker image [pre-commit] - uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 with: context: . file: .docker/pre-commit/Dockerfile diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 8275bd26..c96c596b 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -11,6 +11,7 @@ on: - reopened branches: - main + - renovatebot push: branches: - main @@ -28,19 +29,21 @@ jobs: if: github.event.pull_request.user.login != 'dependabot[bot]' steps: - name: Checkout Repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Cache Pre-commit - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 with: path: ~/.cache/pre-commit/ key: pre-commit|${{ github.repository }}|${{ hashFiles('.pre-commit-config.yaml') }} - - name: Cache Terraform/OpenTofu providers - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + - name: Cache OpenTofu providers + uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 with: path: /github/home/.terraform.d/plugin - key: tf-providers + key: tf-providers-${{ github.run_id }} + restore-keys: | + tf-providers - name: Set Terraform plugin cache run: echo "TF_PLUGIN_CACHE_DIR=/github/home/.terraform.d/plugin" >> $GITHUB_ENV diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4bf0310c..5031eeb3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,7 +83,7 @@ repos: # Commit Message Hooks # --------------------- - repo: https://github.com/commitizen-tools/commitizen - rev: v4.10.0 + rev: v4.13.9 hooks: - id: commitizen name: Git · Validate commit message @@ -104,7 +104,7 @@ repos: exclude: (build/ansible/) - repo: https://github.com/adrienverge/yamllint - rev: v1.37.1 + rev: v1.38.0 hooks: - id: yamllint name: YAML · Linter @@ -144,7 +144,7 @@ repos: # Makefile Hooks # --------------------- - repo: https://github.com/mrtazz/checkmake.git - rev: 0.2.2 + rev: v0.3.2 hooks: - id: checkmake name: Makefile · Lint Makefile @@ -159,13 +159,13 @@ repos: name: Python · autopep8 - repo: https://github.com/PyCQA/isort - rev: 6.1.0 + rev: 8.0.1 hooks: - id: isort name: Python · Import sorter - repo: https://github.com/PyCQA/autoflake - rev: v2.3.1 + rev: v2.3.3 hooks: - id: autoflake name: Python · Remove unused imports @@ -185,7 +185,7 @@ repos: always_run: true - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.24.1 + rev: v0.25 hooks: - id: validate-pyproject name: Python · Validate pyproject.toml @@ -196,7 +196,7 @@ repos: # JSON Schema Hooks # --------------------- - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.35.0 + rev: 0.37.0 hooks: - id: check-github-workflows name: JSON Schema · GitHub workflows @@ -230,7 +230,7 @@ repos: always_run: true - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.104.0 + rev: v1.105.0 hooks: - id: terraform_fmt name: Terraform · Formatter @@ -266,7 +266,7 @@ repos: # Ansible Hooks # --------------------- - repo: https://github.com/ansible-community/ansible-lint.git - rev: v25.11.1 + rev: v26.3.0 hooks: - id: ansible-lint name: Ansible · Linter @@ -277,7 +277,7 @@ repos: # Markdown Hooks # --------------------- - repo: https://github.com/hukkin/mdformat - rev: 0.7.22 + rev: 1.0.0 hooks: - id: mdformat name: Markdown · Format markdown diff --git a/README.md b/README.md index 44870190..45c8c5f0 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ ______________________________________________________________________ - **Role:** Deploy and maintain ForgeMT infrastructure - **Responsibilities:** AWS account setup, tenant provisioning, platform updates -- **Tools:** Terraform/OpenTofu, AWS CLI, kubectl +- **Tools:** OpenTofu, AWS CLI, kubectl, helm - **Workflow:** Deploy control plane → Onboard tenants → Monitor platform ### 👩‍💻 **Development Team (Tenant)** @@ -122,7 +122,7 @@ Deploy and manage the ForgeMT infrastructure: - **[Deploy Your First Tenant](./docs/configurations/deployments/forge_tenant.md)** — Minimal setup to bootstrap ForgeMT. - **[All Deployment Scenarios](./docs/configurations/deployments/index.md)** — Includes EKS, Splunk, BYO AMIs, and advanced patterns. -**Prerequisites:** AWS CLI configured, Terraform 1.5+, kubectl +**Prerequisites:** AWS CLI configured, OpenTofu 1.11+, kubectl, helm ### For Development Teams (Tenants) diff --git a/docs/configurations/deployments/new_tenant.md b/docs/configurations/deployments/new_tenant.md index 7185d086..b133da2c 100644 --- a/docs/configurations/deployments/new_tenant.md +++ b/docs/configurations/deployments/new_tenant.md @@ -67,6 +67,11 @@ gh_config: destination_region: "" # Destination AWS region for the forwarding rule destination_reader_role_arn: "" # IAM role in destination allowed to read forwarded events (leave blank if not needed) # NOTE: Leave all destination_* fields blank when enabled=false; they are ignored. + github_app: + id: # Numeric GitHub App ID from GitHub settings + client_id: # OAuth client ID shown on the GitHub App page + installation_id: # Installation ID after installing the app in your org/account + name: # Exact GitHub App name (must match the created app) tenant: iam_roles_to_assume: # List of full AWS IAM role ARNs runners may assume for workloads @@ -144,76 +149,7 @@ ______________________________________________________________________ ______________________________________________________________________ -## 3. Minimal Working `config.yaml` Example - -```yaml -gh_config: - ghes_url: '' - ghes_org: cisco-sbg - github_webhook_relay: - enabled: false - destination_account_id: "" - destination_event_bus_name: "" - destination_region: "" - destination_reader_role_arn: "" - -tenant: - iam_roles_to_assume: - - arn:aws:iam::123456789012:role/role_for_forge_runners - ecr_registries: - - 123456789012.dkr.ecr.us-east-1.amazonaws.com - github_logs_reader_role_arns: - - arn:aws:iam::123456789012:role/github_logs_reader - -ec2_runner_specs: - small: - ami_name: forge-gh-runner-v* - ami_owner: '123456789012' - ami_kms_key_arn: '' - max_instances: 10 - instance_types: - - t3.small - - t3.medium - pool_config: - - size: 2 - schedule_expression: "cron(*/10 8 * * ? *)" - schedule_expression_timezone: "America/Los_Angeles" - -arc_runner_specs: - dependabot: - runner_size: - max_runners: 100 - min_runners: 1 - scale_set_name: dependabot - scale_set_type: dind - container_actions_runner: 123456789012.dkr.ecr.us-east-1.amazonaws.com/actions-runner:latest - container_requests_cpu: 500m - container_requests_memory: 1Gi - container_limits_cpu: '1' - container_limits_memory: 2Gi -``` - -______________________________________________________________________ - -## 4. Deploy Secrets - -1. **Navigate to the tenant directory** matching your AWS account, region, VPC, and tenant: - -```bash -cd examples/deployments/forge-tenant/terragrunt/environments//regions//vpcs//tenants/ -``` - -2. **Deploy only the secrets** to AWS Secrets Manager: - -```bash -terragrunt apply --target aws_secretsmanager_secret_version.cicd_secrets -``` - -> **Pro tip:** Use `--target` carefully — only apply secrets here to avoid accidental resource changes in other modules. - -______________________________________________________________________ - -## 5. Create GitHub App +## 3. Create GitHub App 1. **Pull the registration UI container (amd64):** @@ -279,27 +215,100 @@ sec-plat-euw1-shared-sbg-cicd-forge ______________________________________________________________________ +## 4. Minimal Working `config.yaml` Example + +```yaml +gh_config: + ghes_url: '' + ghes_org: cisco-sbg + github_webhook_relay: + enabled: false + destination_account_id: "" + destination_event_bus_name: "" + destination_region: "" + destination_reader_role_arn: "" + github_app: + id: 1234567890 + client_id: abcdefghijklmnopqrstuvwx + installation_id: 9876543210 + name: forge-github-app + +tenant: + iam_roles_to_assume: + - arn:aws:iam::123456789012:role/role_for_forge_runners + ecr_registries: + - 123456789012.dkr.ecr.us-east-1.amazonaws.com + github_logs_reader_role_arns: + - arn:aws:iam::123456789012:role/github_logs_reader + +ec2_runner_specs: + small: + ami_name: forge-gh-runner-v* + ami_owner: '123456789012' + ami_kms_key_arn: '' + max_instances: 10 + instance_types: + - t3.small + - t3.medium + pool_config: + - size: 2 + schedule_expression: "cron(*/10 8 * * ? *)" + schedule_expression_timezone: "America/Los_Angeles" + +arc_runner_specs: + dependabot: + runner_size: + max_runners: 100 + min_runners: 1 + scale_set_name: dependabot + scale_set_type: dind + container_actions_runner: 123456789012.dkr.ecr.us-east-1.amazonaws.com/actions-runner:latest + container_requests_cpu: 500m + container_requests_memory: 1Gi + container_limits_cpu: '1' + container_limits_memory: 2Gi +``` + +______________________________________________________________________ + +## 5. Deploy + +1. **Navigate to your tenant directory:** + +```bash +cd examples/deployments/forge-tenant/terragrunt/environments//regions//vpcs//tenants/ +``` + +2. **Deploy everything in one go:** + +```bash +terragrunt apply +``` + +3. **Verify success:** + +- No errors in Terraform apply output. +- All expected AWS resources exist. + +______________________________________________________________________ + ## 6. Set GitHub App Secrets Run the `update-github-app-secrets.sh` script to inject critical GitHub App values into your secrets: ```bash -./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir client_id -./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir name -./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir id -./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir key /path/to/private-key.pem -./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir installation_id +./scripts/update-github-app-secrets.sh /full/path/to/tenant_dir /path/to/private-key.pem ``` ### Notes: - Use **absolute paths** for tenant directories and private key files to avoid path resolution issues inside the script. - Confirm the private key file has **correct permissions** (`chmod 600`) to avoid permission errors. -- The script will update AWS Secrets Manager values — verify with `terragrunt plan` or AWS Console if you want to double-check. +- The script will update AWS SSM Parameter values — verify with `terragrunt plan` or AWS Console if you want to double-check. ______________________________________________________________________ -## 7. Deploy +## 7. Redeploy with secrets updated 1. **Navigate to your tenant directory:** @@ -319,6 +328,4 @@ terragrunt apply - All expected AWS resources exist. - GitHub runners appear registered and are actively picking up jobs. -______________________________________________________________________ - > For more advanced scenarios or troubleshooting, see the [full documentation](../index.md). diff --git a/docs/tenant-usage/index.md b/docs/tenant-usage/index.md index 857d0cb9..b0b2f39b 100644 --- a/docs/tenant-usage/index.md +++ b/docs/tenant-usage/index.md @@ -56,7 +56,7 @@ jobs: runs-on: - self-hosted - x64 - - type: standard + - type:standard ``` For Kubernetes pods, use: diff --git a/docs/tenant-usage/renovatebot/index.md b/docs/tenant-usage/renovatebot/index.md index 75f69910..78ebbec6 100644 --- a/docs/tenant-usage/renovatebot/index.md +++ b/docs/tenant-usage/renovatebot/index.md @@ -36,8 +36,8 @@ jobs: runs-on: - self-hosted - x64 - - type: large - - env: ops-prod + - type:large + - env:ops-prod steps: - name: Checkout Repository diff --git a/examples/deployments/forge-tenant/terragrunt/_global_settings/tenant.hcl b/examples/deployments/forge-tenant/terragrunt/_global_settings/tenant.hcl index 98c05b7d..9c76410e 100644 --- a/examples/deployments/forge-tenant/terragrunt/_global_settings/tenant.hcl +++ b/examples/deployments/forge-tenant/terragrunt/_global_settings/tenant.hcl @@ -26,58 +26,54 @@ locals { # ───────────────────────────────────────────────────────────────────────────── # Tenant Settings # ───────────────────────────────────────────────────────────────────────────── - runner_settings_data = read_terragrunt_config("runner_settings.hcl") - tenant = local.runner_settings_data.locals.tenant + config = read_terragrunt_config("runner_settings.hcl") # ───────────────────────────────────────────────────────────────────────────── # Tags # ───────────────────────────────────────────────────────────────────────────── tags = { - TenantName = local.tenant.name - ForgeCICDTenantName = local.tenant.name - ForgeCICDTenantVpcAlias = local.runner_settings_data.locals.vpc_alias + TenantName = local.config.locals.deployment_config.tenant.name + ForgeCICDTenantName = local.config.locals.deployment_config.tenant.name + ForgeCICDTenantVpcAlias = local.config.locals.vpc_alias + Service = "Forge Runners" } default_tags = { - ApplicationName = "${local.project_name}-${local.tenant.name}-${local.runner_settings_data.locals.region_alias}-${local.runner_settings_data.locals.vpc_alias}" + ApplicationName = "${local.project_name}-${local.config.locals.deployment_config.tenant.name}-${local.config.locals.region_alias}-${local.config.locals.vpc_alias}" ResourceOwner = local.team_name ProductFamilyName = local.product_name IntendedPublic = "No" LastRevalidatedBy = "Terraform" LastRevalidatedAt = "2025-05-15" } + } inputs = { - # Core Environment - env = local.env_name - aws_account_id = local.aws_account_id - aws_profile = local.default_aws_profile - aws_region = local.region + aws_profile = local.default_aws_profile + aws_region = local.region + + ec2_deployment_specs = { + lambda_subnet_ids = local.config.locals.lambda_subnet_ids + lambda_vpc_id = local.config.locals.lambda_vpc_id + subnet_ids = local.config.locals.subnet_ids + vpc_id = local.config.locals.vpc_id + runner_specs = local.config.locals.ec2_runner_specs + } + + arc_deployment_specs = { + cluster_name = local.config.locals.arc_cluster_name + migrate_cluster = local.config.locals.migrate_arc_cluster + runner_specs = local.config.locals.arc_runner_specs + } - # Networking - vpc_id = local.runner_settings_data.locals.vpc_id - subnet_ids = local.runner_settings_data.locals.subnet_ids - lambda_subnet_ids = local.runner_settings_data.locals.lambda_subnet_ids + github_webhook_relay = local.config.locals.github_webhook_relay - # Runners (EC2/ARC) - ec2_runner_specs = local.runner_settings_data.locals.ec2_runner_specs - arc_cluster_name = local.runner_settings_data.locals.arc_cluster_name - arc_runner_specs = local.runner_settings_data.locals.arc_runner_specs - migrate_arc_cluster = local.runner_settings_data.locals.migrate_arc_cluster + deployment_config = local.config.locals.deployment_config - # GitHub Settings - ghes_url = local.runner_settings_data.locals.ghes_url - ghes_org = local.runner_settings_data.locals.ghes_org - repository_selection = local.runner_settings_data.locals.repository_selection - runner_group_name = local.runner_settings_data.locals.runner_group_name - github_webhook_relay = local.runner_settings_data.locals.github_webhook_relay + log_level = local.config.locals.log_level + logging_retention_in_days = local.config.locals.logging_retention_in_days - # Misc - deployment_config = local.runner_settings_data.locals.deployment_config - log_level = local.runner_settings_data.locals.log_level - logging_retention_in_days = local.runner_settings_data.locals.logging_retention_in_days - tenant = local.tenant - tags = local.tags - default_tags = local.default_tags + tags = local.tags + default_tags = local.default_tags } diff --git a/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/config.yaml b/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/config.yaml index dd4b9983..0ee1f313 100644 --- a/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/config.yaml +++ b/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/config.yaml @@ -5,6 +5,11 @@ gh_config: repository_selection: selected github_webhook_relay: enabled: false + github_app: + id: 1234567890 + client_id: abcdefghijklmnopqrstuvwx + installation_id: 9876543210 + name: forge-github-app tenant: iam_roles_to_assume: - arn:aws:iam::123456789012:role/role_for_forge_runners diff --git a/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/runner_settings.hcl b/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/runner_settings.hcl index fd818f29..e896a0c2 100644 --- a/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/runner_settings.hcl +++ b/examples/deployments/forge-tenant/terragrunt/environments/prod/regions/eu-west-1/vpcs/sl/tenants/acme/runner_settings.hcl @@ -18,6 +18,7 @@ include "vpc" { locals { # VPC & region info from includes + lambda_vpc_id = include.vpc.locals.vpc_id lambda_subnet_ids = include.vpc.locals.lambda_subnet_ids vpc_id = include.vpc.locals.vpc_id subnet_ids = include.vpc.locals.subnet_ids @@ -30,33 +31,37 @@ locals { # Tenant tenant_name = basename(get_terragrunt_dir()) - deployment_config = { - prefix = "${local.tenant_name}-${local.region_alias}-${local.vpc_alias}" - secret_suffix = local.vpc_alias - } - log_level = "info" logging_retention_in_days = 3 # Load and parse runner specs YAML once - runner_specs_raw = yamldecode(file("config.yaml")) + config = yamldecode(file("config.yaml")) # GitHub App settings - ghes_url = local.runner_specs_raw.gh_config.ghes_url - ghes_org = local.runner_specs_raw.gh_config.ghes_org - repository_selection = local.runner_specs_raw.gh_config.repository_selection - github_webhook_relay = local.runner_specs_raw.gh_config.github_webhook_relay + github_webhook_relay = local.config.gh_config.github_webhook_relay - tenant = { - name = local.tenant_name - iam_roles_to_assume = local.runner_specs_raw.tenant.iam_roles_to_assume - ecr_registries = local.runner_specs_raw.tenant.ecr_registries - github_logs_reader_role_arns = local.runner_specs_raw.tenant.github_logs_reader_role_arns + deployment_config = { + deployment_prefix = "${local.tenant_name}-${local.region_alias}-${local.vpc_alias}" + secret_suffix = local.vpc_alias + env = local.env_name + github_app = local.config.gh_config.github_app + tenant = { + name = local.tenant_name + iam_roles_to_assume = local.config.tenant.iam_roles_to_assume + ecr_registries = local.config.tenant.ecr_registries + github_logs_reader_role_arns = local.config.tenant.github_logs_reader_role_arns + } + github = { + ghes_org = local.config.gh_config.ghes_org + ghes_url = local.config.gh_config.ghes_url + repository_selection = local.config.gh_config.repository_selection + runner_group_name = local.runner_group_name + } } ec2_runner_specs = { - for size, spec in local.runner_specs_raw.ec2_runner_specs : + for size, spec in local.config.ec2_runner_specs : size => { ami_filter = { name = [spec.ami_name], @@ -83,6 +88,8 @@ locals { min_run_time = 30 max_instances = spec.max_instances instance_types = spec.instance_types + placement = try(spec.placement, null) + license_specifications = try(spec.license_specifications, null) block_device_mappings = [{ delete_on_termination = true device_name = spec.volume.device_name @@ -94,14 +101,15 @@ locals { volume_size = spec.volume.size volume_type = spec.volume.type }] + vpc_id = local.vpc_id pool_config = spec.pool_config } } - arc_cluster_name = local.runner_specs_raw.arc_cluster_name - migrate_arc_cluster = local.runner_specs_raw.migrate_arc_cluster + arc_cluster_name = local.config.arc_cluster_name + migrate_arc_cluster = local.config.migrate_arc_cluster arc_runner_specs = { - for size, spec in local.runner_specs_raw.arc_runner_specs : + for size, spec in local.config.arc_runner_specs : size => { runner_size = spec.runner_size scale_set_name = spec.scale_set_name diff --git a/examples/deployments/splunk-deployment/terragrunt/_global_settings/splunk_otel_eks.hcl b/examples/deployments/splunk-deployment/terragrunt/_global_settings/splunk_otel_eks.hcl index fdd7600c..0ff7044a 100644 --- a/examples/deployments/splunk-deployment/terragrunt/_global_settings/splunk_otel_eks.hcl +++ b/examples/deployments/splunk-deployment/terragrunt/_global_settings/splunk_otel_eks.hcl @@ -58,5 +58,6 @@ inputs = { splunk_otel_collector = local.eks_settings_data.locals.splunk_otel_collector # Misc + tags = local.tags default_tags = local.default_tags } diff --git a/examples/templates/tenant/_global_settings/tenant.hcl b/examples/templates/tenant/_global_settings/tenant.hcl index 98c05b7d..eeb4e5df 100644 --- a/examples/templates/tenant/_global_settings/tenant.hcl +++ b/examples/templates/tenant/_global_settings/tenant.hcl @@ -26,8 +26,8 @@ locals { # ───────────────────────────────────────────────────────────────────────────── # Tenant Settings # ───────────────────────────────────────────────────────────────────────────── - runner_settings_data = read_terragrunt_config("runner_settings.hcl") - tenant = local.runner_settings_data.locals.tenant + config = read_terragrunt_config("runner_settings.hcl") + tenant = local.config.locals.tenant # ───────────────────────────────────────────────────────────────────────────── # Tags @@ -35,11 +35,11 @@ locals { tags = { TenantName = local.tenant.name ForgeCICDTenantName = local.tenant.name - ForgeCICDTenantVpcAlias = local.runner_settings_data.locals.vpc_alias + ForgeCICDTenantVpcAlias = local.config.locals.vpc_alias } default_tags = { - ApplicationName = "${local.project_name}-${local.tenant.name}-${local.runner_settings_data.locals.region_alias}-${local.runner_settings_data.locals.vpc_alias}" + ApplicationName = "${local.project_name}-${local.tenant.name}-${local.config.locals.region_alias}-${local.runner_settings_data.locals.vpc_alias}" ResourceOwner = local.team_name ProductFamilyName = local.product_name IntendedPublic = "No" @@ -50,34 +50,30 @@ locals { inputs = { # Core Environment - env = local.env_name - aws_account_id = local.aws_account_id - aws_profile = local.default_aws_profile - aws_region = local.region - - # Networking - vpc_id = local.runner_settings_data.locals.vpc_id - subnet_ids = local.runner_settings_data.locals.subnet_ids - lambda_subnet_ids = local.runner_settings_data.locals.lambda_subnet_ids + aws_profile = local.default_aws_profile + aws_region = local.region # Runners (EC2/ARC) - ec2_runner_specs = local.runner_settings_data.locals.ec2_runner_specs - arc_cluster_name = local.runner_settings_data.locals.arc_cluster_name - arc_runner_specs = local.runner_settings_data.locals.arc_runner_specs - migrate_arc_cluster = local.runner_settings_data.locals.migrate_arc_cluster + ec2_deployment_specs = { + lambda_subnet_ids = local.config.locals.lambda_subnet_ids + lambda_vpc_id = local.config.locals.lambda_vpc_id + subnet_ids = local.config.locals.subnet_ids + vpc_id = local.config.locals.vpc_id + runner_specs = local.config.locals.ec2_runner_specs + } + + arc_deployment_specs = { + cluster_name = local.config.locals.arc_cluster_name + migrate_cluster = local.config.locals.migrate_arc_cluster + runner_specs = local.config.locals.arc_runner_specs + } - # GitHub Settings - ghes_url = local.runner_settings_data.locals.ghes_url - ghes_org = local.runner_settings_data.locals.ghes_org - repository_selection = local.runner_settings_data.locals.repository_selection - runner_group_name = local.runner_settings_data.locals.runner_group_name - github_webhook_relay = local.runner_settings_data.locals.github_webhook_relay + # Deployment Settings + deployment_config = local.config.locals.deployment_config # Misc - deployment_config = local.runner_settings_data.locals.deployment_config - log_level = local.runner_settings_data.locals.log_level - logging_retention_in_days = local.runner_settings_data.locals.logging_retention_in_days - tenant = local.tenant + log_level = local.config.locals.log_level + logging_retention_in_days = local.config.locals.logging_retention_in_days tags = local.tags default_tags = local.default_tags } diff --git a/examples/templates/tenant/tenant/config.yaml b/examples/templates/tenant/tenant/config.yaml index 46f47b8b..72192ed1 100644 --- a/examples/templates/tenant/tenant/config.yaml +++ b/examples/templates/tenant/tenant/config.yaml @@ -9,6 +9,11 @@ gh_config: destination_event_bus_name: destination_region: destination_reader_role_arn: + github_app: + id: + client_id: + installation_id: + name: tenant: iam_roles_to_assume: - arn:aws:iam:::role/ diff --git a/examples/templates/tenant/tenant/runner_settings.hcl b/examples/templates/tenant/tenant/runner_settings.hcl index fd818f29..e896a0c2 100644 --- a/examples/templates/tenant/tenant/runner_settings.hcl +++ b/examples/templates/tenant/tenant/runner_settings.hcl @@ -18,6 +18,7 @@ include "vpc" { locals { # VPC & region info from includes + lambda_vpc_id = include.vpc.locals.vpc_id lambda_subnet_ids = include.vpc.locals.lambda_subnet_ids vpc_id = include.vpc.locals.vpc_id subnet_ids = include.vpc.locals.subnet_ids @@ -30,33 +31,37 @@ locals { # Tenant tenant_name = basename(get_terragrunt_dir()) - deployment_config = { - prefix = "${local.tenant_name}-${local.region_alias}-${local.vpc_alias}" - secret_suffix = local.vpc_alias - } - log_level = "info" logging_retention_in_days = 3 # Load and parse runner specs YAML once - runner_specs_raw = yamldecode(file("config.yaml")) + config = yamldecode(file("config.yaml")) # GitHub App settings - ghes_url = local.runner_specs_raw.gh_config.ghes_url - ghes_org = local.runner_specs_raw.gh_config.ghes_org - repository_selection = local.runner_specs_raw.gh_config.repository_selection - github_webhook_relay = local.runner_specs_raw.gh_config.github_webhook_relay + github_webhook_relay = local.config.gh_config.github_webhook_relay - tenant = { - name = local.tenant_name - iam_roles_to_assume = local.runner_specs_raw.tenant.iam_roles_to_assume - ecr_registries = local.runner_specs_raw.tenant.ecr_registries - github_logs_reader_role_arns = local.runner_specs_raw.tenant.github_logs_reader_role_arns + deployment_config = { + deployment_prefix = "${local.tenant_name}-${local.region_alias}-${local.vpc_alias}" + secret_suffix = local.vpc_alias + env = local.env_name + github_app = local.config.gh_config.github_app + tenant = { + name = local.tenant_name + iam_roles_to_assume = local.config.tenant.iam_roles_to_assume + ecr_registries = local.config.tenant.ecr_registries + github_logs_reader_role_arns = local.config.tenant.github_logs_reader_role_arns + } + github = { + ghes_org = local.config.gh_config.ghes_org + ghes_url = local.config.gh_config.ghes_url + repository_selection = local.config.gh_config.repository_selection + runner_group_name = local.runner_group_name + } } ec2_runner_specs = { - for size, spec in local.runner_specs_raw.ec2_runner_specs : + for size, spec in local.config.ec2_runner_specs : size => { ami_filter = { name = [spec.ami_name], @@ -83,6 +88,8 @@ locals { min_run_time = 30 max_instances = spec.max_instances instance_types = spec.instance_types + placement = try(spec.placement, null) + license_specifications = try(spec.license_specifications, null) block_device_mappings = [{ delete_on_termination = true device_name = spec.volume.device_name @@ -94,14 +101,15 @@ locals { volume_size = spec.volume.size volume_type = spec.volume.type }] + vpc_id = local.vpc_id pool_config = spec.pool_config } } - arc_cluster_name = local.runner_specs_raw.arc_cluster_name - migrate_arc_cluster = local.runner_specs_raw.migrate_arc_cluster + arc_cluster_name = local.config.arc_cluster_name + migrate_arc_cluster = local.config.migrate_arc_cluster arc_runner_specs = { - for size, spec in local.runner_specs_raw.arc_runner_specs : + for size, spec in local.config.arc_runner_specs : size => { runner_size = spec.runner_size scale_set_name = spec.scale_set_name diff --git a/modules/core/arc/README.md b/modules/core/arc/README.md index cac2291f..5a63ae57 100644 --- a/modules/core/arc/README.md +++ b/modules/core/arc/README.md @@ -3,20 +3,20 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | -| [null](#requirement\_null) | >= 3.2.3 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | +| [null](#requirement\_null) | >= 3.2 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | -| [kubernetes](#provider\_kubernetes) | 2.38.0 | +| [kubernetes](#provider\_kubernetes) | 3.0.1 | | [null](#provider\_null) | 3.2.4 | ## Modules diff --git a/modules/core/arc/karpenter.tf b/modules/core/arc/karpenter.tf index d8329151..62ef5fbe 100644 --- a/modules/core/arc/karpenter.tf +++ b/modules/core/arc/karpenter.tf @@ -4,6 +4,12 @@ locals { }) kubeconfig_path = "${path.cwd}/.kube/${var.eks_cluster_name}-${var.aws_profile}-${var.aws_region}-${var.controller_config.namespace}.kubeconfig" + + merged_tags = merge( + var.tags, + { + Name = "${var.eks_cluster_name}-karpenter-${var.controller_config.namespace}-node" + }) } data "external" "update_kubeconfig" { @@ -51,7 +57,7 @@ data "external" "karpenter_ec2nodeclass" { ) ' - \ | yq eval -o=json - \ - | jq --argjson newtags '${jsonencode(var.tags)}' --arg newname "karpenter-${var.controller_config.namespace}" ' + | jq --argjson newtags '${jsonencode(local.merged_tags)}' --arg newname "karpenter-${var.controller_config.namespace}" ' .spec.tags = (.spec.tags // {}) | .spec.tags *= $newtags | .metadata.name = $newname diff --git a/modules/core/arc/scale_set/README.md b/modules/core/arc/scale_set/README.md index 582b35eb..8ed3f074 100644 --- a/modules/core/arc/scale_set/README.md +++ b/modules/core/arc/scale_set/README.md @@ -3,18 +3,18 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [helm](#provider\_helm) | 3.1.0 | -| [kubernetes](#provider\_kubernetes) | 2.38.0 | +| [aws](#provider\_aws) | 6.35.1 | +| [helm](#provider\_helm) | 3.1.1 | +| [kubernetes](#provider\_kubernetes) | 3.0.1 | ## Modules @@ -28,10 +28,10 @@ No modules. | [aws_iam_role.runner_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy_attachment.runner_role_policy_attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [helm_release.gha_runner_scale_set](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | -| [kubernetes_config_map.hook_extension](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | -| [kubernetes_config_map.hook_pre_post_job](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | -| [kubernetes_role.k8s](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role) | resource | -| [kubernetes_role_binding.k8s](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | +| [kubernetes_config_map_v1.hook_extension](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map_v1) | resource | +| [kubernetes_config_map_v1.hook_pre_post_job](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map_v1) | resource | +| [kubernetes_role_binding_v1.k8s](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding_v1) | resource | +| [kubernetes_role_v1.k8s](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_v1) | resource | | [kubernetes_service_account_v1.runner_sa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [aws_iam_policy_document.assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | diff --git a/modules/core/arc/scale_set/helm.tf b/modules/core/arc/scale_set/helm.tf index c2f1b0b5..ed68c8b5 100644 --- a/modules/core/arc/scale_set/helm.tf +++ b/modules/core/arc/scale_set/helm.tf @@ -1,4 +1,4 @@ -resource "kubernetes_config_map" "hook_extension" { +resource "kubernetes_config_map_v1" "hook_extension" { count = var.migrate_arc_cluster == false ? 1 : 0 metadata { name = "hook-extension-${var.scale_set_name}" @@ -23,7 +23,7 @@ resource "kubernetes_config_map" "hook_extension" { } } -resource "kubernetes_config_map" "hook_pre_post_job" { +resource "kubernetes_config_map_v1" "hook_pre_post_job" { count = var.migrate_arc_cluster == false ? 1 : 0 metadata { name = "hook-pre-post-job-${var.scale_set_name}" diff --git a/modules/core/arc/scale_set/k8s_mode.tf b/modules/core/arc/scale_set/k8s_mode.tf index e140dce8..daf8f010 100644 --- a/modules/core/arc/scale_set/k8s_mode.tf +++ b/modules/core/arc/scale_set/k8s_mode.tf @@ -1,4 +1,4 @@ -resource "kubernetes_role" "k8s" { +resource "kubernetes_role_v1" "k8s" { count = var.scale_set_type == "k8s" && var.migrate_arc_cluster == false ? 1 : 0 metadata { @@ -37,7 +37,7 @@ resource "kubernetes_role" "k8s" { } } -resource "kubernetes_role_binding" "k8s" { +resource "kubernetes_role_binding_v1" "k8s" { count = var.scale_set_type == "k8s" && var.migrate_arc_cluster == false ? 1 : 0 metadata { @@ -48,7 +48,7 @@ resource "kubernetes_role_binding" "k8s" { role_ref { api_group = "rbac.authorization.k8s.io" kind = "Role" - name = kubernetes_role.k8s[0].metadata[0].name + name = kubernetes_role_v1.k8s[0].metadata[0].name } subject { diff --git a/modules/core/arc/scale_set/template_files/dind.yml.tftpl b/modules/core/arc/scale_set/template_files/dind.yml.tftpl index 5543bfbe..74c5a9bb 100644 --- a/modules/core/arc/scale_set/template_files/dind.yml.tftpl +++ b/modules/core/arc/scale_set/template_files/dind.yml.tftpl @@ -7,6 +7,9 @@ runnerGroup: "${runner_group_name}" runnerScaleSetName: "${scale_set_name}" template: + metadata: + annotations: + karpenter.sh/do-not-disrupt: "true" spec: serviceAccountName: ${service_account} automountServiceAccountToken: true @@ -252,6 +255,7 @@ template: audience: sts.amazonaws.com tolerations: + # tenant tolerations - key: "forge.local/scale_set_type" operator: "Equal" value: "dind" @@ -266,6 +270,7 @@ template: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: + # tenant affinity - key: forge.local/scale_set_type operator: In values: diff --git a/modules/core/arc/scale_set/template_files/k8s.yml.tftpl b/modules/core/arc/scale_set/template_files/k8s.yml.tftpl index dbad4be2..d2381cbb 100644 --- a/modules/core/arc/scale_set/template_files/k8s.yml.tftpl +++ b/modules/core/arc/scale_set/template_files/k8s.yml.tftpl @@ -20,6 +20,9 @@ containerMode: # with containerMode.type=kubernetes, we will populate the template.spec with following pod spec template: + metadata: + annotations: + karpenter.sh/do-not-disrupt: "true" spec: securityContext: fsGroup: 123 diff --git a/modules/core/arc/scale_set/versions.tf b/modules/core/arc/scale_set/versions.tf index 06035e89..056af10a 100644 --- a/modules/core/arc/scale_set/versions.tf +++ b/modules/core/arc/scale_set/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } helm = { source = "hashicorp/helm" @@ -10,10 +10,10 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/core/arc/scale_set_controller/README.md b/modules/core/arc/scale_set_controller/README.md index c8336013..a96426d8 100644 --- a/modules/core/arc/scale_set_controller/README.md +++ b/modules/core/arc/scale_set_controller/README.md @@ -3,17 +3,17 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [helm](#provider\_helm) | 3.1.0 | -| [kubernetes](#provider\_kubernetes) | 2.38.0 | +| [helm](#provider\_helm) | 3.1.1 | +| [kubernetes](#provider\_kubernetes) | 3.0.1 | ## Modules @@ -24,8 +24,8 @@ No modules. | Name | Type | |------|------| | [helm_release.gha_runner_scale_set_controller](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | -| [kubernetes_namespace.controller_namespace](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource | -| [kubernetes_secret.github_app](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret) | resource | +| [kubernetes_namespace_v1.controller_namespace](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | +| [kubernetes_secret_v1.github_app](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | ## Inputs diff --git a/modules/core/arc/scale_set_controller/helm.tf b/modules/core/arc/scale_set_controller/helm.tf index ce6b6e4b..dca751c2 100644 --- a/modules/core/arc/scale_set_controller/helm.tf +++ b/modules/core/arc/scale_set_controller/helm.tf @@ -21,5 +21,5 @@ resource "helm_release" "gha_runner_scale_set_controller" { cleanup_on_fail = true timeout = 1200 - depends_on = [kubernetes_secret.github_app] + depends_on = [kubernetes_secret_v1.github_app] } diff --git a/modules/core/arc/scale_set_controller/secret.tf b/modules/core/arc/scale_set_controller/secret.tf index ea195ac2..fe5ba6dc 100644 --- a/modules/core/arc/scale_set_controller/secret.tf +++ b/modules/core/arc/scale_set_controller/secret.tf @@ -1,11 +1,11 @@ -resource "kubernetes_namespace" "controller_namespace" { +resource "kubernetes_namespace_v1" "controller_namespace" { count = var.migrate_arc_cluster == false ? 1 : 0 metadata { name = var.namespace } } -resource "kubernetes_secret" "github_app" { +resource "kubernetes_secret_v1" "github_app" { count = var.migrate_arc_cluster == false ? 1 : 0 metadata { @@ -23,5 +23,5 @@ resource "kubernetes_secret" "github_app" { lifecycle { create_before_destroy = true } - depends_on = [kubernetes_namespace.controller_namespace] + depends_on = [kubernetes_namespace_v1.controller_namespace] } diff --git a/modules/core/arc/scale_set_controller/versions.tf b/modules/core/arc/scale_set_controller/versions.tf index 06035e89..056af10a 100644 --- a/modules/core/arc/scale_set_controller/versions.tf +++ b/modules/core/arc/scale_set_controller/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } helm = { source = "hashicorp/helm" @@ -10,10 +10,10 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/core/arc/versions.tf b/modules/core/arc/versions.tf index 03ec6947..bb87b70f 100644 --- a/modules/core/arc/versions.tf +++ b/modules/core/arc/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -14,14 +14,14 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } null = { source = "hashicorp/null" - version = ">= 3.2.3" + version = ">= 3.2" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/ami_policy/README.md b/modules/infra/ami_policy/README.md index fda1e5b1..4f5d5121 100644 --- a/modules/infra/ami_policy/README.md +++ b/modules/infra/ami_policy/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/ami_policy/versions.tf b/modules/infra/ami_policy/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/ami_policy/versions.tf +++ b/modules/infra/ami_policy/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/ami_sharing/README.md b/modules/infra/ami_sharing/README.md index 02413d64..654c0716 100644 --- a/modules/infra/ami_sharing/README.md +++ b/modules/infra/ami_sharing/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/ami_sharing/versions.tf b/modules/infra/ami_sharing/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/ami_sharing/versions.tf +++ b/modules/infra/ami_sharing/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/cloud_custodian/README.md b/modules/infra/cloud_custodian/README.md index 40cdb5e1..83dc2c7f 100644 --- a/modules/infra/cloud_custodian/README.md +++ b/modules/infra/cloud_custodian/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/cloud_custodian/main.tf b/modules/infra/cloud_custodian/main.tf index 7e866b2c..4ae19106 100644 --- a/modules/infra/cloud_custodian/main.tf +++ b/modules/infra/cloud_custodian/main.tf @@ -17,6 +17,7 @@ data "aws_iam_policy_document" "cloud_custodian_policy" { "ec2:DescribeSecurityGroupReferences", "ec2:DescribeVolumes", "ec2:DeleteVolume", + "ec2:TerminateInstances", ] resources = ["*"] } diff --git a/modules/infra/cloud_custodian/versions.tf b/modules/infra/cloud_custodian/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/cloud_custodian/versions.tf +++ b/modules/infra/cloud_custodian/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/cloud_formation/README.md b/modules/infra/cloud_formation/README.md index 1807e088..da244f51 100644 --- a/modules/infra/cloud_formation/README.md +++ b/modules/infra/cloud_formation/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules @@ -24,6 +24,7 @@ No modules. | [aws_iam_role.cloudformation_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.admin_assume_execution_role_policy_attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.execution_role_policy_attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.admin_assume_execution_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.cloudformation_assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.execution_assume_admin_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | @@ -33,7 +34,6 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_account\_id](#input\_aws\_account\_id) | AWS account ID associated with the infra/backend. | `string` | n/a | yes | | [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | diff --git a/modules/infra/cloud_formation/data.tf b/modules/infra/cloud_formation/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/infra/cloud_formation/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/infra/cloud_formation/main.tf b/modules/infra/cloud_formation/main.tf index bef69a4f..5b39c324 100644 --- a/modules/infra/cloud_formation/main.tf +++ b/modules/infra/cloud_formation/main.tf @@ -39,7 +39,7 @@ data "aws_iam_policy_document" "execution_assume_admin_role_policy" { actions = ["sts:AssumeRole"] principals { type = "AWS" - identifiers = ["arn:aws:iam::${var.aws_account_id}:role/AWSCloudFormationStackSetAdministrationRole"] + identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSCloudFormationStackSetAdministrationRole"] } } } diff --git a/modules/infra/cloud_formation/variables.tf b/modules/infra/cloud_formation/variables.tf index 77e613ed..d022f8f6 100644 --- a/modules/infra/cloud_formation/variables.tf +++ b/modules/infra/cloud_formation/variables.tf @@ -1,8 +1,3 @@ -variable "aws_account_id" { - description = "AWS account ID associated with the infra/backend." - type = string -} - variable "aws_profile" { type = string description = "AWS profile to use." diff --git a/modules/infra/cloud_formation/versions.tf b/modules/infra/cloud_formation/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/cloud_formation/versions.tf +++ b/modules/infra/cloud_formation/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/ecr/README.md b/modules/infra/ecr/README.md index 8703de24..1f7dd802 100644 --- a/modules/infra/ecr/README.md +++ b/modules/infra/ecr/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/ecr/versions.tf b/modules/infra/ecr/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/ecr/versions.tf +++ b/modules/infra/ecr/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/eks/README.md b/modules/infra/eks/README.md index b4a51c0e..4a6c49ce 100644 --- a/modules/infra/eks/README.md +++ b/modules/infra/eks/README.md @@ -3,32 +3,31 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | | [helm](#requirement\_helm) | >= 3.0.0 | | [kubectl](#requirement\_kubectl) | >= 1.19.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | -| [null](#requirement\_null) | >= 3.2.3 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | +| [null](#requirement\_null) | >= 3.2 | | [time](#requirement\_time) | >= 0.13.1 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | -| [helm](#provider\_helm) | 3.1.0 | | [null](#provider\_null) | 3.2.4 | ## Modules | Name | Source | Version | |------|--------|---------| -| [ebs\_csi\_irsa\_role](#module\_ebs\_csi\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts | 6.1.0 | -| [eks](#module\_eks) | terraform-aws-modules/eks/aws | 21.1.0 | -| [karpenter](#module\_karpenter) | terraform-aws-modules/eks/aws//modules/karpenter | 21.1.0 | -| [self\_managed\_node\_group](#module\_self\_managed\_node\_group) | terraform-aws-modules/eks/aws//modules/self-managed-node-group | 21.1.0 | +| [ebs\_csi\_irsa\_role](#module\_ebs\_csi\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts | 6.4.0 | +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | 21.15.1 | +| [karpenter](#module\_karpenter) | terraform-aws-modules/eks/aws//modules/karpenter | 21.15.1 | +| [self\_managed\_node\_group](#module\_self\_managed\_node\_group) | terraform-aws-modules/eks/aws//modules/self-managed-node-group | 21.15.1 | ## Resources @@ -37,9 +36,9 @@ | [aws_eks_addon.aws_ebs_csi_driver](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon) | resource | | [aws_eks_addon.coredns](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon) | resource | | [aws_eks_addon.eks_pod_identity_agent](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon) | resource | -| [helm_release.karpenter](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [null_resource.apply_ec2_node_class](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.apply_node_pool](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [null_resource.karpenter](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.patch_calico_installation](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [null_resource.wait_for_cluster](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [aws_ami.eks_default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | diff --git a/modules/infra/eks/eks.tf b/modules/infra/eks/eks.tf index aa879be4..87cd87cb 100644 --- a/modules/infra/eks/eks.tf +++ b/modules/infra/eks/eks.tf @@ -1,6 +1,6 @@ module "ebs_csi_irsa_role" { source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts" - version = "6.2.3" + version = "6.4.0" name = "${var.cluster_name}-${var.aws_region}-ebs-csi" use_name_prefix = false @@ -17,7 +17,7 @@ module "ebs_csi_irsa_role" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "21.9.0" + version = "21.15.1" name = var.cluster_name kubernetes_version = var.cluster_version diff --git a/modules/infra/eks/karpenter.tf b/modules/infra/eks/karpenter.tf index eff8b74c..3dd8ac95 100644 --- a/modules/infra/eks/karpenter.tf +++ b/modules/infra/eks/karpenter.tf @@ -1,6 +1,6 @@ module "karpenter" { source = "terraform-aws-modules/eks/aws//modules/karpenter" - version = "21.9.0" + version = "21.15.1" namespace = "karpenter" cluster_name = var.cluster_name @@ -14,34 +14,74 @@ module "karpenter" { } } -resource "helm_release" "karpenter" { - name = "karpenter" - namespace = "karpenter" - create_namespace = true - repository = "oci://public.ecr.aws/karpenter" - chart = "karpenter" - version = "1.8.2" - wait = false - - values = [ - <<-EOT - serviceAccount: - name: ${module.karpenter.service_account} - dnsPolicy: Default - settings: - clusterName: ${module.eks.cluster_name} - clusterEndpoint: ${module.eks.cluster_endpoint} - interruptionQueue: ${module.karpenter.queue_name} - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: karpenter.sh/controller - operator: Exists - effect: NoSchedule - webhook: - enabled: false - EOT - ] +resource "null_resource" "karpenter" { + depends_on = [module.eks] + + triggers = { + chart_version = "1.8.3" + service_account = module.karpenter.service_account + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + interruption_queue = module.karpenter.queue_name + kube_context = "${var.cluster_name}-${var.aws_profile}-${var.aws_region}" + } + + # --- CREATE / UPDATE --- + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = < [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/opt_in_regions/versions.tf b/modules/infra/opt_in_regions/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/opt_in_regions/versions.tf +++ b/modules/infra/opt_in_regions/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/service_linked_roles/README.md b/modules/infra/service_linked_roles/README.md index b0e929aa..6da06a59 100644 --- a/modules/infra/service_linked_roles/README.md +++ b/modules/infra/service_linked_roles/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules diff --git a/modules/infra/service_linked_roles/versions.tf b/modules/infra/service_linked_roles/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/infra/service_linked_roles/versions.tf +++ b/modules/infra/service_linked_roles/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/infra/storage/README.md b/modules/infra/storage/README.md index 2add1133..bd4f06fd 100644 --- a/modules/infra/storage/README.md +++ b/modules/infra/storage/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.0 | -| [aws](#requirement\_aws) | ~> 5.80 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 5.100.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules @@ -31,12 +31,12 @@ No modules. | [aws_s3_bucket_server_side_encryption_configuration.s3_short_term_settings](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_server_side_encryption_configuration) | resource | | [aws_s3_bucket_versioning.s3_long_term](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_versioning) | resource | | [aws_s3_bucket_versioning.s3_short_term](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket_versioning) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_account\_id](#input\_aws\_account\_id) | AWS account ID (not SL AWS account ID) associated with the infra/backend. | `string` | n/a | yes | | [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | diff --git a/modules/infra/storage/data.tf b/modules/infra/storage/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/infra/storage/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/infra/storage/main.tf b/modules/infra/storage/main.tf index acc74902..e928b240 100644 --- a/modules/infra/storage/main.tf +++ b/modules/infra/storage/main.tf @@ -2,8 +2,8 @@ # Long-term storage (i.e. release builds and SBOMs we need to retain long-term # for stability and auditing purposes). resource "aws_s3_bucket" "s3_long_term" { - bucket = "${var.aws_account_id}-long-term-storage" - tags = {} + bucket = "${data.aws_caller_identity.current.account_id}-long-term-storage" + tags = local.all_security_tags } # Ownership controls. @@ -40,12 +40,14 @@ resource "aws_s3_bucket_public_access_block" "s3_long_term" { block_public_policy = true ignore_public_acls = true restrict_public_buckets = true + + skip_destroy = true } # Short-term storage (i.e. temporary/feature-branch builds, core dumps, and # other artifacts we aren't obligated to retain long-term). resource "aws_s3_bucket" "s3_short_term" { - bucket = "${var.aws_account_id}-short-term-storage" + bucket = "${data.aws_caller_identity.current.account_id}-short-term-storage" tags = local.all_security_tags } @@ -96,4 +98,6 @@ resource "aws_s3_bucket_public_access_block" "s3_short_term" { block_public_policy = true ignore_public_acls = true restrict_public_buckets = true + + skip_destroy = true } diff --git a/modules/infra/storage/variables.tf b/modules/infra/storage/variables.tf index 3be1a121..e16c7bf7 100644 --- a/modules/infra/storage/variables.tf +++ b/modules/infra/storage/variables.tf @@ -1,8 +1,3 @@ -variable "aws_account_id" { - type = string - description = "AWS account ID (not SL AWS account ID) associated with the infra/backend." -} - variable "aws_profile" { description = "AWS profile to use." type = string diff --git a/modules/infra/storage/versions.tf b/modules/infra/storage/versions.tf index 750e8a83..0631ae67 100644 --- a/modules/infra/storage/versions.tf +++ b/modules/infra/storage/versions.tf @@ -1,11 +1,12 @@ -# Needed to interact with in-house/on-prem SLVM resources. terraform { + # Provider versions. required_providers { aws = { source = "hashicorp/aws" - version = "~> 6.0" + version = ">= 6.25" } } + # OpenTofu version. - required_version = ">= 1.9.0" + required_version = "~> 1.11" } diff --git a/modules/integrations/github_webhook_relay_destination/README.md b/modules/integrations/github_webhook_relay_destination/README.md index b015efd2..c4ba6421 100644 --- a/modules/integrations/github_webhook_relay_destination/README.md +++ b/modules/integrations/github_webhook_relay_destination/README.md @@ -40,15 +40,15 @@ graph TD | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | ## Modules diff --git a/modules/integrations/github_webhook_relay_destination/versions.tf b/modules/integrations/github_webhook_relay_destination/versions.tf index 288ee2ec..fae886cc 100644 --- a/modules/integrations/github_webhook_relay_destination/versions.tf +++ b/modules/integrations/github_webhook_relay_destination/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -12,5 +12,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/github_webhook_relay_destination_receivers/README.md b/modules/integrations/github_webhook_relay_destination_receivers/README.md index 53446fba..50390365 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/README.md +++ b/modules/integrations/github_webhook_relay_destination_receivers/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | ~> 6.0 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules @@ -29,7 +29,7 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [enable\_webex\_webhook\_relay](#input\_enable\_webex\_webhook\_relay) | Enable Webex webhook relay. | `bool` | n/a | yes | diff --git a/modules/integrations/github_webhook_relay_destination_receivers/variables.tf b/modules/integrations/github_webhook_relay_destination_receivers/variables.tf index c3a344ba..6e2c42ef 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/variables.tf +++ b/modules/integrations/github_webhook_relay_destination_receivers/variables.tf @@ -1,5 +1,5 @@ variable "aws_profile" { - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." type = string } diff --git a/modules/integrations/github_webhook_relay_destination_receivers/versions.tf b/modules/integrations/github_webhook_relay_destination_receivers/versions.tf index bdf91f45..0631ae67 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/versions.tf +++ b/modules/integrations/github_webhook_relay_destination_receivers/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 6.0" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/README.md b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/README.md index 84643ce2..81151965 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/README.md +++ b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/README.md @@ -47,22 +47,22 @@ Both `token` and `room_id` keys are required. The function will prepend `Bearer | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | ~> 6.0 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [time](#requirement\_time) | >= 0.13.1 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [time](#provider\_time) | 0.13.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [webex](#module\_webex) | terraform-aws-modules/lambda/aws | 8.1.2 | +| [webex](#module\_webex) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources diff --git a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda.tf b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda.tf index 6a244cf8..d2125adf 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda.tf +++ b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda.tf @@ -7,7 +7,7 @@ resource "aws_cloudwatch_log_group" "webex" { module "webex" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = "webex-webhook-relay-destination-receiver" handler = "handler.lambda_handler" diff --git a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda/handler.py b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda/handler.py index c59a9241..9c1f7b7e 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda/handler.py +++ b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/lambda/handler.py @@ -230,6 +230,9 @@ def lambda_handler(event, _context): return {'statusCode': 200, 'body': 'Alert sent'} - except Exception as exc: - LOG.exception('lambda_error error=%s', exc) - return {'statusCode': 500, 'body': f"Error: {exc}"} + except Exception as e: + LOG.exception( + 'Unhandled exception in webex_webhook_relay lambda. Error: %s', + str(e), + ) + raise diff --git a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/versions.tf b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/versions.tf index ae56fcdf..38608bd6 100644 --- a/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/versions.tf +++ b/modules/integrations/github_webhook_relay_destination_receivers/webex_webhook_relay/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 6.0" + version = ">= 6.25" } time = { source = "hashicorp/time" @@ -12,5 +12,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/github_webhook_relay_source/README.md b/modules/integrations/github_webhook_relay_source/README.md index a50585dc..bfb28308 100644 --- a/modules/integrations/github_webhook_relay_source/README.md +++ b/modules/integrations/github_webhook_relay_source/README.md @@ -59,20 +59,20 @@ curl -X POST "$(terraform output -raw webhook_endpoint)/webhook" \ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [validate\_signature\_lambda](#module\_validate\_signature\_lambda) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [validate\_signature\_lambda](#module\_validate\_signature\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources diff --git a/modules/integrations/github_webhook_relay_source/lambda.tf b/modules/integrations/github_webhook_relay_source/lambda.tf index 82cad234..1cb74bce 100644 --- a/modules/integrations/github_webhook_relay_source/lambda.tf +++ b/modules/integrations/github_webhook_relay_source/lambda.tf @@ -1,6 +1,6 @@ module "validate_signature_lambda" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = "${var.name_prefix}-validate-signature" handler = "validate_signature.lambda_handler" diff --git a/modules/integrations/github_webhook_relay_source/lambda/validate_signature.py b/modules/integrations/github_webhook_relay_source/lambda/validate_signature.py index 9c30e9eb..2baaf6a5 100644 --- a/modules/integrations/github_webhook_relay_source/lambda/validate_signature.py +++ b/modules/integrations/github_webhook_relay_source/lambda/validate_signature.py @@ -16,28 +16,25 @@ def lambda_handler(event, _): - LOG.info('Received event for processing: %s', event) - signature = event['headers'].get('X-Hub-Signature-256', '') - body = event['body'] - - if SECRET: - digest = hmac.new(SECRET, body.encode(), hashlib.sha256).hexdigest() - if not signature.endswith(digest): - LOG.warning( - 'Signature mismatch: provided %s, expected digest %s', signature, digest) - return {'statusCode': 401, 'body': 'Invalid signature'} - try: + LOG.info('Received event for processing: %s', event) + signature = event['headers'].get('X-Hub-Signature-256', '') + body = event['body'] + + if SECRET: + digest = hmac.new(SECRET, body.encode(), + hashlib.sha256).hexdigest() + if not signature.endswith(digest): + LOG.warning( + 'Signature mismatch: provided %s, expected digest %s', signature, digest) + raise ValueError('Invalid signature') + payload = json.loads(body) - except json.JSONDecodeError as e: - LOG.error('JSON decode error: %s', e) - return {'statusCode': 400, 'body': 'Invalid JSON'} - gh_event = event['headers'].get('X-GitHub-Event', 'unknown') - action = payload.get('action', 'none') + gh_event = event['headers'].get('X-GitHub-Event', 'unknown') + action = payload.get('action', 'none') - detail_type = f"github.{gh_event}.{action}" + detail_type = f"github.{gh_event}.{action}" - try: response = eb.put_events( Entries=[ { @@ -50,8 +47,11 @@ def lambda_handler(event, _): ) LOG.info('Event forwarded to EventBridge %s, response: %s', EVENT_BUS, response) - except Exception as e: - LOG.error('Failed to put event to EventBridge: %s', e) - return {'statusCode': 500, 'body': 'Failed to forward event'} - return {'statusCode': 200, 'body': 'Event forwarded'} + return {'statusCode': 200, 'body': 'Event forwarded'} + except Exception as e: + LOG.exception( + 'Unhandled exception in validate_signature lambda. Error: %s', + str(e), + ) + raise diff --git a/modules/integrations/github_webhook_relay_source/versions.tf b/modules/integrations/github_webhook_relay_source/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/integrations/github_webhook_relay_source/versions.tf +++ b/modules/integrations/github_webhook_relay_source/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_aws_billing/README.md b/modules/integrations/splunk_aws_billing/README.md index 37fb941c..606c96af 100644 --- a/modules/integrations/splunk_aws_billing/README.md +++ b/modules/integrations/splunk_aws_billing/README.md @@ -3,9 +3,9 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | +| [terraform](#requirement\_terraform) | ~> 1.11 | | [archive](#requirement\_archive) | >= 2.7.0 | -| [aws](#requirement\_aws) | >= 5.27 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | | [null](#requirement\_null) | >= 3.2 | | [random](#requirement\_random) | >= 3.6 | @@ -14,15 +14,15 @@ | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [cur\_per\_resource](#module\_cur\_per\_resource) | terraform-aws-modules/lambda/aws | 8.1.0 | -| [cur\_per\_resource\_process](#module\_cur\_per\_resource\_process) | terraform-aws-modules/lambda/aws | 8.1.0 | -| [cur\_per\_service](#module\_cur\_per\_service) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [cur\_per\_resource](#module\_cur\_per\_resource) | terraform-aws-modules/lambda/aws | 8.7.0 | +| [cur\_per\_resource\_process](#module\_cur\_per\_resource\_process) | terraform-aws-modules/lambda/aws | 8.7.0 | +| [cur\_per\_service](#module\_cur\_per\_service) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources @@ -54,7 +54,7 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | diff --git a/modules/integrations/splunk_aws_billing/billing_per_resource.tf b/modules/integrations/splunk_aws_billing/billing_per_resource.tf index 3a1bdc6f..e9782aac 100644 --- a/modules/integrations/splunk_aws_billing/billing_per_resource.tf +++ b/modules/integrations/splunk_aws_billing/billing_per_resource.tf @@ -4,7 +4,7 @@ locals { module "cur_per_resource" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = local.cur_per_resource_lambda_name description = "Processes AWS Billing CUR reports per resource and sends data to Splunk" diff --git a/modules/integrations/splunk_aws_billing/billing_per_resource_process.tf b/modules/integrations/splunk_aws_billing/billing_per_resource_process.tf index 59bb764f..66e5db2e 100644 --- a/modules/integrations/splunk_aws_billing/billing_per_resource_process.tf +++ b/modules/integrations/splunk_aws_billing/billing_per_resource_process.tf @@ -4,7 +4,7 @@ locals { module "cur_per_resource_process" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = local.cur_per_resource_process_lambda_name description = "Processes AWS billing data and sends to Splunk" diff --git a/modules/integrations/splunk_aws_billing/billing_per_service.tf b/modules/integrations/splunk_aws_billing/billing_per_service.tf index d0d7a35f..db9ef3a1 100644 --- a/modules/integrations/splunk_aws_billing/billing_per_service.tf +++ b/modules/integrations/splunk_aws_billing/billing_per_service.tf @@ -4,7 +4,7 @@ locals { module "cur_per_service" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = local.cur_per_service_lambda_name description = "Processes AWS Billing CUR reports per service and sends data to Splunk" diff --git a/modules/integrations/splunk_aws_billing/lambda/handler_per_service.py b/modules/integrations/splunk_aws_billing/lambda/handler_per_service.py index 941e3b39..6f563039 100644 --- a/modules/integrations/splunk_aws_billing/lambda/handler_per_service.py +++ b/modules/integrations/splunk_aws_billing/lambda/handler_per_service.py @@ -10,7 +10,7 @@ import common import pandas as pd -LOG = logging.getLOG() +LOG = logging.getLogger() level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() LOG.setLevel(getattr(logging, level_str, logging.INFO)) diff --git a/modules/integrations/splunk_aws_billing/s3.tf b/modules/integrations/splunk_aws_billing/s3.tf index 8f068dfb..00fface4 100644 --- a/modules/integrations/splunk_aws_billing/s3.tf +++ b/modules/integrations/splunk_aws_billing/s3.tf @@ -48,6 +48,8 @@ resource "aws_s3_bucket_public_access_block" "aws_billing_report" { block_public_policy = true ignore_public_acls = true restrict_public_buckets = true + + skip_destroy = true } data "aws_iam_policy_document" "cur_bucket_policy" { diff --git a/modules/integrations/splunk_aws_billing/variables.tf b/modules/integrations/splunk_aws_billing/variables.tf index 20895611..bfbd4cac 100644 --- a/modules/integrations/splunk_aws_billing/variables.tf +++ b/modules/integrations/splunk_aws_billing/variables.tf @@ -1,6 +1,6 @@ variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_aws_billing/versions.tf b/modules/integrations/splunk_aws_billing/versions.tf index f3cb4cba..813a327a 100644 --- a/modules/integrations/splunk_aws_billing/versions.tf +++ b/modules/integrations/splunk_aws_billing/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -23,5 +23,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_conf_shared/README.md b/modules/integrations/splunk_cloud_conf_shared/README.md index 8eb3bc1f..a07b3888 100644 --- a/modules/integrations/splunk_cloud_conf_shared/README.md +++ b/modules/integrations/splunk_cloud_conf_shared/README.md @@ -3,16 +3,16 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [splunk](#requirement\_splunk) | >= 1.4.30 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [splunk](#provider\_splunk) | 1.4.32 | +| [aws](#provider\_aws) | 6.35.1 | +| [splunk](#provider\_splunk) | 1.4.34 | ## Modules @@ -34,6 +34,7 @@ No modules. | [splunk_configs_conf.forgecicd_cloudwatchlogs_runner_gh_runner_version](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_cloudwatchlogs_runner_pages_github_repo_name](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_cloudwatchlogs_runner_tenant_fields](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | +| [splunk_configs_conf.forgecicd_extra_lambda_ec2_tenant_fields](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_extra_lambda_tenant_fields](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_kube_container_dind](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_kube_container_init_dind_externals](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | @@ -61,8 +62,10 @@ No modules. | [splunk_configs_conf.forgecicd_runner_logs_logs](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_runner_logs_tenant_fields_event](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_configs_conf.forgecicd_runner_logs_tenant_fields_logs](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | +| [splunk_configs_conf.forgecicd_trust_validation](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/configs_conf) | resource | | [splunk_data_ui_views.ci_jobs](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/data_ui_views) | resource | | [splunk_data_ui_views.tenant](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/data_ui_views) | resource | +| [splunk_data_ui_views.trust_relationship_validation](https://registry.terraform.io/providers/splunk/splunk/latest/docs/resources/data_ui_views) | resource | | [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | | [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | @@ -70,10 +73,10 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e. generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Assuming single region for now. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | -| [splunk\_conf](#input\_splunk\_conf) | n/a |
object({
splunk_cloud = string
acl = object({
app = string
owner = string
sharing = string
read = list(string)
write = list(string)
})
index = string
})
| n/a | yes | +| [splunk\_conf](#input\_splunk\_conf) | n/a |
object({
splunk_cloud = string
acl = object({
app = string
owner = string
sharing = string
read = list(string)
write = list(string)
})
index = string
tenant_names = list(string)
})
| n/a | yes | ## Outputs diff --git a/modules/integrations/splunk_cloud_conf_shared/dashboard_jobs.tf b/modules/integrations/splunk_cloud_conf_shared/dashboard_jobs.tf index c1538c08..7a58128b 100644 --- a/modules/integrations/splunk_cloud_conf_shared/dashboard_jobs.tf +++ b/modules/integrations/splunk_cloud_conf_shared/dashboard_jobs.tf @@ -2,7 +2,8 @@ locals { job_definition = templatefile( "${path.module}/template_files/ci_jobs.json.tftpl", { - splunk_index = var.splunk_conf.index + splunk_index = var.splunk_conf.index, + tenants = var.splunk_conf.tenant_names } ) job_eai_data = < + + + + + + + + + +EOF +} + +resource "splunk_data_ui_views" "trust_relationship_validation" { + name = "trust_relationship_validation" + eai_data = local.trust_relationship_validation_eai_data + + acl { + app = var.splunk_conf.acl.app + owner = var.splunk_conf.acl.owner + sharing = var.splunk_conf.acl.sharing + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } +} diff --git a/modules/integrations/splunk_cloud_conf_shared/props_billing_cur.tf b/modules/integrations/splunk_cloud_conf_shared/props_billing_cur.tf index e245a02f..4d21f223 100644 --- a/modules/integrations/splunk_cloud_conf_shared/props_billing_cur.tf +++ b/modules/integrations/splunk_cloud_conf_shared/props_billing_cur.tf @@ -5,9 +5,14 @@ resource "splunk_configs_conf" "forgecicd_aws_billing_cur" { "REPORT-forgecicd_billing_cur_instance_id" = "forgecicd_billing_cur_instance_id" "REPORT-forgecicd_billing_cur_volume_id" = "forgecicd_billing_cur_volume_id" } + + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], diff --git a/modules/integrations/splunk_cloud_conf_shared/props_cloudwatchlogs.tf b/modules/integrations/splunk_cloud_conf_shared/props_cloudwatchlogs.tf index 1f0c1e9e..802b79c1 100644 --- a/modules/integrations/splunk_cloud_conf_shared/props_cloudwatchlogs.tf +++ b/modules/integrations/splunk_cloud_conf_shared/props_cloudwatchlogs.tf @@ -6,10 +6,17 @@ resource "splunk_configs_conf" "forgecicd_cloudwatchlogs" { "REPORT-forgecicd_cloudwatchlogs_lambda_tenant_fields" = "forgecicd_cloudwatchlogs_lambda_tenant_fields" "REPORT-forgecicd_cloudwatchlogs_global_lambda_tenant_fields" = "forgecicd_cloudwatchlogs_global_lambda_tenant_fields" "REPORT-forgecicd_extra_lambda_tenant_fields" = "forgecicd_extra_lambda_tenant_fields" + "REPORT-forgecicd_trust_validation" = "forgecicd_trust_validation" + "REPORT-forgecicd_extra_lambda_ec2_tenant_fields" = "forgecicd_extra_lambda_ec2_tenant_fields" } + + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], diff --git a/modules/integrations/splunk_cloud_conf_shared/props_ec2.tf b/modules/integrations/splunk_cloud_conf_shared/props_ec2.tf index 26fc140c..3eb82ebe 100644 --- a/modules/integrations/splunk_cloud_conf_shared/props_ec2.tf +++ b/modules/integrations/splunk_cloud_conf_shared/props_ec2.tf @@ -8,16 +8,14 @@ resource "splunk_configs_conf" "forgecicd_cloudwatchlogs_forgecicd" { "REPORT-forgecicd_cloudwatchlogs_runner_ci_result" = "forgecicd_cloudwatchlogs_runner_ci_result" "REPORT-forgecicd_cloudwatchlogs_runner_gh_runner_version" = "forgecicd_cloudwatchlogs_runner_gh_runner_version" } + acl { - app = var.splunk_conf.acl.app - owner = var.splunk_conf.acl.owner - sharing = var.splunk_conf.acl.sharing - read = var.splunk_conf.acl.read - write = var.splunk_conf.acl.write + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], diff --git a/modules/integrations/splunk_cloud_conf_shared/props_k8s.tf b/modules/integrations/splunk_cloud_conf_shared/props_k8s.tf index e47e2812..44440bd5 100644 --- a/modules/integrations/splunk_cloud_conf_shared/props_k8s.tf +++ b/modules/integrations/splunk_cloud_conf_shared/props_k8s.tf @@ -7,9 +7,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_runner" { "REPORT-forgecicd_kube_container_runner_gh_runner_version" = "forgecicd_kube_container_runner_gh_runner_version" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -67,9 +71,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_init_docker_creds" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -125,9 +133,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_init_dind_rootless" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -183,9 +195,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_init_work" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -241,9 +257,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_init_dind_externals" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -299,9 +319,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_dind" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -357,9 +381,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_listener" { "REPORT-forgecicd_kube_container_listener_tenant_fields" = "forgecicd_kube_container_listener_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -415,9 +443,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_manager" { "REPORT-forgecicd_kube_container_manager_tenant_fields" = "forgecicd_kube_container_manager_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -472,9 +504,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_log_worker" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -529,9 +565,13 @@ resource "splunk_configs_conf" "forgecicd_kube_container_log_hook" { "REPORT-forgecicd_kube_container_runner_tenant_fields" = "forgecicd_kube_container_runner_tenant_fields" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], diff --git a/modules/integrations/splunk_cloud_conf_shared/props_runner_logs.tf b/modules/integrations/splunk_cloud_conf_shared/props_runner_logs.tf index c5b43240..7382f80b 100644 --- a/modules/integrations/splunk_cloud_conf_shared/props_runner_logs.tf +++ b/modules/integrations/splunk_cloud_conf_shared/props_runner_logs.tf @@ -7,9 +7,13 @@ resource "splunk_configs_conf" "forgecicd_runner_logs_json" { "REPORT-forgecicd_runner_arc" = "forgecicd_runner_arc" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], @@ -68,9 +72,13 @@ resource "splunk_configs_conf" "forgecicd_runner_logs_logs" { "REPORT-forgecicd_runner_arc" = "forgecicd_runner_arc" } + acl { + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { ignore_changes = [ - acl, variables["ADD_EXTRA_TIME_FIELDS"], variables["ANNOTATE_PUNCT"], variables["AUTO_KV_JSON"], diff --git a/modules/integrations/splunk_cloud_conf_shared/template_files/ci_jobs.json.tftpl b/modules/integrations/splunk_cloud_conf_shared/template_files/ci_jobs.json.tftpl index c44b402f..0b9b33ee 100644 --- a/modules/integrations/splunk_cloud_conf_shared/template_files/ci_jobs.json.tftpl +++ b/modules/integrations/splunk_cloud_conf_shared/template_files/ci_jobs.json.tftpl @@ -1,6 +1,6 @@ { "title": "CI Job Result", - "description": "", + "description": "Shows recent Forge GitHub Actions job executions per tenant, including status and timing details.", "inputs": { "input_4A2iEpn6": { "options": { @@ -27,12 +27,12 @@ "value": "*" }, { - "label": "Succeeded", - "value": "Succeeded" + "label": "Success", + "value": "success" }, { - "label": "Failed", - "value": "Failed" + "label": "Failure", + "value": "failure" }, { "label": "Cancelled", @@ -45,29 +45,15 @@ "type": "input.dropdown" }, "input_uFiEIG6X": { - "context": { - "formattedConfig": { - "number": { - "prefix": "" - } - }, - "formattedStatics": ">statics | formatByType(formattedConfig)", - "label": ">primary | seriesByName(\"forgecicd_tenant\") | renameSeries(\"label\") | formatByType(formattedConfig)", - "statics": [ - [ - "Select Tenant" - ], - [ - "-" - ] - ], - "value": ">primary | seriesByName(\"forgecicd_tenant\") | renameSeries(\"value\") | formatByType(formattedConfig)" - }, - "dataSources": { - "primary": "ds_x8AMw4ri" - }, + "context": {}, + "dataSources": {}, "options": { - "items": ">frame(label, value) | prepend(formattedStatics) | objects()", + "items": ${jsonencode( + concat( + [{ label = "Select Tenant", value = "-" }], + [for tenant in tenants : { label = tenant, value = tenant }] + ) + )}, "selectFirstSearchResult": true, "token": "dd_8enHUmpH" }, @@ -124,40 +110,11 @@ } }, "dataSources": { - "ds_5a4R8xaz": { - "name": "log type list", - "options": { - "query": "index=\"${splunk_index}\" | stats count by forgecicd_log_type | table forgecicd_log_type", - "queryParameters": { - "earliest": "$global_time.earliest$", - "latest": "$global_time.latest$" - } - }, - "type": "ds.search" - }, - "ds_iONxCEsT": { - "name": "logs", - "options": { - "query": "index=\"${splunk_index}\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_log_type\n=\"$dd_ZlvrLiNG$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n| sort _time asc" - }, - "type": "ds.search" - }, - "ds_x8AMw4ri": { - "name": "tenant list", - "options": { - "query": "index=\"${splunk_index}\" | stats count by forgecicd_tenant | table forgecicd_tenant", - "queryParameters": { - "earliest": "$global_time.earliest$", - "latest": "$global_time.latest$" - } - }, - "type": "ds.search" - }, "ds_yDvZOb30": { "name": "GH Job Details search", "options": { "enableSmartSources": true, - "query": "index=\"${splunk_index}\" forgecicd_log_type=hook type=\"completed\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n| eval repo_url = GITHUB_SERVER_URL . \"/\" . GITHUB_REPOSITORY\n| eval run_url = repo_url . \"/actions/runs/\" . GITHUB_RUN_ID . \"/attempts/\" . GITHUB_RUN_ATTEMPT\n| fields \n GITHUB_SHA,\n run_url,\n repo_url,\n GITHUB_RUN_ID,\n GITHUB_RUN_ATTEMPT,\n GITHUB_ACTOR,\n GITHUB_TRIGGERING_ACTOR,\n GITHUB_WORKFLOW,\n GITHUB_HEAD_REF,\n GITHUB_BASE_REF,\n GITHUB_EVENT_NAME,\n forgecicd_instance_id\n| join forgecicd_instance_id type=left [\n search index=\"${splunk_index}\" forgecicd_log_type=runner forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\" ci_result=*\n | fields forgecicd_instance_id, ci_result, job_name\n]\n| join forgecicd_instance_id type=left [\n search index=\"${splunk_index}\" forgecicd_log_type=hook (type=\"started\" OR type=\"completed\") forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n | stats earliest(_time) as start latest(_time) as end by forgecicd_instance_id\n | eval duration_minutes = round((end - start) / 60, 2)\n | fields forgecicd_instance_id, duration_minutes\n]\n| search ci_result=\"$dd_sHd7IGav$\"\n| table \n GITHUB_SHA,\n duration_minutes,\n run_url,\n repo_url,\n GITHUB_RUN_ID,\n GITHUB_RUN_ATTEMPT,\n GITHUB_ACTOR,\n GITHUB_TRIGGERING_ACTOR,\n GITHUB_WORKFLOW,\n GITHUB_HEAD_REF,\n GITHUB_BASE_REF,\n GITHUB_EVENT_NAME,\n forgecicd_instance_id,\n job_name,\n ci_result\n", + "query": "index=\"${splunk_index}\" forgecicd_log_type=\"runner-job-event\" \nconclusion=\"$dd_sHd7IGav$\"\nforgecicd_instance_id=\"$text_9yxn14bn$\"\nforgecicd_tenant=\"$dd_8enHUmpH$\"\n| table \n workflow_job.html_url \n workflow_job.conclusion\n workflow_job.name \n workflow_job.run_id \n workflow_job.run_attempt \n workflow_job.id \n workflow_job.created_at \n workflow_job.started_at\n workflow_job.completed_at \n workflow_job.runner_name \n forgecicd_instance_id\n| rename \n workflow_job.html_url as run_url\n workflow_job.conclusion as ci_result\n workflow_job.name as workflow_name\n workflow_job.run_id as workflow_run_id\n workflow_job.run_attempt as workflow_run_attempt\n workflow_job.id as workflow_job_id\n workflow_job.created_at as created_at\n workflow_job.started_at as started_at\n workflow_job.completed_at as completed_at\n workflow_job.runner_name as runner_name\n| eval created_epoch = strptime(created_at, \"%Y-%m-%dT%H:%M:%SZ\")\n| eval started_epoch = strptime(started_at, \"%Y-%m-%dT%H:%M:%SZ\")\n| eval completed_epoch = strptime(completed_at, \"%Y-%m-%dT%H:%M:%SZ\")\n| eval duration_created_to_started = tostring(started_epoch - created_epoch, \"duration\")\n| eval duration_started_to_completed = tostring(completed_epoch - started_epoch, \"duration\")\n| eval duration_total = tostring(completed_epoch - created_epoch, \"duration\")\n| table \n run_url \n ci_result \n workflow_name \n workflow_run_id \n workflow_run_attempt \n workflow_job_id \n created_at \n started_at \n completed_at \n duration_created_to_started \n duration_started_to_completed \n duration_total \n runner_name \n forgecicd_instance_id", "queryParameters": { "earliest": "$global_time.earliest$", "latest": "$global_time.latest$" diff --git a/modules/integrations/splunk_cloud_conf_shared/template_files/tenant.json.tftpl b/modules/integrations/splunk_cloud_conf_shared/template_files/tenant.json.tftpl index a3ece1fd..e7d2e32e 100644 --- a/modules/integrations/splunk_cloud_conf_shared/template_files/tenant.json.tftpl +++ b/modules/integrations/splunk_cloud_conf_shared/template_files/tenant.json.tftpl @@ -1,6 +1,6 @@ { "title": "Tenant Logs", - "description": "", + "description": "Displays Forge runner logs for a selected tenant, log type, and time range.", "inputs": { "input_4A2iEpn6": { "options": { @@ -36,29 +36,15 @@ "type": "input.timerange" }, "input_uFiEIG6X": { - "context": { - "formattedConfig": { - "number": { - "prefix": "" - } - }, - "formattedStatics": ">statics | formatByType(formattedConfig)", - "label": ">primary | seriesByName(\"forgecicd_tenant\") | renameSeries(\"label\") | formatByType(formattedConfig)", - "statics": [ - [ - "Select Tenant" - ], - [ - "-" - ] - ], - "value": ">primary | seriesByName(\"forgecicd_tenant\") | renameSeries(\"value\") | formatByType(formattedConfig)" - }, - "dataSources": { - "primary": "ds_x8AMw4ri" - }, + "context": {}, + "dataSources": {}, "options": { - "items": ">frame(label, value) | prepend(formattedStatics) | objects()", + "items": ${jsonencode( + concat( + [{ label = "Select Tenant", value = "-" }], + [for tenant in tenants : { label = tenant, value = tenant }] + ) + )}, "selectFirstSearchResult": true, "token": "dd_8enHUmpH" }, @@ -114,7 +100,7 @@ "ds_iONxCEsT": { "name": "logs", "options": { - "query": "index=\"${splunk_index}\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_log_type\n=\"$dd_ZlvrLiNG$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n| sort _time asc" + "query": "index=\"${splunk_index}\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_log_type=\"$dd_ZlvrLiNG$\"\n| where \"$text_9yxn14bn$\"=\"*\" OR forgecicd_instance_id=\"$text_9yxn14bn$\"\n| sort _time asc" }, "type": "ds.search" }, @@ -128,18 +114,6 @@ } }, "type": "ds.search" - }, - "ds_yDvZOb30": { - "name": "GH Job Details search", - "options": { - "enableSmartSources": true, - "query": "index=\"${splunk_index}\" forgecicd_log_type=hook type=\"completed\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n| eval repo_url = GITHUB_SERVER_URL . \"/\" . GITHUB_REPOSITORY\n| eval run_url = repo_url . \"/actions/runs/\" . GITHUB_RUN_ID . \"/attempts/\" . GITHUB_RUN_ATTEMPT\n| fields \n GITHUB_SHA,\n run_url,\n repo_url,\n GITHUB_RUN_ID,\n GITHUB_RUN_ATTEMPT,\n GITHUB_ACTOR,\n GITHUB_TRIGGERING_ACTOR,\n GITHUB_WORKFLOW,\n GITHUB_HEAD_REF,\n GITHUB_BASE_REF,\n GITHUB_EVENT_NAME,\n forgecicd_instance_id\n| join forgecicd_instance_id type=left [\n search index=\"${splunk_index}\" forgecicd_log_type=runner forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\" ci_result=*\n | fields forgecicd_instance_id, ci_result, job_name\n]\n| join forgecicd_instance_id type=left [\n search index=\"${splunk_index}\" forgecicd_log_type=hook (type=\"started\" OR type=\"completed\") forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_instance_id=\"$text_9yxn14bn$\"\n | stats earliest(_time) as start latest(_time) as end by forgecicd_instance_id\n | eval duration_minutes = round((end - start) / 60, 2)\n | fields forgecicd_instance_id, duration_minutes\n]\n| table \n GITHUB_SHA,\n duration_minutes,\n run_url,\n repo_url,\n GITHUB_RUN_ID,\n GITHUB_RUN_ATTEMPT,\n GITHUB_ACTOR,\n GITHUB_TRIGGERING_ACTOR,\n GITHUB_WORKFLOW,\n GITHUB_HEAD_REF,\n GITHUB_BASE_REF,\n GITHUB_EVENT_NAME,\n forgecicd_instance_id,\n job_name,\n ci_result\n", - "queryParameters": { - "earliest": "$global_time.earliest$", - "latest": "$global_time.latest$" - } - }, - "type": "ds.search" } }, "layout": { diff --git a/modules/integrations/splunk_cloud_conf_shared/template_files/trust_relationship_validation.json.tftpl b/modules/integrations/splunk_cloud_conf_shared/template_files/trust_relationship_validation.json.tftpl new file mode 100644 index 00000000..5ef62189 --- /dev/null +++ b/modules/integrations/splunk_cloud_conf_shared/template_files/trust_relationship_validation.json.tftpl @@ -0,0 +1,121 @@ +{ + "title": "Trust Relationship", + "description": "", + "inputs": { + "input_global_trp": { + "options": { + "defaultValue": "-7d@h,now", + "token": "global_time" + }, + "title": "Global Time Range", + "type": "input.timerange" + }, + "input_uFiEIG6X": { + "context": {}, + "dataSources": {}, + "options": { + "items": ${jsonencode( + concat( + [{ label = "Select Tenant", value = "-" }], + [for tenant in tenants : { label = tenant, value = tenant }] + ) + )}, + "selectFirstSearchResult": true, + "token": "dd_8enHUmpH" + }, + "title": "Forge Tenant", + "type": "input.dropdown" + } + }, + "defaults": { + "dataSources": { + "ds.o11y": { + "options": { + "queryParameters": { + "earliest": "$global_time.earliest$", + "latest": "$global_time.latest$" + } + } + }, + "ds.search": { + "options": { + "queryParameters": { + "earliest": "$global_time.earliest$", + "latest": "$global_time.latest$" + } + } + } + } + }, + "visualizations": { + "viz_YqZBTBIB": { + "containerOptions": {}, + "dataSources": { + "primary": "ds_yDvZOb30" + }, + "eventHandlers": [], + "options": { + "count": 20 + }, + "showLastUpdated": false, + "showProgressBar": false, + "title": "", + "type": "splunk.table" + } + }, + "dataSources": { + "ds_yDvZOb30": { + "name": "Trust relationship validation", + "options": { + "enableSmartSources": true, + "query": "index=\"${splunk_index}\" forgecicd_tenant=\"$dd_8enHUmpH$\" forgecicd_log_type=\"forge-trust-validator\" \"Validation complete\"\n| eventstats max(_time) as tenant_last_seen by forgecicd_tenant\n| where _time=tenant_last_seen\n\n| mvexpand forgecicd_trust_validation\n| eval trust_json=tostring(forgecicd_trust_validation)\n| spath input=trust_json path={} output=item\n| mvexpand item\n| spath input=item path=forge_role_arn output=forge_role\n| spath input=item path=tenant_results{} output=tenant_results\n| mvexpand tenant_results\n| spath input=tenant_results\n\n| eval assume_status=if(tostring(assume_role_success)==\"true\",\"assumed\",\"not assumed\")\n| eval tag_status=if(tostring(tag_session_success)==\"true\",\"session tagged\",\"session not tagged\")\n\n| table forgecicd_tenant forge_role tenant_role_arn assume_status assume_role_error tag_status tag_session_error\n| sort forge_role tenant_role_arn", + "queryParameters": { + "earliest": "$global_time.earliest$", + "latest": "$global_time.latest$" + } + }, + "type": "ds.search" + } + }, + "layout": { + "globalInputs": [ + "input_global_trp", + "input_uFiEIG6X" + ], + "layoutDefinitions": { + "layout_6CHuojec": { + "options": { + "gutterSize": 9 + }, + "structure": [ + { + "item": "viz_YqZBTBIB", + "position": { + "h": 974, + "w": 1200, + "x": 0, + "y": 0 + }, + "type": "block" + } + ], + "type": "grid" + } + }, + "options": {}, + "tabs": { + "items": [ + { + "label": "Trust Relationship Validation", + "layoutId": "layout_6CHuojec" + } + ] + } + }, + "applicationProperties": { + "collapseNavigation": true, + "hideEdit": false, + "hideExport": false, + "hideOpenInSearch": false + } +} diff --git a/modules/integrations/splunk_cloud_conf_shared/transforms_lambda.tf b/modules/integrations/splunk_cloud_conf_shared/transforms_lambda.tf index 96f40eab..9f2d0c58 100644 --- a/modules/integrations/splunk_cloud_conf_shared/transforms_lambda.tf +++ b/modules/integrations/splunk_cloud_conf_shared/transforms_lambda.tf @@ -2,7 +2,7 @@ resource "splunk_configs_conf" "forgecicd_extra_lambda_tenant_fields" { name = "transforms/forgecicd_extra_lambda_tenant_fields" variables = { - "REGEX" = "(?[^:]+):\\/aws\\/lambda\\/(?[a-z0-9]+)-(?[a-z0-9]+)-(?[a-z0-9]+)-(?github-app-runner-group|github-clean-global-lock)" + "REGEX" = "(?[^:]+):\\/aws\\/lambda\\/(?[a-z0-9]+)-(?[a-z0-9]+)-(?[a-z0-9]+)-(?register-github-app-runner-group|github-webhook-relay|clean-global-lock|job-log-archiver|job-log-dispatcher|forge-trust-validator|redrive-deadletter)" "FORMAT" = "aws_region::$1 forgecicd_tenant::$2 forgecicd_region_alias::$3 forgecicd_vpc_alias::$4 forgecicd_log_type::$5" "SOURCE_KEY" = "source" "CLEAN_KEYS" = "0" @@ -29,3 +29,68 @@ resource "splunk_configs_conf" "forgecicd_extra_lambda_tenant_fields" { ] } } + +resource "splunk_configs_conf" "forgecicd_extra_lambda_ec2_tenant_fields" { + name = "transforms/forgecicd_extra_lambda_ec2_tenant_fields" + + variables = { + "REGEX" = "(?[^:]+):\\/aws\\/lambda\\/(?[a-z0-9]+)-(?[a-z0-9]+)-(?[a-z0-9]+)-(?|ec2-update-runner-ssm-ami|ec2-update-runner-tags)" + "FORMAT" = "aws_region::$1 forgecicd_tenant::$2 forgecicd_region_alias::$3 forgecicd_vpc_alias::$4 forgecicd_log_type::$5 forgecicd_type::ec2" + "SOURCE_KEY" = "source" + "CLEAN_KEYS" = "0" + } + acl { + app = var.splunk_conf.acl.app + owner = var.splunk_conf.acl.owner + sharing = var.splunk_conf.acl.sharing + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { + ignore_changes = [ + variables["CAN_OPTIMIZE"], + variables["DEFAULT_VALUE"], + variables["DEPTH_LIMIT"], + variables["DEST_KEY"], + variables["KEEP_EMPTY_VALS"], + variables["LOOKAHEAD"], + variables["MATCH_LIMIT"], + variables["MV_ADD"], + variables["WRITE_META"], + variables["disabled"] + ] + } +} + +resource "splunk_configs_conf" "forgecicd_trust_validation" { + name = "transforms/forgecicd_trust_validation" + + variables = { + REGEX = "Validation complete:\\s*(\\[[^\\r\\n]+])" + FORMAT = "forgecicd_trust_validation::$1" + SOURCE_KEY = "_raw" + CLEAN_KEYS = "0" + } + + acl { + app = var.splunk_conf.acl.app + owner = var.splunk_conf.acl.owner + sharing = var.splunk_conf.acl.sharing + read = var.splunk_conf.acl.read + write = var.splunk_conf.acl.write + } + lifecycle { + ignore_changes = [ + variables["CAN_OPTIMIZE"], + variables["DEFAULT_VALUE"], + variables["DEPTH_LIMIT"], + variables["DEST_KEY"], + variables["KEEP_EMPTY_VALS"], + variables["LOOKAHEAD"], + variables["MATCH_LIMIT"], + variables["MV_ADD"], + variables["WRITE_META"], + variables["disabled"] + ] + } +} diff --git a/modules/integrations/splunk_cloud_conf_shared/variables.tf b/modules/integrations/splunk_cloud_conf_shared/variables.tf index e35c6b6f..2e3a7d93 100644 --- a/modules/integrations/splunk_cloud_conf_shared/variables.tf +++ b/modules/integrations/splunk_cloud_conf_shared/variables.tf @@ -1,6 +1,6 @@ variable "aws_profile" { type = string - description = "AWS profile (i.e. generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { @@ -23,6 +23,7 @@ variable "splunk_conf" { read = list(string) write = list(string) }) - index = string + index = string + tenant_names = list(string) }) } diff --git a/modules/integrations/splunk_cloud_conf_shared/versions.tf b/modules/integrations/splunk_cloud_conf_shared/versions.tf index 7b0eef9a..790b95f2 100644 --- a/modules/integrations/splunk_cloud_conf_shared/versions.tf +++ b/modules/integrations/splunk_cloud_conf_shared/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } splunk = { source = "splunk/splunk" @@ -11,5 +11,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_data_manager/cloudwatch.tf b/modules/integrations/splunk_cloud_data_manager/cloudwatch.tf index 37bc02f7..a2a0a1e2 100644 --- a/modules/integrations/splunk_cloud_data_manager/cloudwatch.tf +++ b/modules/integrations/splunk_cloud_data_manager/cloudwatch.tf @@ -35,7 +35,7 @@ locals { iamRegion = var.aws_region regions = var.cloudwatch_log_groups_config.regions datasetInfo = local.dataset_info_cloudwatch - dataAccounts = [var.aws_account_id] + dataAccounts = [data.aws_caller_identity.current.account_id] resourceTags = local.resource_tags } } diff --git a/modules/integrations/splunk_cloud_data_manager/custom_cloudwatch.tf b/modules/integrations/splunk_cloud_data_manager/custom_cloudwatch.tf index 2138ab7c..fcc4adfb 100644 --- a/modules/integrations/splunk_cloud_data_manager/custom_cloudwatch.tf +++ b/modules/integrations/splunk_cloud_data_manager/custom_cloudwatch.tf @@ -46,7 +46,7 @@ locals { } } } - dataAccounts = [var.aws_account_id] + dataAccounts = [data.aws_caller_identity.current.account_id] resourceTags = local.resource_tags } } diff --git a/modules/integrations/splunk_cloud_data_manager/data.tf b/modules/integrations/splunk_cloud_data_manager/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/integrations/splunk_cloud_data_manager/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/integrations/splunk_cloud_data_manager/data_input/locals.tf b/modules/integrations/splunk_cloud_data_manager/data_input/locals.tf index a3855193..790ca9f9 100644 --- a/modules/integrations/splunk_cloud_data_manager/data_input/locals.tf +++ b/modules/integrations/splunk_cloud_data_manager/data_input/locals.tf @@ -1,7 +1,7 @@ locals { name = "SplunkDMDataIngest-${random_uuid.splunk_input_uuid.result}" - template_url = "https://${var.cloudformation_s3_config.bucket}.s3.amazonaws.com/${var.cloudformation_s3_config.key}${random_uuid.splunk_input_uuid.result}/template.json" + template_url = "https://${var.cloudformation_s3_config.bucket}.s3.amazonaws.com/${var.cloudformation_s3_config.key}${random_uuid.splunk_input_uuid.result}/${data.external.splunk_dm_version.result.template_hash}/template.json" tags = merge( var.tags_all, @@ -15,5 +15,4 @@ locals { SplunkDMVersion = data.external.splunk_dm_version.result["version"] } ) - } diff --git a/modules/integrations/splunk_cloud_data_manager/data_input/s3_template.tf b/modules/integrations/splunk_cloud_data_manager/data_input/s3_template.tf index cda178d1..036c7f55 100644 --- a/modules/integrations/splunk_cloud_data_manager/data_input/s3_template.tf +++ b/modules/integrations/splunk_cloud_data_manager/data_input/s3_template.tf @@ -1,10 +1,11 @@ resource "aws_s3_object" "cloudformation_template" { bucket = var.cloudformation_s3_config.bucket - key = "${var.cloudformation_s3_config.key}${random_uuid.splunk_input_uuid.result}/template.json" + key = "${var.cloudformation_s3_config.key}${random_uuid.splunk_input_uuid.result}/${data.external.splunk_dm_version.result.template_hash}/template.json" source = "/tmp/${random_uuid.splunk_input_uuid.result}_template.json" depends_on = [ null_resource.create_integration, + data.external.splunk_dm_version, random_uuid.splunk_input_uuid, ] } diff --git a/modules/integrations/splunk_cloud_data_manager/data_input/scripts/get_splunk_integration.sh b/modules/integrations/splunk_cloud_data_manager/data_input/scripts/get_splunk_integration.sh index 180e42d3..c2516aa3 100755 --- a/modules/integrations/splunk_cloud_data_manager/data_input/scripts/get_splunk_integration.sh +++ b/modules/integrations/splunk_cloud_data_manager/data_input/scripts/get_splunk_integration.sh @@ -118,5 +118,8 @@ curl "${splunk_cloud}/en-GB/splunkd/__raw/servicesNS/nobody/data_manager/cloudin -H "X-Splunk-Form-Key: $SPLUNKWEB_CSRF_TOKEN_8443" \ -o /tmp/${splunk_input_uuid}_template.json >>/tmp/${splunk_input_uuid}_logs.txt 2>&1 -# Output the version -cat /tmp/${splunk_input_uuid}_input.json | jq -c '{version: .details.version}' +# Calculate the hash of the template +TEMPLATE_HASH=$(shasum -a 256 /tmp/${splunk_input_uuid}_template.json | awk '{print $1}') + +# Output the version and template hash +cat /tmp/${splunk_input_uuid}_input.json | jq -c --arg hash "$TEMPLATE_HASH" '{version: .details.version, template_hash: $hash}' diff --git a/modules/integrations/splunk_cloud_data_manager/data_input/versions.tf b/modules/integrations/splunk_cloud_data_manager/data_input/versions.tf index 6ed9baaa..86cb8743 100644 --- a/modules/integrations/splunk_cloud_data_manager/data_input/versions.tf +++ b/modules/integrations/splunk_cloud_data_manager/data_input/versions.tf @@ -3,11 +3,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } random = { source = "hashicorp/random" - version = ">= 3.7.1" + version = ">= 3.6" } external = { source = "hashicorp/external" @@ -19,7 +19,7 @@ terraform { } local = { source = "hashicorp/local" - version = ">= 2.5.2" + version = ">= 2.5" } time = { source = "hashicorp/time" @@ -28,5 +28,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_data_manager/sec_meta.tf b/modules/integrations/splunk_cloud_data_manager/sec_meta.tf index a3e83845..a073f315 100644 --- a/modules/integrations/splunk_cloud_data_manager/sec_meta.tf +++ b/modules/integrations/splunk_cloud_data_manager/sec_meta.tf @@ -40,7 +40,7 @@ locals { iamRegion = var.aws_region regions = var.security_metadata_config.regions datasetInfo = local.dataset_info_security_metadata - dataAccounts = [var.aws_account_id] + dataAccounts = [data.aws_caller_identity.current.account_id] resourceTags = local.resource_tags } } diff --git a/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/main.tf b/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/main.tf index 1e80c58c..45d84394 100644 --- a/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/main.tf +++ b/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/main.tf @@ -1,6 +1,6 @@ module "splunk_dm_metadata_ec2inst_pattern_tags_lambda" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" role_name = "SplunkDMMetadataEC2InstPatternTags-${var.region}" diff --git a/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/versions.tf b/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/versions.tf +++ b/modules/integrations/splunk_cloud_data_manager/sec_meta_ec2_tags/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_data_manager/variables.tf b/modules/integrations/splunk_cloud_data_manager/variables.tf index fde61872..d7427534 100644 --- a/modules/integrations/splunk_cloud_data_manager/variables.tf +++ b/modules/integrations/splunk_cloud_data_manager/variables.tf @@ -1,11 +1,6 @@ -variable "aws_account_id" { - description = "AWS account ID (not SL AWS account ID) associated with the infra/backend." - type = string -} - variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_cloud_data_manager/versions.tf b/modules/integrations/splunk_cloud_data_manager/versions.tf index 378b9c90..86cb8743 100644 --- a/modules/integrations/splunk_cloud_data_manager/versions.tf +++ b/modules/integrations/splunk_cloud_data_manager/versions.tf @@ -3,11 +3,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } random = { source = "hashicorp/random" - version = ">=3.7.1" + version = ">= 3.6" } external = { source = "hashicorp/external" @@ -19,7 +19,7 @@ terraform { } local = { source = "hashicorp/local" - version = ">= 2.5.2" + version = ">= 2.5" } time = { source = "hashicorp/time" @@ -28,5 +28,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_data_manager_common/README.md b/modules/integrations/splunk_cloud_data_manager_common/README.md index 4a28a663..78899e79 100644 --- a/modules/integrations/splunk_cloud_data_manager_common/README.md +++ b/modules/integrations/splunk_cloud_data_manager_common/README.md @@ -3,15 +3,15 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | ## Modules @@ -24,6 +24,7 @@ No modules. |------|------| | [aws_iam_role.splunk_dm_read_only](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.splunk_dm_policy_attachment](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.splunk_dm_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | | [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | @@ -34,8 +35,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_account\_id](#input\_aws\_account\_id) | AWS account ID (not SL AWS account ID) associated with the infra/backend. | `string` | n/a | yes | -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [splunk\_cloud](#input\_splunk\_cloud) | Splunk Cloud endpoint. | `string` | n/a | yes | diff --git a/modules/integrations/splunk_cloud_data_manager_common/data.tf b/modules/integrations/splunk_cloud_data_manager_common/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/integrations/splunk_cloud_data_manager_common/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/integrations/splunk_cloud_data_manager_common/role.tf b/modules/integrations/splunk_cloud_data_manager_common/role.tf index 66601716..53c9e0c3 100644 --- a/modules/integrations/splunk_cloud_data_manager_common/role.tf +++ b/modules/integrations/splunk_cloud_data_manager_common/role.tf @@ -11,15 +11,15 @@ data "aws_iam_policy_document" "splunk_dm_policy" { "iam:GetPolicyVersion" ] resources = [ - "arn:aws:iam::${var.aws_account_id}:role/SplunkDM*", - "arn:aws:iam::${var.aws_account_id}:policy/*" + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/SplunkDM*", + "arn:aws:iam::${data.aws_caller_identity.current.account_id}:policy/*" ] } statement { effect = "Allow" actions = ["guardduty:GetMasterAccount"] - resources = ["arn:aws:guardduty:*:${var.aws_account_id}:detector/*"] + resources = ["arn:aws:guardduty:*:${data.aws_caller_identity.current.account_id}:detector/*"] } statement { @@ -30,7 +30,7 @@ data "aws_iam_policy_document" "splunk_dm_policy" { "securityhub:ListMembers", "securityhub:ListInvitations" ] - resources = ["arn:aws:securityhub:*:${var.aws_account_id}:hub/default"] + resources = ["arn:aws:securityhub:*:${data.aws_caller_identity.current.account_id}:hub/default"] } statement { @@ -39,7 +39,7 @@ data "aws_iam_policy_document" "splunk_dm_policy" { "cloudformation:DescribeStacks", "cloudformation:GetTemplate" ] - resources = ["arn:aws:cloudformation:*:${var.aws_account_id}:stack/SplunkDM*/*"] + resources = ["arn:aws:cloudformation:*:${data.aws_caller_identity.current.account_id}:stack/SplunkDM*/*"] } statement { @@ -65,19 +65,19 @@ data "aws_iam_policy_document" "splunk_dm_policy" { "logs:DescribeLogGroups", "logs:DescribeSubscriptionFilters" ] - resources = ["arn:aws:logs:*:${var.aws_account_id}:log-group:*"] + resources = ["arn:aws:logs:*:${data.aws_caller_identity.current.account_id}:log-group:*"] } statement { effect = "Allow" actions = ["firehose:DescribeDeliveryStream"] - resources = ["arn:aws:firehose:*:${var.aws_account_id}:deliverystream/SplunkDM*"] + resources = ["arn:aws:firehose:*:${data.aws_caller_identity.current.account_id}:deliverystream/SplunkDM*"] } statement { effect = "Allow" actions = ["events:DescribeRule"] - resources = ["arn:aws:events:*:${var.aws_account_id}:rule/SplunkDM*"] + resources = ["arn:aws:events:*:${data.aws_caller_identity.current.account_id}:rule/SplunkDM*"] } statement { @@ -92,7 +92,7 @@ data "aws_iam_policy_document" "splunk_dm_policy" { statement { effect = "Allow" actions = ["lambda:GetFunction"] - resources = ["arn:aws:lambda:*:${var.aws_account_id}:function:SplunkDM*"] + resources = ["arn:aws:lambda:*:${data.aws_caller_identity.current.account_id}:function:SplunkDM*"] } } diff --git a/modules/integrations/splunk_cloud_data_manager_common/variables.tf b/modules/integrations/splunk_cloud_data_manager_common/variables.tf index 044d828e..72155f3d 100644 --- a/modules/integrations/splunk_cloud_data_manager_common/variables.tf +++ b/modules/integrations/splunk_cloud_data_manager_common/variables.tf @@ -1,11 +1,8 @@ -variable "aws_account_id" { - description = "AWS account ID (not SL AWS account ID) associated with the infra/backend." - type = string -} + variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_cloud_data_manager_common/versions.tf b/modules/integrations/splunk_cloud_data_manager_common/versions.tf index 288ee2ec..fae886cc 100644 --- a/modules/integrations/splunk_cloud_data_manager_common/versions.tf +++ b/modules/integrations/splunk_cloud_data_manager_common/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -12,5 +12,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_cloud_s3_runner_logs/README.md b/modules/integrations/splunk_cloud_s3_runner_logs/README.md index aa302d93..0e575d04 100644 --- a/modules/integrations/splunk_cloud_s3_runner_logs/README.md +++ b/modules/integrations/splunk_cloud_s3_runner_logs/README.md @@ -3,22 +3,22 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | ## Modules | Name | Source | Version | |------|--------|---------| -| [splunk\_s3\_runner\_logs\_lambda](#module\_splunk\_s3\_runner\_logs\_lambda) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [splunk\_s3\_runner\_logs\_lambda](#module\_splunk\_s3\_runner\_logs\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources @@ -55,7 +55,7 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | diff --git a/modules/integrations/splunk_cloud_s3_runner_logs/lambda.tf b/modules/integrations/splunk_cloud_s3_runner_logs/lambda.tf index 6415ad4d..f2e21678 100644 --- a/modules/integrations/splunk_cloud_s3_runner_logs/lambda.tf +++ b/modules/integrations/splunk_cloud_s3_runner_logs/lambda.tf @@ -4,7 +4,7 @@ locals { module "splunk_s3_runner_logs_lambda" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = "${local.prefix_lambda}-lambda-${var.aws_region}" handler = "splunk_s3_runner_logs.lambda_handler" diff --git a/modules/integrations/splunk_cloud_s3_runner_logs/variables.tf b/modules/integrations/splunk_cloud_s3_runner_logs/variables.tf index 35ec4e31..56e0b4d6 100644 --- a/modules/integrations/splunk_cloud_s3_runner_logs/variables.tf +++ b/modules/integrations/splunk_cloud_s3_runner_logs/variables.tf @@ -1,6 +1,6 @@ variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_cloud_s3_runner_logs/versions.tf b/modules/integrations/splunk_cloud_s3_runner_logs/versions.tf index 288ee2ec..fae886cc 100644 --- a/modules/integrations/splunk_cloud_s3_runner_logs/versions.tf +++ b/modules/integrations/splunk_cloud_s3_runner_logs/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -12,5 +12,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_o11y_aws_integration/README.md b/modules/integrations/splunk_o11y_aws_integration/README.md index 7146a2d8..662d0e95 100644 --- a/modules/integrations/splunk_o11y_aws_integration/README.md +++ b/modules/integrations/splunk_o11y_aws_integration/README.md @@ -3,14 +3,14 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules @@ -28,7 +28,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [splunk\_ingest\_url](#input\_splunk\_ingest\_url) | URL for Splunk Ingest. | `string` | n/a | yes | diff --git a/modules/integrations/splunk_o11y_aws_integration/variables.tf b/modules/integrations/splunk_o11y_aws_integration/variables.tf index fa794b49..bf70cca8 100644 --- a/modules/integrations/splunk_o11y_aws_integration/variables.tf +++ b/modules/integrations/splunk_o11y_aws_integration/variables.tf @@ -1,6 +1,6 @@ variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_o11y_aws_integration/versions.tf b/modules/integrations/splunk_o11y_aws_integration/versions.tf index 3353a29f..0631ae67 100644 --- a/modules/integrations/splunk_o11y_aws_integration/versions.tf +++ b/modules/integrations/splunk_o11y_aws_integration/versions.tf @@ -3,10 +3,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_o11y_aws_integration_common/README.md b/modules/integrations/splunk_o11y_aws_integration_common/README.md index 759d5ecf..e10c1bf8 100644 --- a/modules/integrations/splunk_o11y_aws_integration_common/README.md +++ b/modules/integrations/splunk_o11y_aws_integration_common/README.md @@ -3,8 +3,8 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [signalfx](#requirement\_signalfx) | < 10.0.0 | | [time](#requirement\_time) | >= 0.13 | @@ -12,8 +12,8 @@ | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [signalfx](#provider\_signalfx) | 9.22.2 | +| [aws](#provider\_aws) | 6.35.1 | +| [signalfx](#provider\_signalfx) | 9.25.1 | | [time](#provider\_time) | 0.13.1 | ## Modules @@ -30,6 +30,7 @@ No modules. | [signalfx_aws_external_integration.integration](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/aws_external_integration) | resource | | [signalfx_aws_integration.integration](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/aws_integration) | resource | | [time_sleep.wait_30_seconds](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.splunk_integration](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.splunk_managed_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | @@ -40,8 +41,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_account\_id](#input\_aws\_account\_id) | AWS account ID (not SL AWS account ID) associated with the infra/backend. | `string` | n/a | yes | -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e., generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [integration\_name](#input\_integration\_name) | Name of the integration. | `string` | n/a | yes | diff --git a/modules/integrations/splunk_o11y_aws_integration_common/data.tf b/modules/integrations/splunk_o11y_aws_integration_common/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/integrations/splunk_o11y_aws_integration_common/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/integrations/splunk_o11y_aws_integration_common/role.tf b/modules/integrations/splunk_o11y_aws_integration_common/role.tf index 658aa38e..6213dc6d 100644 --- a/modules/integrations/splunk_o11y_aws_integration_common/role.tf +++ b/modules/integrations/splunk_o11y_aws_integration_common/role.tf @@ -122,7 +122,7 @@ data "aws_iam_policy_document" "splunk_managed_policy" { effect = "Allow" actions = ["iam:PassRole"] # Role to be created by Cloudformation stack - resources = ["arn:aws:iam::${var.aws_account_id}:role/splunk-metric-streams*"] + resources = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/splunk-metric-streams*"] } } diff --git a/modules/integrations/splunk_o11y_aws_integration_common/variables.tf b/modules/integrations/splunk_o11y_aws_integration_common/variables.tf index 18e1d44e..f2f6b20f 100644 --- a/modules/integrations/splunk_o11y_aws_integration_common/variables.tf +++ b/modules/integrations/splunk_o11y_aws_integration_common/variables.tf @@ -1,11 +1,6 @@ -variable "aws_account_id" { - description = "AWS account ID (not SL AWS account ID) associated with the infra/backend." - type = string -} - variable "aws_profile" { type = string - description = "AWS profile (i.e., generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/splunk_o11y_aws_integration_common/versions.tf b/modules/integrations/splunk_o11y_aws_integration_common/versions.tf index 6452cee6..fb9236a1 100644 --- a/modules/integrations/splunk_o11y_aws_integration_common/versions.tf +++ b/modules/integrations/splunk_o11y_aws_integration_common/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } signalfx = { source = "splunk-terraform/signalfx" @@ -16,5 +16,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_o11y_conf_shared/README.md b/modules/integrations/splunk_o11y_conf_shared/README.md new file mode 100644 index 00000000..3fbdddfd --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/README.md @@ -0,0 +1,52 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.35.1 | +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [dashboard\_billing](#module\_dashboard\_billing) | ./dashboards/billing | n/a | +| [dashboard\_dynamodb](#module\_dashboard\_dynamodb) | ./dashboards/dynamodb | n/a | +| [dashboard\_ebs](#module\_dashboard\_ebs) | ./dashboards/ebs | n/a | +| [dashboard\_lambda](#module\_dashboard\_lambda) | ./dashboards/lambda | n/a | +| [dashboard\_runner\_ec2](#module\_dashboard\_runner\_ec2) | ./dashboards/runner_ec2 | n/a | +| [dashboard\_runner\_k8s](#module\_dashboard\_runner\_k8s) | ./dashboards/runner_k8s | n/a | +| [dashboard\_sqs](#module\_dashboard\_sqs) | ./dashboards/sqs | n/a | + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard_group.forgecicd](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard_group) | resource | +| [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | +| [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | +| [aws\_region](#input\_aws\_region) | Default AWS region. | `string` | n/a | yes | +| [dashboard\_variables](#input\_dashboard\_variables) | Variables for Dashboards |
object({
runner_k8s = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
runner_ec2 = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
billing = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
sqs = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
ebs = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
lambda = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
dynamodb = object({
tenant_names = list(string)
dynamic_variables = list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}
))
})
})
| n/a | yes | +| [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | +| [splunk\_api\_url](#input\_splunk\_api\_url) | URL for plunk Observability Cloud API. | `string` | n/a | yes | +| [splunk\_organization\_id](#input\_splunk\_organization\_id) | organization ID for Splunk Observability Cloud. | `string` | n/a | yes | +| [team](#input\_team) | Team ID | `string` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/backend.tf b/modules/integrations/splunk_o11y_conf_shared/backend.tf new file mode 100644 index 00000000..ca3c8dcb --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/backend.tf @@ -0,0 +1,4 @@ +terraform { + # Intentionally empty. Will be filled by Terragrunt. + backend "s3" {} +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards.tf new file mode 100644 index 00000000..ebe831e1 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards.tf @@ -0,0 +1,98 @@ +resource "signalfx_dashboard_group" "forgecicd" { + name = "ForgeCICD Dashboards" + description = "" + teams = [var.team] + + lifecycle { + ignore_changes = [ + import_qualifier + ] + } +} + +# Core platform health +module "dashboard_runner_ec2" { + source = "./dashboards/runner_ec2" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.runner_ec2.tenant_names + dynamic_variables = var.dashboard_variables.runner_ec2.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +module "dashboard_runner_k8s" { + source = "./dashboards/runner_k8s" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.runner_k8s.tenant_names + dynamic_variables = var.dashboard_variables.runner_k8s.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +module "dashboard_lambda" { + source = "./dashboards/lambda" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.lambda.tenant_names + dynamic_variables = var.dashboard_variables.lambda.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +# Messaging and storage +module "dashboard_sqs" { + source = "./dashboards/sqs" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.sqs.tenant_names + dynamic_variables = var.dashboard_variables.sqs.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +module "dashboard_dynamodb" { + source = "./dashboards/dynamodb" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.dynamodb.tenant_names + dynamic_variables = var.dashboard_variables.dynamodb.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +module "dashboard_ebs" { + source = "./dashboards/ebs" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.ebs.tenant_names + dynamic_variables = var.dashboard_variables.ebs.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} + +# Cost and usage +module "dashboard_billing" { + source = "./dashboards/billing" + + providers = { + signalfx = signalfx + } + + tenant_names = var.dashboard_variables.billing.tenant_names + dynamic_variables = var.dashboard_variables.billing.dynamic_variables + dashboard_group = signalfx_dashboard_group.forgecicd.id +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/README.md new file mode 100644 index 00000000..1b9696a5 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/README.md @@ -0,0 +1,42 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.billing](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_time_chart.cost_per_service](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.cost_per_tenant](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.net_cost_per_service](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.net_cost_per_tenant](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.total_cost](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.total_net_cost](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/main.tf new file mode 100644 index 00000000..fd136209 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/main.tf @@ -0,0 +1,289 @@ +resource "signalfx_time_chart" "cost_per_service" { + name = "Cost per service" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.cost_usd') +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) +C = B.sum(by=['service', 'forgecicd_tenant','usage_month', 'usage_year']) + +# publish both current and baseline +C.publish(label='current') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + on_chart_legend_dimension = "service" + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } +} + +resource "signalfx_time_chart" "net_cost_per_service" { + name = "Net Cost per service" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.net_cost_usd') +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) +C = B.sum(by=['service', 'forgecicd_tenant','usage_month', 'usage_year']) # removes usage_date from label + +# publish both current and baseline +C.publish(label='current') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } +} + +resource "signalfx_time_chart" "net_cost_per_tenant" { + name = "Net Cost per tenant" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.net_cost_usd') +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) +C = B.sum(by=['forgecicd_tenant','usage_month', 'usage_year']) +D = C.timeshift('29d') + +# publish both current and baseline +C.publish(label='current') +#D.publish(label='baseline') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } +} + +resource "signalfx_time_chart" "cost_per_tenant" { + name = "Cost per tenant" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.cost_usd') +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) +C = B.sum(by=['forgecicd_tenant','usage_month', 'usage_year']) +D = C.timeshift('29d') + +# publish both current and baseline +C.publish(label='current') +#D.publish(label='baseline') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + on_chart_legend_dimension = "service" + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } +} + +resource "signalfx_time_chart" "total_cost" { + name = "Total Cost" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.cost_usd') + +# Take max per day/service/tenant, carrying forward last value if missing +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) + +# Sum by month, still carrying forward where needed +C = B.sum(by=['usage_month', 'usage_year']) + +# Shift by 29 days to get a baseline comparison +D = C.timeshift('29d') + +# Publish both +C.publish(label='current') +D.publish(label='baseline') +EOF + + plot_type = "AreaChart" + axes_precision = 0 + on_chart_legend_dimension = "service" + + + time_range = 3600 + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } + viz_options { + axis = "left" + color = "red" + display_name = "baseline" + label = "baseline" + } +} + +resource "signalfx_time_chart" "total_net_cost" { + name = "Total Net Cost" + description = "" + + program_text = <<-EOF +A = data('forge.per_service.net_cost_usd') + +# Take max per day/service/tenant, carrying forward last value if missing +B = A.max(by=['usage_date', 'service', 'forgecicd_tenant','usage_month', 'usage_year']) + +# Sum by month, still carrying forward where needed +C = B.sum(by=['usage_month', 'usage_year']) + +# Shift by 29 days to get a baseline comparison +D = C.timeshift('29d') + +# Publish both +C.publish(label='current') +# D.publish(label='baseline') +EOF + + plot_type = "AreaChart" + axes_precision = 0 + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "current" + label = "current" + } +} + +resource "signalfx_dashboard" "billing" { + name = "Billing" + description = "Forge CICD cost and net cost by service and tenant." + dashboard_group = var.dashboard_group + + time_range = "-31d" + + variable { + property = "forgecicd_tenant" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + + chart { + chart_id = signalfx_time_chart.cost_per_service.id + row = 0 + column = 0 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.net_cost_per_service.id + row = 0 + column = 6 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.cost_per_tenant.id + row = 1 + column = 0 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.net_cost_per_tenant.id + row = 1 + column = 6 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.total_cost.id + row = 2 + column = 0 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.total_net_cost.id + row = 2 + column = 6 + width = 6 + height = 1 + } +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/billing/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/README.md new file mode 100644 index 00000000..09ad4894 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/README.md @@ -0,0 +1,49 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.dynamodb](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_single_value_chart.avg_request_latency_single](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.system_errors_single](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.throttled_requests_single](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.user_errors_single](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.avg_request_latency_ts](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_capacity_percentage](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_throttle_events](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.returned_item_count](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.system_errors_ts](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.throttled_requests_ts](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.user_errors_ts](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.write_capacity_percentage](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.write_throttle_events](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/main.tf new file mode 100644 index 00000000..9cd58ffd --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/main.tf @@ -0,0 +1,631 @@ +resource "signalfx_time_chart" "write_throttle_events" { + name = "Write throttle events" + description = "Requests to DynamoDB that exceed the provisioned write capacity units for a table or a global secondary index." + program_text = <<-EOF +A = data('WriteThrottleEvents', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'sum') and filter('TableName', '*'), rollup='sum').publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = true + + axes_precision = 0 + + show_data_markers = true + + time_range = 900 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "orange" + display_name = "Writethrottleevents" + label = "A" + } +} + +resource "signalfx_time_chart" "system_errors_ts" { + name = "System errors" + description = "Requests to DynamoDB or Amazon DynamoDB streams that generate an HTTP 500 status code during the specified time period." + program_text = <<-EOF +A = data('SystemErrors', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'sum') and filter('TableName', '*') and filter('Operation', '*'), rollup='sum').publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = true + + axes_precision = 0 + + time_range = 900 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + display_name = "Systemerrors" + label = "A" + } +} + +resource "signalfx_time_chart" "read_capacity_percentage" { + name = "Percentage of read capacity consumed" + description = "The percentage of read capacity units consumed over the specified time period, so you can track how much of your provisioned throughput is used." + program_text = <<-EOF +A = data('ProvisionedReadCapacityUnits', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean') and filter('TableName', '*')).publish(label='A', enable=False) +B = data('ConsumedReadCapacityUnits', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean') and filter('TableName', '*')).publish(label='B', enable=False) +C = ((B/A)*100).publish(label='C') +EOF + + plot_type = "LineChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = false + + axes_precision = 0 + on_chart_legend_dimension = "TableName" + show_data_markers = true + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "TableName" + } + + viz_options { + axis = "left" + display_name = "Consumedreadcapacityunits" + label = "B" + } + viz_options { + axis = "left" + display_name = "Percentage of read capacity consumed" + label = "C" + } + viz_options { + axis = "left" + display_name = "Provisionedreadcapacityunits" + label = "A" + } +} + +resource "signalfx_time_chart" "returned_item_count" { + name = "Returned item count" + description = "The number of items returned by query or scan operations during the specified time period." + program_text = <<-EOF +A = data('ReturnedItemCount', filter=filter('namespace', 'AWS/DynamoDB') and filter('TableName', '*') and filter('Operation', '*') and filter('stat', 'count'), rollup='sum').publish(label='A') +EOF + + plot_type = "LineChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = false + + axes_precision = 0 + + on_chart_legend_dimension = "Operation" + + show_data_markers = true + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = true + property = "Operation" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "TableName" + } + + viz_options { + axis = "left" + display_name = "Returneditemcount" + label = "A" + } +} + +resource "signalfx_single_value_chart" "avg_request_latency_single" { + name = "Average request latency (ms)" + description = "Successful requests to DynamoDB or Amazon DynamoDB streams during the specified time period." + unit_prefix = "Metric" + color_by = "Dimension" + + max_precision = 3 + + program_text = <<-EOF +A = data('SuccessfulRequestLatency', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean')).mean().publish(label='A') +EOF + + viz_options { + display_name = "Successfulrequestlatency - mean" + label = "A" + } +} + +resource "signalfx_time_chart" "avg_request_latency_ts" { + name = "Average request latency (ms)" + description = "Successful requests to DynamoDB or Amazon DynamoDB streams during the specified time period." + program_text = <<-EOF +A = data('SuccessfulRequestLatency', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean')).publish(label='A') +EOF + + axes_precision = 0 + + on_chart_legend_dimension = "Operation" + + plot_type = "LineChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = false + + show_data_markers = true + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = true + property = "Operation" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "TableName" + } + + viz_options { + axis = "left" + display_name = "Successfulrequestlatency" + label = "A" + } +} + +resource "signalfx_single_value_chart" "throttled_requests_single" { + name = "Throttled requests" + description = "Requests to DynamoDB that exceed the provisioned throughput limits on a resource (such as a table or an index)." + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('ThrottledRequests', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'sum'), rollup='sum').sum().publish(label='A') +EOF + viz_options { + color = "yellow" + display_name = "Throttledrequests - sum" + label = "A" + } +} + +resource "signalfx_single_value_chart" "system_errors_single" { + name = "System errors" + description = "Requests to DynamoDB or Amazon DynamoDB streams that generate an HTTP 500 status code during the specified time period." + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('SystemErrors', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'sum') and filter('TableName', '*') and filter('Operation', '*'), rollup='sum').publish(label='A') +EOF + + viz_options { + color = "brown" + display_name = "Systemerrors" + label = "A" + } +} + +resource "signalfx_time_chart" "user_errors_ts" { + name = "User errors" + description = "Requests to DynamoDB or Amazon DynamoDB streams that generate an HTTP 400 status code during the specified time period." + program_text = <<-EOF +A = data('UserErrors', filter=filter('namespace', 'AWS/DynamoDB') and filter('sf_metric', 'UserErrors') and filter('stat', 'sum'), rollup='sum').publish(label='A') +EOF + + plot_type = "ColumnChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = false + + axes_precision = 0 + + show_data_markers = true + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "brown" + display_name = "Usererrors" + label = "A" + } +} + +resource "signalfx_time_chart" "read_throttle_events" { + name = "Read throttle events" + description = "Requests to DynamoDB that exceed the provisioned read capacity units for a table or a global secondary index." + program_text = <<-EOF +A = data('ReadThrottleEvents', filter=filter('stat', 'sum') and filter('TableName', '*'), rollup='sum').publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = true + + axes_precision = 0 + on_chart_legend_dimension = "TableName" + show_data_markers = true + time_range = 3600 + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "purple" + display_name = "Readthrottleevents" + label = "A" + } +} + +resource "signalfx_time_chart" "throttled_requests_ts" { + name = "Throttled requests" + description = "Requests to DynamoDB that exceed the provisioned throughput limits on a resource (such as a table or an index)." + program_text = <<-EOF +A = data('ThrottledRequests', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'sum'), rollup='sum').publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = true + + axes_precision = 0 + + on_chart_legend_dimension = "Operation" + + show_data_markers = true + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = true + property = "Operation" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "TableName" + } + + viz_options { + axis = "left" + display_name = "Throttledrequests" + label = "A" + } +} + +resource "signalfx_time_chart" "write_capacity_percentage" { + name = "Percentage of write capacity consumed" + description = "The percentage of write capacity units consumed over the specified time period, so you can track how much of your provisioned throughput is used." + program_text = <<-EOF +A = data('ProvisionedWriteCapacityUnits', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean') and filter('TableName', '*')).publish(label='A', enable=False) +B = data('ConsumedWriteCapacityUnits', filter=filter('namespace', 'AWS/DynamoDB') and filter('stat', 'mean') and filter('TableName', '*')).publish(label='B', enable=False) +C = ((B/A)*100).publish(label='C') +EOF + + plot_type = "LineChart" + unit_prefix = "Metric" + color_by = "Dimension" + stacked = false + + axes_precision = 0 + + on_chart_legend_dimension = "TableName" + + show_data_markers = true + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "TableName" + } + + viz_options { + axis = "left" + display_name = "Consumedwritecapacityunits" + label = "B" + } + viz_options { + axis = "left" + display_name = "Percentage of write capacity consumed" + label = "C" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Provisionedwritecapacityunits" + label = "A" + } +} + +resource "signalfx_single_value_chart" "user_errors_single" { + name = "User errors" + description = "Requests to DynamoDB or DynamoDB streams that generate an HTTP 400 status code during the specified time period." + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('UserErrors', filter=filter('namespace', 'AWS/DynamoDB') and filter('sf_metric', 'UserErrors') and filter('stat', 'sum'), rollup='sum').publish(label='A') +EOF + + viz_options { + color = "brown" + display_name = "Usererrors" + label = "A" + } +} + + +resource "signalfx_dashboard" "dynamodb" { + name = "DynamoDBs" + description = "Forge CICD DynamoDB table performance, capacity, and throttling." + + dashboard_group = var.dashboard_group + + variable { + property = "aws_tag_TenantName" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + + chart { + chart_id = signalfx_single_value_chart.avg_request_latency_single.id + column = 0 + row = 0 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.avg_request_latency_ts.id + column = 3 + row = 0 + width = 9 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.throttled_requests_single.id + column = 0 + row = 1 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.throttled_requests_ts.id + column = 3 + row = 1 + width = 9 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.user_errors_single.id + column = 9 + row = 2 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.user_errors_ts.id + column = 0 + row = 2 + width = 9 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.system_errors_ts.id + column = 0 + row = 3 + width = 9 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.system_errors_single.id + column = 9 + row = 3 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_capacity_percentage.id + column = 0 + row = 4 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.write_capacity_percentage.id + column = 6 + row = 4 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_throttle_events.id + column = 0 + row = 5 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.write_throttle_events.id + column = 6 + row = 5 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.returned_item_count.id + column = 0 + row = 6 + width = 12 + height = 1 + } +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/dynamodb/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/README.md new file mode 100644 index 00000000..7ac85a61 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/README.md @@ -0,0 +1,51 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.ebs](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_single_value_chart.state](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.avg_queue_length](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.byte_utilization_pct](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.idle_time](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.latency_op](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_latency](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_ops](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_throughput](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.read_vs_write_ops](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.rw_bytes_breakdown](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.total_read_time](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.total_write_time](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.write_latency](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.write_ops](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.write_throughput](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/main.tf new file mode 100644 index 00000000..8fb6dcd9 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/main.tf @@ -0,0 +1,677 @@ +resource "signalfx_time_chart" "byte_utilization_pct" { + name = "Byte utilization %" + + description = "Compares delivered bytes to maximum allowed for Nitro volumes" + program_text = <<-EOF +A = data('EBSByteBalance%').mean(by=['AWSUniqueId']).publish(label='A') +EOF + plot_type = "LineChart" + color_by = "Dimension" + unit_prefix = "Metric" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Percent" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + display_name = "Byte utilization percentage" + label = "A" + value_suffix = "%" + } +} + +resource "signalfx_time_chart" "write_latency" { + name = "Write latency (ms/op)" + description = "Estimates average latency per write operation for troubleshooting" + program_text = <<-EOF +A = data('VolumeTotalWriteTime').sum(by=['VolumeId']).publish(label='A', enable=False) +B = data('VolumeWriteOps').sum(by=['VolumeId']).publish(label='B', enable=False) +C = (A/B * 1000).publish(label='C', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ms/Op" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Intermediate: Write operations count" + label = "B" + value_suffix = "No of ops" + } + viz_options { + axis = "left" + color = "emerald" + display_name = "Average write latency (ms/op)" + label = "C" + value_unit = "Millisecond" + } + viz_options { + axis = "left" + color = "gray" + display_name = "Intermediate: Total write time" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "read_ops" { + name = "# Read ops" + description = "Displays reads performed per interval for EBS volume" + program_text = <<-EOF +A = data('VolumeReadOps').sum().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + disable_sampling = true + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ops" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Read operations count" + label = "A" + value_suffix = "No of ops" + } +} + +resource "signalfx_time_chart" "write_throughput" { + name = "Write throughput" + description = "Shows throughput of writes in bytes per time interval" + program_text = <<-EOF +A = data('VolumeWriteBytes').rate().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Bytes" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Write throughput (bytes/sec)" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "rw_bytes_breakdown" { + name = "Read/write bytes breakdown" + description = "Compares total bytes read to bytes written across timeline" + program_text = <<-EOF +A = data('VolumeReadBytes').sum().publish(label='A', enable=True) +B = data('VolumeWriteBytes').sum().publish(label='B', enable=True) +EOF + plot_type = "AreaChart" + color_by = "Dimension" + stacked = true + + axes_precision = 4 + disable_sampling = false + on_chart_legend_dimension = "plot_label" + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Bytes" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Total bytes read" + label = "A" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "brown" + display_name = "Total bytes written" + label = "B" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "read_latency" { + name = "Read latency (ms/op)" + description = "Estimates average latency per read operation as efficiency measure" + program_text = <<-EOF +A = data('VolumeTotalReadTime').sum(by=['VolumeId']).publish(label='A', enable=False) +B = data('VolumeReadOps').sum(by=['VolumeId']).publish(label='B', enable=False) +C = (A/B*1000).publish(label='C', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ms/Op" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Read ops per volume (raw)" + label = "B" + value_suffix = "No of ops" + } + viz_options { + axis = "left" + color = "brown" + display_name = "Average read latency (ms/op)" + label = "C" + value_unit = "Millisecond" + } + viz_options { + axis = "left" + color = "gray" + display_name = "Read time per volume (raw)" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "read_throughput" { + name = "Read throughput" + description = "Shows throughput of reads in bytes per time interval" + program_text = <<-EOF +A = data('VolumeReadBytes').rate().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + disable_sampling = true + + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Bytes" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Read throughput (bytes/sec)" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_single_value_chart" "state" { + name = "State" + description = "Indicates availability and current status for workload visibility" + program_text = <<-EOF +A = data('VolumeReadOps').count(by=['aws_state']).publish(label='A', enable=True) +EOF + color_by = "Dimension" + + max_precision = 4 + secondary_visualization = "Sparkline" + + viz_options { + display_name = "Volume state reporting count" + label = "A" + value_suffix = "No of ops" + } +} + +resource "signalfx_time_chart" "total_read_time" { + name = "Total read time" + description = "Total seconds spent in servicing read operations" + program_text = <<-EOF +A = data('VolumeTotalReadTime').sum().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Seconds" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Total time spent reading" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "latency_op" { + name = "Latency/op (ms)" + description = "" + program_text = <<-EOF +A = data('VolumeWriteOps', filter=filter('namespace', 'AWS/EBS') and filter('VolumeId', 'vol-46dcc55f') and filter('stat', 'sum'), extrapolation='zero', rollup='rate').scale(60).publish(label='A') +B = data('VolumeTotalWriteTime', filter=filter('namespace', 'AWS/EBS') and filter('VolumeId', 'vol-46dcc55f') and filter('stat', 'sum'), extrapolation='zero', rollup='rate').scale(60).publish(label='B', enable=False) +C = data('VolumeReadOps', filter=filter('namespace', 'AWS/EBS') and filter('VolumeId', 'vol-46dcc55f') and filter('stat', 'sum'), extrapolation='zero', rollup='rate').scale(60).publish(label='C') +D = data('VolumeTotalReadTime', filter=filter('namespace', 'AWS/EBS') and filter('VolumeId', 'vol-46dcc55f') and filter('stat', 'sum'), extrapolation='zero', rollup='rate').scale(60).publish(label='D', enable=False) +E = (B/A).scale(1000).publish(label='E') +F = (D/C).scale(1000).publish(label='F') +EOF + plot_type = "ColumnChart" + color_by = "Dimension" + + axes_precision = 0 + time_range = 7200 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ms/write - BLUE" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = 0 + } + + axis_right { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ms/read - RED" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = 0 + } + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + display_name = "Volumereadops - scale:60" + label = "C" + } + viz_options { + axis = "left" + display_name = "Volumetotalreadtime - scale:60" + label = "D" + } + viz_options { + axis = "left" + display_name = "Volumetotalwritetime - scale:60" + label = "B" + } + viz_options { + axis = "left" + display_name = "Volumewriteops - scale:60" + label = "A" + } + viz_options { + axis = "left" + color = "blue" + display_name = "Millisec/write" + label = "E" + } + viz_options { + axis = "right" + color = "brown" + display_name = "Millisec/read" + label = "F" + } +} + +resource "signalfx_time_chart" "total_write_time" { + name = "Total write time" + description = "Total seconds spent in servicing write operations" + program_text = <<-EOF +A = data('VolumeTotalWriteTime').sum().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Seconds" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "azure" + display_name = "Total write time" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "read_vs_write_ops" { + name = "Read vs write ops" + description = "Visualizes relative load of read and write ops over time" + program_text = <<-EOF +A = data('VolumeReadOps').sum().publish(label='A', enable=True) +B = data('VolumeWriteOps').sum().publish(label='B', enable=True) +EOF + plot_type = "AreaChart" + color_by = "Dimension" + + axes_precision = 4 + disable_sampling = false + on_chart_legend_dimension = "plot_label" + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ops" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Read operations" + label = "A" + value_suffix = "No of ops" + } + viz_options { + axis = "left" + color = "brown" + display_name = "Write operations" + label = "B" + value_suffix = "No of ops" + } +} + +resource "signalfx_time_chart" "avg_queue_length" { + name = "Average queue length" + description = "Measures operations awaiting completion, highlighting saturation" + program_text = <<-EOF +A = data('VolumeQueueLength').mean().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Queue length" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Average volume queue length" + label = "A" + value_suffix = "No of ops" + } +} + +resource "signalfx_time_chart" "idle_time" { + name = "Idle time" + description = "Indicates when disk is idle and not serving I/O" + program_text = <<-EOF +A = data('VolumeIdleTime').sum().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Seconds" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Total idle time" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "write_ops" { + name = "# Write ops" + description = "Displays writes performed per interval for EBS volume" + program_text = <<-EOF +A = data('VolumeWriteOps').sum().publish(label='A', enable=True) +EOF + plot_type = "LineChart" + color_by = "Dimension" + + axes_precision = 4 + time_range = 900 + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Ops" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Number of write operations" + label = "A" + value_suffix = "No of ops" + } +} +resource "signalfx_dashboard" "ebs" { + name = "EBS" + description = "EC2/EBS volume throughput, IOPS, and latency for Forge runners." + dashboard_group = var.dashboard_group + + variable { + property = "aws_tag_TenantName" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + + chart { + chart_id = signalfx_single_value_chart.state.id + column = 0 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_ops.id + column = 4 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.write_ops.id + column = 8 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.write_latency.id + column = 8 + row = 1 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_latency.id + column = 4 + row = 1 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.latency_op.id + column = 0 + row = 1 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_vs_write_ops.id + column = 0 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.write_throughput.id + column = 8 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.read_throughput.id + column = 4 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.rw_bytes_breakdown.id + column = 0 + row = 3 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.total_read_time.id + column = 4 + row = 3 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.total_write_time.id + column = 8 + row = 3 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.byte_utilization_pct.id + column = 0 + row = 4 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.idle_time.id + column = 8 + row = 4 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.avg_queue_length.id + column = 4 + row = 4 + width = 4 + height = 1 + } + +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/ebs/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/README.md new file mode 100644 index 00000000..20982d27 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/README.md @@ -0,0 +1,51 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.lambda](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_list_chart.avg_duration_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.percent_invocations_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_single_value_chart.avg_invocation_duration](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.total_errors](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.total_invocations](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.total_spillover_invocations](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.total_throttles](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.errors_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.invocations](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.invocations_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.provisioned_concurrency_invocations_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.provisioned_concurrency_spillover_invocations_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.provisioned_concurrency_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.provisioned_concurrent_executions_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.throttles_by_version](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/main.tf new file mode 100644 index 00000000..5839b4dc --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/main.tf @@ -0,0 +1,824 @@ +resource "signalfx_time_chart" "provisioned_concurrent_executions_by_version" { + name = "Provisioned concurrent executions by version" + description = "The number of events that are being processed on provisioned concurrency. For each invocation of an alias or version with provisioned concurrency, Lambda emits the current count." + program_text = <<-EOF +A = data('ProvisionedConcurrentExecutions', filter=filter('stat', 'upper') and filter('Resource', '*') and filter('ExecutedVersion', '*')).sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = true + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + + time_range = 900 + + viz_options { + axis = "left" + display_name = "Provisioned concurrent executions" + label = "A" + } + +} + +resource "signalfx_time_chart" "provisioned_concurrency_invocations_by_version" { + name = "Provisioned concurrency invocations by version" + description = "The number of invocations that are run on provisioned concurrency. Lambda increments the count once for each invocation that runs on provisioned concurrency." + program_text = <<-EOF +A = data('ProvisionedConcurrencyInvocations', filter=filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='rate').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = true + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + + viz_options { + axis = "left" + display_name = "Provisioned concurrent invocations" + label = "A" + } +} + +resource "signalfx_time_chart" "provisioned_concurrency_spillover_invocations_by_version" { + name = "Provisioned concurrency spillover invocations by version" + description = "The number of invocations that are run on nonprovisioned concurrency, when all provisioned concurrency is in use. For a version or alias that is configured to use provisioned concurrency, Lambda increments the count once for each invocation that runs on non-provisioned concurrency." + program_text = <<-EOF +A = data('ProvisionedConcurrencySpilloverInvocations', filter=filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='rate').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = true + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + + viz_options { + axis = "left" + display_name = "Provisioned concurrent invocations" + label = "A" + } +} + +resource "signalfx_single_value_chart" "total_spillover_invocations" { + name = "Total spillover invocations" + description = "Over 5m | Spillover invocations are run on nonprovisioned concurrency, when all provisioned concurrency is in use." + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('ProvisionedConcurrencySpilloverInvocations', filter=filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='rate').sum(over='5m').sum().publish(label='A') +EOF + + viz_options { + display_name = "Provisioned concurrent invocations" + label = "A" + } +} + +resource "signalfx_list_chart" "percent_invocations_by_version" { + name = "% invocations by version" + description = "The % of total invocations handled by version" + unit_prefix = "Metric" + color_by = "Dimension" + secondary_visualization = "Sparkline" + sort_by = "-value" + + program_text = <<-EOF +C = (B/A).scale(100).publish(label='C') +A = data('Invocations', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='sum', extrapolation='zero').sum().publish(label='A', enable=False) +B = data('Invocations', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='sum', extrapolation='zero').sum(by=['ExecutedVersion']).publish(label='B', enable=False) +EOF + + time_range = 900 + + legend_options_fields { + enabled = true + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = false + property = "aws_function_version" + } + + viz_options { + display_name = "A" + label = "A" + } + viz_options { + display_name = "B" + label = "B" + } + viz_options { + display_name = "Version" + label = "C" + value_suffix = "%" + } +} + +resource "signalfx_time_chart" "errors_by_version" { + name = "Errors by version" + description = "The number of invocations that failed due to errors in the function (response code 4XX)." + program_text = <<-EOF +A = data('Errors', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('ExecutedVersion', '*') and filter('Resource', '*'), rollup='sum').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = false + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = false + property = "aws_function_version" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + + viz_options { + axis = "left" + display_name = "Errors by version" + label = "A" + value_suffix = "-errors" + } +} + +resource "signalfx_single_value_chart" "total_throttles" { + name = "Total throttles" + description = "Over 5m" + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('Throttles', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and (not filter('ExecutedVersion', '*')), rollup='sum', extrapolation='zero').sum(over='5m').sum().publish(label='A') +EOF + + viz_options { + color = "yellow" + display_name = "Throttles - sum(5m) - sum" + label = "A" + } +} + +resource "signalfx_list_chart" "avg_duration_by_version" { + name = "Average duration by version" + unit_prefix = "Metric" + color_by = "Dimension" + secondary_visualization = "Sparkline" + sort_by = "-value" + + disable_sampling = true + + program_text = <<-EOF +A = data('Duration', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'mean') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='average').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = false + property = "aws_function_version" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + + viz_options { + display_name = "Version" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_single_value_chart" "avg_invocation_duration" { + name = "Average invocation duration" + unit_prefix = "Metric" + color_by = "Metric" + + program_text = <<-EOF +A = data('Duration', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'mean') and filter('Resource', '*') and (not filter('ExecutedVersion', '*')), rollup='average').publish(label='A') +EOF + + max_precision = 5 + + viz_options { + display_name = "Duration (ms)" + label = "A" + value_unit = "Millisecond" + } +} + +resource "signalfx_time_chart" "throttles_by_version" { + name = "Throttles by version" + description = "The number of Lambda function invocation attempts that were throttled due to invocation rates exceeding the customer’s concurrent limits (error code 429)." + program_text = <<-EOF +A = data('Throttles', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='sum').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = true + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "aws_function_version" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + + viz_options { + axis = "left" + display_name = "Throttles by version" + label = "A" + } + +} + +resource "signalfx_time_chart" "invocations_by_version" { + name = "Invocations by version" + description = "The number of times a function is invoked in response to an event or invocation API call." + program_text = <<-EOF +A = data('Invocations', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and filter('ExecutedVersion', '*'), rollup='sum').sum(by=['ExecutedVersion']).publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = true + + + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + time_range = 900 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = false + property = "aws_function_version" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + + viz_options { + axis = "left" + display_name = "Invocations by version" + label = "A" + value_suffix = "-invocations" + } + +} + +resource "signalfx_time_chart" "invocations" { + name = "Invocations" + description = "The number of times a function is invoked in response to an event or invocation API call and associated errors or throttles." + program_text = <<-EOF +A = data('Invocations', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and (not filter('ExecutedVersion', '*')), rollup='sum').sum().publish(label='A') +EOF + + plot_type = "AreaChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = false + + axes_precision = 0 + on_chart_legend_dimension = "plot_label" + + time_range = 900 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = false + property = "aws_function_version" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + + viz_options { + axis = "left" + display_name = "Invocations" + label = "A" + } +} + +resource "signalfx_single_value_chart" "total_errors" { + name = "Total errors" + description = "Over 5m" + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('Errors', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and (not filter('ExecutedVersion', '*')), rollup='sum', extrapolation='zero').sum(over='5m').sum().publish(label='A') +EOF + + viz_options { + color = "brown" + display_name = "Errors" + label = "A" + } +} + +resource "signalfx_time_chart" "provisioned_concurrency_utilization" { + name = "Provisioned concurrency utilization" + description = "The number of events that are being processed on provisioned concurrency, divided by the total amount of provisioned concurrency allocated. For example, .5 indicates that 50 percent of allocated provisioned concurrency is in use. For each invocation of an alias or version with provisioned concurrency, Lambda emits the current count." + program_text = <<-EOF +A = data('ProvisionedConcurrencyUtilization', filter=filter('stat', 'upper') and filter('Resource', '*') and filter('ExecutedVersion', '*')).scale(100).publish(label='A') +EOF + + plot_type = "LineChart" + unit_prefix = "Metric" + color_by = "Dimension" + timezone = "UTC" + stacked = false + + axes_include_zero = true + axes_precision = 0 + on_chart_legend_dimension = "ExecutedVersion" + + time_range = 900 + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "FunctionName" + } + legend_options_fields { + enabled = true + property = "ExecutedVersion" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "Resource" + } + legend_options_fields { + enabled = false + property = "stat" + } + + viz_options { + axis = "left" + display_name = "Provisioned concurrency utilization" + label = "A" + value_suffix = "%" + } +} + +resource "signalfx_single_value_chart" "total_invocations" { + name = "Total invocations" + description = "Over 5m" + unit_prefix = "Metric" + color_by = "Dimension" + + program_text = <<-EOF +A = data('Invocations', filter=filter('namespace', 'AWS/Lambda') and filter('stat', 'sum') and filter('Resource', '*') and (not filter('ExecutedVersion', '*')), rollup='sum').sum(over='5m').sum().publish(label='A') +EOF + + viz_options { + color = "chartreuse" + display_name = "Invocations - sum(5m) - sum" + label = "A" + } +} + +resource "signalfx_dashboard" "lambda" { + name = "Lambdas" + description = "Forge CICD Lambda invocation rate, errors, duration, and concurrency." + dashboard_group = var.dashboard_group + + variable { + property = "aws_tag_TenantName" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + + time_range = "-1h" + + chart { + chart_id = signalfx_time_chart.invocations.id + column = 0 + row = 1 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.total_invocations.id + column = 0 + row = 0 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.avg_invocation_duration.id + column = 3 + row = 0 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.total_spillover_invocations.id + column = 6 + row = 0 + width = 2 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.total_errors.id + column = 8 + row = 0 + width = 2 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.total_throttles.id + column = 10 + row = 0 + width = 2 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.invocations_by_version.id + column = 3 + row = 1 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.provisioned_concurrency_invocations_by_version.id + column = 6 + row = 1 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.provisioned_concurrent_executions_by_version.id + column = 9 + row = 1 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.avg_duration_by_version.id + column = 0 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.errors_by_version.id + column = 4 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.throttles_by_version.id + column = 8 + row = 2 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.percent_invocations_by_version.id + column = 0 + row = 3 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.provisioned_concurrency_spillover_invocations_by_version.id + column = 4 + row = 3 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.provisioned_concurrency_utilization.id + column = 8 + row = 3 + width = 4 + height = 1 + } +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/lambda/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/README.md new file mode 100644 index 00000000..bbb56528 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/README.md @@ -0,0 +1,58 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.runner_ec2](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_list_chart.chart_active_hosts_by_availability_zone](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_active_hosts_per_instance_type](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_disk_metrics_24h_change](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_disk_summary_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_top_5_network_in_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_top_5_network_out_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_top_images_by_mean_cpu_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_top_instances_by_cpu_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_top_memory_page_swaps_sec](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.chart_total_network_errors](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_single_value_chart.chart_active_hosts](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.chart_hosts_with_agent_installed](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.chart_cpu_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_disk_io_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_disk_ops](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_disk_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_memory_utilization](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_network_in_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_network_in_bytes_vs_24h_change](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_network_out_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_network_out_bytes_vs_24h_change](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.chart_total_memory_overview_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/main.tf new file mode 100644 index 00000000..2068d11d --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/main.tf @@ -0,0 +1,1703 @@ +resource "signalfx_time_chart" "chart_disk_ops" { + name = "# Disk ops" + + program_text = <<-EOF +A = data('^aws.ec2.disk.ops.write.total', extrapolation='last_value', maxExtrapolations=5).sum().publish(label='A') +B = data('^aws.ec2.disk.ops.read.total', extrapolation='last_value', maxExtrapolations=5).sum().publish(label='B') +EOF + + plot_type = "ColumnChart" + on_chart_legend_dimension = "plot_label" + time_range = 3600 + + axes_precision = 0 + + axis_left { + min_value = 0 + } + axis_right { + min_value = 0 + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Write ops" + label = "A" + } + viz_options { + axis = "right" + color = "orange" + display_name = "Read ops" + label = "B" + } +} + +resource "signalfx_time_chart" "chart_total_memory_overview_bytes" { + name = "Total memory overview (bytes)" + description = "From hosts with agent installed" + + program_text = <<-EOF +C = data('system.memory.usage', filter=filter('state', 'free') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='C') +F = data('system.memory.usage', filter=filter('state', 'used') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='F') +A = data('system.memory.usage', filter=filter('state', 'buffered') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='A') +B = data('system.memory.usage', filter=filter('state', 'cached') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='B') +D = data('system.memory.usage', filter=filter('state', 'slab_reclaimable') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='D') +E = data('system.memory.usage', filter=filter('state', 'slab_unreclaimable') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).sum().publish(label='E') +EOF + + plot_type = "AreaChart" + + axes_precision = 4 + on_chart_legend_dimension = "plot_label" + stacked = true + unit_prefix = "Binary" + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + color = "azure" + display_name = "Cached" + label = "B" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "emerald" + display_name = "Free" + label = "C" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "pink" + display_name = "Unreclaimable" + label = "E" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "red" + display_name = "Used" + label = "F" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "violet" + display_name = "Reclaimable" + label = "D" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "yellow" + display_name = "Buffered" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "chart_network_out_bytes_vs_24h_change" { + name = "Network out (bytes) vs. 24h change (%)" + + program_text = <<-EOF +A = data('^aws.ec2.network.io.transmit.total', extrapolation='last_value', maxExtrapolations=5).sum().mean(over='1h').publish(label='A') +B = (A).timeshift('1d').publish(label='B', enable=False) +C = (A/B-1).scale(100).publish(label='C') +EOF + + plot_type = "ColumnChart" + + axes_precision = 0 + unit_prefix = "Binary" + on_chart_legend_dimension = "plot_label" + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + display_name = "A - timeshift 1d" + label = "B" + } + viz_options { + axis = "left" + color = "blue" + display_name = "Network out" + label = "A" + value_unit = "Byte" + } + viz_options { + axis = "right" + color = "orange" + display_name = "24h change (%)" + label = "C" + plot_type = "LineChart" + } +} + +resource "signalfx_time_chart" "chart_network_out_bytes" { + name = "Network out (bytes)" + description = "Percentile distribution across all active hosts" + + program_text = <<-EOF +A = data('^aws.ec2.network.io.transmit.total', extrapolation='last_value', maxExtrapolations=5).publish(label='A', enable=False) +B = (A).min().publish(label='B') +C = (A).percentile(pct=10).publish(label='C') +D = (A).percentile(pct=50).publish(label='D') +E = (A).percentile(pct=90).publish(label='E') +F = (A).max().publish(label='F') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + unit_prefix = "Binary" + on_chart_legend_dimension = "plot_label" + + time_range = 3600 + + axis_left { + min_value = 0 + label = "Bytes" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + display_name = "Network bytes out" + label = "A" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "azure" + display_name = "Median" + label = "D" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Min" + label = "B" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "pink" + display_name = "P90" + label = "E" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "red" + display_name = "Max" + label = "F" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "yellowgreen" + display_name = "P10" + label = "C" + value_unit = "Byte" + } +} + +resource "signalfx_list_chart" "chart_top_instances_by_cpu_utilization" { + name = "Top instances by CPU utilization (%)" + description = "By AWSUniqueId" + + program_text = "A = data('^aws.ec2.cpu.utilization', extrapolation='last_value', maxExtrapolations=5).mean(by=['AWSUniqueId']).top(count=5).publish(label='A')" + + sort_by = "-value" + + color_by = "Scale" + refresh_interval = 60 + max_precision = 4 + time_range = 900 + secondary_visualization = "None" + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "aws_instance_id" + } + + viz_options { + display_name = "Top instances by CPU utilization" + label = "A" + value_suffix = "%" + } +} + +resource "signalfx_time_chart" "chart_disk_utilization" { + name = "Disk utilization (%)" + description = "Percentile distribution across active hosts with agent installed" + + program_text = <<-EOF +B = data('system.filesystem.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'used')).publish(label='B', enable=False) +C = data('system.filesystem.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'free')).publish(label='C', enable=False) +D = ((B/(B+C))*100).mean(by=['AWSUniqueId']).publish(label='D', enable=False) +E = (D).min().publish(label='E') +F = (D).percentile(pct=10).publish(label='F') +G = (D).percentile(pct=50).publish(label='G') +H = (D).percentile(pct=90).publish(label='H') +I = (D).max().publish(label='I') +A = alerts(autodetect_id='F6cykK5AYAA', filter=filter('aws_tag_ProductFamilyName', 'Forge MT')).publish(label='A') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + + on_chart_legend_dimension = "plot_label" + time_range = 3600 + + event_options { + display_name = "Autodetect alerts" + label = "A" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "aws_instance_id" + } + legend_options_fields { + enabled = true + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = true + property = "host.image.id" + } + legend_options_fields { + enabled = true + property = "os.type" + } + legend_options_fields { + enabled = true + property = "type" + } + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "host.type" + } + legend_options_fields { + enabled = true + property = "cloud.availability_zone" + } + legend_options_fields { + enabled = true + property = "mountpoint" + } + legend_options_fields { + enabled = true + property = "mode" + } + legend_options_fields { + enabled = true + property = "host.name" + } + legend_options_fields { + enabled = true + property = "cloud.platform" + } + legend_options_fields { + enabled = true + property = "host.id" + } + legend_options_fields { + enabled = true + property = "cloud.region" + } + legend_options_fields { + enabled = true + property = "cloud.provider" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = true + property = "cloud.account.id" + } + legend_options_fields { + enabled = true + property = "device" + } + legend_options_fields { + enabled = true + property = "deployment.environment" + } + legend_options_fields { + enabled = true + property = "state" + } + legend_options_fields { + enabled = true + property = "azure.resourcegroup.name" + } + legend_options_fields { + enabled = true + property = "azure.vm.name" + } + legend_options_fields { + enabled = true + property = "azure.vm.size" + } + legend_options_fields { + enabled = true + property = "azure_resource_id" + } + legend_options_fields { + enabled = true + property = "azure.vm.scaleset.name" + } + legend_options_fields { + enabled = true + property = "gcp_id" + } + + viz_options { + axis = "left" + display_name = "Disk utilization" + label = "D" + value_suffix = "%" + } + viz_options { + axis = "left" + display_name = "Free disk" + label = "C" + value_suffix = "%" + } + viz_options { + axis = "left" + display_name = "Used disk" + label = "B" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "azure" + display_name = "P50" + label = "G" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Min" + label = "E" + } + viz_options { + axis = "left" + color = "pink" + display_name = "P90" + label = "H" + } + viz_options { + axis = "left" + color = "red" + display_name = "Max" + label = "I" + } + viz_options { + axis = "left" + color = "yellowgreen" + display_name = "P10" + label = "F" + } +} + +resource "signalfx_list_chart" "chart_disk_metrics_24h_change" { + name = "Disk metrics 24h change (%)" + description = "Change over 24h" + + program_text = <<-EOF +A = data('^aws.ec2.disk.ops.read.total').sum().mean(over='1h').scale(60).publish(label='A', enable=False) +B = (A).timeshift('1d').publish(label='B', enable=False) +C = (A/B-1).scale(100).publish(label='C') +D = data('^aws.ec2.disk.ops.write.total').sum().mean(over='1h').scale(60).publish(label='D', enable=False) +E = (D).timeshift('1d').publish(label='E', enable=False) +F = (D/E-1).scale(100).publish(label='F') +G = data('^aws.ec2.disk.io.read.total').sum().mean(over='1h').scale(60).publish(label='G', enable=False) +H = (G).timeshift('1d').publish(label='H', enable=False) +I = (G/H-1).scale(100).publish(label='I') +J = data('^aws.ec2.disk.io.write.total').sum().mean(over='1h').scale(60).publish(label='J', enable=False) +K = (J).timeshift('1d').publish(label='K', enable=False) +L = (J/K-1).scale(100).publish(label='L') +EOF + + sort_by = "-value" + + color_by = "Scale" + unit_prefix = "Binary" + max_precision = 4 + secondary_visualization = "Sparkline" + time_range = 900 + refresh_interval = 60 + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + + viz_options { + display_name = "A - timeshift 1d" + label = "B" + } + viz_options { + display_name = "D - timeshift 1d" + label = "E" + } + viz_options { + display_name = "Disk I/O read" + label = "I" + value_suffix = "%" + } + viz_options { + display_name = "Disk I/O write" + label = "L" + value_suffix = "%" + } + viz_options { + display_name = "Disk ops read" + label = "C" + value_suffix = "%" + } + viz_options { + display_name = "Disk ops write" + label = "F" + value_suffix = "%" + } + viz_options { + display_name = "G - timeshift 1d" + label = "H" + } + viz_options { + display_name = "J - timeshift 1d" + label = "K" + } + viz_options { + display_name = "^aws.ec2.disk.io.read.total - sum - mean(1h) - scale:60" + label = "G" + } + viz_options { + display_name = "^aws.ec2.disk.io.write.total - sum - mean(1h) - scale:60" + label = "J" + } + viz_options { + display_name = "^aws.ec2.disk.ops.read.total - sum - mean(1h) - scale:60" + label = "A" + } + viz_options { + display_name = "^aws.ec2.disk.ops.write.total - sum - mean(1h) - scale:60" + label = "D" + } +} + +resource "signalfx_list_chart" "chart_top_images_by_mean_cpu_utilization" { + name = "Top images by mean CPU utilization (%)" + description = "By aws_image_id" + + program_text = <<-EOF +A = data('CPUUtilization', filter=filter('namespace', 'AWS/EC2') and filter('stat', 'mean'), extrapolation='last_value', maxExtrapolations=5).mean(by=['aws_image_id']).top(count=5).publish(label='A',enable=False) +B = data('cpu.utilization', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks'), extrapolation='last_value', maxExtrapolations=5).dimensions(renames={'aws_image_id':'host.image.id'}).mean(by=['aws_image_id']).top(count=5).publish(label='B',enable=False) +C = union(A,B).top(count=5).publish("C") +EOF + + sort_by = "-value" + + color_by = "Scale" + time_range = 900 + refresh_interval = 60 + max_precision = 4 + secondary_visualization = "None" + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "aws_image_id" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "CPU utilization - OTel" + label = "B" + } + viz_options { + display_name = "CPU utilization - cloudWatch" + label = "A" + value_suffix = "%" + } + viz_options { + display_name = "Union" + label = "C" + value_suffix = "%" + } +} + +resource "signalfx_time_chart" "chart_network_in_bytes" { + name = "Network in (bytes)" + description = "Percentile distribution across all active hosts" + + program_text = <<-EOF +A = data('^aws.ec2.network.io.receive.total', extrapolation='last_value', maxExtrapolations=5).publish(label='A', enable=False) +B = (A).min().publish(label='B') +C = (A).percentile(pct=10).publish(label='C') +D = (A).percentile(pct=50).publish(label='D') +E = (A).percentile(pct=90).publish(label='E') +F = (A).max().publish(label='F') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + unit_prefix = "Binary" + on_chart_legend_dimension = "plot_label" + + time_range = 3600 + + axis_left { + min_value = 0 + label = "Bytes" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + display_name = "Network bytes in" + label = "A" + } + viz_options { + axis = "left" + color = "azure" + display_name = "Median" + label = "D" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Min" + label = "B" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "pink" + display_name = "P90" + label = "E" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "red" + display_name = "Max" + label = "F" + value_unit = "Byte" + } + viz_options { + axis = "left" + color = "yellowgreen" + display_name = "P10" + label = "C" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "chart_memory_utilization" { + name = "Memory utilization (%)" + description = "Percentile distribution across active hosts with agent installed" + + program_text = <<-EOF +H = data('system.memory.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'used')).sum(by=['host.name']).publish(label='H', enable=False) +I = data('system.memory.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'used', 'free', 'cached', 'buffered')).sum(by=['host.name']).publish(label='I', enable=False) +J = ((H/I)*100).publish(label='J', enable=False) +C = (J).min().publish(label='C') +D = (J).percentile(pct=10).publish(label='D') +E = (J).percentile(pct=50).publish(label='E') +F = (J).percentile(pct=90).publish(label='F') +G = (J).max().publish(label='G') +A = alerts(autodetect_id='F7vC_VlAYAI', filter=filter('aws_tag_ProductFamilyName', 'Forge MT')).publish(label='A') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + + on_chart_legend_dimension = "plot_label" + time_range = 3600 + + axis_left { + max_value = 120 + min_value = 0 + } + + event_options { + display_name = "Autodetect alerts" + label = "A" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "cloud.region" + } + legend_options_fields { + enabled = true + property = "host.image.id" + } + legend_options_fields { + enabled = true + property = "os.type" + } + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "host.type" + } + legend_options_fields { + enabled = true + property = "cloud.availability_zone" + } + legend_options_fields { + enabled = true + property = "cloud.provider" + } + legend_options_fields { + enabled = true + property = "cloud.account.id" + } + legend_options_fields { + enabled = true + property = "host.name" + } + legend_options_fields { + enabled = true + property = "state" + } + legend_options_fields { + enabled = true + property = "cloud.platform" + } + legend_options_fields { + enabled = true + property = "host.id" + } + legend_options_fields { + enabled = true + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = true + property = "deployment.environment" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = true + property = "azure.resourcegroup.name" + } + legend_options_fields { + enabled = true + property = "azure.vm.name" + } + legend_options_fields { + enabled = true + property = "azure.vm.size" + } + legend_options_fields { + enabled = true + property = "azure_resource_id" + } + legend_options_fields { + enabled = true + property = "azure.vm.scaleset.name" + } + legend_options_fields { + enabled = true + property = "gcp_id" + } + legend_options_fields { + enabled = true + property = "telemetry.sdk.name" + } + legend_options_fields { + enabled = true + property = "telemetry.sdk.language" + } + legend_options_fields { + enabled = true + property = "telemetry.sdk.version" + } + legend_options_fields { + enabled = true + property = "service.name" + } + + viz_options { + axis = "left" + display_name = "Memory total" + label = "I" + } + viz_options { + axis = "left" + display_name = "Memory usage" + label = "H" + } + viz_options { + axis = "left" + display_name = "Memory utilization" + label = "J" + } + viz_options { + axis = "left" + color = "azure" + display_name = "Median" + label = "E" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Min" + label = "C" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "pink" + display_name = "P90" + label = "F" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "red" + display_name = "Max" + label = "G" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "yellowgreen" + display_name = "P10" + label = "D" + value_suffix = "%" + } +} + +resource "signalfx_time_chart" "chart_disk_io_bytes" { + name = "Disk I/O (bytes)" + + program_text = <<-EOF +A = data('^aws.ec2.disk.io.write.total', extrapolation='last_value', maxExtrapolations=5).sum().publish(label='A') +B = data('^aws.ec2.disk.io.read.total', extrapolation='last_value', maxExtrapolations=5).sum().publish(label='B') +EOF + + plot_type = "ColumnChart" + + axes_precision = 0 + unit_prefix = "Binary" + on_chart_legend_dimension = "plot_label" + time_range = 3600 + + axis_left { + min_value = 0 + } + axis_right { + min_value = 0 + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Bytes written" + label = "A" + value_unit = "Byte" + } + viz_options { + axis = "right" + color = "orange" + display_name = "Bytes read" + label = "B" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "chart_network_in_bytes_vs_24h_change" { + name = "Network in (bytes) vs. 24h change (%)" + + program_text = <<-EOF +C = (B).timeshift('1d').publish(label='C', enable=False) +A = data('^aws.ec2.network.io.receive.total', extrapolation='last_value', maxExtrapolations=5).sum().publish(label='A') +B = (A).mean(over='1h').publish(label='B', enable=False) +D = (B/C-1).scale(100).publish(label='D') +EOF + + plot_type = "ColumnChart" + + axes_precision = 0 + unit_prefix = "Binary" + + on_chart_legend_dimension = "plot_label" + + time_range = 3600 + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + display_name = "A - mean(1h)" + label = "B" + } + viz_options { + axis = "left" + color = "blue" + display_name = "Network in" + label = "A" + value_unit = "Byte" + } + viz_options { + axis = "right" + color = "orange" + display_name = "24h change (%)" + label = "D" + plot_type = "LineChart" + } + viz_options { + axis = "right" + color = "yellow" + display_name = "C" + label = "C" + plot_type = "LineChart" + } +} + +resource "signalfx_list_chart" "chart_total_network_errors" { + name = "# Total network errors" + + program_text = <<-EOF +A = data('system.network.errors', filter=filter('direction', 'receive') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).count().publish(label='A') +B = data('system.network.errors', filter=filter('direction', 'transmit') and filter('cloud.platform', 'aws_ec2', 'aws_eks')).count().publish(label='B') +EOF + + sort_by = "-value" + + color_by = "Metric" + max_precision = 4 + refresh_interval = 60 + time_range = 900 + + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + color = "blue" + display_name = "Errors with bytes in" + label = "A" + } + viz_options { + color = "orange" + display_name = "Errors with bytes out" + label = "B" + } +} + +resource "signalfx_list_chart" "chart_top_memory_page_swaps_sec" { + name = "Top memory page swaps/sec" + description = "From hosts with agent installed" + + program_text = <<-EOF +A = data('vmpage_io.swap.in', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks')).mean(by=['host.name']).top(count=5).publish(label='A') +B = data('vmpage_io.swap.out', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks'), rollup='rate').mean(by=['host.name']).top(count=5).publish(label='B') +EOF + + sort_by = "-value" + + color_by = "Scale" + max_precision = 4 + time_range = 900 + refresh_interval = 60 + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "host.name" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "Pages swapped in" + label = "A" + } + viz_options { + display_name = "Pages swapped out" + label = "B" + } +} + +resource "signalfx_list_chart" "chart_active_hosts_per_instance_type" { + name = "# Active hosts per instance type" + description = "That reported in the last hour" + + program_text = <<-EOF +A = data('CPUUtilization', filter=filter('namespace', 'AWS/EC2') and filter('stat', 'mean'), extrapolation='last_value', maxExtrapolations=5).max(over='1h').count(by=['aws_instance_type']).publish(label='A',enable=False) +A.publish("C") +EOF + + sort_by = "-value" + + color_by = "Scale" + max_precision = 0 + secondary_visualization = "Sparkline" + time_range = 900 + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "aws_instance_type" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "CPU utilization - cloudWatch" + label = "A" + } + viz_options { + display_name = "Union" + label = "C" + } +} + +resource "signalfx_time_chart" "chart_cpu_utilization" { + name = "CPU utilization (%)" + description = "Percentile distribution across all active hosts" + + program_text = <<-EOF +AB = alerts(autodetect_id='F7vDCq0AgAE', filter=filter('aws_tag_ProductFamilyName', 'Forge MT')).publish(label='Autodetect alerts') +A = data('^aws.ec2.cpu.utilization', extrapolation='last_value', maxExtrapolations=5).publish(label='A', enable=False) +B = (A).min().publish(label='B') +C = (A).percentile(pct=10).publish(label='C') +D = (A).percentile(pct=50).publish(label='D') +E = (A).percentile(pct=90).publish(label='E') +F = (A).max().publish(label='F') +EOF + + plot_type = "AreaChart" + + axes_precision = 0 + on_chart_legend_dimension = "plot_label" + time_range = 3600 + + axis_left { + max_value = 110 + } + + event_options { + display_name = "Autodetect alerts" + label = "Autodetect alerts" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + axis = "left" + display_name = "CPU utilization" + label = "A" + } + viz_options { + axis = "left" + color = "azure" + display_name = "Median" + label = "D" + } + viz_options { + axis = "left" + color = "chartreuse" + display_name = "Min" + label = "B" + } + viz_options { + axis = "left" + color = "pink" + display_name = "P90" + label = "E" + } + viz_options { + axis = "left" + color = "red" + display_name = "Max" + label = "F" + } + viz_options { + axis = "left" + color = "yellowgreen" + display_name = "P10" + label = "C" + } +} + +resource "signalfx_list_chart" "chart_active_hosts_by_availability_zone" { + name = "# Active hosts by availability zone" + description = "That reported in the last hour" + + program_text = <<-EOF +A = data('CPUUtilization', filter=filter('namespace', 'AWS/EC2') and filter('stat', 'mean'), extrapolation='last_value', maxExtrapolations=5).max(over='1h').count(by=['aws_availability_zone']).publish(label='A',enable=False) +B = data('cpu.utilization', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks'), extrapolation='last_value', maxExtrapolations=5).dimensions(renames={'aws_availability_zone':'cloud.availability_zone'}).max(over='1h').count(by=['aws_availability_zone']).publish(label='B',enable=False) +C = union(A,B).publish("C") +EOF + + sort_by = "-value" + + color_by = "Scale" + time_range = 900 + secondary_visualization = "None" + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "aws_availability_zone" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "CPU utilization - OTel" + label = "B" + } + viz_options { + display_name = "CPU utilization - cloudWatch" + label = "A" + } + viz_options { + display_name = "Union" + label = "C" + } +} + +resource "signalfx_list_chart" "chart_disk_summary_utilization" { + name = "Disk summary utilization (%)" + description = "Percent of disk space utilized on all volumes on active hosts with agent installed. Instance id | Host" + + program_text = <<-EOF +A = data('system.filesystem.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'used')).publish(label='A', enable=False) +B = data('system.filesystem.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks') and filter('state', 'free')).publish(label='B', enable=False) +C = ((A/(A+B))*100).mean(by=['host.name', 'AWSUniqueId']).publish(label='C') +EOF + + sort_by = "-value" + + max_precision = 4 + time_range = 3600 + + + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "aws_instance_id" + } + legend_options_fields { + enabled = true + property = "host.name" + } + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = true + property = "host.image.id" + } + legend_options_fields { + enabled = true + property = "os.type" + } + legend_options_fields { + enabled = true + property = "type" + } + legend_options_fields { + enabled = true + property = "host.type" + } + legend_options_fields { + enabled = true + property = "cloud.availability_zone" + } + legend_options_fields { + enabled = true + property = "mountpoint" + } + legend_options_fields { + enabled = true + property = "mode" + } + legend_options_fields { + enabled = true + property = "state" + } + legend_options_fields { + enabled = true + property = "cloud.platform" + } + legend_options_fields { + enabled = true + property = "host.id" + } + legend_options_fields { + enabled = true + property = "cloud.region" + } + legend_options_fields { + enabled = true + property = "cloud.provider" + } + legend_options_fields { + enabled = true + property = "cloud.account.id" + } + legend_options_fields { + enabled = true + property = "device" + } + legend_options_fields { + enabled = true + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = true + property = "deployment.environment" + } + + viz_options { + display_name = "Disk free" + label = "B" + value_suffix = "%" + } + viz_options { + display_name = "Disk summary urilzation" + label = "C" + value_suffix = "%" + } + viz_options { + display_name = "Disk used" + label = "A" + value_suffix = "%" + } +} + +resource "signalfx_single_value_chart" "chart_hosts_with_agent_installed" { + name = "# Hosts with agent installed" + description = "Splunk OTel connector installed" + + program_text = "A = data('system.memory.usage', filter=filter('cloud.platform', 'aws_ec2', 'aws_eks'), rollup='average').sum(by=['AWSUniqueId']).count().publish(label='A')" + + color_by = "Dimension" + max_precision = 4 + refresh_interval = 60 + + viz_options { + display_name = "Hosts with agent installed" + label = "A" + } +} + +resource "signalfx_list_chart" "chart_top_5_network_out_bytes" { + name = "Top 5 network out (bytes)" + description = "By AWSUniqueId" + + program_text = "A = data('^aws.ec2.network.io.transmit.total', extrapolation='last_value', maxExtrapolations=5).mean(by=['AWSUniqueId']).top(count=5).publish(label='A')" + + sort_by = "-value" + + color_by = "Scale" + unit_prefix = "Binary" + time_range = 900 + max_precision = 4 + refresh_interval = 60 + secondary_visualization = "None" + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "aws_instance_id" + } + + viz_options { + display_name = "Network out" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_single_value_chart" "chart_active_hosts" { + name = "# Active hosts" + + program_text = "A = data('^aws.ec2.cpu.utilization', extrapolation='last_value', maxExtrapolations=2).sum(by=['AWSUniqueId']).count().publish(label='A')" + + color_by = "Dimension" + max_precision = 4 + refresh_interval = 60 + + viz_options { + display_name = "# Hosts" + label = "A" + } +} + +resource "signalfx_list_chart" "chart_top_5_network_in_bytes" { + name = "Top 5 network in (bytes)" + description = "By AWSUniqueId" + + program_text = "A = data('^aws.ec2.network.io.receive.total', extrapolation='last_value', maxExtrapolations=5).mean(by=['AWSUniqueId']).top(count=5).publish(label='A')" + + sort_by = "-value" + + color_by = "Scale" + unit_prefix = "Binary" + max_precision = 4 + refresh_interval = 60 + time_range = 900 + secondary_visualization = "None" + + color_scale { + color = "blue" + gt = 0 + } + color_scale { + color = "red" + lte = 0 + } + + legend_options_fields { + enabled = true + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = false + property = "aws_instance_id" + } + + viz_options { + display_name = "Network in" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_dashboard" "runner_ec2" { + name = "EC2 Runners" + description = "EC2-based GitHub Actions runners: CPU, memory, disk, and network." + dashboard_group = var.dashboard_group + + variable { + property = "aws_tag_TenantName" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + variable { + property = "aws_instance_id" + alias = "ForgeCICD Instance ID" + description = "" + values = [] + value_required = false + values_suggested = [] + restricted_suggestions = false + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + chart { + chart_id = signalfx_single_value_chart.chart_active_hosts.id + row = 0 + column = 0 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_single_value_chart.chart_hosts_with_agent_installed.id + row = 0 + column = 3 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_active_hosts_per_instance_type.id + row = 0 + column = 9 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_active_hosts_by_availability_zone.id + row = 0 + column = 6 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_cpu_utilization.id + row = 1 + column = 0 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_top_instances_by_cpu_utilization.id + row = 1 + column = 4 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_top_images_by_mean_cpu_utilization.id + row = 1 + column = 8 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_total_memory_overview_bytes.id + row = 2 + column = 4 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_top_memory_page_swaps_sec.id + row = 2 + column = 8 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_memory_utilization.id + row = 2 + column = 0 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_disk_metrics_24h_change.id + row = 3 + column = 9 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_disk_io_bytes.id + row = 3 + column = 6 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_disk_utilization.id + row = 3 + column = 0 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_disk_ops.id + row = 3 + column = 3 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_top_5_network_in_bytes.id + row = 4 + column = 6 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_network_in_bytes_vs_24h_change.id + row = 4 + column = 9 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_disk_summary_utilization.id + row = 4 + column = 0 + width = 6 + height = 2 + } + chart { + chart_id = signalfx_list_chart.chart_top_5_network_out_bytes.id + row = 5 + column = 6 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_network_out_bytes_vs_24h_change.id + row = 5 + column = 9 + width = 3 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_network_out_bytes.id + row = 6 + column = 8 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_time_chart.chart_network_in_bytes.id + row = 6 + column = 0 + width = 4 + height = 1 + } + chart { + chart_id = signalfx_list_chart.chart_total_network_errors.id + row = 6 + column = 4 + width = 4 + height = 1 + } + +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_ec2/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/README.md new file mode 100644 index 00000000..0dd4bf76 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/README.md @@ -0,0 +1,46 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.runner_k8s](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_list_chart.k8s_network_errors_per_sec](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.k8s_pods_by_phase](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.k8s_top_10_cpu_usage_per_pod](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.k8s_top_10_pods_by_avg_memory_usage](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_single_value_chart.k8s_active_pods](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.k8s_available_pods_by_deployments](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_single_value_chart.k8s_desired_pods_by_deployments](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.k8s_memory_usage_bytes](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.k8s_memory_usage_pct](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.k8s_network_bytes_per_sec](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/main.tf new file mode 100644 index 00000000..35c559e2 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/main.tf @@ -0,0 +1,555 @@ +resource "signalfx_single_value_chart" "k8s_available_pods_by_deployments" { + name = "# Available pods by deployments" + description = "Number of pods ready by deployments" + + program_text = "A = data('k8s.deployment.available', rollup='latest').sum(by=['k8s.cluster.name', 'k8s.namespace.name', 'k8s.deployment.name']).sum().publish(label='A')" + + color_by = "Dimension" + refresh_interval = 5 + + viz_options { + display_name = "Available pods" + label = "A" + } +} + +resource "signalfx_list_chart" "k8s_top_10_cpu_usage_per_pod" { + name = "Top 10 CPU usage per pod (CPU units)" + description = "Pod name | Node name" + + program_text = <<-EOF +A = data('container_cpu_utilization', rollup='rate').mean(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).scale(0.01).top(count=10).publish(label='A') +B = data('container.cpu.time').mean(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).top(count=10).publish(label='B') +EOF + + sort_by = "-value" + + disable_sampling = true + hide_missing_values = true + max_precision = 5 + refresh_interval = 5 + time_range = 900 + + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "k8s.pod.name" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = false + property = "k8s.pod.uid" + } + + viz_options { + display_name = "OTel pre translated CPU usage" + label = "B" + } + viz_options { + display_name = "Top 10 pods by average CPU usage" + label = "A" + } +} + +resource "signalfx_time_chart" "k8s_network_bytes_per_sec" { + name = "Network bytes / sec" + description = "" + + program_text = "A = data('k8s.pod.network.io', filter=filter('k8s.cluster.name', '*') and filter('k8s.namespace.name', '*') and filter('sf_tags', '*', match_missing=True) and filter('k8s.deployment.name', '*', match_missing=True), rollup='rate', extrapolation='zero').sum(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).publish(label='A')" + + plot_type = "ColumnChart" + + time_range = 900 + unit_prefix = "Binary" + + axes_precision = 0 + + disable_sampling = true + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Rx bytes /sec (RED)" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = 0 + } + + axis_right { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "Tx bytes /sec (BLUE)" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + min_value = 0 + } + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "brown" + display_name = "Network bytes / sec" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_single_value_chart" "k8s_desired_pods_by_deployments" { + name = "# Desired pods by deployments" + description = "Number of pods that should be created by deployments" + + program_text = "A = data('k8s.deployment.desired', rollup='latest').sum(by=['k8s.cluster.name', 'k8s.namespace.name', 'k8s.deployment.name']).sum().publish(label='A')" + + color_by = "Dimension" + refresh_interval = 5 + + viz_options { + display_name = "Desired pods by deployments" + label = "A" + } +} + +resource "signalfx_list_chart" "k8s_network_errors_per_sec" { + name = "Network errors / sec" + description = "" + + program_text = "A = data('k8s.pod.network.errors', filter=filter('k8s.cluster.name', '*') and filter('k8s.namespace.name', '*') and filter('k8s.deployment.name', '*', match_missing=True) and filter('sf_tags', '*', match_missing=True), rollup='rate').sum(by=['k8s.pod.name', 'k8s.cluster.name', 'k8s.node.name', 'k8s.pod.uid']).publish(label='A')" + + sort_by = "-value" + + disable_sampling = true + max_precision = 4 + refresh_interval = 5 + time_range = 900 + + legend_options_fields { + enabled = true + property = "k8s.pod.name" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = false + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = false + property = "k8s.pod.uid" + } + + viz_options { + color = "brown" + display_name = "Network error / sec" + label = "A" + } +} + +resource "signalfx_time_chart" "k8s_memory_usage_pct" { + name = "Memory usage (%)" + description = "With EKS/Fargate metric data can possibly go >100%" + + program_text = <<-EOF +A = data('container.memory.usage', filter=filter('k8s.cluster.name', '*') and filter('k8s.namespace.name', '*') and filter('k8s.deployment.name', '*', match_missing=True) and filter('sf_tags', '*', match_missing=True)).sum(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).publish(label='A', enable=False) +B = data('k8s.container.memory_limit', filter=filter('k8s.cluster.name', '*') and filter('k8s.namespace.name', '*') and filter('k8s.deployment.name', '*', match_missing=True) and filter('sf_tags', '*', match_missing=True)).sum(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).above(0, inclusive=True).publish(label='B', enable=False) +C = (A/B*100).publish(label='C') +EOF + + plot_type = "LineChart" + + + time_range = 900 + disable_sampling = true + axes_precision = 0 + axes_include_zero = true + + + axis_left { + high_watermark = 179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + label = "% memory used" + low_watermark = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + max_value = 110 + min_value = -179769313486231570000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "k8s.pod.name" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = true + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = true + property = "k8s.pod.uid" + } + + viz_options { + axis = "left" + display_name = "Memory used (%)" + label = "C" + value_suffix = "%" + } + viz_options { + axis = "left" + color = "blue" + display_name = "Container" + label = "A" + } + viz_options { + axis = "left" + color = "yellow" + display_name = "Limit" + label = "B" + } +} + +resource "signalfx_single_value_chart" "k8s_active_pods" { + name = "# Active pods" + description = "This may include \"pause\" containers used internally by k8s" + + program_text = "A = data('k8s.pod.phase').between(1.5, 2.5, low_inclusive=True, high_inclusive=True).count().publish(label='A')" + + color_by = "Dimension" + refresh_interval = 5 + + viz_options { + display_name = "Number of pods" + label = "A" + } +} + +resource "signalfx_list_chart" "k8s_top_10_pods_by_avg_memory_usage" { + name = "Top 10 pods by average memory usage (bytes)" + description = "Pod name | Node name" + + program_text = "A = data('container.memory.usage', filter=filter('k8s.cluster.name', '*') and filter('k8s.namespace.name', '*') and filter('k8s.deployment.name', '*', match_missing=True) and filter('sf_tags', '*', match_missing=True)).mean(by=['k8s.pod.name', 'k8s.node.name', 'k8s.cluster.name', 'k8s.pod.uid']).top(count=10).publish(label='A')" + + sort_by = "-value" + + disable_sampling = true + unit_prefix = "Binary" + refresh_interval = 5 + max_precision = 4 + secondary_visualization = "None" + time_range = 900 + + legend_options_fields { + enabled = true + property = "k8s.pod.name" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + legend_options_fields { + enabled = false + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = false + property = "k8s.pod.uid" + } + + viz_options { + display_name = "Top 10 pods by average memory usage" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_list_chart" "k8s_pods_by_phase" { + name = "# Pods by phase" + description = "" + + program_text = <<-EOF +B = data('k8s.pod.phase', rollup='latest').between(1.5, 2.5, low_inclusive=True, high_inclusive=True).count().publish(label='B') +A = data('k8s.pod.phase', rollup='latest').between(0, 1.5, low_inclusive=True, high_inclusive=True).count().publish(label='A') +C = data('k8s.pod.phase', rollup='latest').between(2.5, 3.5, low_inclusive=True, high_inclusive=True).count().publish(label='C') +D = data('k8s.pod.phase', rollup='latest').between(3.5, 4.5, low_inclusive=True, high_inclusive=True).count().publish(label='D') +E = data('k8s.pod.phase', rollup='latest').between(4.5, 5.5, low_inclusive=True, high_inclusive=True).count().publish(label='E') +EOF + + sort_by = "+sf_originatingMetric" + + disable_sampling = true + max_precision = 4 + refresh_interval = 5 + time_range = 900 + + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = true + property = "sf_metric" + } + + viz_options { + color = "azure" + display_name = "Succeeded" + label = "C" + value_suffix = "pods" + } + viz_options { + color = "brown" + display_name = "Failed" + label = "D" + value_suffix = "pods" + } + viz_options { + color = "purple" + display_name = "Unknown" + label = "E" + value_suffix = "pods" + } + viz_options { + color = "yellow" + display_name = "Pending" + label = "A" + value_suffix = "pods" + } + viz_options { + color = "yellowgreen" + display_name = "Running" + label = "B" + value_suffix = "pods" + } +} + +resource "signalfx_time_chart" "k8s_memory_usage_bytes" { + name = "Memory usage (bytes)" + description = "" + + program_text = "A = data('container.memory.usage', filter=filter('k8s.node.name', '*')).sum(by=['k8s.cluster.name', 'k8s.namespace.name', 'k8s.pod.uid', 'k8s.pod.name', 'k8s.node.name']).publish(label='A')" + + plot_type = "LineChart" + + + axes_precision = 0 + disable_sampling = true + time_range = 900 + unit_prefix = "Binary" + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = true + property = "kubernetes_pod_name" + } + legend_options_fields { + enabled = true + property = "kubernetes_namespace" + } + legend_options_fields { + enabled = true + property = "kubernetes_cluster" + } + legend_options_fields { + enabled = true + property = "kubernetes_pod_uid" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "k8s.namespace.name" + } + legend_options_fields { + enabled = true + property = "k8s.pod.name" + } + legend_options_fields { + enabled = true + property = "k8s.cluster.name" + } + legend_options_fields { + enabled = true + property = "k8s.pod.uid" + } + legend_options_fields { + enabled = true + property = "k8s.node.name" + } + + viz_options { + axis = "left" + display_name = "Memory usage per pod" + label = "A" + value_unit = "Byte" + } +} + + +resource "signalfx_dashboard" "runner_k8s" { + name = "K8S Runners" + description = "Kubernetes-based runners: pod states, CPU, memory, and network health." + dashboard_group = var.dashboard_group + + variable { + property = "k8s.namespace.name" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + variable { + property = "k8s.pod.name" + alias = "ForgeCICD Instance Id" + description = "" + values = [] + value_required = false + values_suggested = [] + restricted_suggestions = false + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + chart { + chart_id = signalfx_single_value_chart.k8s_active_pods.id + row = 0 + column = 0 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.k8s_available_pods_by_deployments.id + row = 0 + column = 3 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.k8s_top_10_pods_by_avg_memory_usage.id + row = 0 + column = 9 + width = 3 + height = 2 + } + + chart { + chart_id = signalfx_single_value_chart.k8s_desired_pods_by_deployments.id + row = 0 + column = 6 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.k8s_pods_by_phase.id + row = 1 + column = 0 + width = 3 + height = 2 + } + + chart { + chart_id = signalfx_time_chart.k8s_memory_usage_pct.id + row = 1 + column = 3 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.k8s_memory_usage_bytes.id + row = 1 + column = 6 + width = 3 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.k8s_network_errors_per_sec.id + row = 2 + column = 3 + width = 5 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.k8s_network_bytes_per_sec.id + row = 2 + column = 8 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.k8s_top_10_cpu_usage_per_pod.id + row = 3 + column = 0 + width = 3 + height = 2 + } +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/runner_k8s/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/README.md b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/README.md new file mode 100644 index 00000000..0ccf41e5 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/README.md @@ -0,0 +1,45 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [signalfx](#requirement\_signalfx) | < 10.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [signalfx](#provider\_signalfx) | 9.25.1 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [signalfx_dashboard.sqs](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/dashboard) | resource | +| [signalfx_list_chart.oldest_message_age](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.top_queues_by_message_received](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_list_chart.top_queues_by_message_sent](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/list_chart) | resource | +| [signalfx_single_value_chart.queues](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/single_value_chart) | resource | +| [signalfx_time_chart.empty_receives](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.message_processing_trend](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.messages_by_state](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.messages_deleted](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | +| [signalfx_time_chart.sent_message_size](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/time_chart) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [dashboard\_group](#input\_dashboard\_group) | Dashboard group name for organizing dashboards. | `string` | n/a | yes | +| [dynamic\_variables](#input\_dynamic\_variables) | Additional dynamic variable definitions for the dashboard. |
list(object({
property = string
alias = string
description = string
values = list(string)
value_required = bool
values_suggested = list(string)
restricted_suggestions = bool
}))
| `[]` | no | +| [tenant\_names](#input\_tenant\_names) | List of tenant names used for the dashboard. | `list(string)` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/main.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/main.tf new file mode 100644 index 00000000..3806a766 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/main.tf @@ -0,0 +1,452 @@ +resource "signalfx_single_value_chart" "queues" { + name = "# Queues" + description = "Shows how many SQS queues are being monitored" + + program_text = "A = data('ApproximateAgeOfOldestMessage', rollup='latest').count(by=['QueueName']).count().publish(label='A')" + + max_precision = 4 + unit_prefix = "Metric" + color_by = "Dimension" + show_spark_line = false + + viz_options { + display_name = "Number of SQS queues monitored" + label = "A" + } +} + +resource "signalfx_list_chart" "top_queues_by_message_sent" { + name = "Top queues by message sent" + description = "Ranks queues by number of sent messages" + + program_text = "A = data('NumberOfMessagesSent', rollup='latest').sum(by=['QueueName']).top(count=5).publish(label='A')" + sort_by = "-value" + + disable_sampling = false + hide_missing_values = true + max_precision = 4 + time_range = 900 + unit_prefix = "Metric" + secondary_visualization = "None" + + legend_options_fields { + enabled = true + property = "QueueName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "Top queues by visible messages" + label = "A" + } +} + +resource "signalfx_time_chart" "sent_message_size" { + name = "Sent message size" + description = "Tracks the size of sent messages over time." + + program_text = <<-EOF +A = data('SentMessageSize', filter=filter('namespace', 'AWS/SQS')).sum(over=Args.get('ui.dashboard_window', '15m')).publish(label='A') +EOF + + plot_type = "LineChart" + disable_sampling = true + show_event_lines = false + stacked = false + time_range = 900 + unit_prefix = "Metric" + + axes_precision = 0 + + axis_left { + label = "Bytes" + } + + histogram_options { + color_theme = "gold" + } + + legend_options_fields { + enabled = false + property = "AWSUniqueId" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "QueueName" + } + legend_options_fields { + enabled = false + property = "stat" + } + + viz_options { + display_name = "Sent message size over time" + label = "A" + value_unit = "Byte" + } +} + +resource "signalfx_time_chart" "messages_by_state" { + name = "Messages by state" + description = "Shows delayed, visible, and in-flight message breakdown." + + program_text = <<-EOF +A = data('ApproximateNumberOfMessagesDelayed', rollup='latest').sum().publish(label='A') +B = data('ApproximateNumberOfMessagesVisible', rollup='latest').sum().publish(label='B') +C = data('ApproximateNumberOfMessagesNotVisible', rollup='latest').sum().publish(label='C') +EOF + + plot_type = "AreaChart" + disable_sampling = false + show_event_lines = false + stacked = true + time_range = 900 + unit_prefix = "Metric" + axes_precision = 0 + + on_chart_legend_dimension = "plot_label" + + axis_left { + label = "#Messages" + } + + histogram_options { + color_theme = "gold" + } + + viz_options { + axis = "left" + color = "blue" + display_name = "Delayed messages" + label = "A" + value_suffix = "No of messages" + } + viz_options { + axis = "left" + color = "emerald" + display_name = "Visible messages" + label = "B" + value_suffix = "No of messages" + } + viz_options { + axis = "left" + color = "pink" + display_name = "In-flight messages" + label = "C" + value_suffix = "No of messages" + } +} + +resource "signalfx_list_chart" "oldest_message_age" { + name = "Oldest message age" + description = "Displays the max age of the oldest unprocessed message in seconds" + + program_text = "A = data('ApproximateAgeOfOldestMessage', filter=filter('namespace', 'AWS/SQS') and filter('stat', 'mean')).sum(by=['QueueName', 'aws_region']).publish(label='A')" + + disable_sampling = false + hide_missing_values = true + time_range = 900 + unit_prefix = "Metric" + + secondary_visualization = "None" + sort_by = "-value" + + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "namespace" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + legend_options_fields { + enabled = true + property = "QueueName" + } + legend_options_fields { + enabled = false + property = "stat" + } + legend_options_fields { + enabled = true + property = "aws_region" + } + + viz_options { + display_name = "ApproximateAgeOfOldestMessage - Sum by QueueName,aws_region" + label = "A" + } +} + +resource "signalfx_time_chart" "empty_receives" { + name = "# Empty receives" + description = "Tracks ReceiveMessage API calls returning zero messages" + + program_text = "A = data('NumberOfEmptyReceives', rollup='latest').sum().publish(label='A')" + + plot_type = "LineChart" + disable_sampling = false + show_event_lines = false + stacked = false + time_range = 900 + unit_prefix = "Metric" + axes_precision = 0 + + histogram_options { + color_theme = "gold" + } + + axis_left { + label = "# Calls" + } + + viz_options { + axis = "left" + color = "brown" + display_name = "Number of empty receives" + label = "A" + value_suffix = "No of receives" + } +} + +resource "signalfx_list_chart" "top_queues_by_message_received" { + name = "Top queues by message received" + description = "Ranks queues by number of received messages" + + program_text = "A = data('NumberOfMessagesReceived', rollup='latest').sum(by=['QueueName']).top(count=5).publish(label='A')" + + sort_by = "-value" + + disable_sampling = false + hide_missing_values = true + max_precision = 4 + time_range = 900 + unit_prefix = "Metric" + secondary_visualization = "None" + + legend_options_fields { + enabled = true + property = "QueueName" + } + legend_options_fields { + enabled = false + property = "sf_originatingMetric" + } + legend_options_fields { + enabled = false + property = "sf_metric" + } + + viz_options { + display_name = "Top queues by visible messages" + label = "A" + } +} + +resource "signalfx_time_chart" "message_processing_trend" { + name = "Message processing trend" + description = "Tracks messages sent, received, and deleted over time." + + program_text = <<-EOF +A = data('NumberOfMessagesSent', rollup='latest').sum().publish(label='A') +B = data('NumberOfMessagesReceived', rollup='latest').sum().publish(label='B') +C = data('NumberOfMessagesDeleted', rollup='latest').sum().publish(label='C') +EOF + + plot_type = "AreaChart" + disable_sampling = false + show_event_lines = false + time_range = 900 + unit_prefix = "Metric" + axes_precision = 0 + + on_chart_legend_dimension = "plot_label" + + axis_left { + label = "Count" + } + + histogram_options { + color_theme = "gold" + } + + + viz_options { + axis = "left" + color = "azure" + display_name = "Messages received" + label = "B" + value_suffix = "No of messages" + } + viz_options { + axis = "left" + color = "blue" + display_name = "Messages sent" + label = "A" + value_suffix = "No of messages" + } + viz_options { + axis = "left" + color = "orange" + display_name = "Messages deleted" + label = "C" + value_suffix = "No of messages" + } +} + +resource "signalfx_time_chart" "messages_deleted" { + name = "# Messages deleted" + description = "Displays messages successfully deleted from queues" + + program_text = "A = data('NumberOfMessagesDeleted', rollup='latest').sum().publish(label='A')" + + plot_type = "LineChart" + disable_sampling = false + show_event_lines = false + stacked = false + time_range = 900 + unit_prefix = "Metric" + axes_precision = 0 + + axis_left { + label = "# Messages" + } + + histogram_options { + color_theme = "gold" + } + viz_options { + axis = "left" + color = "emerald" + display_name = "Number of messages deleted" + label = "A" + value_suffix = "No of messages" + } +} + +resource "signalfx_dashboard" "sqs" { + name = "SQS" + description = "SQS queue counts, message states, sizes, and processing trends." + + dashboard_group = var.dashboard_group + + variable { + property = "aws_tag_TenantName" + alias = "ForgeCICD Tenant Name" + description = "" + values = [] + value_required = false + values_suggested = var.tenant_names + restricted_suggestions = true + } + + dynamic "variable" { + for_each = var.dynamic_variables + iterator = var_def + + content { + property = var_def.value.property + alias = var_def.value.alias + description = var_def.value.description + values = var_def.value.values + value_required = var_def.value.value_required + values_suggested = var_def.value.values_suggested + restricted_suggestions = var_def.value.restricted_suggestions + } + } + + chart { + chart_id = signalfx_time_chart.message_processing_trend.id + column = 4 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.sent_message_size.id + column = 8 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_single_value_chart.queues.id + column = 0 + row = 0 + width = 4 + height = 1 + } + + chart { + chart_id = signalfx_list_chart.top_queues_by_message_received.id + column = 0 + row = 1 + width = 4 + height = 2 + } + + chart { + chart_id = signalfx_list_chart.oldest_message_age.id + column = 8 + row = 1 + width = 4 + height = 2 + } + + chart { + chart_id = signalfx_list_chart.top_queues_by_message_sent.id + column = 4 + row = 1 + width = 4 + height = 2 + } + + chart { + chart_id = signalfx_time_chart.messages_deleted.id + column = 6 + row = 3 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.messages_by_state.id + column = 0 + row = 3 + width = 6 + height = 1 + } + + chart { + chart_id = signalfx_time_chart.empty_receives.id + column = 0 + row = 4 + width = 12 + height = 1 + } +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/variables.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/variables.tf new file mode 100644 index 00000000..d926894e --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/variables.tf @@ -0,0 +1,24 @@ + +variable "tenant_names" { + description = "List of tenant names used for the dashboard." + type = list(string) +} + +variable "dynamic_variables" { + description = "Additional dynamic variable definitions for the dashboard." + type = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + })) + default = [] +} + +variable "dashboard_group" { + description = "Dashboard group name for organizing dashboards." + type = string +} diff --git a/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/versions.tf b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/versions.tf new file mode 100644 index 00000000..dded539f --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/dashboards/sqs/versions.tf @@ -0,0 +1,12 @@ +terraform { + # Provider versions. + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/providers.tf b/modules/integrations/splunk_o11y_conf_shared/providers.tf new file mode 100644 index 00000000..6ac7d5f0 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/providers.tf @@ -0,0 +1,17 @@ +provider "aws" { + region = var.aws_region + profile = var.aws_profile + + # Required, as per security guidelines. + default_tags { + tags = merge(var.default_tags, ) + } +} + +provider "signalfx" { + api_url = var.splunk_api_url + + email = data.aws_secretsmanager_secret_version.secrets["splunk_o11y_username"].secret_string + password = data.aws_secretsmanager_secret_version.secrets["splunk_o11y_password"].secret_string + organization_id = var.splunk_organization_id +} diff --git a/modules/integrations/splunk_o11y_conf_shared/secrets.tf b/modules/integrations/splunk_o11y_conf_shared/secrets.tf new file mode 100644 index 00000000..42b5e7c9 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/secrets.tf @@ -0,0 +1,20 @@ +locals { + secrets = { + splunk_o11y_username = { + name = "/cicd/common/splunk_o11y_username" + } + splunk_o11y_password = { + name = "/cicd/common/splunk_o11y_password" + } + } +} + +data "aws_secretsmanager_secret" "secrets" { + for_each = local.secrets + name = each.value.name +} + +data "aws_secretsmanager_secret_version" "secrets" { + for_each = data.aws_secretsmanager_secret.secrets + secret_id = each.value.id +} diff --git a/modules/integrations/splunk_o11y_conf_shared/variables.tf b/modules/integrations/splunk_o11y_conf_shared/variables.tf new file mode 100644 index 00000000..5bed82d1 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/variables.tf @@ -0,0 +1,126 @@ +variable "aws_profile" { + type = string + description = "AWS profile to use." +} + +variable "aws_region" { + type = string + description = "Default AWS region." +} + +variable "default_tags" { + type = map(string) + description = "A map of tags to apply to resources." +} + +variable "splunk_api_url" { + description = "URL for plunk Observability Cloud API." + type = string +} + +variable "splunk_organization_id" { + description = "organization ID for Splunk Observability Cloud." + type = string +} + +variable "team" { + description = "Team ID" + type = string +} + +variable "dashboard_variables" { + type = object({ + runner_k8s = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + runner_ec2 = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + billing = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + sqs = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + ebs = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + lambda = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + dynamodb = object({ + tenant_names = list(string) + dynamic_variables = list(object({ + property = string + alias = string + description = string + values = list(string) + value_required = bool + values_suggested = list(string) + restricted_suggestions = bool + } + )) + }) + }) + description = "Variables for Dashboards" +} diff --git a/modules/integrations/splunk_o11y_conf_shared/versions.tf b/modules/integrations/splunk_o11y_conf_shared/versions.tf new file mode 100644 index 00000000..50012663 --- /dev/null +++ b/modules/integrations/splunk_o11y_conf_shared/versions.tf @@ -0,0 +1,16 @@ +terraform { + # Provider versions. + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 6.25" + } + signalfx = { + source = "splunk-terraform/signalfx" + version = "< 10.0.0" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/integrations/splunk_otel_eks/README.md b/modules/integrations/splunk_otel_eks/README.md index 9f1be39c..b7c2ac02 100644 --- a/modules/integrations/splunk_otel_eks/README.md +++ b/modules/integrations/splunk_otel_eks/README.md @@ -3,17 +3,17 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [helm](#provider\_helm) | 3.1.0 | +| [aws](#provider\_aws) | 6.35.1 | +| [helm](#provider\_helm) | 3.1.1 | ## Modules @@ -23,9 +23,16 @@ No modules. | Name | Type | |------|------| +| [aws_eks_pod_identity_association.eks_pod_identity](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_pod_identity_association) | resource | +| [aws_iam_policy.ec2_describe_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.splunk_otel_ec2_describe](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.splunk_otel_ec2_describe](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [helm_release.splunk_otel_collector](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [aws_eks_cluster.cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_iam_openid_connect_provider.cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_openid_connect_provider) | data source | +| [aws_iam_policy_document.ec2_describe_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.splunk_otel_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | | [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | @@ -38,6 +45,7 @@ No modules. | [cluster\_name](#input\_cluster\_name) | The name of the EKS cluster | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [splunk\_otel\_collector](#input\_splunk\_otel\_collector) | Configuration for the Splunk OpenTelemetry Collector |
object({
splunk_observability_realm = string
splunk_platform_endpoint = string
splunk_platform_index = string
gateway = bool
splunk_observability_profiling = bool
environment = string
discovery = bool
})
| n/a | yes | +| [tags](#input\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | ## Outputs diff --git a/modules/integrations/splunk_otel_eks/iam.tf b/modules/integrations/splunk_otel_eks/iam.tf new file mode 100644 index 00000000..0792e394 --- /dev/null +++ b/modules/integrations/splunk_otel_eks/iam.tf @@ -0,0 +1,84 @@ +data "aws_iam_openid_connect_provider" "cluster" { + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +} + +locals { + oidc_provider_arn = data.aws_iam_openid_connect_provider.cluster.arn +} + +data "aws_iam_policy_document" "splunk_otel_assume_role" { + + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["pods.eks.amazonaws.com"] + } + + actions = [ + "sts:AssumeRole", + "sts:TagSession" + ] + } + + statement { + effect = "Allow" + + principals { + type = "Federated" + identifiers = [ + local.oidc_provider_arn + ] + } + + actions = ["sts:AssumeRoleWithWebIdentity"] + + condition { + test = "StringEquals" + variable = "${regex("^arn:aws:iam::\\d+:oidc-provider/(.+)$", local.oidc_provider_arn)[0]}:sub" + values = [ + "system:serviceaccount:splunk-otel-collector:splunk-otel-collector", + ] + } + } +} + +resource "aws_iam_role" "splunk_otel_ec2_describe" { + name = "splunk-otel-${var.cluster_name}-ec2-describe-role" + assume_role_policy = data.aws_iam_policy_document.splunk_otel_assume_role.json + + tags = local.all_security_tags + tags_all = local.all_security_tags +} + +data "aws_iam_policy_document" "ec2_describe_instances" { + statement { + sid = "AllowDescribeInstances" + effect = "Allow" + actions = ["ec2:DescribeInstances"] + resources = ["*"] + } +} + +resource "aws_iam_policy" "ec2_describe_instances" { + name = "splunk-otel-${var.cluster_name}-ec2-describe-policy" + policy = data.aws_iam_policy_document.ec2_describe_instances.json + + tags = local.all_security_tags +} + +resource "aws_iam_role_policy_attachment" "splunk_otel_ec2_describe" { + role = aws_iam_role.splunk_otel_ec2_describe.name + policy_arn = aws_iam_policy.ec2_describe_instances.arn +} + + +resource "aws_eks_pod_identity_association" "eks_pod_identity" { + cluster_name = var.cluster_name + namespace = "splunk-otel-collector" + service_account = "splunk-otel-collector" + role_arn = aws_iam_role.splunk_otel_ec2_describe.arn + + tags = local.all_security_tags +} diff --git a/modules/integrations/splunk_otel_eks/otel.tf b/modules/integrations/splunk_otel_eks/otel.tf index 4038f1e3..e58ce71b 100644 --- a/modules/integrations/splunk_otel_eks/otel.tf +++ b/modules/integrations/splunk_otel_eks/otel.tf @@ -2,7 +2,7 @@ resource "helm_release" "splunk_otel_collector" { name = "splunk-otel-collector" repository = "https://signalfx.github.io/splunk-otel-collector-chart" chart = "splunk-otel-collector" - version = "0.140.0" + version = "0.145.1" namespace = "splunk-otel-collector" create_namespace = true diff --git a/modules/integrations/splunk_otel_eks/tags.tf b/modules/integrations/splunk_otel_eks/tags.tf new file mode 100644 index 00000000..a527e72f --- /dev/null +++ b/modules/integrations/splunk_otel_eks/tags.tf @@ -0,0 +1,4 @@ +# Common tags we propagate project-wide. +locals { + all_security_tags = merge(var.default_tags, var.tags) +} diff --git a/modules/integrations/splunk_otel_eks/variables.tf b/modules/integrations/splunk_otel_eks/variables.tf index b26f5d83..fc5eb9eb 100644 --- a/modules/integrations/splunk_otel_eks/variables.tf +++ b/modules/integrations/splunk_otel_eks/variables.tf @@ -26,6 +26,11 @@ variable "splunk_otel_collector" { }) } +variable "tags" { + type = map(string) + description = "A map of tags to apply to resources." +} + variable "default_tags" { type = map(string) description = "A map of tags to apply to resources." diff --git a/modules/integrations/splunk_otel_eks/versions.tf b/modules/integrations/splunk_otel_eks/versions.tf index 21e524f6..b00cedb6 100644 --- a/modules/integrations/splunk_otel_eks/versions.tf +++ b/modules/integrations/splunk_otel_eks/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } helm = { source = "hashicorp/helm" @@ -11,10 +11,10 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/splunk_secrets/versions.tf b/modules/integrations/splunk_secrets/versions.tf index 04078419..cd0b6f7a 100644 --- a/modules/integrations/splunk_secrets/versions.tf +++ b/modules/integrations/splunk_secrets/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } time = { source = "hashicorp/time" @@ -12,5 +12,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/teleport/README.md b/modules/integrations/teleport/README.md index cebb0ce4..8ca03bb4 100644 --- a/modules/integrations/teleport/README.md +++ b/modules/integrations/teleport/README.md @@ -3,17 +3,17 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [kubernetes](#provider\_kubernetes) | 2.38.0 | +| [aws](#provider\_aws) | 6.35.1 | +| [kubernetes](#provider\_kubernetes) | 3.0.1 | ## Modules @@ -25,23 +25,33 @@ | Name | Type | |------|------| -| [kubernetes_config_map.aws_auth_teleport](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | +| [aws_iam_policy.eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.teleport_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.attach_eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [kubernetes_config_map_v1.aws_auth_teleport](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map_v1) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_eks_cluster.cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_iam_policy_document.eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.trust_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e. generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | | [aws\_region](#input\_aws\_region) | Assuming single region for now. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [tags](#input\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | | [teleport\_config](#input\_teleport\_config) | Map of IAM roles to assume for teleport access, including EKS cluster ARNs and other roles. |
object({
cluster_name = string
teleport_iam_role_to_assume = string
})
| n/a | yes | -| [tenant\_prefix](#input\_tenant\_prefix) | Name of the EKS cluster | `string` | n/a | yes | | [tenants](#input\_tenants) | List of tenants to create roles for. | `list(string)` | n/a | yes | ## Outputs -No outputs. +| Name | Description | +|------|-------------| +| [teleport\_account\_id](#output\_teleport\_account\_id) | AWS account ID where Teleport role and resources are created. | +| [teleport\_cluster\_name](#output\_teleport\_cluster\_name) | EKS cluster name used by the Teleport integration. | +| [teleport\_role\_arn](#output\_teleport\_role\_arn) | ARN of the IAM role created for Teleport access to the EKS cluster. | +| [teleport\_tenant\_groups](#output\_teleport\_tenant\_groups) | Map of tenant name to Kubernetes group name used in aws-auth (teleport-). | diff --git a/modules/integrations/teleport/main.tf b/modules/integrations/teleport/main.tf index a880f88b..e664d56c 100644 --- a/modules/integrations/teleport/main.tf +++ b/modules/integrations/teleport/main.tf @@ -5,7 +5,7 @@ module "tenant" { namespace = each.value } -resource "kubernetes_config_map" "aws_auth_teleport" { +resource "kubernetes_config_map_v1" "aws_auth_teleport" { count = length(var.tenants) > 0 ? 1 : 0 metadata { name = "aws-auth" diff --git a/modules/integrations/teleport/tenant/README.md b/modules/integrations/teleport/tenant/README.md index bbaa4e99..89a70897 100644 --- a/modules/integrations/teleport/tenant/README.md +++ b/modules/integrations/teleport/tenant/README.md @@ -3,17 +3,16 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.90 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [helm](#requirement\_helm) | >= 3.0.0 | -| [kubernetes](#requirement\_kubernetes) | >= 2.36.0 | +| [kubernetes](#requirement\_kubernetes) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [kubernetes](#provider\_kubernetes) | 2.38.0 | +| [kubernetes](#provider\_kubernetes) | 3.0.1 | ## Modules @@ -23,28 +22,18 @@ No modules. | Name | Type | |------|------| -| [aws_iam_policy.eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | -| [aws_iam_role.teleport_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | -| [aws_iam_role_policy_attachment.attach_eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | -| [kubernetes_cluster_role.impersonate](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | -| [kubernetes_cluster_role.pods](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | -| [kubernetes_cluster_role_binding.impersonate](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | -| [kubernetes_role_binding.pods](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding) | resource | -| [aws_iam_policy_document.eks_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | -| [aws_iam_policy_document.trust_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [kubernetes_cluster_role_binding_v1.impersonate](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding_v1) | resource | +| [kubernetes_cluster_role_v1.impersonate](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_v1) | resource | +| [kubernetes_cluster_role_v1.pods](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_v1) | resource | +| [kubernetes_role_binding_v1.pods](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/role_binding_v1) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [namespace](#input\_namespace) | Namespace for chart installation | `string` | n/a | yes | -| [release\_name](#input\_release\_name) | Name of the Helm release | `string` | n/a | yes | -| [tags](#input\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | -| [teleport\_config](#input\_teleport\_config) | Map of IAM roles to assume for teleport access, including EKS cluster ARNs and other roles. |
object({
teleport_iam_role_to_assume = string
})
| n/a | yes | ## Outputs -| Name | Description | -|------|-------------| -| [iam\_role\_arn](#output\_iam\_role\_arn) | n/a | +No outputs. diff --git a/modules/integrations/teleport/tenant/main.tf b/modules/integrations/teleport/tenant/main.tf index 76945d46..f3819aa8 100644 --- a/modules/integrations/teleport/tenant/main.tf +++ b/modules/integrations/teleport/tenant/main.tf @@ -1,4 +1,4 @@ -resource "kubernetes_cluster_role" "impersonate" { +resource "kubernetes_cluster_role_v1" "impersonate" { metadata { name = "teleport-${var.namespace}-impersonate" } @@ -16,7 +16,7 @@ resource "kubernetes_cluster_role" "impersonate" { } } -resource "kubernetes_cluster_role" "pods" { +resource "kubernetes_cluster_role_v1" "pods" { metadata { name = "teleport-${var.namespace}-pods" } @@ -29,7 +29,7 @@ resource "kubernetes_cluster_role" "pods" { } -resource "kubernetes_cluster_role_binding" "impersonate" { +resource "kubernetes_cluster_role_binding_v1" "impersonate" { metadata { name = "teleport-${var.namespace}-impersonate-binding" } @@ -48,7 +48,7 @@ resource "kubernetes_cluster_role_binding" "impersonate" { } } -resource "kubernetes_role_binding" "pods" { +resource "kubernetes_role_binding_v1" "pods" { metadata { name = "teleport-${var.namespace}-pods-binding" namespace = var.namespace diff --git a/modules/integrations/teleport/tenant/versions.tf b/modules/integrations/teleport/tenant/versions.tf index 5163401e..056af10a 100644 --- a/modules/integrations/teleport/tenant/versions.tf +++ b/modules/integrations/teleport/tenant/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.90" + version = ">= 6.25" } helm = { source = "hashicorp/helm" @@ -10,10 +10,10 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/integrations/teleport/variables.tf b/modules/integrations/teleport/variables.tf index e4213aa6..ae97a383 100644 --- a/modules/integrations/teleport/variables.tf +++ b/modules/integrations/teleport/variables.tf @@ -1,6 +1,6 @@ variable "aws_profile" { type = string - description = "AWS profile (i.e. generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { diff --git a/modules/integrations/teleport/versions.tf b/modules/integrations/teleport/versions.tf index 06035e89..056af10a 100644 --- a/modules/integrations/teleport/versions.tf +++ b/modules/integrations/teleport/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } helm = { source = "hashicorp/helm" @@ -10,10 +10,10 @@ terraform { } kubernetes = { source = "hashicorp/kubernetes" - version = ">= 2.36.0" + version = ">= 3.0" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/arc_deployment/README.md b/modules/platform/arc_deployment/README.md index 9d57f2b1..5df617f1 100644 --- a/modules/platform/arc_deployment/README.md +++ b/modules/platform/arc_deployment/README.md @@ -3,7 +3,7 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | +| [terraform](#requirement\_terraform) | ~> 1.11 | ## Providers @@ -32,6 +32,7 @@ No resources. | Name | Description | |------|-------------| -| [arc\_runners\_arn\_map](#output\_arc\_runners\_arn\_map) | n/a | -| [subnet\_cidr\_blocks](#output\_subnet\_cidr\_blocks) | n/a | +| [arc\_cluster\_name](#output\_arc\_cluster\_name) | Name of the Kubernetes cluster used for ARC runners. | +| [arc\_runners\_arn\_map](#output\_arc\_runners\_arn\_map) | Map of ARC runner keys to their IAM role ARNs. | +| [subnet\_cidr\_blocks](#output\_subnet\_cidr\_blocks) | Map of ARC runner subnet IDs to their CIDR blocks. | diff --git a/modules/platform/arc_deployment/main.tf b/modules/platform/arc_deployment/main.tf index 74e00565..b07c2440 100644 --- a/modules/platform/arc_deployment/main.tf +++ b/modules/platform/arc_deployment/main.tf @@ -22,7 +22,7 @@ module "arc" { release_name = "${var.runner_configs.prefix}-${key}" namespace = var.tenant_configs.name chart_name = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set" - chart_version = "0.13.0" + chart_version = "0.13.1" } runner_config = { runner_size = val.runner_size @@ -50,7 +50,7 @@ module "arc" { release_name = var.runner_configs.prefix namespace = var.tenant_configs.name chart_name = "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller" - chart_version = "0.13.0" + chart_version = "0.13.1" name = "${var.runner_configs.prefix}-gha-rs-controller" } diff --git a/modules/platform/arc_deployment/outputs.tf b/modules/platform/arc_deployment/outputs.tf index 8866d2e4..81d04b70 100644 --- a/modules/platform/arc_deployment/outputs.tf +++ b/modules/platform/arc_deployment/outputs.tf @@ -2,8 +2,15 @@ output "arc_runners_arn_map" { value = { for runner_key, runner in module.arc.runners_map : runner_key => runner.runner_role_arn } + description = "Map of ARC runner keys to their IAM role ARNs." } output "subnet_cidr_blocks" { - value = module.arc.subnet_cidr_blocks + value = module.arc.subnet_cidr_blocks + description = "Map of ARC runner subnet IDs to their CIDR blocks." +} + +output "arc_cluster_name" { + value = var.runner_configs.arc_cluster_name + description = "Name of the Kubernetes cluster used for ARC runners." } diff --git a/modules/platform/arc_deployment/versions.tf b/modules/platform/arc_deployment/versions.tf index 5a407547..c4822b74 100644 --- a/modules/platform/arc_deployment/versions.tf +++ b/modules/platform/arc_deployment/versions.tf @@ -1,4 +1,4 @@ terraform { # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/ec2_deployment/README.md b/modules/platform/ec2_deployment/README.md index 0f62868c..13165123 100644 --- a/modules/platform/ec2_deployment/README.md +++ b/modules/platform/ec2_deployment/README.md @@ -3,47 +3,36 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | +| [terraform](#requirement\_terraform) | ~> 1.11 | | [archive](#requirement\_archive) | >= 2.7.0 | -| [aws](#requirement\_aws) | >= 5.27 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [external](#provider\_external) | 2.3.5 | ## Modules | Name | Source | Version | |------|--------|---------| -| [runners](#module\_runners) | git::https://github.com/github-aws-runners/terraform-aws-github-runner.git//modules/multi-runner | v6.7.8 | -| [update\_ec2\_tags](#module\_update\_ec2\_tags) | terraform-aws-modules/lambda/aws | 8.1.0 | -| [update\_runner\_ami\_lambda](#module\_update\_runner\_ami\_lambda) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [ec2\_update\_runner\_ssm\_ami](#module\_ec2\_update\_runner\_ssm\_ami) | ./ec2_update_runner_ssm_ami | n/a | +| [ec2\_update\_runner\_tags](#module\_ec2\_update\_runner\_tags) | ./ec2_update_runner_tags | n/a | +| [runners](#module\_runners) | git::https://github.com/edersonbrilhante/terraform-aws-github-runner.git//modules/multi-runner | feat-macos-support | ## Resources | Name | Type | |------|------| -| [aws_cloudwatch_event_rule.update_ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | -| [aws_cloudwatch_event_rule.update_runner_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | -| [aws_cloudwatch_event_target.update_ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | -| [aws_cloudwatch_event_target.update_runner_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | -| [aws_cloudwatch_log_group.update_ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | -| [aws_cloudwatch_log_group.update_runner_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_iam_policy.ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_kms_alias.github](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_alias) | resource | | [aws_kms_key.github](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | -| [aws_lambda_permission.update_ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | -| [aws_lambda_permission.update_runner_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | -| [aws_security_group.gh_runner_egress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [aws_security_group.gh_runner_lambda_egress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | | [aws_ami.runner_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source | | [aws_iam_policy_document.ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | -| [aws_iam_policy_document.update_ec2_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | -| [aws_iam_policy_document.update_runner_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_ssm_parameter.ami_id](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameter) | data source | | [aws_subnet.runner_subnet](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/subnet) | data source | | [external_external.download_lambdas](https://registry.terraform.io/providers/hashicorp/external/latest/docs/data-sources/external) | data source | @@ -53,17 +42,17 @@ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [aws\_region](#input\_aws\_region) | Assuming single region for now. | `string` | n/a | yes | -| [network\_configs](#input\_network\_configs) | n/a |
object({
vpc_id = string
subnet_ids = list(string)
lambda_subnet_ids = list(string)
})
| n/a | yes | -| [runner\_configs](#input\_runner\_configs) | n/a |
object({
env = string
prefix = string
ghes_url = string
ghes_org = string
log_level = string
logging_retention_in_days = string
github_app = object({
key_base64 = string
id = string
webhook_secret = string
})
runner_iam_role_managed_policy_arns = list(string)
runner_group_name = string
runner_specs = map(object({
ami_filter = object({
name = list(string)
state = list(string)
})
ami_kms_key_arn = string
ami_owners = list(string)
runner_labels = list(string)
runner_os = string
runner_architecture = string
extra_labels = list(string)
max_instances = number
min_run_time = number
instance_types = list(string)
pool_config = list(object({
size = number
schedule_expression = string
schedule_expression_timezone = string
}))
runner_user = string
enable_userdata = bool
instance_target_capacity_type = string
block_device_mappings = list(object({
delete_on_termination = bool
device_name = string
encrypted = bool
iops = number
kms_key_id = string
snapshot_id = string
throughput = number
volume_size = number
volume_type = string
}))
}))
})
| n/a | yes | +| [network\_configs](#input\_network\_configs) | n/a |
object({
vpc_id = string
subnet_ids = list(string)
lambda_vpc_id = string
lambda_subnet_ids = list(string)
})
| n/a | yes | +| [runner\_configs](#input\_runner\_configs) | n/a |
object({
env = string
prefix = string
ghes_url = string
ghes_org = string
log_level = string
logging_retention_in_days = string
github_app = object({
key_base64 = string
id = string
webhook_secret = string
})
runner_iam_role_managed_policy_arns = list(string)
runner_group_name = string
scale_errors = optional(list(string), [])
runner_specs = map(object({
ami_filter = object({
name = list(string)
state = list(string)
})
ami_kms_key_arn = string
ami_owners = list(string)
runner_labels = list(string)
runner_os = string
runner_architecture = string
extra_labels = list(string)
max_instances = number
min_run_time = number
instance_types = list(string)
license_specifications = optional(list(object({
license_configuration_arn = string
})), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(string)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
pool_config = list(object({
size = number
schedule_expression = string
schedule_expression_timezone = string
}))
runner_user = string
enable_userdata = bool
instance_target_capacity_type = string
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
block_device_mappings = list(object({
delete_on_termination = bool
device_name = string
encrypted = bool
iops = number
kms_key_id = string
snapshot_id = string
throughput = number
volume_size = number
volume_type = string
}))
}))
})
| n/a | yes | | [tenant\_configs](#input\_tenant\_configs) | n/a |
object({
ecr_registries = list(string)
tags = map(string)
})
| n/a | yes | ## Outputs | Name | Description | |------|-------------| -| [ec2\_runners\_ami\_name\_map](#output\_ec2\_runners\_ami\_name\_map) | n/a | -| [ec2\_runners\_arn\_map](#output\_ec2\_runners\_arn\_map) | n/a | -| [event\_bus\_name](#output\_event\_bus\_name) | n/a | -| [subnet\_cidr\_blocks](#output\_subnet\_cidr\_blocks) | n/a | -| [webhook\_endpoint](#output\_webhook\_endpoint) | n/a | +| [ec2\_runners\_ami\_name\_map](#output\_ec2\_runners\_ami\_name\_map) | Map of EC2 runner keys to the AMI names used for each runner. | +| [ec2\_runners\_arn\_map](#output\_ec2\_runners\_arn\_map) | Map of EC2 runner keys to their IAM role ARNs. | +| [event\_bus\_name](#output\_event\_bus\_name) | Name of the EventBridge event bus used by the webhook relay. | +| [subnet\_cidr\_blocks](#output\_subnet\_cidr\_blocks) | Map of EC2 runner subnet IDs to their CIDR blocks. | +| [webhook\_endpoint](#output\_webhook\_endpoint) | Public HTTPS endpoint URL for the GitHub Actions webhook relay. | diff --git a/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami.tf b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami.tf new file mode 100644 index 00000000..7cdfe919 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami.tf @@ -0,0 +1,23 @@ +module "ec2_update_runner_ssm_ami" { + source = "./ec2_update_runner_ssm_ami" + + providers = { + aws = aws + } + + prefix = var.runner_configs.prefix + logging_retention_in_days = var.runner_configs.logging_retention_in_days + log_level = var.runner_configs.log_level + tags = var.tenant_configs.tags + + runner_ami_map = { + for key in keys(var.runner_configs.runner_specs) : + key => { + resource_ssm_id = replace(module.runners.runners_map[key].launch_template_ami_id, "resolve:ssm:", "") + ssm_id = split("parameter", module.runners.runners_map[key].launch_template_ami_id)[1] + ami_filter = var.runner_configs.runner_specs[key].ami_filter + ami_owners = var.runner_configs.runner_specs[key].ami_owners + } + } + +} diff --git a/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/README.md b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/README.md new file mode 100644 index 00000000..3a4ab931 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/README.md @@ -0,0 +1,44 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.35.1 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [ec2\_update\_runner\_ssm\_ami\_lambda](#module\_ec2\_update\_runner\_ssm\_ami\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.ec2_update_runner_ssm_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.ec2_update_runner_ssm_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.ec2_update_runner_ssm_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_lambda_permission.ec2_update_runner_ssm_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.ec2_update_runner_ssm_ami_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | +| [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | +| [runner\_ami\_map](#input\_runner\_ami\_map) | n/a |
map(object({
resource_ssm_id = string
ssm_id = string
ami_filter = object({
name = list(string)
state = list(string)
})
ami_owners = list(string)
}))
| n/a | yes | +| [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | + +## Outputs + +No outputs. + diff --git a/modules/platform/ec2_deployment/lambda/update_ssm_ami_id.py b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/lambda/ec2_update_runner_ssm_ami.py similarity index 100% rename from modules/platform/ec2_deployment/lambda/update_ssm_ami_id.py rename to modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/lambda/ec2_update_runner_ssm_ami.py diff --git a/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/main.tf b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/main.tf new file mode 100644 index 00000000..4d8d88e9 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/main.tf @@ -0,0 +1,94 @@ +module "ec2_update_runner_ssm_ami_lambda" { + source = "terraform-aws-modules/lambda/aws" + version = "8.7.0" + + function_name = "${var.prefix}-ec2-update-runner-ssm-ami" + handler = "ec2_update_runner_ssm_ami.lambda_handler" + runtime = "python3.12" + timeout = 900 + architectures = ["x86_64"] + + source_path = [{ + path = "${path.module}/lambda" + }] + + logging_log_group = aws_cloudwatch_log_group.ec2_update_runner_ssm_ami_lambda.name + use_existing_cloudwatch_log_group = true + + trigger_on_package_timestamp = false + + environment_variables = { + RUNNER_AMI_MAP = jsonencode(var.runner_ami_map) + LOG_LEVEL = var.log_level + } + + attach_policy_json = true + + policy_json = data.aws_iam_policy_document.ec2_update_runner_ssm_ami_lambda.json + + function_tags = var.tags + role_tags = var.tags + tags = var.tags + + depends_on = [aws_cloudwatch_log_group.ec2_update_runner_ssm_ami_lambda] +} + +data "aws_iam_policy_document" "ec2_update_runner_ssm_ami_lambda" { + statement { + effect = "Allow" + actions = [ + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:PutParameter", + "ssm:AddTagsToResource" + ] + + resources = [ + for key, cfg in var.runner_ami_map : + cfg.resource_ssm_id + ] + } + + statement { + effect = "Allow" + actions = [ + "ec2:DescribeImages", + ] + resources = ["*"] + } +} + +resource "aws_cloudwatch_log_group" "ec2_update_runner_ssm_ami_lambda" { + name = "/aws/lambda/${var.prefix}-ec2-update-runner-ssm-ami" + retention_in_days = var.logging_retention_in_days + tags = var.tags + tags_all = var.tags +} + +resource "aws_cloudwatch_event_rule" "ec2_update_runner_ssm_ami_lambda" { + name = "${var.prefix}-ec2-update-runner-ssm-ami" + description = "Trigger Lambda every 10 minutes" + schedule_expression = "cron(*/10 * * * ? *)" + + tags = var.tags + tags_all = var.tags + + depends_on = [module.ec2_update_runner_ssm_ami_lambda] +} + +resource "aws_cloudwatch_event_target" "ec2_update_runner_ssm_ami_lambda" { + rule = aws_cloudwatch_event_rule.ec2_update_runner_ssm_ami_lambda.name + arn = module.ec2_update_runner_ssm_ami_lambda.lambda_function_arn + + depends_on = [module.ec2_update_runner_ssm_ami_lambda] +} + +resource "aws_lambda_permission" "ec2_update_runner_ssm_ami_lambda" { + action = "lambda:InvokeFunction" + function_name = "${var.prefix}-ec2-update-runner-ssm-ami" + principal = "events.amazonaws.com" + statement_id = "AllowExecutionFromCloudWatch" + source_arn = aws_cloudwatch_event_rule.ec2_update_runner_ssm_ami_lambda.arn + + depends_on = [module.ec2_update_runner_ssm_ami_lambda] +} diff --git a/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/variables.tf b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/variables.tf new file mode 100644 index 00000000..1654e1de --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/variables.tf @@ -0,0 +1,34 @@ +variable "prefix" { + description = "Prefix for all resources" + type = string +} + +variable "tags" { + description = "Tags to apply to created resources." + type = map(string) + default = {} +} + +variable "logging_retention_in_days" { + description = "Retention in days for CloudWatch Log Group for the Lambdas." + type = number + default = 30 +} + +variable "log_level" { + type = string + description = "Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR)" + default = "INFO" +} + +variable "runner_ami_map" { + type = map(object({ + resource_ssm_id = string + ssm_id = string + ami_filter = object({ + name = list(string) + state = list(string) + }) + ami_owners = list(string) + })) +} diff --git a/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/versions.tf b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/versions.tf new file mode 100644 index 00000000..7ce2660e --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_ssm_ami/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 6.25" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/platform/ec2_deployment/ec2_update_runner_tags.tf b/modules/platform/ec2_deployment/ec2_update_runner_tags.tf new file mode 100644 index 00000000..3c35f414 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_tags.tf @@ -0,0 +1,14 @@ +module "ec2_update_runner_tags" { + source = "./ec2_update_runner_tags" + + providers = { + aws = aws + } + + prefix = var.runner_configs.prefix + logging_retention_in_days = var.runner_configs.logging_retention_in_days + log_level = var.runner_configs.log_level + tags = var.tenant_configs.tags + + event_bus = module.runners.webhook.eventbridge.event_bus.name +} diff --git a/modules/platform/ec2_deployment/ec2_update_runner_tags/README.md b/modules/platform/ec2_deployment/ec2_update_runner_tags/README.md new file mode 100644 index 00000000..e6238c75 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_tags/README.md @@ -0,0 +1,44 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.35.1 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [ec2\_update\_runner\_tags\_lambda](#module\_ec2\_update\_runner\_tags\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.ec2_update_runner_tags_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.ec2_update_runner_tags_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.ec2_update_runner_tags_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_lambda_permission.ec2_update_runner_tags_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.ec2_update_runner_tags_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [event\_bus](#input\_event\_bus) | The name of the EventBridge event bus to subscribe to. | `string` | n/a | yes | +| [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | +| [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | +| [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | + +## Outputs + +No outputs. + diff --git a/modules/platform/ec2_deployment/ec2_update_runner_tags/lambda/ec2_update_runner_tags.py b/modules/platform/ec2_deployment/ec2_update_runner_tags/lambda/ec2_update_runner_tags.py new file mode 100644 index 00000000..6e534b8f --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_tags/lambda/ec2_update_runner_tags.py @@ -0,0 +1,64 @@ +import json +import logging +import os + +import boto3 + +LOG = logging.getLogger() +level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() +LOG.setLevel(getattr(logging, level_str, logging.INFO)) + +ssm = boto3.client('ssm') +ec2 = boto3.client('ec2') + + +def lambda_handler(event, context): + try: + LOG.debug('Received event') + + if event.get('detail-type') != 'workflow_job': + LOG.info('Ignoring non-workflow_job event: %s', + event.get('detail-type')) + return {'statusCode': 200, 'body': json.dumps({'message': 'ignored event'})} + + detail = event.get('detail', {}) + + runner_name = detail.get('workflow_job').get('runner_name') + if not runner_name: + LOG.error('runner_name missing in event detail: %s', detail) + raise ValueError('runner_name missing') + + if not (isinstance(runner_name, str) and runner_name.startswith('i-')): + LOG.info( + 'Runner name %s is not an EC2 instance ID, ignoring', runner_name) + return {'statusCode': 200, 'body': json.dumps({'message': 'ignored non-EC2 runner'})} + + LOG.info('Looking up EC2 instance by ID: %s', runner_name) + resp = ec2.describe_instances(InstanceIds=[runner_name]) + instance_ids = [inst['InstanceId'] for res in resp.get( + 'Reservations', []) for inst in res.get('Instances', [])] + + LOG.info('Described instances, found IDs: %s', instance_ids) + if not instance_ids: + LOG.info('No instances found with Name tag %s', runner_name) + return {'statusCode': 200, 'body': json.dumps({'message': 'no instances found'})} + + job_url = detail.get('workflow_job', {}).get('html_url', '') + job_id = str(detail.get('workflow_job', {}).get('id', '')) + LOG.info('GitHub job URL: %s, job ID: %s', job_url, job_id) + + # Tag instances with found flag and GitHub URLs + LOG.info( + 'Tagging instances %s with tags job_url, job_id', instance_ids) + ec2.create_tags(Resources=instance_ids, Tags=[ + {'Key': 'ghr:job_id', 'Value': job_id}, + {'Key': 'ghr:job_url', 'Value': job_url} + ]) + LOG.info('Successfully tagged instances: %s', instance_ids) + return {'statusCode': 200, 'body': json.dumps({'tagged_instances': instance_ids})} + except Exception as e: + LOG.exception( + 'Unhandled exception in ec2_update_runner_tags lambda. Error: %s', + str(e), + ) + raise diff --git a/modules/platform/ec2_deployment/ec2_update_runner_tags/main.tf b/modules/platform/ec2_deployment/ec2_update_runner_tags/main.tf new file mode 100644 index 00000000..4763ccc9 --- /dev/null +++ b/modules/platform/ec2_deployment/ec2_update_runner_tags/main.tf @@ -0,0 +1,101 @@ +module "ec2_update_runner_tags_lambda" { + source = "terraform-aws-modules/lambda/aws" + version = "8.7.0" + + function_name = "${var.prefix}-ec2-update-runner-tags" + handler = "ec2_update_runner_tags.lambda_handler" + runtime = "python3.12" + timeout = 900 + architectures = ["x86_64"] + + source_path = [{ + path = "${path.module}/lambda" + }] + + logging_log_group = aws_cloudwatch_log_group.ec2_update_runner_tags_lambda.name + use_existing_cloudwatch_log_group = true + + trigger_on_package_timestamp = false + + environment_variables = { + LOG_LEVEL = var.log_level + } + + + attach_policy_json = true + + policy_json = data.aws_iam_policy_document.ec2_update_runner_tags_lambda.json + + function_tags = var.tags + role_tags = var.tags + tags = var.tags + + depends_on = [aws_cloudwatch_log_group.ec2_update_runner_tags_lambda] +} + +data "aws_iam_policy_document" "ec2_update_runner_tags_lambda" { + + # Allow DescribeInstances without condition + statement { + effect = "Allow" + actions = ["ec2:DescribeInstances"] + resources = ["*"] + } + + # Allow tagging operations conditioned on environment tag + statement { + effect = "Allow" + actions = [ + "ec2:CreateTags", + "ec2:DeleteTags" + ] + resources = ["*"] + + condition { + test = "StringLike" + variable = "ec2:ResourceTag/ghr:environment" + values = ["${var.prefix}-*"] + } + } +} + +resource "aws_cloudwatch_log_group" "ec2_update_runner_tags_lambda" { + name = "/aws/lambda/${var.prefix}-ec2-update-runner-tags" + retention_in_days = var.logging_retention_in_days + tags = var.tags + tags_all = var.tags +} + +resource "aws_lambda_permission" "ec2_update_runner_tags_lambda" { + action = "lambda:InvokeFunction" + function_name = "${var.prefix}-ec2-update-runner-tags" + principal = "events.amazonaws.com" + statement_id = "AllowExecutionFromCloudWatch" + source_arn = aws_cloudwatch_event_rule.ec2_update_runner_tags_lambda.arn + + depends_on = [module.ec2_update_runner_tags_lambda] +} + +resource "aws_cloudwatch_event_rule" "ec2_update_runner_tags_lambda" { + name = "${var.prefix}-ec2-update-runner-tags" + description = "Workflow job event rule to update EC2 tags." + event_bus_name = var.event_bus + + tags = var.tags + tags_all = var.tags + + event_pattern = < runner.role_runner.arn } + description = "Map of EC2 runner keys to their IAM role ARNs." } output "ec2_runners_ami_name_map" { value = { for runner_key, runner in module.runners.runners_map : runner_key => data.aws_ami.runner_ami[runner_key].name } + description = "Map of EC2 runner keys to the AMI names used for each runner." } output "subnet_cidr_blocks" { - value = { for id, subnet in data.aws_subnet.runner_subnet : id => subnet.cidr_block } + value = { for id, subnet in data.aws_subnet.runner_subnet : id => subnet.cidr_block } + description = "Map of EC2 runner subnet IDs to their CIDR blocks." } output "event_bus_name" { - value = module.runners.webhook.eventbridge.event_bus.name + value = module.runners.webhook.eventbridge.event_bus.name + description = "Name of the EventBridge event bus used by the webhook relay." } diff --git a/modules/platform/ec2_deployment/security_group.tf b/modules/platform/ec2_deployment/security_group.tf index 932c749d..1f1c74e0 100644 --- a/modules/platform/ec2_deployment/security_group.tf +++ b/modules/platform/ec2_deployment/security_group.tf @@ -1,7 +1,7 @@ # Allow the lambda to egress to any destination via any protocol. resource "aws_security_group" "gh_runner_lambda_egress" { name = "${var.runner_configs.prefix}-gh-runner-lambda-egress-all" - vpc_id = var.network_configs.vpc_id + vpc_id = var.network_configs.lambda_vpc_id egress { from_port = 0 @@ -19,25 +19,3 @@ resource "aws_security_group" "gh_runner_lambda_egress" { tags_all = var.tenant_configs.tags } - -# Policy for the runners (not the lambdas). -resource "aws_security_group" "gh_runner_egress" { - name = "${var.runner_configs.prefix}-gh-runner-lambda-egress" - vpc_id = var.network_configs.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = merge( - var.tenant_configs.tags, - { - Name = "${var.runner_configs.prefix}-gh-runner-lambda-egress" - } - ) - - tags_all = var.tenant_configs.tags -} diff --git a/modules/platform/ec2_deployment/template_files/user_data.tftpl b/modules/platform/ec2_deployment/template_files/user_data_linux.tftpl similarity index 100% rename from modules/platform/ec2_deployment/template_files/user_data.tftpl rename to modules/platform/ec2_deployment/template_files/user_data_linux.tftpl diff --git a/modules/platform/ec2_deployment/template_files/user_data_osx.tftpl b/modules/platform/ec2_deployment/template_files/user_data_osx.tftpl new file mode 100644 index 00000000..84f97620 --- /dev/null +++ b/modules/platform/ec2_deployment/template_files/user_data_osx.tftpl @@ -0,0 +1,27 @@ +#!/bin/bash + +set +x +%{ if enable_debug_logging } +set -x +%{ endif } + +# Just a dummy value (user-supplied string; does nothing). +${pre_install} + +# Set defaults for macOS EC2 instances. +# Default user on EC2 macOS AMIs is typically "ec2-user". +user_name=ec2-user + +touch .env +chmod 0644 .env + +grep -q "^PATH=" .env || echo "PATH=$PATH" >> .env + +# Configure CloudWatch logging agent if installed and configured for macOS. +if command -v /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl >/dev/null 2>&1; then + /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c ssm:${ssm_key_cloudwatch_agent_config} || true +fi + +cd /opt/actions-runner + +${start_runner} diff --git a/modules/platform/ec2_deployment/template_files/user_data_windows.tftpl b/modules/platform/ec2_deployment/template_files/user_data_windows.tftpl new file mode 100644 index 00000000..0cc35b11 --- /dev/null +++ b/modules/platform/ec2_deployment/template_files/user_data_windows.tftpl @@ -0,0 +1,23 @@ + + +$logPath = 'C:\UserData.log' + +Start-Transcript -Path $logPath -Append + +$ssm = Get-Service AmazonSSMAgent -ErrorAction SilentlyContinue +if ($ssm) { + Set-Service AmazonSSMAgent -StartupType Automatic + if ($ssm.Status -ne 'Running') { + Start-Service AmazonSSMAgent + } +} + +${pre_install} + +Set-Location C:\actions-runner + +${start_runner} + +Stop-Transcript + + diff --git a/modules/platform/ec2_deployment/update-ec2-tags.tf b/modules/platform/ec2_deployment/update-ec2-tags.tf deleted file mode 100644 index 3f08f967..00000000 --- a/modules/platform/ec2_deployment/update-ec2-tags.tf +++ /dev/null @@ -1,101 +0,0 @@ -module "update_ec2_tags" { - source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" - - function_name = "${var.runner_configs.prefix}-update-ec2-tags" - handler = "update_ec2_tags.lambda_handler" - runtime = "python3.12" - timeout = 900 - architectures = ["x86_64"] - - source_path = [{ - path = "${path.module}/lambda" - }] - - logging_log_group = aws_cloudwatch_log_group.update_ec2_tags.name - use_existing_cloudwatch_log_group = true - - trigger_on_package_timestamp = false - - environment_variables = { - LOG_LEVEL = var.runner_configs.log_level - } - - - attach_policy_json = true - - policy_json = data.aws_iam_policy_document.update_ec2_tags.json - - function_tags = var.tenant_configs.tags - role_tags = var.tenant_configs.tags - tags = var.tenant_configs.tags - - depends_on = [aws_cloudwatch_log_group.update_ec2_tags] -} - -data "aws_iam_policy_document" "update_ec2_tags" { - - # Allow DescribeInstances without condition - statement { - effect = "Allow" - actions = ["ec2:DescribeInstances"] - resources = ["*"] - } - - # Allow tagging operations conditioned on environment tag - statement { - effect = "Allow" - actions = [ - "ec2:CreateTags", - "ec2:DeleteTags" - ] - resources = ["*"] - - condition { - test = "StringLike" - variable = "ec2:ResourceTag/ghr:environment" - values = ["${var.runner_configs.prefix}-*"] - } - } -} - -resource "aws_cloudwatch_log_group" "update_ec2_tags" { - name = "/aws/lambda/${var.runner_configs.prefix}-update-ec2-tags" - retention_in_days = var.runner_configs.logging_retention_in_days - tags = var.tenant_configs.tags - tags_all = var.tenant_configs.tags -} - -resource "aws_lambda_permission" "update_ec2_tags" { - action = "lambda:InvokeFunction" - function_name = "${var.runner_configs.prefix}-update-ec2-tags" - principal = "events.amazonaws.com" - statement_id = "AllowExecutionFromCloudWatch" - source_arn = aws_cloudwatch_event_rule.update_ec2_tags.arn - - depends_on = [module.update_ec2_tags] -} - -resource "aws_cloudwatch_event_rule" "update_ec2_tags" { - name = "${var.runner_configs.prefix}-update-ec2-tags" - description = "Workflow job event rule to update EC2 tags." - event_bus_name = module.runners.webhook.eventbridge.event_bus.name - - tags = var.tenant_configs.tags - tags_all = var.tenant_configs.tags - - event_pattern = < { - ssm_id = split("parameter", module.runners.runners_map[key].launch_template_ami_id)[1] - ami_filter = var.runner_configs.runner_specs[key].ami_filter - ami_owners = var.runner_configs.runner_specs[key].ami_owners - } - } - - runner_ami_map_json = jsonencode(local.runner_ami_map) -} - -module "update_runner_ami_lambda" { - source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" - - function_name = "${var.runner_configs.prefix}-update-runner-ami" - handler = "update_ssm_ami_id.lambda_handler" - runtime = "python3.12" - timeout = 900 - architectures = ["x86_64"] - - source_path = [{ - path = "${path.module}/lambda" - }] - - logging_log_group = aws_cloudwatch_log_group.update_runner_ami_lambda.name - use_existing_cloudwatch_log_group = true - - trigger_on_package_timestamp = false - - environment_variables = { - RUNNER_AMI_MAP = local.runner_ami_map_json - LOG_LEVEL = var.runner_configs.log_level - } - - attach_policy_json = true - - policy_json = data.aws_iam_policy_document.update_runner_ami_lambda.json - - function_tags = var.tenant_configs.tags - role_tags = var.tenant_configs.tags - tags = var.tenant_configs.tags - - depends_on = [aws_cloudwatch_log_group.update_runner_ami_lambda] -} - -data "aws_iam_policy_document" "update_runner_ami_lambda" { - statement { - effect = "Allow" - actions = [ - "ssm:GetParameter", - "ssm:GetParameters", - "ssm:PutParameter", - "ssm:AddTagsToResource" - ] - - resources = [ - for key in keys(var.runner_configs.runner_specs) : - replace(module.runners.runners_map[key].launch_template_ami_id, "resolve:ssm:", "") - ] - } - - statement { - effect = "Allow" - actions = [ - "ec2:DescribeImages" - ] - resources = ["*"] - } -} - -resource "aws_cloudwatch_log_group" "update_runner_ami_lambda" { - name = "/aws/lambda/${var.runner_configs.prefix}-update-runner-ami" - retention_in_days = var.runner_configs.logging_retention_in_days - tags = var.tenant_configs.tags - tags_all = var.tenant_configs.tags -} - -resource "aws_cloudwatch_event_rule" "update_runner_ami_lambda" { - name = "${var.runner_configs.prefix}-update-runner-ami" - description = "Trigger Lambda every 10 minutes" - schedule_expression = "cron(*/10 * * * ? *)" - - tags = var.tenant_configs.tags - tags_all = var.tenant_configs.tags - - depends_on = [module.update_runner_ami_lambda] -} - -resource "aws_cloudwatch_event_target" "update_runner_ami_lambda" { - rule = aws_cloudwatch_event_rule.update_runner_ami_lambda.name - arn = module.update_runner_ami_lambda.lambda_function_arn - - depends_on = [module.update_runner_ami_lambda] -} - -resource "aws_lambda_permission" "update_runner_ami_lambda" { - action = "lambda:InvokeFunction" - function_name = "${var.runner_configs.prefix}-update-runner-ami" - principal = "events.amazonaws.com" - statement_id = "AllowExecutionFromCloudWatch" - source_arn = aws_cloudwatch_event_rule.update_runner_ami_lambda.arn - - depends_on = [module.update_runner_ami_lambda] -} diff --git a/modules/platform/ec2_deployment/variables.tf b/modules/platform/ec2_deployment/variables.tf index 2e1983b1..3831e0f2 100644 --- a/modules/platform/ec2_deployment/variables.tf +++ b/modules/platform/ec2_deployment/variables.tf @@ -18,6 +18,7 @@ variable "runner_configs" { }) runner_iam_role_managed_policy_arns = list(string) runner_group_name = string + scale_errors = optional(list(string), []) runner_specs = map(object({ ami_filter = object({ name = list(string) @@ -32,6 +33,20 @@ variable "runner_configs" { max_instances = number min_run_time = number instance_types = list(string) + license_specifications = optional(list(object({ + license_configuration_arn = string + })), null) + placement = optional(object({ + affinity = optional(string) + availability_zone = optional(string) + group_id = optional(string) + group_name = optional(string) + host_id = optional(string) + host_resource_group_arn = optional(string) + spread_domain = optional(string) + tenancy = optional(string) + partition_number = optional(number) + }), null) pool_config = list(object({ size = number schedule_expression = string @@ -40,6 +55,8 @@ variable "runner_configs" { runner_user = string enable_userdata = bool instance_target_capacity_type = string + vpc_id = optional(string, null) + subnet_ids = optional(list(string), null) block_device_mappings = list(object({ delete_on_termination = bool device_name = string @@ -59,6 +76,7 @@ variable "network_configs" { type = object({ vpc_id = string subnet_ids = list(string) + lambda_vpc_id = string lambda_subnet_ids = list(string) }) } diff --git a/modules/platform/ec2_deployment/versions.tf b/modules/platform/ec2_deployment/versions.tf index f0429d46..dc020a24 100644 --- a/modules/platform/ec2_deployment/versions.tf +++ b/modules/platform/ec2_deployment/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } external = { source = "hashicorp/external" @@ -15,5 +15,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/forge_runners/README.md b/modules/platform/forge_runners/README.md index a951d979..93b741a0 100644 --- a/modules/platform/forge_runners/README.md +++ b/modules/platform/forge_runners/README.md @@ -3,9 +3,9 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | +| [terraform](#requirement\_terraform) | ~> 1.11 | | [archive](#requirement\_archive) | >= 2.7.0 | -| [aws](#requirement\_aws) | >= 5.27 | +| [aws](#requirement\_aws) | >= 6.25 | | [external](#requirement\_external) | >= 2.3 | | [local](#requirement\_local) | >= 2.5 | | [null](#requirement\_null) | >= 3.2 | @@ -16,9 +16,9 @@ | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | | [null](#provider\_null) | 3.2.4 | -| [random](#provider\_random) | 3.7.2 | +| [random](#provider\_random) | 3.8.1 | | [time](#provider\_time) | 0.13.1 | ## Modules @@ -27,10 +27,12 @@ |------|--------|---------| | [arc\_runners](#module\_arc\_runners) | ../arc_deployment | n/a | | [ec2\_runners](#module\_ec2\_runners) | ../ec2_deployment | n/a | +| [forge\_trust\_validator](#module\_forge\_trust\_validator) | ./forge_trust_validator | n/a | | [github\_actions\_job\_logs](#module\_github\_actions\_job\_logs) | ./github_actions_job_logs | n/a | | [github\_app\_runner\_group](#module\_github\_app\_runner\_group) | ./github_app_runner_group | n/a | | [github\_global\_lock](#module\_github\_global\_lock) | ./github_global_lock | n/a | | [github\_webhook\_relay](#module\_github\_webhook\_relay) | ./github_webhook_relay | n/a | +| [redrive\_deadletter](#module\_redrive\_deadletter) | ./redrive_deadletter | n/a | ## Resources @@ -38,44 +40,37 @@ |------|------| | [aws_iam_policy.ecr_access_for_ec2_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_policy.role_assumption_for_forge_runners](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | -| [aws_secretsmanager_secret.cicd_secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | -| [aws_secretsmanager_secret_version.cicd_secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | | [aws_servicecatalogappregistry_application.forge](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/servicecatalogappregistry_application) | resource | +| [aws_ssm_parameter.github_app_client_id](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_ssm_parameter.github_app_id](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_ssm_parameter.github_app_installation_id](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_ssm_parameter.github_app_key](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_ssm_parameter.github_app_name](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_ssm_parameter.github_app_webhook_secret](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | | [null_resource.update_github_app_webhook](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [random_id.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | -| [time_sleep.wait_60_seconds](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [random_password.github_app_webhook_secret](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | +| [time_rotating.every_30_days](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/rotating) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.ecr_access_for_ec2_instances](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.role_assumption_for_forge_runners](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | -| [aws_secretsmanager_random_password.secret_seeds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_random_password) | data source | -| [aws_secretsmanager_secret.data_cicd_secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | -| [aws_secretsmanager_secret_version.data_cicd_secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | +| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | +| [aws_ssm_parameter.github_app_key](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameter) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [arc\_cluster\_name](#input\_arc\_cluster\_name) | Name of the EKS cluster | `string` | n/a | yes | -| [arc\_runner\_specs](#input\_arc\_runner\_specs) | Map of runner specifications |
map(object({
runner_size = object({
max_runners = number
min_runners = number
})
scale_set_name = string
scale_set_type = string
container_actions_runner = string
container_limits_cpu = string
container_limits_memory = string
container_requests_cpu = string
container_requests_memory = string
volume_requests_storage_size = string
volume_requests_storage_type = string
}))
| n/a | yes | -| [aws\_account\_id](#input\_aws\_account\_id) | AWS account ID (not SL AWS account ID) associated with the infra/backend. | `string` | n/a | yes | -| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e. generated via 'sl aws session generate') to use. | `string` | n/a | yes | -| [aws\_region](#input\_aws\_region) | Assuming single region for now. | `string` | n/a | yes | +| [arc\_deployment\_specs](#input\_arc\_deployment\_specs) | Deployment configuration for Azure Container Apps (ARC) runners.

Top-level fields:
- cluster\_name : Name of the EKS cluster used for ARC runners.
- migrate\_cluster: Optional flag to indicate a one-time migration or
blue/green cutover of the ARC runner cluster.
- runner\_specs : Map of ARC runner pool keys to their sizing and
container resource settings.

runner\_specs[*] object fields:
- runner\_size.max\_runners: Maximum concurrent ARC runners for this pool.
- runner\_size.min\_runners: Minimum number of warm runners.
- scale\_set\_name : Logical name for the scale set / pool.
- scale\_set\_type : Backing type for the scale set (for example,
kubernetes or containerapp, depending on integration).
- container\_actions\_runner : Container image used for the ARC runner.
- container\_limits\_cpu : CPU limit for the runner container.
- container\_limits\_memory : Memory limit for the runner container.
- container\_requests\_cpu : CPU request (baseline reservation).
- container\_requests\_memory : Memory request (baseline reservation).
- volume\_requests\_storage\_size: Size of attached storage for the runner.
- volume\_requests\_storage\_type: Storage class or type for attached volume. |
object({
cluster_name = string
migrate_cluster = optional(bool, false)
runner_specs = map(object({
runner_size = object({
max_runners = number
min_runners = number
})
scale_set_name = string
scale_set_type = string
container_actions_runner = string
container_limits_cpu = string
container_limits_memory = string
container_requests_cpu = string
container_requests_memory = string
volume_requests_storage_size = string
volume_requests_storage_type = string
}))
})
| n/a | yes | +| [aws\_profile](#input\_aws\_profile) | AWS profile to use. | `string` | n/a | yes | +| [aws\_region](#input\_aws\_region) | AWS region where Forge runners and supporting infrastructure are deployed. | `string` | n/a | yes | | [default\_tags](#input\_default\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | -| [deployment\_config](#input\_deployment\_config) | Prefix for the deployment, used to distinguish resources. |
object({
prefix = string
secret_suffix = string
})
| n/a | yes | -| [ec2\_runner\_specs](#input\_ec2\_runner\_specs) | Map of runner specifications |
map(object({
ami_filter = object({
name = list(string)
state = list(string)
})
ami_kms_key_arn = string
ami_owners = list(string)
runner_labels = list(string)
runner_os = string
runner_architecture = string
extra_labels = list(string)
max_instances = number
min_run_time = number
instance_types = list(string)
pool_config = list(object({
size = number
schedule_expression = string
schedule_expression_timezone = string
}))
runner_user = string
enable_userdata = bool
instance_target_capacity_type = string
block_device_mappings = list(object({
delete_on_termination = bool
device_name = string
encrypted = bool
iops = number
kms_key_id = string
snapshot_id = string
throughput = number
volume_size = number
volume_type = string
}))
}))
| n/a | yes | -| [env](#input\_env) | Deployment environments. | `string` | n/a | yes | -| [ghes\_org](#input\_ghes\_org) | GitHub organization. | `string` | n/a | yes | -| [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. | `string` | n/a | yes | +| [deployment\_config](#input\_deployment\_config) | High-level deployment configuration for a Forge runner installation.

Top-level fields:
- deployment\_prefix: Prefix used when naming resources (for example,
log groups, KMS keys, and SSM parameters).
- env : Logical environment name (for example, dev, stage,
prod). Used for tagging and dashboards.

github\_app object:
- id : Numeric GitHub App ID.
- client\_id : OAuth client ID for the app.
- installation\_id: GitHub App installation ID for this tenant.
- name : GitHub App name, used to build URLs and logs.

github object:
- ghes\_org : GitHub organization that owns the repos where
runners will be used.
- ghes\_url : GitHub.com or GHES base URL. Empty string implies
public github.com.
- repository\_selection: Scope for runners (all or selected repositories).
- runner\_group\_name : GitHub runner group to attach new runners to.

tenant object:
- name : Tenant identifier used in naming and
tagging.
- iam\_roles\_to\_assume : Optional list of IAM role ARNs that
runners are allowed to assume for workload execution.
- ecr\_registries : Optional list of ECR registry URLs that
runners may need to pull images from.
- github\_logs\_reader\_role\_arns: Optional list of IAM roles that can read
GitHub Actions logs for this tenant. |
object({
deployment_prefix = string
secret_suffix = string
env = string
github_app = object({
id = string
client_id = string
installation_id = string
name = string
})
github = object({
ghes_org = string
ghes_url = string
repository_selection = string
runner_group_name = string
})
tenant = object({
name = string
iam_roles_to_assume = optional(list(string), [])
ecr_registries = optional(list(string), [])
github_logs_reader_role_arns = optional(list(string), [])
})
})
| n/a | yes | +| [ec2\_deployment\_specs](#input\_ec2\_deployment\_specs) | EC2 deployment configuration for GitHub Actions runners.

Top-level fields:
- lambda\_subnet\_ids: Subnets where runner-related lambdas execute.
These can be more permissive than the runner subnets.
- subnet\_ids : Subnets where the EC2 runners are launched.
- vpc\_id : VPC that contains both runner and lambda subnets.
- runner\_specs : Map of runner pool keys to their EC2 sizing and
scheduling configuration.

runner\_specs[*] object fields:
- ami\_filter : Name/state filters used to select the runner AMI.
- ami\_kms\_key\_arn : KMS key ARN used to encrypt AMI EBS volumes.
- ami\_owners : List of AWS account IDs that own the AMI.
- runner\_labels : Base GitHub labels applied to jobs for this pool.
- runner\_os : Runner operating system (for example, linux).
- runner\_architecture: CPU architecture (for example, x86\_64 or arm64).
- extra\_labels : Additional GitHub labels that further specialize
this runner pool.
- max\_instances : Maximum number of EC2 runners in this pool.
- min\_run\_time : Minimum job run time (in minutes) before a runner
is eligible for scale-down.
- instance\_types : Allowed EC2 instance types for runners in this pool.
- pool\_config : List of pool size schedules (size + cron expression
and optional time zone) controlling baseline capacity.
- runner\_user : OS user under which the GitHub runner process runs.
- enable\_userdata : Whether the module should inject its standard
userdata to configure the runner VM.
- instance\_target\_capacity\_type: EC2 capacity type to use (spot or
on-demand).
- block\_device\_mappings: EBS volume configuration for the runner
instances, including size, type, encryption, and KMS. |
object({
lambda_subnet_ids = list(string)
subnet_ids = list(string)
lambda_vpc_id = string
vpc_id = string
scale_errors = optional(list(string), [])
runner_specs = map(object({
ami_filter = object({
name = list(string)
state = list(string)
})
ami_kms_key_arn = string
ami_owners = list(string)
runner_labels = list(string)
runner_os = string
runner_architecture = string
extra_labels = list(string)
max_instances = number
min_run_time = number
instance_types = list(string)
license_specifications = optional(list(object({
license_configuration_arn = string
})), null)
placement = optional(object({
affinity = optional(string)
availability_zone = optional(string)
group_id = optional(string)
group_name = optional(string)
host_id = optional(string)
host_resource_group_arn = optional(string)
spread_domain = optional(string)
tenancy = optional(string)
partition_number = optional(number)
}), null)
pool_config = list(object({
size = number
schedule_expression = string
schedule_expression_timezone = string
}))
runner_user = string
enable_userdata = bool
instance_target_capacity_type = string
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
block_device_mappings = list(object({
delete_on_termination = bool
device_name = string
encrypted = bool
iops = number
kms_key_id = string
snapshot_id = string
throughput = number
volume_size = number
volume_type = string
}))
}))
})
| n/a | yes | | [github\_webhook\_relay](#input\_github\_webhook\_relay) | Configuration for the (optional) webhook relay source module.
If enabled=true we provision the API Gateway + source EventBridge forwarding rule.
destination\_event\_bus\_name must already exist or be created in the destination account (or via the destination submodule run there). |
object({
enabled = bool
destination_account_id = optional(string)
destination_event_bus_name = optional(string)
destination_region = optional(string)
destination_reader_role_arn = optional(string)
})
|
{
"destination_account_id": "",
"destination_event_bus_name": "",
"destination_reader_role_arn": "",
"destination_region": "",
"enabled": false
}
| no | -| [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | So the lambdas can run in our pre-determined subnets. They don't require the same security policy as the runners though. | `list(string)` | n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | n/a | yes | | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Logging retention period in days. | `string` | n/a | yes | -| [migrate\_arc\_cluster](#input\_migrate\_arc\_cluster) | Flag to indicate if the cluster is being migrated. | `bool` | `false` | no | -| [repository\_selection](#input\_repository\_selection) | Repository selection type. | `string` | n/a | yes | -| [runner\_group\_name](#input\_runner\_group\_name) | Name of the group applied to all runners. | `string` | n/a | yes | -| [subnet\_ids](#input\_subnet\_ids) | Subnet(s) in which our runners will be deployed. Supplied by the underlying AWS-based CI/CD stack. | `list(string)` | n/a | yes | | [tags](#input\_tags) | A map of tags to apply to resources. | `map(string)` | n/a | yes | -| [tenant](#input\_tenant) | Map of tenant configs |
object({
name = string
iam_roles_to_assume = optional(list(string), [])
ecr_registries = optional(list(string), [])
github_logs_reader_role_arns = optional(list(string), [])
})
| n/a | yes | -| [vpc\_id](#input\_vpc\_id) | VPC in which our runners will be deployed. Supplied by the underlying AWS-based CI/CD stack. | `string` | n/a | yes | ## Outputs diff --git a/modules/platform/forge_runners/arc_runners.tf b/modules/platform/forge_runners/arc_runners.tf index 6a614233..b5bd44e3 100644 --- a/modules/platform/forge_runners/arc_runners.tf +++ b/modules/platform/forge_runners/arc_runners.tf @@ -8,25 +8,24 @@ module "arc_runners" { aws_region = var.aws_region tenant_configs = { - ecr_registries = var.tenant.ecr_registries - name = var.tenant.name + ecr_registries = var.deployment_config.tenant.ecr_registries + name = var.deployment_config.tenant.name tags = local.all_security_tags } runner_configs = { - arc_cluster_name = var.arc_cluster_name - migrate_arc_cluster = var.migrate_arc_cluster - prefix = var.deployment_config.prefix - ghes_url = var.ghes_url - ghes_org = var.ghes_org - log_level = var.log_level + arc_cluster_name = var.arc_deployment_specs.cluster_name + migrate_arc_cluster = var.arc_deployment_specs.migrate_cluster + prefix = var.deployment_config.deployment_prefix + ghes_url = var.deployment_config.github.ghes_url + ghes_org = var.deployment_config.github.ghes_org runner_iam_role_managed_policy_arns = local.runner_iam_role_managed_policy_arns github_app = { - key_base64 = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_key"].secret_string - id = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_id"].secret_string - installation_id = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_installation_id"].secret_string + key_base64 = data.aws_ssm_parameter.github_app_key.value + id = var.deployment_config.github_app.id + installation_id = var.deployment_config.github_app.installation_id } - runner_group_name = var.runner_group_name - runner_specs = var.arc_runner_specs + runner_group_name = var.deployment_config.github.runner_group_name + runner_specs = var.arc_deployment_specs.runner_specs } } diff --git a/modules/platform/forge_runners/data.tf b/modules/platform/forge_runners/data.tf new file mode 100644 index 00000000..f4693af5 --- /dev/null +++ b/modules/platform/forge_runners/data.tf @@ -0,0 +1,3 @@ +data "aws_caller_identity" "current" {} + +data "aws_region" "current" {} diff --git a/modules/platform/forge_runners/ec2_runners.tf b/modules/platform/forge_runners/ec2_runners.tf index 4dbdc503..009c0af6 100644 --- a/modules/platform/forge_runners/ec2_runners.tf +++ b/modules/platform/forge_runners/ec2_runners.tf @@ -2,42 +2,48 @@ # For generating a webhook secret. Apparently this is a cryptographically secure # PRNG. resource "random_id" "random" { - count = length(var.ec2_runner_specs) > 0 ? 1 : 0 + count = length(var.ec2_deployment_specs.runner_specs) > 0 ? 1 : 0 byte_length = 20 } module "ec2_runners" { - count = length(var.ec2_runner_specs) > 0 ? 1 : 0 + count = length(var.ec2_deployment_specs.runner_specs) > 0 ? 1 : 0 # Using multi-runner example as a baseline. source = "../ec2_deployment" aws_region = var.aws_region + providers = { + aws = aws + } + network_configs = { - vpc_id = var.vpc_id - subnet_ids = var.subnet_ids - lambda_subnet_ids = var.lambda_subnet_ids + vpc_id = var.ec2_deployment_specs.vpc_id + subnet_ids = var.ec2_deployment_specs.subnet_ids + lambda_subnet_ids = var.ec2_deployment_specs.lambda_subnet_ids + lambda_vpc_id = var.ec2_deployment_specs.lambda_vpc_id } tenant_configs = { - ecr_registries = var.tenant.ecr_registries + ecr_registries = var.deployment_config.tenant.ecr_registries tags = local.all_security_tags } runner_configs = { - env = var.env - prefix = var.deployment_config.prefix - ghes_url = var.ghes_url - ghes_org = var.ghes_org + env = var.deployment_config.env + prefix = var.deployment_config.deployment_prefix + ghes_url = var.deployment_config.github.ghes_url + ghes_org = var.deployment_config.github.ghes_org log_level = var.log_level logging_retention_in_days = var.logging_retention_in_days runner_iam_role_managed_policy_arns = local.runner_iam_role_managed_policy_arns github_app = { - key_base64 = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_key"].secret_string - id = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_id"].secret_string - webhook_secret = random_id.random[0].hex + key_base64 = data.aws_ssm_parameter.github_app_key.value + id = var.deployment_config.github_app.id + webhook_secret = aws_ssm_parameter.github_app_webhook_secret.value } - runner_group_name = var.runner_group_name - runner_specs = var.ec2_runner_specs + runner_group_name = var.deployment_config.github.runner_group_name + scale_errors = var.ec2_deployment_specs.scale_errors + runner_specs = var.ec2_deployment_specs.runner_specs } } diff --git a/modules/platform/forge_runners/forge_trust_validator.tf b/modules/platform/forge_runners/forge_trust_validator.tf new file mode 100644 index 00000000..42c3981c --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator.tf @@ -0,0 +1,28 @@ +module "forge_trust_validator" { + count = length(var.deployment_config.tenant.iam_roles_to_assume) > 0 ? 1 : 0 + source = "./forge_trust_validator" + + providers = { + aws = aws + } + + aws_profile = var.aws_profile + prefix = var.deployment_config.deployment_prefix + logging_retention_in_days = var.logging_retention_in_days + log_level = var.log_level + tags = local.all_security_tags + + forge_iam_roles = { + for idx, arn in values(merge( + try(module.ec2_runners[0].ec2_runners_arn_map, {}), + try(module.arc_runners.arc_runners_arn_map, {}), + )) : + idx => arn + } + number_forge_iram_roles = ( + length(var.ec2_deployment_specs.runner_specs) + + length(var.arc_deployment_specs.runner_specs) + ) + + tenant_iam_roles = var.deployment_config.tenant.iam_roles_to_assume +} diff --git a/modules/platform/forge_runners/forge_trust_validator/README.md b/modules/platform/forge_runners/forge_trust_validator/README.md new file mode 100644 index 00000000..59aa56c0 --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/README.md @@ -0,0 +1,52 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | +| [null](#requirement\_null) | >= 3.2 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.35.1 | +| [null](#provider\_null) | 3.2.4 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [forge\_trust\_validator\_lambda](#module\_forge\_trust\_validator\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.forge_trust_validator_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.forge_trust_validator_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.forge_trust_validator_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_lambda_permission.forge_trust_validator_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [null_resource.update_forge_role_trust](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.forge_trust_validator_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_role.forge](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_role) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_profile](#input\_aws\_profile) | AWS profile (i.e. generated via 'sl aws session generate') to use. | `string` | n/a | yes | +| [forge\_iam\_roles](#input\_forge\_iam\_roles) | List of IAM role ARNs for Forge runners. | `map(string)` | n/a | yes | +| [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | +| [number\_forge\_iram\_roles](#input\_number\_forge\_iram\_roles) | Number of Iam roles ARNs for Forge runners | `number` | n/a | yes | +| [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | +| [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | +| [tenant\_iam\_roles](#input\_tenant\_iam\_roles) | List of IAM role ARNs that the runners will assume to test trust relationships. | `list(string)` | `[]` | no | + +## Outputs + +No outputs. + diff --git a/modules/platform/forge_runners/forge_trust_validator/data.tf b/modules/platform/forge_runners/forge_trust_validator/data.tf new file mode 100644 index 00000000..8fc4b38c --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/data.tf @@ -0,0 +1 @@ +data "aws_caller_identity" "current" {} diff --git a/modules/platform/forge_runners/forge_trust_validator/forge_roles.tf b/modules/platform/forge_runners/forge_trust_validator/forge_roles.tf new file mode 100644 index 00000000..8620ef40 --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/forge_roles.tf @@ -0,0 +1,124 @@ +data "aws_iam_role" "forge" { + count = var.number_forge_iram_roles + + name = regex("([^/]+)$", var.forge_iam_roles[count.index])[0] + + depends_on = [module.forge_trust_validator_lambda] +} + +locals { + + # Statement we want to add to EVERY forge IAM role + lambda_trust_statement = { + Sid = "AllowLambdaValidationAssume" + Effect = "Allow" + Principal = { + AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${var.prefix}-forge-trust-validator" + } + Action = "sts:AssumeRole" + } + + # original_trust[arn] = decoded assume_role_policy JSON for each role + original_trust = { + for idx, role in data.aws_iam_role.forge : + var.forge_iam_roles[idx] => jsondecode(role.assume_role_policy) + } + + # original_statements[arn] = existing Statements list (or []) + original_statements = { + for arn, trust in local.original_trust : + arn => try(trust.Statement, []) + } + + # updated_statements[arn]: ensure exactly one statement with this Sid + updated_statements = { + for arn, stmts in local.original_statements : + arn => concat( + [ + for s in stmts : + s if !(can(s.Sid) && s.Sid == local.lambda_trust_statement.Sid) + ], + [local.lambda_trust_statement] + ) + } + + # concatenated_trust_object[arn] = full updated policy for each role + concatenated_trust_object = { + for arn, trust in local.updated_statements : + arn => { + Version = try(trust.Version, "2012-10-17") + Statement = local.updated_statements[arn] + } + } + + # concatenated_trust_json[arn] = final JSON string for each role + concatenated_trust_json = { + for arn, obj in local.concatenated_trust_object : + arn => jsonencode(obj) + } + + original_statements_trust_json = { + for arn, obj in local.original_statements : + arn => jsonencode(obj) + } +} + +resource "null_resource" "update_forge_role_trust" { + count = var.number_forge_iram_roles + + triggers = { + role_name = data.aws_iam_role.forge[count.index].id + original_policy = jsonencode(local.original_statements_trust_json[var.forge_iam_roles[count.index]]) + future_policy = jsonencode(local.concatenated_trust_json[var.forge_iam_roles[count.index]]) + } + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + + command = <<-EOT + set -euo pipefail + + retry_with_backoff() { + local max_attempts=10 + local attempt=1 + local delay=2 + + while true; do + # Capture stderr so we can inspect it + if output=$(aws iam update-assume-role-policy \ + --role-name "$${ROLE_NAME}" \ + --policy-document "file://$${TMP_FILE}" \ + --profile "${var.aws_profile}" 2>&1); then + echo "$output" + return 0 + fi + + # If it's not a throttling / rate exceeded error, fail fast + if ! echo "$output" | grep -q "Rate exceeded"; then + echo "$output" >&2 + return 1 + fi + + # Throttling: if we've hit max attempts, print and fail + if [ "$attempt" -ge "$max_attempts" ]; then + echo "$output" >&2 + return 1 + fi + + sleep "$delay" + attempt=$((attempt + 1)) + delay=$((delay * 2)) + done + } + + ROLE_NAME="${data.aws_iam_role.forge[count.index].name}" + TMP_FILE="/tmp/${data.aws_iam_role.forge[count.index].name}-trust.json" + + cat > "$${TMP_FILE}" << 'JSON' +${local.concatenated_trust_json[var.forge_iam_roles[count.index]]} +JSON + + retry_with_backoff + EOT + } +} diff --git a/modules/platform/forge_runners/forge_trust_validator/lambda/forge_trust_validator.py b/modules/platform/forge_runners/forge_trust_validator/lambda/forge_trust_validator.py new file mode 100644 index 00000000..7a483017 --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/lambda/forge_trust_validator.py @@ -0,0 +1,245 @@ +import json +import logging +import os +from typing import Any, Dict, List + +import boto3 +from botocore.exceptions import ClientError + +sts = boto3.client('sts') + +LOG = logging.getLogger() +level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() +LOG.setLevel(getattr(logging, level_str, logging.INFO)) + + +def parse_env_list(name: str) -> List[str]: + """ + Parse an environment variable Comma-separated string: 'a,b' + """ + LOG.info(f"Parsing environment variable: {name}") + value = os.environ.get(name, '') + if not value: + LOG.warning(f"Environment variable {name} is empty or missing") + return [] + + items = [v.strip() for v in value.split(',') if v.strip()] + LOG.info(f"Parsed {len(items)} items from {name}") + return items + + +def build_session_policy_for_tenants(tenant_role_arns: List[str]) -> str: + """ + Restrictive inline session policy: only allow sts:AssumeRole on tenant roles. + This means the forge-role session can't do anything else. + """ + policy = { + 'Version': '2012-10-17', + 'Statement': [ + { + 'Sid': 'AllowAssumeTenantRolesForValidation', + 'Effect': 'Allow', + 'Action': [ + 'sts:AssumeRole', + 'sts:TagSession', + ], + 'Resource': tenant_role_arns, + } + ], + } + return json.dumps(policy) + + +def assume_role( + role_arn: str, + session_name: str, + session_policy: str | None = None, +) -> Dict[str, Any]: + """ + Wrapper around sts.assume_role that optionally applies a restrictive session policy. + """ + LOG.info( + f"Attempting to assume role: {role_arn} (Session: {session_name})") + kwargs: Dict[str, Any] = { + 'RoleArn': role_arn, + 'RoleSessionName': session_name, + 'DurationSeconds': 900, # 15 minutes is plenty for validation + } + if session_policy: + kwargs['Policy'] = session_policy + + return sts.assume_role(**kwargs) + + +def build_sts_client_from_creds(creds: Dict[str, Any]): + """ + Given STS credentials from assume_role, build an STS client using them. + """ + return boto3.client( + 'sts', + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + ) + + +def validate_forge_role_against_tenants( + forge_role_arn: str, + tenant_role_arns: List[str], +) -> Dict[str, Any]: + """ + For a single Forge role: + - assume forge role (with restrictive session policy) + - using that session, try to assume each tenant role + - return per-tenant results + Assumes: + - Lambda execution role already has sts:AssumeRole on forge_role_arn + - Forge role trust already allows the Lambda execution role + """ + LOG.info(f"Starting validation for Forge role: {forge_role_arn}") + result: Dict[str, Any] = { + 'forge_role_arn': forge_role_arn, + 'tenant_results': [], + 'errors': [], + } + + try: + # 1) Assume the Forge role with a restrictive policy + session_policy = build_session_policy_for_tenants(tenant_role_arns) + forge_assume_resp = assume_role( + role_arn=forge_role_arn, + session_name='ForgeValidation', + session_policy=session_policy, + ) + LOG.info(f"Successfully assumed Forge role: {forge_role_arn}") + + forge_creds = forge_assume_resp['Credentials'] + sts_as_forge = build_sts_client_from_creds(forge_creds) + + # 2) From the forge session, attempt to assume each tenant role + for tenant_arn in tenant_role_arns: + LOG.info( + f"Attempting to assume Tenant role: {tenant_arn} from Forge role: {forge_role_arn}") + tenant_entry = { + 'tenant_role_arn': tenant_arn, + 'assume_role_success': False, + 'assume_role_error': None, + 'tag_session_success': False, + 'tag_session_error': None + } + + # --- Test 1: Basic AssumeRole (no tags) --- + try: + sts_as_forge.assume_role( + RoleArn=tenant_arn, + RoleSessionName='TenantValidation-Basic', + ) + LOG.info(f"Basic AssumeRole successful for {tenant_arn}") + tenant_entry['assume_role_success'] = True + except ClientError as e: + LOG.error(f"Basic AssumeRole failed for {tenant_arn}: {e}") + tenant_entry['assume_role_error'] = str(e) + except Exception as e: + LOG.error( + f"Unexpected error in Basic AssumeRole for {tenant_arn}: {e}") + tenant_entry['assume_role_error'] = f"Unexpected error: {e}" + + # --- Test 2: AssumeRole WITH Tags (only if basic succeeded) --- + if tenant_entry['assume_role_success']: + try: + tenant_resp = sts_as_forge.assume_role( + RoleArn=tenant_arn, + RoleSessionName='TenantValidation-Tags', + Tags=[ + {'Key': 'CreatedBy', 'Value': 'ForgeTrustValidator'}, + {'Key': 'Validation', 'Value': 'True'} + ] + ) + + tenant_creds = tenant_resp['Credentials'] + sts_as_tenant = boto3.client( + 'sts', + aws_access_key_id=tenant_creds['AccessKeyId'], + aws_secret_access_key=tenant_creds['SecretAccessKey'], + aws_session_token=tenant_creds['SessionToken'], + ) + identity = sts_as_tenant.get_caller_identity() + LOG.info( + f"AssumeRole WITH Tags successful for {tenant_arn}. Identity: {identity['Arn']}") + + tenant_entry['tag_session_success'] = True + except ClientError as e: + LOG.error( + f"AssumeRole WITH Tags failed for {tenant_arn}: {e}") + tenant_entry['tag_session_error'] = str(e) + except Exception as e: + LOG.error( + f"Unexpected error in AssumeRole WITH Tags for {tenant_arn}: {e}") + tenant_entry['tag_session_error'] = f"Unexpected error: {e}" + else: + tenant_entry['tag_session_error'] = 'Skipped because basic AssumeRole failed' + + result['tenant_results'].append(tenant_entry) + + except ClientError as e: + LOG.error(f"IAM/STS error for Forge role {forge_role_arn}: {e}") + result['errors'].append( + f"IAM/STS error for forge role {forge_role_arn}: {e}" + ) + except Exception as e: + LOG.error(f"Unexpected error for Forge role {forge_role_arn}: {e}") + result['errors'].append( + f"Unexpected error for forge role {forge_role_arn}: {e}" + ) + + return result + + +def lambda_handler(event, context): + """ + Main Lambda entrypoint. + + Configuration: + - FORGE_IAM_ROLES: env var, JSON list or CSV of forge role ARNs + - TENANT_IAM_ROLES: env var, JSON list or CSV of tenant role ARNs + + Example: + FORGE_IAM_ROLES='["arn:aws:iam::123:role/forge-1","arn:aws:iam::123:role/forge-2"]' + TENANT_IAM_ROLES="arn:aws:iam::456:role/tenant-1,arn:aws:iam::789:role/tenant-2" + """ + try: + LOG.info('Lambda handler started') + forge_role_arns = parse_env_list('FORGE_IAM_ROLES') + tenant_role_arns = parse_env_list('TENANT_IAM_ROLES') + + if not forge_role_arns or not tenant_role_arns: + msg = ( + 'Missing forge_role_arns or tenant_role_arns ' + '(check env variables FORGE_IAM_ROLES and TENANT_IAM_ROLES).' + ) + LOG.error( + 'Missing required environment variables: FORGE_IAM_ROLES or TENANT_IAM_ROLES', + ) + raise RuntimeError(msg) + + LOG.info( + f"Loaded configuration: {len(forge_role_arns)} Forge roles, {len(tenant_role_arns)} Tenant roles") + all_results: List[Dict[str, Any]] = [] + + for forge_role_arn in forge_role_arns: + res = validate_forge_role_against_tenants( + forge_role_arn=forge_role_arn, + tenant_role_arns=tenant_role_arns, + ) + all_results.append(res) + + LOG.info('Validation complete: %s', json.dumps(all_results)) + + return { + 'statusCode': 200, + 'body': json.dumps(all_results), + } + except Exception as e: + LOG.exception( + f'Unhandled exception in forge_trust_validator lambda. Error: {str(e)}') + raise diff --git a/modules/platform/forge_runners/forge_trust_validator/main.tf b/modules/platform/forge_runners/forge_trust_validator/main.tf new file mode 100644 index 00000000..2c7cc083 --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/main.tf @@ -0,0 +1,82 @@ +module "forge_trust_validator_lambda" { + source = "terraform-aws-modules/lambda/aws" + version = "8.7.0" + + function_name = "${var.prefix}-forge-trust-validator" + handler = "forge_trust_validator.lambda_handler" + runtime = "python3.12" + timeout = 900 + architectures = ["x86_64"] + + source_path = [{ + path = "${path.module}/lambda" + }] + + logging_log_group = aws_cloudwatch_log_group.forge_trust_validator_lambda.name + use_existing_cloudwatch_log_group = true + + trigger_on_package_timestamp = false + + environment_variables = { + FORGE_IAM_ROLES = join(",", [for key, arn in var.forge_iam_roles : arn]) + TENANT_IAM_ROLES = join(",", var.tenant_iam_roles) + LOG_LEVEL = var.log_level + } + + attach_policy_json = true + + policy_json = data.aws_iam_policy_document.forge_trust_validator_lambda.json + + function_tags = var.tags + role_tags = var.tags + tags = var.tags + + depends_on = [aws_cloudwatch_log_group.forge_trust_validator_lambda] +} + +data "aws_iam_policy_document" "forge_trust_validator_lambda" { + statement { + actions = [ + "iam:GetRole", + "iam:UpdateAssumeRolePolicy", + "sts:AssumeRole", + ] + effect = "Allow" + resources = [for key, arn in var.forge_iam_roles : arn] + } +} + +resource "aws_cloudwatch_log_group" "forge_trust_validator_lambda" { + name = "/aws/lambda/${var.prefix}-forge-trust-validator" + retention_in_days = var.logging_retention_in_days + tags = var.tags + tags_all = var.tags +} + +resource "aws_cloudwatch_event_rule" "forge_trust_validator_lambda" { + name = "${var.prefix}-forge-trust-validator" + description = "Trigger Lambda every 10 minutes" + schedule_expression = "cron(*/10 * * * ? *)" + + tags = var.tags + tags_all = var.tags + + depends_on = [module.forge_trust_validator_lambda] +} + +resource "aws_cloudwatch_event_target" "forge_trust_validator_lambda" { + rule = aws_cloudwatch_event_rule.forge_trust_validator_lambda.name + arn = module.forge_trust_validator_lambda.lambda_function_arn + + depends_on = [module.forge_trust_validator_lambda] +} + +resource "aws_lambda_permission" "forge_trust_validator_lambda" { + action = "lambda:InvokeFunction" + function_name = "${var.prefix}-forge-trust-validator" + principal = "events.amazonaws.com" + statement_id = "AllowExecutionFromCloudWatch" + source_arn = aws_cloudwatch_event_rule.forge_trust_validator_lambda.arn + + depends_on = [module.forge_trust_validator_lambda] +} diff --git a/modules/platform/forge_runners/forge_trust_validator/variables.tf b/modules/platform/forge_runners/forge_trust_validator/variables.tf new file mode 100644 index 00000000..de321229 --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/variables.tf @@ -0,0 +1,43 @@ +variable "aws_profile" { + type = string + description = "AWS profile (i.e. generated via 'sl aws session generate') to use." +} + +variable "prefix" { + description = "Prefix for all resources" + type = string +} + +variable "tags" { + description = "Tags to apply to created resources." + type = map(string) + default = {} +} + +variable "logging_retention_in_days" { + description = "Retention in days for CloudWatch Log Group for the Lambdas." + type = number + default = 30 +} + +variable "log_level" { + type = string + description = "Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR)" + default = "INFO" +} + +variable "forge_iam_roles" { + type = map(string) + description = "List of IAM role ARNs for Forge runners." +} + +variable "number_forge_iram_roles" { + type = number + description = "Number of Iam roles ARNs for Forge runners" +} + +variable "tenant_iam_roles" { + type = list(string) + description = "List of IAM role ARNs that the runners will assume to test trust relationships." + default = [] +} diff --git a/modules/platform/forge_runners/forge_trust_validator/versions.tf b/modules/platform/forge_runners/forge_trust_validator/versions.tf new file mode 100644 index 00000000..7fcd170f --- /dev/null +++ b/modules/platform/forge_runners/forge_trust_validator/versions.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 6.25" + } + null = { + source = "hashicorp/null" + version = ">= 3.2" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/platform/forge_runners/github_actions_job_log.tf b/modules/platform/forge_runners/github_actions_job_log.tf index 9bf67d04..5a31aba6 100644 --- a/modules/platform/forge_runners/github_actions_job_log.tf +++ b/modules/platform/forge_runners/github_actions_job_log.tf @@ -1,19 +1,28 @@ module "github_actions_job_logs" { - count = length(var.ec2_runner_specs) > 0 ? 1 : 0 + count = length(var.ec2_deployment_specs.runner_specs) > 0 ? 1 : 0 source = "./github_actions_job_logs" providers = { aws = aws } - prefix = var.deployment_config.prefix - secrets_prefix = local.cicd_secrets_prefix - shared_role_arns = var.tenant.github_logs_reader_role_arns + prefix = var.deployment_config.deployment_prefix + github_app = { + key_base64_ssm = { + arn = aws_ssm_parameter.github_app_key.arn + } + id_ssm = { + arn = aws_ssm_parameter.github_app_id.arn + } + installation_id_ssm = { + arn = aws_ssm_parameter.github_app_installation_id.arn + } + } + shared_role_arns = var.deployment_config.tenant.github_logs_reader_role_arns logging_retention_in_days = var.logging_retention_in_days log_level = var.log_level tags = local.all_security_tags event_bus_name = module.ec2_runners[0].event_bus_name - ghes_url = var.ghes_url + ghes_url = var.deployment_config.github.ghes_url - depends_on = [data.aws_secretsmanager_secret_version.data_cicd_secrets] } diff --git a/modules/platform/forge_runners/github_actions_job_logs/README.md b/modules/platform/forge_runners/github_actions_job_logs/README.md index 9f539840..fbdf2772 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/README.md +++ b/modules/platform/forge_runners/github_actions_job_logs/README.md @@ -123,21 +123,21 @@ See parent repository license. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [job\_log\_archiver](#module\_job\_log\_archiver) | terraform-aws-modules/lambda/aws | 8.1.0 | -| [job\_log\_dispatcher](#module\_job\_log\_dispatcher) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [job\_log\_archiver](#module\_job\_log\_archiver) | terraform-aws-modules/lambda/aws | 8.7.0 | +| [job\_log\_dispatcher](#module\_job\_log\_dispatcher) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources @@ -170,8 +170,6 @@ See parent repository license. | [aws_iam_policy_document.job_log_archiver](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.job_log_dispatcher](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | -| [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | -| [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | ## Inputs @@ -179,10 +177,10 @@ See parent repository license. |------|-------------|------|---------|:--------:| | [event\_bus\_name](#input\_event\_bus\_name) | Name of the EventBridge event bus to listen for workflow job events. | `string` | n/a | yes | | [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. | `string` | `""` | no | +| [github\_app](#input\_github\_app) | GitHub App configuration |
object({
key_base64_ssm = object({
arn = string
})
id_ssm = object({
arn = string
})
installation_id_ssm = object({
arn = string
})
})
| n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | | [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | -| [secrets\_prefix](#input\_secrets\_prefix) | Prefix for all secrets | `string` | n/a | yes | | [shared\_role\_arns](#input\_shared\_role\_arns) | Optional list of consumer identifier to IAM Role ARN granted read/list on tenant's github job logs. | `list(string)` | `[]` | no | | [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | diff --git a/modules/platform/forge_runners/github_actions_job_logs/job_log_archiver.tf b/modules/platform/forge_runners/github_actions_job_logs/job_log_archiver.tf index 0d585e48..d0c6e56e 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/job_log_archiver.tf +++ b/modules/platform/forge_runners/github_actions_job_logs/job_log_archiver.tf @@ -5,11 +5,12 @@ locals { module "job_log_archiver" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = local.resource_name_archiver handler = "job_log_archiver.lambda_handler" runtime = "python3.12" + memory_size = 1024 timeout = 900 architectures = ["x86_64"] @@ -30,9 +31,9 @@ module "job_log_archiver" { environment_variables = { GITHUB_API = local.github_api - SECRET_NAME_APP_ID = local.secrets["github_actions_runners_app_id"].name - SECRET_NAME_PRIVATE_KEY = local.secrets["github_actions_runners_app_key"].name - SECRET_NAME_INSTALLATION_ID = local.secrets["github_actions_runners_app_installation_id"].name + SECRET_NAME_APP_ID = var.github_app.id_ssm.arn + SECRET_NAME_PRIVATE_KEY = var.github_app.key_base64_ssm.arn + SECRET_NAME_INSTALLATION_ID = var.github_app.installation_id_ssm.arn BUCKET_NAME = aws_s3_bucket.gh_logs.id KMS_KEY_ARN = aws_kms_key.gh_logs.arn LOG_LEVEL = var.log_level @@ -82,14 +83,18 @@ data "aws_iam_policy_document" "job_log_archiver" { statement { effect = "Allow" + actions = [ - "secretsmanager:GetSecretValue", - "secretsmanager:DescribeSecret", + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:GetParameterHistory", + "ssm:DescribeParameters", ] + resources = [ - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_key"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_id"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_installation_id"].arn, + var.github_app.id_ssm.arn, + var.github_app.key_base64_ssm.arn, + var.github_app.installation_id_ssm.arn, ] } diff --git a/modules/platform/forge_runners/github_actions_job_logs/job_log_dispatcher.tf b/modules/platform/forge_runners/github_actions_job_logs/job_log_dispatcher.tf index bdb85959..ce863004 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/job_log_dispatcher.tf +++ b/modules/platform/forge_runners/github_actions_job_logs/job_log_dispatcher.tf @@ -4,7 +4,7 @@ locals { module "job_log_dispatcher" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = local.resource_name_dispatcher handler = "job_log_dispatcher.lambda_handler" diff --git a/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_archiver/job_log_archiver.py b/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_archiver/job_log_archiver.py index fb3578d9..208c4fc9 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_archiver/job_log_archiver.py +++ b/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_archiver/job_log_archiver.py @@ -15,7 +15,7 @@ level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() LOG.setLevel(getattr(logging, level_str, logging.INFO)) -SECRETS = boto3.client('secretsmanager') +SSM = boto3.client('ssm') S3 = boto3.client('s3') MAX_S3_TAGS = 10 @@ -23,11 +23,9 @@ GITHUB_RETRY_DELAY = 2 -def _get_secret_value(secret_id: str) -> str: - resp = SECRETS.get_secret_value(SecretId=secret_id) - if 'SecretString' in resp: - return resp['SecretString'] - return base64.b64decode(resp['SecretBinary']).decode() +def _get_secret_value(parameter_name: str) -> str: + resp = SSM.get_parameter(Name=parameter_name, WithDecryption=True) + return resp['Parameter']['Value'] def _generate_jwt(app_id: str, private_key_pem: str) -> str: @@ -157,63 +155,81 @@ def _tags(wf: Dict[str, Any]) -> Dict[str, str]: def lambda_handler(event: Dict[str, Any], _context: Any) -> Dict[str, Any]: # pragma: no cover - LOG.debug('Event: %s', json.dumps(event)) try: - gh_event, workflow_job = _parse_event(event) - except ValueError: - LOG.error('Invalid JSON in SQS body') - return {'status': 'error', 'error': 'invalid_json'} - - detail = gh_event.get('detail', {}) - if detail.get('action') != 'completed' or not workflow_job: - return {'status': 'ignored'} - - repo_full_name = (detail.get('repository') or {}).get('full_name') - if not repo_full_name: - return {'status': 'error', 'error': 'missing_repository'} - - try: - env = _get_env() - except RuntimeError as e: - LOG.error(str(e)) - return {'status': 'error', 'error': 'missing_env'} - - owner, repo = repo_full_name.split('/', 1) - job_id = workflow_job.get('id') - run_id = workflow_job.get('run_id') - runner_name = workflow_job.get('runner_name') - run_attempt = workflow_job.get('run_attempt') - workflow_name = workflow_job.get('workflow_name') + LOG.debug('Event: %s', json.dumps(event)) + try: + gh_event, workflow_job = _parse_event(event) + except Exception as e: + raise ValueError('invalid_json Error: %s', str(e)) + + detail = gh_event.get('detail', {}) + if detail.get('action') != 'completed' or not workflow_job: + LOG.info( + 'Event action is not completed or workflow_job is missing, ignoring.') + return {'status': 'ignored'} + + conclusion = workflow_job.get('conclusion') + + if conclusion in ('skipped', 'cancelled'): + LOG.info( + 'Job conclusion is %s, skipping log archival. Workflow job: %s', + conclusion, + workflow_job, + ) + return {'status': 'ignored'} + + repo_full_name = (detail.get('repository') or {}).get('full_name') + if not repo_full_name: + LOG.info( + 'Missing repository full_name in event detail. Detail event: %s', detail) + raise ValueError('missing_repository') - if not all([runner_name, run_id, job_id]): - return {'status': 'error', 'error': 'missing_ids'} + try: + env = _get_env() + except Exception as e: + raise ValueError('missing_env. Error: %s', str(e)) + + owner, repo = repo_full_name.split('/', 1) + job_id = workflow_job.get('id') + run_id = workflow_job.get('run_id') + runner_name = workflow_job.get('runner_name') + run_attempt = workflow_job.get('run_attempt') + workflow_name = workflow_job.get('workflow_name') + + if not all([runner_name, run_id, job_id]): + LOG.info('Missing required IDs: runner_name=%s run_id=%s job_id=%s. Workflow job: %s', + runner_name, run_id, job_id, workflow_job) + raise ValueError('missing_ids') - try: - install_token = _github_auth( - env['SECRET_NAME_APP_ID'], env['SECRET_NAME_PRIVATE_KEY'], env['SECRET_NAME_INSTALLATION_ID'], env['GITHUB_API'] - ) - _, log_key, event_key = _keys( - repo_full_name, run_id, run_attempt, job_id) - obj_tags = _tags(workflow_job) - body = _download_job_logs(owner, repo, int( - job_id), install_token, env['GITHUB_API']) - _put_log_object(env['BUCKET_NAME'], log_key, body, - env['KMS_KEY_ARN'], obj_tags) - size = len(body) - _put_json_object(env['BUCKET_NAME'], event_key, - detail, env['KMS_KEY_ARN'], obj_tags) - - return { - 'status': 'ok', - 'job_id': job_id, - 'run_id': run_id, - 'run_attempt': run_attempt, - 'workflow_name': workflow_name, - 'repository': repo_full_name, - 'log_key': log_key, - 'event_key': event_key, - 'size': size - } - except Exception: - LOG.exception('archive_failed job_id=%s run_id=%s', job_id, run_id) - return {'status': 'error', 'job_id': job_id, 'run_id': run_id, 'error': 'see logs'} + try: + install_token = _github_auth( + env['SECRET_NAME_APP_ID'], env['SECRET_NAME_PRIVATE_KEY'], env['SECRET_NAME_INSTALLATION_ID'], env['GITHUB_API'] + ) + _, log_key, event_key = _keys( + repo_full_name, run_id, run_attempt, job_id) + obj_tags = _tags(workflow_job) + body = _download_job_logs(owner, repo, int( + job_id), install_token, env['GITHUB_API']) + _put_log_object(env['BUCKET_NAME'], log_key, body, + env['KMS_KEY_ARN'], obj_tags) + size = len(body) + _put_json_object(env['BUCKET_NAME'], event_key, + detail, env['KMS_KEY_ARN'], obj_tags) + + return { + 'status': 'ok', + 'job_id': job_id, + 'run_id': run_id, + 'run_attempt': run_attempt, + 'workflow_name': workflow_name, + 'repository': repo_full_name, + 'log_key': log_key, + 'event_key': event_key, + 'size': size + } + except Exception as e: + raise ValueError( + 'archiver_error: job_id=%s run_id=%s. Error: %s', job_id, run_id, str(e)) + except Exception as e: + LOG.exception( + 'Unhandled exception in job_log_archiver lambda. Error: %s', str(e)) diff --git a/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_dispatcher/job_log_dispatcher.py b/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_dispatcher/job_log_dispatcher.py index f665bfe7..46f988ab 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_dispatcher/job_log_dispatcher.py +++ b/modules/platform/forge_runners/github_actions_job_logs/lambda/job_log_dispatcher/job_log_dispatcher.py @@ -15,39 +15,44 @@ def lambda_handler(event, context): - LOG.debug('Received event') - - if event.get('detail-type') != 'workflow_job': - LOG.info('Ignoring non-workflow_job event: %s', - event.get('detail-type')) - return {'statusCode': 200, 'body': json.dumps({'message': 'ignored event'})} - sqs.send_message(QueueUrl=QUEUE_URL, MessageBody=json.dumps(event)) - - detail = event.get('detail', {}) - workflow_job = detail.get('workflow_job', {}) - repo = detail.get('repository', {}).get('full_name') - - payload = { - 'repository': repo, - 'job_id': workflow_job.get('id'), - 'run_id': workflow_job.get('run_id'), - 'workflow': workflow_job.get('workflow_name'), - 'attempt': workflow_job.get('run_attempt', 1), - 'job_name': workflow_job.get('name'), - 'status': workflow_job.get('status'), - 'conclusion': workflow_job.get('conclusion'), - 'branch': workflow_job.get('head_branch'), - 'sha': (workflow_job.get('head_sha') or '')[:12], - 'labels': workflow_job.get('labels', []), - 'action': detail.get('action'), - } - - LOG.info( - 'Enqueued workflow_job action=%s repo=%s job_id=%s run_id=%s workflow=%s job_name=%s status=%s conclusion=%s attempt=%s branch=%s sha=%s labels=%s', - payload['action'], payload['repository'], payload['job_id'], payload['run_id'], - payload['workflow'], payload['job_name'], payload['status'], payload['conclusion'], - payload['attempt'], payload['branch'], payload['sha'], ','.join( - payload['labels']) - ) - - return {'enqueued': True} + try: + LOG.debug('Received event') + + if event.get('detail-type') != 'workflow_job': + LOG.info('Ignoring non-workflow_job event: %s', + event.get('detail-type')) + return {'statusCode': 200, 'body': json.dumps({'message': 'ignored event'})} + sqs.send_message(QueueUrl=QUEUE_URL, MessageBody=json.dumps(event)) + + detail = event.get('detail', {}) + workflow_job = detail.get('workflow_job', {}) + repo = detail.get('repository', {}).get('full_name') + + payload = { + 'repository': repo, + 'job_id': workflow_job.get('id'), + 'run_id': workflow_job.get('run_id'), + 'workflow': workflow_job.get('workflow_name'), + 'attempt': workflow_job.get('run_attempt', 1), + 'job_name': workflow_job.get('name'), + 'status': workflow_job.get('status'), + 'conclusion': workflow_job.get('conclusion'), + 'branch': workflow_job.get('head_branch'), + 'sha': (workflow_job.get('head_sha') or '')[:12], + 'labels': workflow_job.get('labels', []), + 'action': detail.get('action'), + } + + LOG.info( + 'Enqueued workflow_job action=%s repo=%s job_id=%s run_id=%s workflow=%s job_name=%s status=%s conclusion=%s attempt=%s branch=%s sha=%s labels=%s', + payload['action'], payload['repository'], payload['job_id'], payload['run_id'], + payload['workflow'], payload['job_name'], payload['status'], payload['conclusion'], + payload['attempt'], payload['branch'], payload['sha'], ','.join( + payload['labels']) + ) + + return {'enqueued': True} + except Exception as e: + LOG.exception( + f'Unhandled exception in job_log_dispatcher lambda. Error: {str(e)}') + raise diff --git a/modules/platform/forge_runners/github_actions_job_logs/s3.tf b/modules/platform/forge_runners/github_actions_job_logs/s3.tf index 5192baa1..a1e35415 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/s3.tf +++ b/modules/platform/forge_runners/github_actions_job_logs/s3.tf @@ -1,7 +1,6 @@ resource "aws_s3_bucket" "gh_logs" { - bucket = "${var.prefix}-forge-gh-logs-${data.aws_caller_identity.current.account_id}" - tags = var.tags - tags_all = var.tags + bucket = "${var.prefix}-forge-gh-logs-${data.aws_caller_identity.current.account_id}" + tags = var.tags } resource "aws_s3_bucket_ownership_controls" "gh_logs" { @@ -80,6 +79,8 @@ resource "aws_s3_bucket_public_access_block" "gh_logs" { block_public_policy = true ignore_public_acls = true restrict_public_buckets = true + + skip_destroy = true } diff --git a/modules/platform/forge_runners/github_actions_job_logs/secrets.tf b/modules/platform/forge_runners/github_actions_job_logs/secrets.tf deleted file mode 100644 index 75be70cf..00000000 --- a/modules/platform/forge_runners/github_actions_job_logs/secrets.tf +++ /dev/null @@ -1,23 +0,0 @@ -locals { - secrets = { - github_actions_runners_app_key = { - name = "${var.secrets_prefix}github_actions_runners_app_key" - } - github_actions_runners_app_installation_id = { - name = "${var.secrets_prefix}github_actions_runners_app_installation_id" - } - github_actions_runners_app_id = { - name = "${var.secrets_prefix}github_actions_runners_app_id" - } - } -} - -data "aws_secretsmanager_secret" "secrets" { - for_each = local.secrets - name = each.value.name -} - -data "aws_secretsmanager_secret_version" "secrets" { - for_each = data.aws_secretsmanager_secret.secrets - secret_id = each.value.id -} diff --git a/modules/platform/forge_runners/github_actions_job_logs/variables.tf b/modules/platform/forge_runners/github_actions_job_logs/variables.tf index cdd28c87..c6159d06 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/variables.tf +++ b/modules/platform/forge_runners/github_actions_job_logs/variables.tf @@ -38,7 +38,17 @@ variable "prefix" { type = string } -variable "secrets_prefix" { - description = "Prefix for all secrets" - type = string +variable "github_app" { + description = "GitHub App configuration" + type = object({ + key_base64_ssm = object({ + arn = string + }) + id_ssm = object({ + arn = string + }) + installation_id_ssm = object({ + arn = string + }) + }) } diff --git a/modules/platform/forge_runners/github_actions_job_logs/versions.tf b/modules/platform/forge_runners/github_actions_job_logs/versions.tf index d193e844..7ce2660e 100644 --- a/modules/platform/forge_runners/github_actions_job_logs/versions.tf +++ b/modules/platform/forge_runners/github_actions_job_logs/versions.tf @@ -2,10 +2,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/forge_runners/github_app_runner_group.tf b/modules/platform/forge_runners/github_app_runner_group.tf index 52abb701..7d6cebbe 100644 --- a/modules/platform/forge_runners/github_app_runner_group.tf +++ b/modules/platform/forge_runners/github_app_runner_group.tf @@ -5,15 +5,24 @@ module "github_app_runner_group" { aws = aws } - prefix = var.deployment_config.prefix - secrets_prefix = local.cicd_secrets_prefix + prefix = var.deployment_config.deployment_prefix + github_app = { + key_base64_ssm = { + arn = aws_ssm_parameter.github_app_key.arn + } + id_ssm = { + arn = aws_ssm_parameter.github_app_id.arn + } + installation_id_ssm = { + arn = aws_ssm_parameter.github_app_installation_id.arn + } + } logging_retention_in_days = var.logging_retention_in_days log_level = var.log_level tags = local.all_security_tags github_api = local.github_api - ghes_org = var.ghes_org - runner_group_name = var.runner_group_name - repository_selection = var.repository_selection + ghes_org = var.deployment_config.github.ghes_org + runner_group_name = var.deployment_config.github.runner_group_name + repository_selection = var.deployment_config.github.repository_selection - depends_on = [data.aws_secretsmanager_secret_version.data_cicd_secrets] } diff --git a/modules/platform/forge_runners/github_app_runner_group/README.md b/modules/platform/forge_runners/github_app_runner_group/README.md index 730d72c3..54893a7d 100644 --- a/modules/platform/forge_runners/github_app_runner_group/README.md +++ b/modules/platform/forge_runners/github_app_runner_group/README.md @@ -3,20 +3,20 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [register\_github\_app\_runner\_group\_lambda](#module\_register\_github\_app\_runner\_group\_lambda) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [register\_github\_app\_runner\_group\_lambda](#module\_register\_github\_app\_runner\_group\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources @@ -28,8 +28,6 @@ | [aws_lambda_permission.register_github_app_runner_group_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_iam_policy_document.register_github_app_runner_group_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | -| [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | -| [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | ## Inputs @@ -37,12 +35,12 @@ |------|-------------|------|---------|:--------:| | [ghes\_org](#input\_ghes\_org) | GitHub organization (GHES or GitHub.com). | `string` | n/a | yes | | [github\_api](#input\_github\_api) | Base URL for the GitHub API (set to GHES API endpoint if using Enterprise). | `string` | `"https://api.github.com"` | no | +| [github\_app](#input\_github\_app) | GitHub App configuration |
object({
key_base64_ssm = object({
arn = string
})
id_ssm = object({
arn = string
})
installation_id_ssm = object({
arn = string
})
})
| n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | | [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | | [repository\_selection](#input\_repository\_selection) | Repository selection type: 'all' or 'selected'. | `string` | n/a | yes | | [runner\_group\_name](#input\_runner\_group\_name) | Name of the GitHub Actions runner group to create/update and attach repositories to. | `string` | n/a | yes | -| [secrets\_prefix](#input\_secrets\_prefix) | Prefix for all secrets | `string` | n/a | yes | | [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | ## Outputs diff --git a/modules/platform/forge_runners/github_app_runner_group/lambda/github_app_runner_group.py b/modules/platform/forge_runners/github_app_runner_group/lambda/github_app_runner_group.py index 6e450dcf..96a068f5 100644 --- a/modules/platform/forge_runners/github_app_runner_group/lambda/github_app_runner_group.py +++ b/modules/platform/forge_runners/github_app_runner_group/lambda/github_app_runner_group.py @@ -20,6 +20,8 @@ level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() LOG.setLevel(getattr(logging, level_str, logging.INFO)) +SSM = boto3.client('ssm') + def generate_jwt(app_id: str, private_key: str) -> str: """Generate a JWT for GitHub App authentication.""" @@ -144,11 +146,10 @@ def save_to_runner_group(access_token: str, github_api: str, organization: str, f"Added repository {repo['full_name']} to runner group {runner_group_name}.") -def get_secret(secret_name: str) -> Dict[str, Any]: - """Retrieve secrets from AWS Secrets Manager.""" - client = boto3.client('secretsmanager') - response = client.get_secret_value(SecretId=secret_name) - return response['SecretString'] +def get_secret(secret_name: str) -> str: + """Retrieve secrets from AWS Systems Manager Parameter Store.""" + response = SSM.get_parameter(Name=secret_name, WithDecryption=True) + return response['Parameter']['Value'] def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: @@ -192,8 +193,6 @@ def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: 'body': json.dumps({'message': 'Repositories added to runner group successfully.'}) } except Exception as e: - LOG.error(f'Error: {str(e)}') - return { - 'statusCode': 500, - 'body': json.dumps({'message': 'An error occurred', 'error': str(e)}) - } + LOG.exception( + f'Unhandled exception in github_app_runner_group lambda. Error: {str(e)}') + raise diff --git a/modules/platform/forge_runners/github_app_runner_group/main.tf b/modules/platform/forge_runners/github_app_runner_group/main.tf index 12babbca..8348a0e5 100644 --- a/modules/platform/forge_runners/github_app_runner_group/main.tf +++ b/modules/platform/forge_runners/github_app_runner_group/main.tf @@ -1,6 +1,6 @@ module "register_github_app_runner_group_lambda" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = "${var.prefix}-register-github-app-runner-group" handler = "github_app_runner_group.lambda_handler" @@ -28,9 +28,9 @@ module "register_github_app_runner_group_lambda" { ORGANIZATION = var.ghes_org RUNNER_GROUP_NAME = var.runner_group_name REPOSITORY_SELECTION = var.repository_selection - SECRET_NAME_APP_ID = local.secrets.github_actions_runners_app_id.name - SECRET_NAME_PRIVATE_KEY = local.secrets.github_actions_runners_app_key.name - SECRET_NAME_INSTALLATION_ID = local.secrets.github_actions_runners_app_installation_id.name + SECRET_NAME_APP_ID = var.github_app.id_ssm.arn + SECRET_NAME_PRIVATE_KEY = var.github_app.key_base64_ssm.arn + SECRET_NAME_INSTALLATION_ID = var.github_app.installation_id_ssm.arn LOG_LEVEL = var.log_level } @@ -47,15 +47,19 @@ module "register_github_app_runner_group_lambda" { data "aws_iam_policy_document" "register_github_app_runner_group_lambda" { statement { + effect = "Allow" + actions = [ - "secretsmanager:GetSecretValue", - "secretsmanager:DescribeSecret", + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:GetParameterHistory", + "ssm:DescribeParameters", ] - effect = "Allow" + resources = [ - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_key"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_id"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_installation_id"].arn, + var.github_app.id_ssm.arn, + var.github_app.key_base64_ssm.arn, + var.github_app.installation_id_ssm.arn, ] } } diff --git a/modules/platform/forge_runners/github_app_runner_group/secrets.tf b/modules/platform/forge_runners/github_app_runner_group/secrets.tf deleted file mode 100644 index d412e54f..00000000 --- a/modules/platform/forge_runners/github_app_runner_group/secrets.tf +++ /dev/null @@ -1,23 +0,0 @@ -locals { - secrets = { - github_actions_runners_app_key = { - name = "${var.secrets_prefix}github_actions_runners_app_key" - } - github_actions_runners_app_id = { - name = "${var.secrets_prefix}github_actions_runners_app_id" - } - github_actions_runners_app_installation_id = { - name = "${var.secrets_prefix}github_actions_runners_app_installation_id" - } - } -} - -data "aws_secretsmanager_secret" "secrets" { - for_each = local.secrets - name = each.value.name -} - -data "aws_secretsmanager_secret_version" "secrets" { - for_each = data.aws_secretsmanager_secret.secrets - secret_id = each.value.id -} diff --git a/modules/platform/forge_runners/github_app_runner_group/variables.tf b/modules/platform/forge_runners/github_app_runner_group/variables.tf index 5868d297..0d4883c9 100644 --- a/modules/platform/forge_runners/github_app_runner_group/variables.tf +++ b/modules/platform/forge_runners/github_app_runner_group/variables.tf @@ -9,11 +9,6 @@ variable "tags" { default = {} } -variable "secrets_prefix" { - description = "Prefix for all secrets" - type = string -} - variable "logging_retention_in_days" { description = "Retention in days for CloudWatch Log Group for the Lambdas." type = number @@ -50,3 +45,18 @@ variable "repository_selection" { error_message = "repository_selection must be 'all' or 'selected'." } } + +variable "github_app" { + description = "GitHub App configuration" + type = object({ + key_base64_ssm = object({ + arn = string + }) + id_ssm = object({ + arn = string + }) + installation_id_ssm = object({ + arn = string + }) + }) +} diff --git a/modules/platform/forge_runners/github_app_runner_group/versions.tf b/modules/platform/forge_runners/github_app_runner_group/versions.tf index d193e844..7ce2660e 100644 --- a/modules/platform/forge_runners/github_app_runner_group/versions.tf +++ b/modules/platform/forge_runners/github_app_runner_group/versions.tf @@ -2,10 +2,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/forge_runners/github_global_lock.tf b/modules/platform/forge_runners/github_global_lock.tf index 000abf86..17c72986 100644 --- a/modules/platform/forge_runners/github_global_lock.tf +++ b/modules/platform/forge_runners/github_global_lock.tf @@ -5,11 +5,19 @@ module "github_global_lock" { aws = aws } - prefix = var.deployment_config.prefix - secrets_prefix = local.cicd_secrets_prefix + prefix = var.deployment_config.deployment_prefix + github_app = { + key_base64_ssm = { + arn = aws_ssm_parameter.github_app_key.arn + } + id_ssm = { + arn = aws_ssm_parameter.github_app_id.arn + } + installation_id_ssm = { + arn = aws_ssm_parameter.github_app_installation_id.arn + } + } logging_retention_in_days = var.logging_retention_in_days log_level = var.log_level tags = local.all_security_tags - - depends_on = [data.aws_secretsmanager_secret_version.data_cicd_secrets] } diff --git a/modules/platform/forge_runners/github_global_lock/README.md b/modules/platform/forge_runners/github_global_lock/README.md index dbd96c6b..d90b611a 100644 --- a/modules/platform/forge_runners/github_global_lock/README.md +++ b/modules/platform/forge_runners/github_global_lock/README.md @@ -3,20 +3,20 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | +| [aws](#provider\_aws) | 6.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| -| [clean\_global\_lock\_lambda](#module\_clean\_global\_lock\_lambda) | terraform-aws-modules/lambda/aws | 8.1.0 | +| [clean\_global\_lock\_lambda](#module\_clean\_global\_lock\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | ## Resources @@ -31,17 +31,15 @@ | [aws_iam_policy_document.clean_global_lock_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.dynamodb_policy_document](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | -| [aws_secretsmanager_secret.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | -| [aws_secretsmanager_secret_version.secrets](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [github\_app](#input\_github\_app) | GitHub App configuration |
object({
key_base64_ssm = object({
arn = string
})
id_ssm = object({
arn = string
})
installation_id_ssm = object({
arn = string
})
})
| n/a | yes | | [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | | [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | -| [secrets\_prefix](#input\_secrets\_prefix) | Prefix for all secrets | `string` | n/a | yes | | [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | ## Outputs diff --git a/modules/platform/forge_runners/github_global_lock/lambda/github_clean_global_lock.py b/modules/platform/forge_runners/github_global_lock/lambda/github_clean_global_lock.py index d25e57b5..54aa4f5c 100644 --- a/modules/platform/forge_runners/github_global_lock/lambda/github_clean_global_lock.py +++ b/modules/platform/forge_runners/github_global_lock/lambda/github_clean_global_lock.py @@ -23,6 +23,7 @@ DYNAMODB_TABLE = os.getenv('DYNAMODB_TABLE') +SSM = boto3.client('ssm') dynamodb = boto3.resource('dynamodb') table = dynamodb.Table(DYNAMODB_TABLE) @@ -49,11 +50,10 @@ def get_installation_access_token(jwt_token: str, installation_id: str) -> str: return response.json()['token'] -def get_secret(secret_name: str) -> Dict[str, Any]: - """Retrieve secrets from AWS Secrets Manager.""" - client = boto3.client('secretsmanager') - response = client.get_secret_value(SecretId=secret_name) - return response['SecretString'] +def get_secret(secret_name: str) -> str: + """Retrieve secrets from AWS Systems Manager Parameter Store.""" + response = SSM.get_parameter(Name=secret_name, WithDecryption=True) + return response['Parameter']['Value'] def parse_github_url(workflow_run_url: str) -> Tuple[str, str, str]: @@ -144,8 +144,6 @@ def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: 'body': json.dumps({'message': 'Cleaned lock successfully.'}) } except Exception as e: - LOG.error(f'Error: {str(e)}') - return { - 'statusCode': 500, - 'body': json.dumps({'message': 'An error occurred', 'error': str(e)}) - } + LOG.exception( + f'Unhandled exception in github_global_lock lambda. Error: {str(e)}') + raise diff --git a/modules/platform/forge_runners/github_global_lock/main.tf b/modules/platform/forge_runners/github_global_lock/main.tf index ce4b3ec4..5bde0895 100644 --- a/modules/platform/forge_runners/github_global_lock/main.tf +++ b/modules/platform/forge_runners/github_global_lock/main.tf @@ -70,7 +70,7 @@ resource "aws_iam_policy" "dynamodb_policy" { ## GitHub Clean Global Lock Lambda module "clean_global_lock_lambda" { source = "terraform-aws-modules/lambda/aws" - version = "8.1.2" + version = "8.7.0" function_name = "${var.prefix}-clean-global-lock" handler = "github_clean_global_lock.lambda_handler" @@ -95,9 +95,9 @@ module "clean_global_lock_lambda" { environment_variables = { DYNAMODB_TABLE = "${var.prefix}-gh-actions-lock" - SECRET_NAME_APP_ID = local.secrets.github_actions_runners_app_id.name - SECRET_NAME_PRIVATE_KEY = local.secrets.github_actions_runners_app_key.name - SECRET_NAME_INSTALLATION_ID = local.secrets.github_actions_runners_app_installation_id.name + SECRET_NAME_APP_ID = var.github_app.id_ssm.arn + SECRET_NAME_PRIVATE_KEY = var.github_app.key_base64_ssm.arn + SECRET_NAME_INSTALLATION_ID = var.github_app.installation_id_ssm.arn LOG_LEVEL = var.log_level } @@ -114,15 +114,19 @@ module "clean_global_lock_lambda" { data "aws_iam_policy_document" "clean_global_lock_lambda" { statement { + effect = "Allow" + actions = [ - "secretsmanager:GetSecretValue", - "secretsmanager:DescribeSecret", + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:GetParameterHistory", + "ssm:DescribeParameters", ] - effect = "Allow" + resources = [ - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_key"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_id"].arn, - data.aws_secretsmanager_secret_version.secrets["github_actions_runners_app_installation_id"].arn, + var.github_app.id_ssm.arn, + var.github_app.key_base64_ssm.arn, + var.github_app.installation_id_ssm.arn, ] } statement { diff --git a/modules/platform/forge_runners/github_global_lock/secrets.tf b/modules/platform/forge_runners/github_global_lock/secrets.tf deleted file mode 100644 index d412e54f..00000000 --- a/modules/platform/forge_runners/github_global_lock/secrets.tf +++ /dev/null @@ -1,23 +0,0 @@ -locals { - secrets = { - github_actions_runners_app_key = { - name = "${var.secrets_prefix}github_actions_runners_app_key" - } - github_actions_runners_app_id = { - name = "${var.secrets_prefix}github_actions_runners_app_id" - } - github_actions_runners_app_installation_id = { - name = "${var.secrets_prefix}github_actions_runners_app_installation_id" - } - } -} - -data "aws_secretsmanager_secret" "secrets" { - for_each = local.secrets - name = each.value.name -} - -data "aws_secretsmanager_secret_version" "secrets" { - for_each = data.aws_secretsmanager_secret.secrets - secret_id = each.value.id -} diff --git a/modules/platform/forge_runners/github_global_lock/variables.tf b/modules/platform/forge_runners/github_global_lock/variables.tf index 0ea9e192..ab464b50 100644 --- a/modules/platform/forge_runners/github_global_lock/variables.tf +++ b/modules/platform/forge_runners/github_global_lock/variables.tf @@ -9,11 +9,6 @@ variable "tags" { default = {} } -variable "secrets_prefix" { - description = "Prefix for all secrets" - type = string -} - variable "logging_retention_in_days" { description = "Retention in days for CloudWatch Log Group for the Lambdas." type = number @@ -25,3 +20,19 @@ variable "log_level" { description = "Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR)" default = "INFO" } + + +variable "github_app" { + description = "GitHub App configuration" + type = object({ + key_base64_ssm = object({ + arn = string + }) + id_ssm = object({ + arn = string + }) + installation_id_ssm = object({ + arn = string + }) + }) +} diff --git a/modules/platform/forge_runners/github_global_lock/versions.tf b/modules/platform/forge_runners/github_global_lock/versions.tf index d193e844..7ce2660e 100644 --- a/modules/platform/forge_runners/github_global_lock/versions.tf +++ b/modules/platform/forge_runners/github_global_lock/versions.tf @@ -2,10 +2,10 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/forge_runners/github_webhook_relay.tf b/modules/platform/forge_runners/github_webhook_relay.tf index 354cb10e..d855b9a5 100644 --- a/modules/platform/forge_runners/github_webhook_relay.tf +++ b/modules/platform/forge_runners/github_webhook_relay.tf @@ -6,13 +6,11 @@ module "github_webhook_relay" { aws = aws } - prefix = var.deployment_config.prefix - secret_prefix = "/cicd/common/${var.tenant.name}/${var.deployment_config.secret_suffix}" + prefix = var.deployment_config.deployment_prefix + secret_prefix = "/cicd/common/${var.deployment_config.tenant.name}/${var.deployment_config.secret_suffix}" logging_retention_in_days = var.logging_retention_in_days log_level = var.log_level tags = local.all_security_tags github_webhook_relay = var.github_webhook_relay - - depends_on = [data.aws_secretsmanager_secret_version.data_cicd_secrets] } diff --git a/modules/platform/forge_runners/github_webhook_relay/README.md b/modules/platform/forge_runners/github_webhook_relay/README.md index a302b9c6..544b7b06 100644 --- a/modules/platform/forge_runners/github_webhook_relay/README.md +++ b/modules/platform/forge_runners/github_webhook_relay/README.md @@ -3,16 +3,16 @@ | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.9.1 | -| [aws](#requirement\_aws) | >= 5.27 | +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | | [random](#requirement\_random) | >= 3.6 | ## Providers | Name | Version | |------|---------| -| [aws](#provider\_aws) | 6.19.0 | -| [random](#provider\_random) | 3.7.2 | +| [aws](#provider\_aws) | 6.35.1 | +| [random](#provider\_random) | 3.8.1 | ## Modules diff --git a/modules/platform/forge_runners/github_webhook_relay/versions.tf b/modules/platform/forge_runners/github_webhook_relay/versions.tf index edde8e7b..05f1b0ca 100644 --- a/modules/platform/forge_runners/github_webhook_relay/versions.tf +++ b/modules/platform/forge_runners/github_webhook_relay/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } random = { source = "hashicorp/random" @@ -11,5 +11,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/modules/platform/forge_runners/locals.tf b/modules/platform/forge_runners/locals.tf index f1697a62..e36beb2b 100644 --- a/modules/platform/forge_runners/locals.tf +++ b/modules/platform/forge_runners/locals.tf @@ -1,13 +1,13 @@ locals { runner_iam_role_managed_policy_arns = concat( # If the policy exists, include it, otherwise skip it - length(var.tenant.iam_roles_to_assume) > 0 ? [aws_iam_policy.role_assumption_for_forge_runners[0].arn] : [], + length(var.deployment_config.tenant.iam_roles_to_assume) > 0 ? [aws_iam_policy.role_assumption_for_forge_runners[0].arn] : [], [ aws_iam_policy.ecr_access_for_ec2_instances.arn, module.github_global_lock.dynamodb_policy_arn, ] ) - github_app_installation = "${var.ghes_url == "" ? "https://github.com" : var.ghes_url}/apps/${data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_name"].secret_string}/installations/${data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_installation_id"].secret_string}" - github_api = var.ghes_url == "" ? "https://api.github.com" : "https://api.${replace(var.ghes_url, "https://", "")}" + github_app_installation = "${var.deployment_config.github.ghes_url == "" ? "https://github.com" : var.deployment_config.github.ghes_url}/apps/${var.deployment_config.github_app.name}/installations/${var.deployment_config.github_app.installation_id}" + github_api = var.deployment_config.github.ghes_url == "" ? "https://api.github.com" : "https://api.${replace(var.deployment_config.github.ghes_url, "https://", "")}" } diff --git a/modules/platform/forge_runners/outputs.tf b/modules/platform/forge_runners/outputs.tf index 9378a35a..8f491699 100644 --- a/modules/platform/forge_runners/outputs.tf +++ b/modules/platform/forge_runners/outputs.tf @@ -1,8 +1,8 @@ output "forge_core" { description = "Core tenant-level metadata (non-sensitive)." value = { - tenant = var.tenant - runner_group_name = var.runner_group_name + tenant = var.deployment_config.tenant + runner_group_name = var.deployment_config.github.runner_group_name } } @@ -15,6 +15,7 @@ output "forge_runners" { subnet_cidr_blocks = try(module.ec2_runners[0].subnet_cidr_blocks, []) } arc = { + cluster_name = try(module.arc_runners.arc_cluster_name, {}) runners_arn_map = try(module.arc_runners.arc_runners_arn_map, {}) subnet_cidr_blocks = try(module.arc_runners.subnet_cidr_blocks, []) } @@ -42,8 +43,8 @@ output "forge_github_app" { description = "GitHub App related outputs." value = { installation_url = local.github_app_installation - installation_id = try(data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_installation_id"].secret_string, null) - name = try(data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_name"].secret_string, null) + installation_id = var.deployment_config.github_app.installation_id + name = var.deployment_config.github_app.name } sensitive = true } diff --git a/modules/platform/forge_runners/redrive_deadletter.tf b/modules/platform/forge_runners/redrive_deadletter.tf new file mode 100644 index 00000000..35ee1a78 --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter.tf @@ -0,0 +1,32 @@ +locals { + sqs_prefix_arn = "arn:aws:sqs:${data.aws_region.current.region}:${data.aws_caller_identity.current.account_id}" +} + +module "redrive_deadletter" { + source = "./redrive_deadletter" + + providers = { + aws = aws + } + + prefix = var.deployment_config.deployment_prefix + logging_retention_in_days = var.logging_retention_in_days + log_level = var.log_level + tags = local.all_security_tags + + sqs_map = merge( + { + for key in keys(var.ec2_deployment_specs.runner_specs) : + key => { + dlq = "${local.sqs_prefix_arn}:${var.deployment_config.deployment_prefix}-${key}-queued-builds_dead_letter" + main = "${local.sqs_prefix_arn}:${var.deployment_config.deployment_prefix}-${key}-queued-builds" + } + }, + { + "gha-job-logs" = { + dlq = "${local.sqs_prefix_arn}:${var.deployment_config.deployment_prefix}-gha-job-logs-dead-letter" + main = "${local.sqs_prefix_arn}:${var.deployment_config.deployment_prefix}-gha-job-logs" + } + }, + ) +} diff --git a/modules/platform/forge_runners/redrive_deadletter/README.md b/modules/platform/forge_runners/redrive_deadletter/README.md new file mode 100644 index 00000000..d3094f16 --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter/README.md @@ -0,0 +1,44 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.11 | +| [aws](#requirement\_aws) | >= 6.25 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | 6.35.1 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [redrive\_deadletter\_lambda](#module\_redrive\_deadletter\_lambda) | terraform-aws-modules/lambda/aws | 8.7.0 | + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.redrive_deadletter_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.redrive_deadletter_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.redrive_deadletter_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_lambda_permission.redrive_deadletter_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.redrive_deadletter_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [log\_level](#input\_log\_level) | Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR) | `string` | `"INFO"` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Retention in days for CloudWatch Log Group for the Lambdas. | `number` | `30` | no | +| [prefix](#input\_prefix) | Prefix for all resources | `string` | n/a | yes | +| [sqs\_map](#input\_sqs\_map) | Map of runner SQS queue names. |
map(object({
main = string
dlq = string
}))
| n/a | yes | +| [tags](#input\_tags) | Tags to apply to created resources. | `map(string)` | `{}` | no | + +## Outputs + +No outputs. + diff --git a/modules/platform/forge_runners/redrive_deadletter/lambda/redrive_deadletter.py b/modules/platform/forge_runners/redrive_deadletter/lambda/redrive_deadletter.py new file mode 100644 index 00000000..84182926 --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter/lambda/redrive_deadletter.py @@ -0,0 +1,150 @@ +import json +import logging +import os +from typing import Dict, List + +import boto3 +from botocore.exceptions import ClientError + +LOG = logging.getLogger() +level_str = os.environ.get('LOG_LEVEL', 'INFO').upper() +LOG.setLevel(getattr(logging, level_str, logging.INFO)) + +sqs = boto3.client('sqs') + + +def parse_sqs_map(raw: str) -> List[Dict[str, str]]: + """ + Expected SQS_MAP env var (from Terraform map), now using ARNs: + + { + "runner-a": { + "main": "arn:aws:sqs:us-east-1:111122223333:queue-a-main", + "dlq": "arn:aws:sqs:us-east-1:111122223333:queue-a-dlq" + }, + "runner-b": { + "main": "arn:aws:sqs:us-east-1:111122223333:queue-b-main", + "dlq": "arn:aws:sqs:us-east-1:111122223333:queue-b-dlq" + } + } + + Returns a list of: + [{"key": "runner-a", "main": "", "dlq": ""}, ...] + """ + if not raw.strip(): + return [] + + try: + parsed = json.loads(raw) + except json.JSONDecodeError as e: + raise Exception(f"Invalid SQS_MAP JSON: {e}. Value: {raw}") from e + + if not isinstance(parsed, dict): + raise Exception( + f"SQS_MAP must be a JSON object/map, got: {type(parsed)}" + ) + + mappings: List[Dict[str, str]] = [] + for key, value in parsed.items(): + if not isinstance(value, dict): + raise Exception( + f"SQS_MAP['{key}'] must be an object with 'main' and 'dlq'" + ) + if 'main' not in value or 'dlq' not in value: + raise Exception( + f"SQS_MAP['{key}'] missing 'main' or 'dlq' keys: {value}" + ) + mappings.append( + { + 'key': key, + 'main': str(value['main']), + 'dlq': str(value['dlq']), + } + ) + + return mappings + + +def start_dlq_redrive_to_source(sqs_client, dlq_identifier: str) -> Dict[str, str]: + """ + Start an SQS message move task from the given DLQ to its *source* queue. + + Uses StartMessageMoveTask with only SourceArn so that SQS + uses the DLQ's redrive policy to determine the destination. + """ + LOG.info('Starting message move task for DLQ ARN=%s', dlq_identifier) + + try: + resp = sqs_client.start_message_move_task( + SourceArn=dlq_identifier + ) + except ClientError as e: + LOG.error( + 'Failed to start message move task for DLQ ARN=%s: %s', + dlq_identifier, + e, + exc_info=True, + ) + return { + 'status': 'error', + 'dlq_identifier': dlq_identifier, + 'error': str(e), + } + + task_handle = resp.get('TaskHandle') + LOG.info( + 'Started message move task for DLQ ARN=%s task_handle=%s', + dlq_identifier, + task_handle, + ) + + return { + 'status': 'started', + 'dlq_identifier': dlq_identifier, + 'task_handle': task_handle, + } + + +def lambda_handler(event, context): + try: + raw_sqs_map = os.getenv('SQS_MAP', '') + mappings = parse_sqs_map(raw_sqs_map) + + if not mappings: + LOG.warning('SQS_MAP is empty; nothing to do.') + return {'status': 'noop', 'message': 'SQS_MAP is empty', 'results': []} + + LOG.info( + 'Starting DLQ redrive (StartMessageMoveTask) for %d mapping(s)', + len(mappings), + ) + + results = [] + for entry in mappings: + key = entry['key'] + dlq_identifier = entry['dlq'] + main_identifier = entry['main'] + + LOG.info( + 'Processing SQS mapping key=%s dlq=%s main=%s', + key, + dlq_identifier, + main_identifier, + ) + + redrive_result = start_dlq_redrive_to_source(sqs, dlq_identifier) + + results.append( + { + 'key': key, + 'dlq': dlq_identifier, + 'main': main_identifier, + **redrive_result, + } + ) + + return {'status': 'ok', 'results': results} + except Exception as e: + LOG.exception( + f'Unhandled exception in redrive_deadletter lambda. Error: {str(e)}') + raise diff --git a/modules/platform/forge_runners/redrive_deadletter/main.tf b/modules/platform/forge_runners/redrive_deadletter/main.tf new file mode 100644 index 00000000..80d7cea6 --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter/main.tf @@ -0,0 +1,104 @@ +module "redrive_deadletter_lambda" { + source = "terraform-aws-modules/lambda/aws" + version = "8.7.0" + + function_name = "${var.prefix}-redrive-deadletter" + handler = "redrive_deadletter.lambda_handler" + runtime = "python3.12" + timeout = 900 + architectures = ["x86_64"] + + source_path = [{ + path = "${path.module}/lambda" + }] + + logging_log_group = aws_cloudwatch_log_group.redrive_deadletter_lambda.name + use_existing_cloudwatch_log_group = true + + trigger_on_package_timestamp = false + + environment_variables = { + SQS_MAP = jsonencode(var.sqs_map) + LOG_LEVEL = var.log_level + } + + attach_policy_json = true + + policy_json = data.aws_iam_policy_document.redrive_deadletter_lambda.json + + function_tags = var.tags + role_tags = var.tags + tags = var.tags + + depends_on = [aws_cloudwatch_log_group.redrive_deadletter_lambda] +} + +data "aws_iam_policy_document" "redrive_deadletter_lambda" { + statement { + sid = "SQSReceiveFromDLQ" + effect = "Allow" + + actions = [ + "sqs:StartMessageMoveTask", + "sqs:ReceiveMessage", + "sqs:GetQueueAttributes", + "sqs:DeleteMessage", + ] + + resources = [ + for key, cfg in var.sqs_map : + cfg.dlq + ] + } + + statement { + sid = "SQSSendToMainQueue" + effect = "Allow" + + actions = [ + "sqs:SendMessage", + ] + + resources = [ + for key, cfg in var.sqs_map : + cfg.main + ] + } +} + + + +resource "aws_cloudwatch_log_group" "redrive_deadletter_lambda" { + name = "/aws/lambda/${var.prefix}-redrive-deadletter" + retention_in_days = var.logging_retention_in_days + tags = var.tags + tags_all = var.tags +} + +resource "aws_cloudwatch_event_rule" "redrive_deadletter_lambda" { + name = "${var.prefix}-redrive-deadletter" + description = "Trigger Lambda every 10 minutes" + schedule_expression = "cron(*/10 * * * ? *)" + + tags = var.tags + tags_all = var.tags + + depends_on = [module.redrive_deadletter_lambda] +} + +resource "aws_cloudwatch_event_target" "redrive_deadletter_lambda" { + rule = aws_cloudwatch_event_rule.redrive_deadletter_lambda.name + arn = module.redrive_deadletter_lambda.lambda_function_arn + + depends_on = [module.redrive_deadletter_lambda] +} + +resource "aws_lambda_permission" "redrive_deadletter_lambda" { + action = "lambda:InvokeFunction" + function_name = "${var.prefix}-redrive-deadletter" + principal = "events.amazonaws.com" + statement_id = "AllowExecutionFromCloudWatch" + source_arn = aws_cloudwatch_event_rule.redrive_deadletter_lambda.arn + + depends_on = [module.redrive_deadletter_lambda] +} diff --git a/modules/platform/forge_runners/redrive_deadletter/variables.tf b/modules/platform/forge_runners/redrive_deadletter/variables.tf new file mode 100644 index 00000000..3d715477 --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter/variables.tf @@ -0,0 +1,30 @@ +variable "prefix" { + description = "Prefix for all resources" + type = string +} + +variable "tags" { + description = "Tags to apply to created resources." + type = map(string) + default = {} +} + +variable "logging_retention_in_days" { + description = "Retention in days for CloudWatch Log Group for the Lambdas." + type = number + default = 30 +} + +variable "log_level" { + type = string + description = "Log level for application logging (e.g., INFO, DEBUG, WARN, ERROR)" + default = "INFO" +} + +variable "sqs_map" { + description = "Map of runner SQS queue names." + type = map(object({ + main = string + dlq = string + })) +} diff --git a/modules/platform/forge_runners/redrive_deadletter/versions.tf b/modules/platform/forge_runners/redrive_deadletter/versions.tf new file mode 100644 index 00000000..7ce2660e --- /dev/null +++ b/modules/platform/forge_runners/redrive_deadletter/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 6.25" + } + } + + # OpenTofu version. + required_version = "~> 1.11" +} diff --git a/modules/platform/forge_runners/roles.tf b/modules/platform/forge_runners/roles.tf index a038d809..ea7aa743 100644 --- a/modules/platform/forge_runners/roles.tf +++ b/modules/platform/forge_runners/roles.tf @@ -1,6 +1,6 @@ # Define the IAM policy for role assumption data "aws_iam_policy_document" "role_assumption_for_forge_runners" { - count = length(var.tenant.iam_roles_to_assume) > 0 ? 1 : 0 + count = length(var.deployment_config.tenant.iam_roles_to_assume) > 0 ? 1 : 0 statement { effect = "Allow" @@ -8,15 +8,15 @@ data "aws_iam_policy_document" "role_assumption_for_forge_runners" { "sts:AssumeRole", "sts:TagSession", ] - resources = var.tenant.iam_roles_to_assume + resources = var.deployment_config.tenant.iam_roles_to_assume } } # Define the actual IAM policy for role assumption resource "aws_iam_policy" "role_assumption_for_forge_runners" { - count = length(var.tenant.iam_roles_to_assume) > 0 ? 1 : 0 + count = length(var.deployment_config.tenant.iam_roles_to_assume) > 0 ? 1 : 0 - name = "${var.deployment_config.prefix}-policy-for-role-assumption-for-forge_runners" + name = "${var.deployment_config.deployment_prefix}-policy-for-role-assumption-for-forge_runners" description = "Managed policy for IAM role assumption." policy = element(data.aws_iam_policy_document.role_assumption_for_forge_runners[*].json, 0) @@ -43,7 +43,7 @@ data "aws_iam_policy_document" "ecr_access_for_ec2_instances" { # Define the actual IAM policy for ECR access resource "aws_iam_policy" "ecr_access_for_ec2_instances" { - name = "${var.deployment_config.prefix}-policy-for-ecr-access-for-ec2-instances" + name = "${var.deployment_config.deployment_prefix}-policy-for-ecr-access-for-ec2-instances" description = "Managed policy for IAM role assumption." policy = data.aws_iam_policy_document.ecr_access_for_ec2_instances.json diff --git a/modules/platform/forge_runners/secrets.tf b/modules/platform/forge_runners/secrets.tf deleted file mode 100644 index ecb6a09d..00000000 --- a/modules/platform/forge_runners/secrets.tf +++ /dev/null @@ -1,111 +0,0 @@ -locals { - cicd_secrets_prefix = "/cicd/common/${var.tenant.name}/${var.deployment_config.secret_suffix}/" - - secrets = [ - # CI/CD runners: secrets used in build/deploy pipelines. - { - name = "${local.cicd_secrets_prefix}github_actions_runners_app_key" - description = "Base64 encoded GitHub App private key for GHA ephemeral runners for Tenant ${var.tenant.name}(${var.deployment_config.secret_suffix})." - recovery_days = 7 - }, - { - name = "${local.cicd_secrets_prefix}github_actions_runners_app_id" - description = "GitHub App ID for GHA ephemeral runners for Tenant ${var.tenant.name}(${var.deployment_config.secret_suffix})." - recovery_days = 7 - }, - { - name = "${local.cicd_secrets_prefix}github_actions_runners_app_client_id" - description = "GitHub App Client ID for GHA ephemeral runners for Tenant ${var.tenant.name}(${var.deployment_config.secret_suffix})." - recovery_days = 7 - }, - { - name = "${local.cicd_secrets_prefix}github_actions_runners_app_installation_id" - description = "GitHub App Installation ID for GHA ephemeral runners for Tenant ${var.tenant.name}(${var.deployment_config.secret_suffix})." - recovery_days = 7 - }, - { - name = "${local.cicd_secrets_prefix}github_actions_runners_app_name" - description = "GitHub App Name for GHA ephemeral runners for Tenant ${var.tenant.name}(${var.deployment_config.secret_suffix})." - recovery_days = 7 - } - ] -} - -# Psuedo-random seeds we use for initializing the secrets. If we don't do this, -# then the secret "exists", but has no value or initial version, and "tf apply" -# steps fail, requiring one to manually set the password outside of Terraform. -data "aws_secretsmanager_random_password" "secret_seeds" { - for_each = { - for key, val in local.secrets : val.name => val - } - - password_length = 16 -} - -# Actual object containing the secret. -resource "aws_secretsmanager_secret" "cicd_secrets" { - for_each = { - for key, val in local.secrets : val.name => val - } - - name = each.value.name - description = each.value.description - recovery_window_in_days = each.value.recovery_days - - tags = local.all_security_tags - tags_all = local.all_security_tags -} - -# Force a delay between secret creation and seeding. We only need a few -# seconds, but if we don't do this, we get into a bad state requiring manual -# intervention and/or manual forced-deletion of secrets. -resource "time_sleep" "wait_60_seconds" { - depends_on = [ - aws_secretsmanager_secret.cicd_secrets, - ] - create_duration = "60s" -} - -# Only used for seeding purposes. Will not clobber/overwrite secrets afterward -# (i.e. if/when we set them manually via the AWS CLI or management console). -resource "aws_secretsmanager_secret_version" "cicd_secrets" { - depends_on = [time_sleep.wait_60_seconds] - for_each = { - for key, val in local.secrets : val.name => val - } - - secret_id = aws_secretsmanager_secret.cicd_secrets[each.key].id - secret_string = base64encode(data.aws_secretsmanager_random_password.secret_seeds[each.key].random_password) - - # Prevents this seed from being applied more than once (at initial "tf apply" - # time). - lifecycle { - ignore_changes = [secret_string, ] - } -} - -# Critical secrets needed for provisioning the CICD system. -data "aws_secretsmanager_secret" "data_cicd_secrets" { - for_each = { - for key, val in local.secrets : val.name => val - } - - depends_on = [ - aws_secretsmanager_secret.cicd_secrets, - ] - arn = "arn:aws:secretsmanager:${var.aws_region}:${var.aws_account_id}:secret:${each.key}" -} - -# Need both these objects to be able to extract the secrets' respective -# payloads. -data "aws_secretsmanager_secret_version" "data_cicd_secrets" { - for_each = { - for key, val in local.secrets : val.name => val - } - - depends_on = [ - aws_secretsmanager_secret_version.cicd_secrets, - data.aws_secretsmanager_secret.data_cicd_secrets - ] - secret_id = data.aws_secretsmanager_secret.data_cicd_secrets[each.key].id -} diff --git a/modules/platform/forge_runners/service_catalog.tf b/modules/platform/forge_runners/service_catalog.tf index d1e50953..469fe95a 100644 --- a/modules/platform/forge_runners/service_catalog.tf +++ b/modules/platform/forge_runners/service_catalog.tf @@ -1,3 +1,3 @@ resource "aws_servicecatalogappregistry_application" "forge" { - name = var.deployment_config.prefix + name = var.deployment_config.deployment_prefix } diff --git a/modules/platform/forge_runners/ssm.tf b/modules/platform/forge_runners/ssm.tf new file mode 100644 index 00000000..05712dc6 --- /dev/null +++ b/modules/platform/forge_runners/ssm.tf @@ -0,0 +1,77 @@ +resource "aws_ssm_parameter" "github_app_key" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_key" + description = "Base64 encoded GitHub App private key for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = base64encode("initial-placeholder-value") + tags = local.all_security_tags + + lifecycle { + # Allow operators to rotate the key directly in SSM without Terraform + # forcing it back to the original value. + ignore_changes = [value] + } +} + +resource "aws_ssm_parameter" "github_app_id" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_id" + description = "GitHub App ID for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = var.deployment_config.github_app.id + + tags = local.all_security_tags +} + +resource "aws_ssm_parameter" "github_app_client_id" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_client_id" + description = "GitHub App Client ID for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = var.deployment_config.github_app.client_id + + tags = local.all_security_tags +} + +resource "aws_ssm_parameter" "github_app_installation_id" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_installation_id" + description = "GitHub App Installation ID for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = var.deployment_config.github_app.installation_id + + tags = local.all_security_tags +} + +resource "aws_ssm_parameter" "github_app_name" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_name" + description = "GitHub App Name for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = var.deployment_config.github_app.name + + tags = local.all_security_tags +} + +resource "aws_ssm_parameter" "github_app_webhook_secret" { + name = "/forge/${var.deployment_config.deployment_prefix}/github_app_webhook_secret" + description = "GitHub App webhook secret for GHA ephemeral runners for Tenant ${var.deployment_config.tenant.name}." + type = "SecureString" + value = random_password.github_app_webhook_secret.result + + tags = local.all_security_tags +} + + +resource "time_rotating" "every_30_days" { + rotation_days = 30 +} + +resource "random_password" "github_app_webhook_secret" { + length = 20 + + keepers = { + rotation = time_rotating.every_30_days.id + } +} + +data "aws_ssm_parameter" "github_app_key" { + name = aws_ssm_parameter.github_app_key.name + with_decryption = true + depends_on = [aws_ssm_parameter.github_app_key] +} diff --git a/modules/platform/forge_runners/update_gh_app.tf b/modules/platform/forge_runners/update_gh_app.tf index 22076bde..82945e86 100644 --- a/modules/platform/forge_runners/update_gh_app.tf +++ b/modules/platform/forge_runners/update_gh_app.tf @@ -1,28 +1,28 @@ resource "null_resource" "update_github_app_webhook" { triggers = { - ghes_org = var.ghes_org - ghes_url = var.ghes_url - webhook_url = try(module.ec2_runners[0].webhook_endpoint, "https://cisco-open.github.io/forge") - secret = try(random_id.random[0].hex, null) - secret_version = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_key"].id + ghes_org = var.deployment_config.github.ghes_org + ghes_url = var.deployment_config.github.ghes_url + webhook_url = try(module.ec2_runners[0].webhook_endpoint, "https://cisco-open.github.io/forge") + secret = aws_ssm_parameter.github_app_webhook_secret.value + secret_version = aws_ssm_parameter.github_app_webhook_secret.version + id = var.deployment_config.github_app.id + client_id = var.deployment_config.github_app.client_id + installation_id = var.deployment_config.github_app.installation_id + name = var.deployment_config.github_app.name } provisioner "local-exec" { environment = { - CLIENT_ID = data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_client_id"].secret_string + CLIENT_ID = var.deployment_config.github_app.client_id PRIVATE_KEY = base64decode( - data.aws_secretsmanager_secret_version.data_cicd_secrets["${local.cicd_secrets_prefix}github_actions_runners_app_key"].secret_string + data.aws_ssm_parameter.github_app_key.value ) WEBHOOK_URL = self.triggers.webhook_url SECRET = self.triggers.secret GITHUB_API = local.github_api - PREFIX = "${var.env}-${var.deployment_config.prefix}" + PREFIX = "${var.deployment_config.env}-${var.deployment_config.deployment_prefix}" } command = "${path.module}/scripts/generate_and_patch_github_app.sh" } - - depends_on = [ - data.aws_secretsmanager_secret_version.data_cicd_secrets, - ] } diff --git a/modules/platform/forge_runners/variables.tf b/modules/platform/forge_runners/variables.tf index 0e083c90..21700267 100644 --- a/modules/platform/forge_runners/variables.tf +++ b/modules/platform/forge_runners/variables.tf @@ -1,147 +1,219 @@ -variable "aws_account_id" { - type = string - description = "AWS account ID (not SL AWS account ID) associated with the infra/backend." -} - variable "aws_profile" { type = string - description = "AWS profile (i.e. generated via 'sl aws session generate') to use." + description = "AWS profile to use." } variable "aws_region" { type = string - description = "Assuming single region for now." + description = "AWS region where Forge runners and supporting infrastructure are deployed." } -variable "env" { - type = string - description = "Deployment environments." -} +variable "ec2_deployment_specs" { + type = object({ + lambda_subnet_ids = list(string) + subnet_ids = list(string) + lambda_vpc_id = string + vpc_id = string + scale_errors = optional(list(string), []) + runner_specs = map(object({ + ami_filter = object({ + name = list(string) + state = list(string) + }) + ami_kms_key_arn = string + ami_owners = list(string) + runner_labels = list(string) + runner_os = string + runner_architecture = string + extra_labels = list(string) + max_instances = number + min_run_time = number + instance_types = list(string) + license_specifications = optional(list(object({ + license_configuration_arn = string + })), null) + placement = optional(object({ + affinity = optional(string) + availability_zone = optional(string) + group_id = optional(string) + group_name = optional(string) + host_id = optional(string) + host_resource_group_arn = optional(string) + spread_domain = optional(string) + tenancy = optional(string) + partition_number = optional(number) + }), null) + pool_config = list(object({ + size = number + schedule_expression = string + schedule_expression_timezone = string + })) + runner_user = string + enable_userdata = bool + instance_target_capacity_type = string + vpc_id = optional(string, null) + subnet_ids = optional(list(string), null) + block_device_mappings = list(object({ + delete_on_termination = bool + device_name = string + encrypted = bool + iops = number + kms_key_id = string + snapshot_id = string + throughput = number + volume_size = number + volume_type = string + })) + })) + }) -variable "ghes_org" { - type = string - description = "GitHub organization." -} + description = <<-EOT + EC2 deployment configuration for GitHub Actions runners. -variable "ghes_url" { - type = string - description = "GitHub Enterprise Server URL." + Top-level fields: + - lambda_subnet_ids: Subnets where runner-related lambdas execute. + These can be more permissive than the runner subnets. + - subnet_ids : Subnets where the EC2 runners are launched. + - vpc_id : VPC that contains both runner and lambda subnets. + - runner_specs : Map of runner pool keys to their EC2 sizing and + scheduling configuration. + + runner_specs[*] object fields: + - ami_filter : Name/state filters used to select the runner AMI. + - ami_kms_key_arn : KMS key ARN used to encrypt AMI EBS volumes. + - ami_owners : List of AWS account IDs that own the AMI. + - runner_labels : Base GitHub labels applied to jobs for this pool. + - runner_os : Runner operating system (for example, linux). + - runner_architecture: CPU architecture (for example, x86_64 or arm64). + - extra_labels : Additional GitHub labels that further specialize + this runner pool. + - max_instances : Maximum number of EC2 runners in this pool. + - min_run_time : Minimum job run time (in minutes) before a runner + is eligible for scale-down. + - instance_types : Allowed EC2 instance types for runners in this pool. + - pool_config : List of pool size schedules (size + cron expression + and optional time zone) controlling baseline capacity. + - runner_user : OS user under which the GitHub runner process runs. + - enable_userdata : Whether the module should inject its standard + userdata to configure the runner VM. + - instance_target_capacity_type: EC2 capacity type to use (spot or + on-demand). + - block_device_mappings: EBS volume configuration for the runner + instances, including size, type, encryption, and KMS. + EOT } -variable "repository_selection" { - type = string - description = "Repository selection type." + +variable "deployment_config" { + type = object({ + deployment_prefix = string + secret_suffix = string + env = string + github_app = object({ + id = string + client_id = string + installation_id = string + name = string + }) + github = object({ + ghes_org = string + ghes_url = string + repository_selection = string + runner_group_name = string + }) + tenant = object({ + name = string + iam_roles_to_assume = optional(list(string), []) + ecr_registries = optional(list(string), []) + github_logs_reader_role_arns = optional(list(string), []) + }) + }) validation { - condition = contains(["all", "selected"], var.repository_selection) + condition = contains(["all", "selected"], var.deployment_config.github.repository_selection) error_message = "repository_selection must be 'all' or 'selected'." } -} -variable "lambda_subnet_ids" { - type = list(string) - description = "So the lambdas can run in our pre-determined subnets. They don't require the same security policy as the runners though." -} + description = <<-EOT + High-level deployment configuration for a Forge runner installation. -variable "runner_group_name" { - type = string - description = "Name of the group applied to all runners." -} + Top-level fields: + - deployment_prefix: Prefix used when naming resources (for example, + log groups, KMS keys, and SSM parameters). + - env : Logical environment name (for example, dev, stage, + prod). Used for tagging and dashboards. -variable "deployment_config" { - type = object({ - prefix = string - secret_suffix = string - }) - description = "Prefix for the deployment, used to distinguish resources." -} + github_app object: + - id : Numeric GitHub App ID. + - client_id : OAuth client ID for the app. + - installation_id: GitHub App installation ID for this tenant. + - name : GitHub App name, used to build URLs and logs. -variable "ec2_runner_specs" { - description = "Map of runner specifications" - type = map(object({ - ami_filter = object({ - name = list(string) - state = list(string) - }) - ami_kms_key_arn = string - ami_owners = list(string) - runner_labels = list(string) - runner_os = string - runner_architecture = string - extra_labels = list(string) - max_instances = number - min_run_time = number - instance_types = list(string) - pool_config = list(object({ - size = number - schedule_expression = string - schedule_expression_timezone = string - })) - runner_user = string - enable_userdata = bool - instance_target_capacity_type = string - block_device_mappings = list(object({ - delete_on_termination = bool - device_name = string - encrypted = bool - iops = number - kms_key_id = string - snapshot_id = string - throughput = number - volume_size = number - volume_type = string - })) - })) -} + github object: + - ghes_org : GitHub organization that owns the repos where + runners will be used. + - ghes_url : GitHub.com or GHES base URL. Empty string implies + public github.com. + - repository_selection: Scope for runners (all or selected repositories). + - runner_group_name : GitHub runner group to attach new runners to. -variable "subnet_ids" { - type = list(string) - description = "Subnet(s) in which our runners will be deployed. Supplied by the underlying AWS-based CI/CD stack." + tenant object: + - name : Tenant identifier used in naming and + tagging. + - iam_roles_to_assume : Optional list of IAM role ARNs that + runners are allowed to assume for workload execution. + - ecr_registries : Optional list of ECR registry URLs that + runners may need to pull images from. + - github_logs_reader_role_arns: Optional list of IAM roles that can read + GitHub Actions logs for this tenant. + EOT } -variable "tenant" { - description = "Map of tenant configs" +variable "arc_deployment_specs" { type = object({ - name = string - iam_roles_to_assume = optional(list(string), []) - ecr_registries = optional(list(string), []) - github_logs_reader_role_arns = optional(list(string), []) + cluster_name = string + migrate_cluster = optional(bool, false) + runner_specs = map(object({ + runner_size = object({ + max_runners = number + min_runners = number + }) + scale_set_name = string + scale_set_type = string + container_actions_runner = string + container_limits_cpu = string + container_limits_memory = string + container_requests_cpu = string + container_requests_memory = string + volume_requests_storage_size = string + volume_requests_storage_type = string + })) }) -} - -variable "vpc_id" { - type = string - description = "VPC in which our runners will be deployed. Supplied by the underlying AWS-based CI/CD stack." -} -variable "arc_cluster_name" { - description = "Name of the EKS cluster" - type = string -} + description = <<-EOT + Deployment configuration for Azure Container Apps (ARC) runners. -variable "arc_runner_specs" { - description = "Map of runner specifications" - type = map(object({ - runner_size = object({ - max_runners = number - min_runners = number - }) - scale_set_name = string - scale_set_type = string - container_actions_runner = string - container_limits_cpu = string - container_limits_memory = string - container_requests_cpu = string - container_requests_memory = string - volume_requests_storage_size = string - volume_requests_storage_type = string - })) -} + Top-level fields: + - cluster_name : Name of the EKS cluster used for ARC runners. + - migrate_cluster: Optional flag to indicate a one-time migration or + blue/green cutover of the ARC runner cluster. + - runner_specs : Map of ARC runner pool keys to their sizing and + container resource settings. -variable "migrate_arc_cluster" { - type = bool - description = "Flag to indicate if the cluster is being migrated." - default = false + runner_specs[*] object fields: + - runner_size.max_runners: Maximum concurrent ARC runners for this pool. + - runner_size.min_runners: Minimum number of warm runners. + - scale_set_name : Logical name for the scale set / pool. + - scale_set_type : Backing type for the scale set (for example, + kubernetes or containerapp, depending on integration). + - container_actions_runner : Container image used for the ARC runner. + - container_limits_cpu : CPU limit for the runner container. + - container_limits_memory : Memory limit for the runner container. + - container_requests_cpu : CPU request (baseline reservation). + - container_requests_memory : Memory request (baseline reservation). + - volume_requests_storage_size: Size of attached storage for the runner. + - volume_requests_storage_type: Storage class or type for attached volume. + EOT } variable "tags" { diff --git a/modules/platform/forge_runners/versions.tf b/modules/platform/forge_runners/versions.tf index 3dc35d43..0ff94680 100644 --- a/modules/platform/forge_runners/versions.tf +++ b/modules/platform/forge_runners/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.27" + version = ">= 6.25" } local = { source = "hashicorp/local" @@ -31,5 +31,5 @@ terraform { } # OpenTofu version. - required_version = ">= 1.9.1" + required_version = "~> 1.11" } diff --git a/renovate.json b/renovate.json index bb534b2e..21bb904d 100644 --- a/renovate.json +++ b/renovate.json @@ -1,135 +1,219 @@ { - "$schema": "https://docs.renovatebot.com/renovate-schema.json", - "extends": [ - "config:recommended", - "config:base", - ":rebaseStalePrs", - ":semanticCommits", - ":semanticCommitScope(deps)" - ], - - "pre-commit": { - "enabled": true - }, - - "vulnerabilityAlerts": { - "enabled": true, - "schedule": ["at any time"] - }, - "osvVulnerabilityAlerts": true, - "automerge": false, - "platformAutomerge": true, - "stabilityDays": 0, - "separateMinorPatch": true, - "separateMajorMinor": true, - - "customManagers": [ - { - "customType": "regex", - "fileMatch": ["^*\\.tf$"], - "matchStrings": ["required_version\\s=\\s\">= (?.*?)\""], - "depNameTemplate": "opentofu/opentofu", - "datasourceTemplate": "github-releases" - } - ], - - "packageRules": [ - { - "matchDatasources": ["github-tags"], - "matchPackageNames": ["github-aws-runners/terraform-aws-github-runner"], - "groupName": "terraform-aws-github-runner version", - "separateMajorMinor": false, - "separateMinorPatch": false + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "config:recommended", + "config:base", + ":rebaseStalePrs", + ":semanticCommits", + ":semanticCommitScope(deps)" + ], + "pre-commit": { + "enabled": true }, - { - "description": "Security updates - immediate processing (override stability days)", - "matchPackagePatterns": ["*"], - "matchUpdateTypes": ["patch", "minor"], - "vulnerabilityAlerts": {"enabled": true}, - "stabilityDays": 0, - "prPriority": 10, - "prCreation": "immediate", - "addLabels": ["security", "critical"] + "vulnerabilityAlerts": { + "enabled": true, + "schedule": [ + "at any time" + ] }, - { - "matchUpdateTypes": ["patch","pin","digest"], - "addLabels": ["simple-review"], - "automerge": true - }, - { - "matchUpdateTypes": ["minor"], - "addLabels": ["simple-review"] - }, - { - "matchPackagePatterns": ["aws","terraform-aws-modules"], - "groupName": "AWS providers and modules", - "addLabels": ["aws-updates"] - }, - { - "description": "GitHub Actions - pin to commit SHA for security", - "matchManagers": ["github-actions"], - "pinDigests": true, - "groupName": "GitHub Actions", - "addLabels": ["github-actions", "ci-cd"] - }, - { - "matchManagers": ["pre-commit"], - "groupName": "Pre-commit hooks", - "addLabels": ["pre-commit"] - }, - { - "matchPackagePatterns": ["python","pip"], - "groupName": "Python dependencies", - "addLabels": ["python"] - }, - { - "description": "Development dependencies - reduced stability days", - "matchDepTypes": ["devDependencies"], - "matchUpdateTypes": ["patch", "minor"], - "stabilityDays": 1, - "automerge": true, - "addLabels": ["dev-dependencies"] - }, - { - "description": "Docker images - grouped with digest pinning", - "matchManagers": ["docker-compose", "dockerfile"], - "groupName": "Docker images", - "pinDigests": true, - "addLabels": ["docker", "containers"] - }, - { - "description": "Terraform providers from public registry", - "matchDatasources": ["terraform-provider"], - "matchPackagePatterns": ["^hashicorp/.*"], - "registryUrls": ["https://registry.opentofu.org"] - }, - { - "description": "Major version updates - extended stability period", - "matchUpdateTypes": ["major"], - "stabilityDays": 7, - "prPriority": 1, - "addLabels": ["major-update", "breaking-change"] - }, - { - "description": "Critical security patches - auto-merge enabled", - "matchPackagePatterns": ["*"], - "matchUpdateTypes": ["patch"], - "vulnerabilityAlerts": {"enabled": true}, - "automerge": true, - "automergeType": "pr", - "addLabels": ["security", "auto-merge"] - } - ], - - "regexManagers": [ - { - "fileMatch": ["^.*\\.tf$"], - "matchStrings": [ - "download_lambdas\\.sh\"\\s*,\\s*\"[^\"]+\"\\s*,\\s*\"v(?\\d+\\.\\d+\\.\\d+)\"" + "osvVulnerabilityAlerts": true, + "automerge": false, + "platformAutomerge": true, + "stabilityDays": 0, + "separateMinorPatch": true, + "separateMajorMinor": true, + "customManagers": [ + { + "customType": "regex", + "fileMatch": [ + "^*\\.tf$" + ], + "matchStrings": [ + "required_version\\s=\\s\">= (?.*?)\"" + ], + "datasourceTemplate": "github-releases", + "depNameTemplate": "opentofu/opentofu", + "extractVersionTemplate": "^v?(?.*)$" + }, + { + "customType": "regex", + "fileMatch": [ + "^.*\\.tf$" + ], + "matchStrings": [ + "download_lambdas\\.sh\"\\s*,\\s*\"[^\"]+\"\\s*,\\s*\"(?v\\d+\\.\\d+\\.\\d+)\"" + ], + "depNameTemplate": "github-aws-runners/terraform-aws-github-runner", + "datasourceTemplate": "github-tags", + "versioningTemplate": "semver-coerced" + }, + { + "fileMatch": ["Dockerfile$"], + "matchStrings": ["# renovate: datasource=(?\\S+) depName=(?\\S+) registryUrl=(?\\S+)( extractVersion=(?.+?))?( versioning=(?.*?))?\\n.*?VERSION=\"(?.*)?\"\\s"], + "versioningTemplate": "{{#if versioning}}{{{versioning}}}{{else}}semver{{/if}}" + } ], - "datasourceTemplate": "github-tags", - "depNameTemplate": "github-aws-runners/terraform-aws-github-runner", - "versioningTemplate": "semver" - } - ] + "packageRules": [ + { + "matchDatasources": [ + "github-tags" + ], + "matchPackageNames": [ + "github-aws-runners/terraform-aws-github-runner" + ], + "groupName": "terraform-aws-github-runner version", + "separateMajorMinor": false, + "separateMinorPatch": false + }, + { + "description": "Security updates - immediate processing (override stability days)", + "matchPackagePatterns": [ + "*" + ], + "matchUpdateTypes": [ + "patch", + "minor" + ], + "vulnerabilityAlerts": { + "enabled": true + }, + "stabilityDays": 0, + "prPriority": 10, + "prCreation": "immediate", + "addLabels": [ + "security", + "critical" + ] + }, + { + "matchUpdateTypes": [ + "patch", + "pin", + "digest" + ], + "addLabels": [ + "simple-review" + ], + "automerge": true + }, + { + "matchUpdateTypes": [ + "minor" + ], + "addLabels": [ + "simple-review" + ] + }, + { + "matchPackagePatterns": [ + "aws", + "terraform-aws-modules" + ], + "groupName": "AWS providers and modules", + "addLabels": [ + "aws-updates" + ] + }, + { + "description": "GitHub Actions - pin to commit SHA for security", + "matchManagers": [ + "github-actions" + ], + "pinDigests": true, + "groupName": "GitHub Actions", + "addLabels": [ + "github-actions", + "ci-cd" + ] + }, + { + "matchManagers": [ + "pre-commit" + ], + "groupName": "Pre-commit hooks", + "addLabels": [ + "pre-commit" + ] + }, + { + "matchPackagePatterns": [ + "python", + "pip" + ], + "groupName": "Python dependencies", + "addLabels": [ + "python" + ] + }, + { + "description": "Development dependencies - reduced stability days", + "matchDepTypes": [ + "devDependencies" + ], + "matchUpdateTypes": [ + "patch", + "minor" + ], + "stabilityDays": 1, + "automerge": true, + "addLabels": [ + "dev-dependencies" + ] + }, + { + "description": "Docker images - grouped with digest pinning", + "matchManagers": [ + "docker-compose", + "dockerfile" + ], + "groupName": "Docker images", + "pinDigests": true, + "addLabels": [ + "docker", + "containers" + ] + }, + { + "description": "Terraform providers from public registry", + "matchDatasources": [ + "terraform-provider" + ], + "matchPackagePatterns": [ + "^hashicorp/.*" + ], + "registryUrls": [ + "https://registry.opentofu.org" + ] + }, + { + "description": "Major version updates - extended stability period", + "matchUpdateTypes": [ + "major" + ], + "stabilityDays": 7, + "prPriority": 1, + "addLabels": [ + "major-update", + "breaking-change" + ] + }, + { + "description": "Critical security patches - auto-merge enabled", + "matchPackagePatterns": [ + "*" + ], + "matchUpdateTypes": [ + "patch" + ], + "vulnerabilityAlerts": { + "enabled": true + }, + "automerge": true, + "automergeType": "pr", + "addLabels": [ + "security", + "auto-merge" + ] + } + ] } diff --git a/scripts/migrate-tenant.sh b/scripts/migrate-tenant.sh index a308806c..6cd186ce 100755 --- a/scripts/migrate-tenant.sh +++ b/scripts/migrate-tenant.sh @@ -30,7 +30,7 @@ parse_args() { CONFIG_FILE="${TF_DIR}/config.yml" rendered=$(terragrunt render --format json --working-dir "$TF_DIR") - arc_cluster_name=$(echo "$rendered" | jq -r '.inputs.arc_cluster_name') + arc_cluster_name=$(echo "$rendered" | jq -r '.inputs.arc_deployment_specs.cluster_name') aws_profile=$(echo "$rendered" | jq -r '.inputs.aws_profile') aws_region=$(echo "$rendered" | jq -r '.inputs.aws_region') @@ -40,7 +40,7 @@ parse_args() { --alias "${arc_cluster_name}-${aws_profile}-${aws_region}" \ --profile "$aws_profile" - FROM_CTX="$arc_cluster_name" + FROM_CTX="${arc_cluster_name}-${aws_profile}-${aws_region}" } detect_clusters() { @@ -115,6 +115,7 @@ main() { echo "🚀 Enabling ARC for tenant on new cluster '$TO'" update_config false "$TO" terragrunt_apply 'module.arc_runners' + terragrunt_apply 'module.forge_trust_validator' echo "✅ Migration complete. Tenant '$TENANT' is now on '$TO'" } diff --git a/scripts/update-github-app-secrets.sh b/scripts/update-github-app-secrets.sh index ab5dfee8..a00c8837 100755 --- a/scripts/update-github-app-secrets.sh +++ b/scripts/update-github-app-secrets.sh @@ -2,9 +2,9 @@ set -euo pipefail usage() { - echo "Usage: $0 " - echo " : key | id | installation_id | name | client_id" - echo " : For 'key' type, path to PEM file to base64 encode. For others, string value." + echo "Usage: $0 " + echo " : Path to the Terragrunt directory for the tenant/stack" + echo " : Path to the GitHub App PEM file to base64 encode and store in SSM" exit 1 } @@ -24,12 +24,11 @@ validate_pem_file() { } } -get_secret_name() { +get_ssm_name() { local terragrunt_dir="$1" - local type="$2" - tenant_name=$(get_terragrunt_var "var.tenant.name" "$terragrunt_dir") - secret_suffix=$(get_terragrunt_var "var.deployment_config.secret_suffix" "$terragrunt_dir") - echo "/cicd/common/${tenant_name}/${secret_suffix}/github_actions_runners_app_${type}" + local deployment_prefix + deployment_prefix=$(get_terragrunt_var "var.deployment_config.deployment_prefix" "$terragrunt_dir") + echo "/forge/${deployment_prefix}/github_app_key" } get_terragrunt_var() { @@ -54,43 +53,33 @@ encode_pem() { tr -d '\n' <"$pem_file" | base64 } -update_secret() { - local secret_name="$1" - local secret_value="$2" - aws secretsmanager put-secret-value --secret-id "$secret_name" --secret-string "$secret_value" +update_ssm_param() { + local param_name="$1" + local param_value="$2" + aws ssm put-parameter \ + --name "$param_name" \ + --type "SecureString" \ + --value "$param_value" \ + --overwrite } main() { - if [[ $# -ne 3 ]]; then + if [[ $# -ne 2 ]]; then usage fi TERRAGRUNT_DIR="$1" - TYPE="$2" - VALUE="$3" - - case "$TYPE" in - key | id | installation_id | name | client_id) ;; - *) - echo "Error: Invalid type '$TYPE'. Allowed: key, id, installation_id, name" - exit 1 - ;; - esac + PEM_FILE="$2" validate_terragrunt_dir "$TERRAGRUNT_DIR" + validate_pem_file "$PEM_FILE" - if [[ "$TYPE" == "key" ]]; then - validate_pem_file "$VALUE" - SECRET_VALUE=$(encode_pem "$VALUE") - else - SECRET_VALUE="$VALUE" - fi - - SECRET_NAME=$(get_secret_name "$TERRAGRUNT_DIR" "$TYPE") + SSM_NAME=$(get_ssm_name "$TERRAGRUNT_DIR") + ENCODED_KEY=$(encode_pem "$PEM_FILE") - update_secret "$SECRET_NAME" "$SECRET_VALUE" + update_ssm_param "$SSM_NAME" "$ENCODED_KEY" - echo "✅ Updated secret '$SECRET_NAME'" + echo "✅ Updated SSM parameter '$SSM_NAME' with base64-encoded GitHub App key" } main "$@"