Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
with:
context: trinity-${{ github.run_id }}
push: true
file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile.uv
file: trinity-${{ github.run_id }}/docker/Dockerfile
shm-size: 128g
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/docker/env
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
TRINITY_NODE1_GPU_0=4
TRINITY_NODE1_GPU_1=5
TRINITY_NODE2_GPU_0=6
TRINITY_NODE2_GPU_1=7
TRINITY_DOCKER_IMAGE=trinity-rft-unittest:20260523
TRINITY_MOUNT_DIR=/mnt1/checkpoints
TRINITY_HF_ENDPOINT=https://hf-mirror.com
TRINITY_PYPI_INDEX_URL=http://mirrors.cloud.aliyuncs.com/pypi/simple/
TRINITY_RAY_DASHBOARD_PORT=8275
59 changes: 39 additions & 20 deletions .github/workflows/unittest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,28 @@ jobs:
path: trinity-${{ github.run_id }}
ref: refs/pull/${{ github.event.issue.number }}/head

- name: Prepare docker env
working-directory: trinity-${{ github.run_id }}
run: |
DASHBOARD_PORT=$((10000 + (GITHUB_RUN_ID % 50000)))
cp .github/workflows/docker/env docker/env
sed -i "s/^TRINITY_RAY_DASHBOARD_PORT=.*/TRINITY_RAY_DASHBOARD_PORT=${DASHBOARD_PORT}/" docker/env

- name: Setup docker compose
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
working-directory: trinity-${{ github.run_id }}
run: |
docker compose up -d
bash docker/start.sh
sleep 15s

- name: Check ray status
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
working-directory: trinity-${{ github.run_id }}
run: |
MAX_RETRIES=20
RETRY_INTERVAL=5
for i in $(seq 1 $MAX_RETRIES); do
if docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" \
&& docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then
bash docker/status.sh | tee ray-status.txt
if grep -Fq "[trinity-node-1] Container is running and Ray is healthy." ray-status.txt \
&& grep -Fq "[trinity-node-2] Container is running and Ray is healthy." ray-status.txt; then
break
fi
echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)"
Expand Down Expand Up @@ -74,50 +82,59 @@ jobs:
run: |
git fetch origin main
git diff --name-only origin/main...HEAD > changed_files.txt
awk -F/ '/^(trinity)\// {print $2}' changed_files.txt | sort | uniq > changed_modules.txt
awk '{print "tests/"$1}' changed_modules.txt > test_dirs.txt
awk -F/ '/^(trinity|tests)\// {print $2}' changed_files.txt | sort | uniq > changed_modules.txt
: > test_dirs.txt
while read -r module; do
if [[ -n "$module" && -d "tests/$module" ]]; then
echo "tests/$module" >> test_dirs.txt
fi
done < changed_modules.txt

- name: Run unittest
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
working-directory: trinity-${{ github.run_id }}
run: |
source docker/common.sh
init_docker_compose
TYPE="${{ steps.test_type.outputs.type }}"
if [ "$TYPE" = "all" ]; then
echo "tests_run=true" >> $GITHUB_ENV
docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json"
"${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json"
elif [ "$TYPE" = "diff" ]; then
if [ -s ../../../test_dirs.txt ]; then
if [ -s test_dirs.txt ]; then
echo "tests_run=true" >> $GITHUB_ENV
TEST_DIRS=$(cat ../../../test_dirs.txt | xargs)
docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json"
TEST_DIRS=$(cat test_dirs.txt | xargs)
"${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json"
else
echo "No changed modules detected, skipping tests."
echo "tests_run=false" >> $GITHUB_ENV
fi
elif [ "$TYPE" = "module" ]; then
MODULE="${{ steps.test_type.outputs.module }}"
if [ -n "$MODULE" ]; then
if [ -n "$MODULE" ] && [ -d "tests/$MODULE" ]; then
echo "tests_run=true" >> $GITHUB_ENV
docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json"
"${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json"
else
echo "No module specified, skipping tests."
echo "No valid module specified, skipping tests."
echo "tests_run=false" >> $GITHUB_ENV
fi
elif [ "$TYPE" = "pattern" ]; then
PATTERN="${{ steps.test_type.outputs.pattern }}"
if [ -n "$PATTERN" ]; then
echo "tests_run=true" >> $GITHUB_ENV
docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s -k '$PATTERN' --ctrf report.json"
"${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s -k '$PATTERN' --ctrf report.json"
else
echo "No pattern specified, skipping tests."
echo "tests_run=false" >> $GITHUB_ENV
fi
fi

- name: Clean checkpoint dir
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
working-directory: trinity-${{ github.run_id }}
if: always()
run: |
docker compose exec trinity-node-1 rm -rf /mnt/checkpoints/*
source docker/common.sh
init_docker_compose
"${COMPOSE_CMD[@]}" exec trinity-node-1 rm -rf /mnt/checkpoints/*
continue-on-error: true

- name: Upload test results
Expand All @@ -134,17 +151,19 @@ jobs:
with:
report-path: trinity-${{ github.run_id }}/report.json
summary: true
summary-report: true
collapse-large-reports: true
pull-request: false
issue: ${{ github.event.issue.number }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
continue-on-error: true

- name: Remove docker compose
working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
working-directory: trinity-${{ github.run_id }}
if: always()
run: |
docker compose down --remove-orphans
bash docker/stop.sh
continue-on-error: true

- name: Cleanup workspace
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ ipython_config.py

# Environments
.env
docker/env
.venv
env/
venv/
Expand Down
2 changes: 1 addition & 1 deletion scripts/docker/Dockerfile.uv → docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Build and run the docker image with the following command:
#
# cd <Trinity-RFT root dir>
# docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest .
# docker build -f docker/Dockerfile -t trinity-rft:latest .
# docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft:latest
#
# Note:
Expand Down
97 changes: 97 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Docker Test Environment

This document focuses on two things:

1. how to prepare the Docker-based test environment
2. how to run Trinity unit tests inside that environment

## Hardware Requirements

- `trinity-node-1` requires at least 2 GPUs.
- `trinity-node-2` requires at least 2 GPUs.
- Each container requires `64G` shared memory.

Default GPU assignment:

- `trinity-node-1`: GPU `0` and GPU `1`
- `trinity-node-2`: GPU `2` and GPU `3`

If the current machine uses different GPU indices, override them in `docker/env`.

## Test Environment Setup

Before running any Docker test command:

1. Copy `docker/env.example` to `docker/env`.
2. Update `docker/env` for the current machine.
3. Make sure `TRINITY_MOUNT_DIR` points to a host directory that contains the required models, datasets, and checkpoints.

The helper scripts do not read `env.example` automatically. If `docker/env` is missing, they will stop and ask you to create it first.

Required settings in `docker/env`:

- `TRINITY_DOCKER_IMAGE`: Docker image used by both containers.
- `TRINITY_MOUNT_DIR`: Host directory mounted to `/mnt` inside the containers.
- `TRINITY_NODE1_GPU_0`, `TRINITY_NODE1_GPU_1`: GPU indices for `trinity-node-1`.
- `TRINITY_NODE2_GPU_0`, `TRINITY_NODE2_GPU_1`: GPU indices for `trinity-node-2`.
- `TRINITY_HF_ENDPOINT`: Hugging Face mirror or endpoint.
- `TRINITY_PYPI_INDEX_URL`: Python package index used inside containers.
- `TRINITY_RAY_DASHBOARD_PORT`: Host port mapped to the Ray dashboard.

## Start And Check The Environment

Start the Docker test environment:

```bash
bash docker/start.sh
```

Check whether both containers are up and whether Ray is healthy:

```bash
bash docker/status.sh
```

Expected interpretation:

- If a container does not exist, run `bash docker/start.sh` first.
- If a container exists but is stopped, start the environment again before running tests.
- If a container is running but Ray is unhealthy, resolve the container startup problem before running tests.

## Run Tests

Use `bash docker/run.sh` to execute pytest inside `trinity-node-1`.

Run one narrow test module:

```bash
bash docker/run.sh --module common
```

Run a filtered subset when a smaller slice is known:

```bash
bash docker/run.sh --module common --keyword test_config
```

Rules for test execution:

- Always prefer the smallest viable `--module`.
- Add `--keyword` whenever you know the failing test name, keyword, or a smaller slice.
- Do not widen the test scope unless the narrower check is insufficient.
- `run.sh` always executes tests inside `trinity-node-1`.

## Stop The Environment

Stop the Docker test environment after use:

```bash
bash docker/stop.sh
```

## Script Roles

- [start.sh](/nas/pxc/rft/Trinity-RFT/docker/start.sh): starts the test environment.
- [status.sh](/nas/pxc/rft/Trinity-RFT/docker/status.sh): checks container state and Ray health.
- [run.sh](/nas/pxc/rft/Trinity-RFT/docker/run.sh): runs pytest in the Docker test environment.
- [stop.sh](/nas/pxc/rft/Trinity-RFT/docker/stop.sh): shuts the test environment down cleanly.
64 changes: 64 additions & 0 deletions docker/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bash

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yaml"
ENV_EXAMPLE_FILE="$SCRIPT_DIR/env.example"
ENV_FILE="$SCRIPT_DIR/env"
COMPOSE_CMD=()

docker_fail() {
echo "$1" >&2
return 1
}

load_docker_env() {
if [[ ! -f "$ENV_FILE" ]]; then
if [[ -f "$ENV_EXAMPLE_FILE" ]]; then
docker_fail "docker/env was not found. Copy docker/env.example to docker/env and adjust the machine-specific settings first."
else
docker_fail "docker/env was not found, and docker/env.example is also missing in $SCRIPT_DIR."
fi
return 1
fi

set -a
# shellcheck disable=SC1090
source "$ENV_FILE"
set +a
}

init_docker_compose() {
if ! command -v docker >/dev/null 2>&1; then
docker_fail "Docker is not installed or not available in PATH."
return 1
fi

if [[ ! -f "$COMPOSE_FILE" ]]; then
docker_fail "docker-compose.yaml was not found in $SCRIPT_DIR."
return 1
fi

load_docker_env || return 1

COMPOSE_CMD=(docker compose -f "$COMPOSE_FILE")
if ! "${COMPOSE_CMD[@]}" version >/dev/null 2>&1; then
docker_fail "Docker Compose is not available. Make sure 'docker compose' works on this machine."
return 1
fi

for required_var in \
TRINITY_DOCKER_IMAGE \
TRINITY_PYPI_INDEX_URL \
TRINITY_HF_ENDPOINT \
TRINITY_MOUNT_DIR \
TRINITY_RAY_DASHBOARD_PORT \
TRINITY_NODE1_GPU_0 \
TRINITY_NODE1_GPU_1 \
TRINITY_NODE2_GPU_0 \
TRINITY_NODE2_GPU_1; do
if [[ -z "${!required_var:-}" ]]; then
docker_fail "Required Docker setting '$required_var' is empty. Check docker/env."
return 1
fi
done
}
Loading
Loading