diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 5c928049f0..248500529b 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -60,7 +60,7 @@ jobs: with: context: trinity-${{ github.run_id }} push: true - file: trinity-${{ github.run_id }}/scripts/docker/Dockerfile.uv + file: trinity-${{ github.run_id }}/docker/Dockerfile shm-size: 128g tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/docker/env b/.github/workflows/docker/env new file mode 100644 index 0000000000..35ea11087f --- /dev/null +++ b/.github/workflows/docker/env @@ -0,0 +1,9 @@ +TRINITY_NODE1_GPU_0=4 +TRINITY_NODE1_GPU_1=5 +TRINITY_NODE2_GPU_0=6 +TRINITY_NODE2_GPU_1=7 +TRINITY_DOCKER_IMAGE=trinity-rft-unittest:20260523 +TRINITY_MOUNT_DIR=/mnt1/checkpoints +TRINITY_HF_ENDPOINT=https://hf-mirror.com +TRINITY_PYPI_INDEX_URL=http://mirrors.cloud.aliyuncs.com/pypi/simple/ +TRINITY_RAY_DASHBOARD_PORT=8275 diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 755627a9a5..491a82f585 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -22,20 +22,28 @@ jobs: path: trinity-${{ github.run_id }} ref: refs/pull/${{ github.event.issue.number }}/head + - name: Prepare docker env + working-directory: trinity-${{ github.run_id }} + run: | + DASHBOARD_PORT=$((10000 + (GITHUB_RUN_ID % 50000))) + cp .github/workflows/docker/env docker/env + sed -i "s/^TRINITY_RAY_DASHBOARD_PORT=.*/TRINITY_RAY_DASHBOARD_PORT=${DASHBOARD_PORT}/" docker/env + - name: Setup docker compose - working-directory: trinity-${{ github.run_id }}/.github/workflows/docker + working-directory: trinity-${{ github.run_id }} run: | - docker compose up -d + bash docker/start.sh sleep 15s - name: Check ray status - working-directory: trinity-${{ github.run_id }}/.github/workflows/docker + working-directory: trinity-${{ github.run_id }} run: | MAX_RETRIES=20 RETRY_INTERVAL=5 for i in $(seq 1 $MAX_RETRIES); do - if docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && ray status" \ - && docker compose exec trinity-node-2 bash -c "source /opt/venv/bin/activate && ray status"; then + bash docker/status.sh | tee ray-status.txt + if grep -Fq "[trinity-node-1] Container is running and Ray is healthy." ray-status.txt \ + && grep -Fq "[trinity-node-2] Container is running and Ray is healthy." ray-status.txt; then break fi echo "Waiting for ray cluster to be ready... ($i/$MAX_RETRIES)" @@ -74,39 +82,46 @@ jobs: run: | git fetch origin main git diff --name-only origin/main...HEAD > changed_files.txt - awk -F/ '/^(trinity)\// {print $2}' changed_files.txt | sort | uniq > changed_modules.txt - awk '{print "tests/"$1}' changed_modules.txt > test_dirs.txt + awk -F/ '/^(trinity|tests)\// {print $2}' changed_files.txt | sort | uniq > changed_modules.txt + : > test_dirs.txt + while read -r module; do + if [[ -n "$module" && -d "tests/$module" ]]; then + echo "tests/$module" >> test_dirs.txt + fi + done < changed_modules.txt - name: Run unittest - working-directory: trinity-${{ github.run_id }}/.github/workflows/docker + working-directory: trinity-${{ github.run_id }} run: | + source docker/common.sh + init_docker_compose TYPE="${{ steps.test_type.outputs.type }}" if [ "$TYPE" = "all" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json" + "${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s --ctrf report.json" elif [ "$TYPE" = "diff" ]; then - if [ -s ../../../test_dirs.txt ]; then + if [ -s test_dirs.txt ]; then echo "tests_run=true" >> $GITHUB_ENV - TEST_DIRS=$(cat ../../../test_dirs.txt | xargs) - docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json" + TEST_DIRS=$(cat test_dirs.txt | xargs) + "${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest $TEST_DIRS -v -s --ctrf report.json" else echo "No changed modules detected, skipping tests." echo "tests_run=false" >> $GITHUB_ENV fi elif [ "$TYPE" = "module" ]; then MODULE="${{ steps.test_type.outputs.module }}" - if [ -n "$MODULE" ]; then + if [ -n "$MODULE" ] && [ -d "tests/$MODULE" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json" + "${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests/$MODULE -v -s --ctrf report.json" else - echo "No module specified, skipping tests." + echo "No valid module specified, skipping tests." echo "tests_run=false" >> $GITHUB_ENV fi elif [ "$TYPE" = "pattern" ]; then PATTERN="${{ steps.test_type.outputs.pattern }}" if [ -n "$PATTERN" ]; then echo "tests_run=true" >> $GITHUB_ENV - docker compose exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s -k '$PATTERN' --ctrf report.json" + "${COMPOSE_CMD[@]}" exec trinity-node-1 bash -c "source /opt/venv/bin/activate && pytest tests -v -s -k '$PATTERN' --ctrf report.json" else echo "No pattern specified, skipping tests." echo "tests_run=false" >> $GITHUB_ENV @@ -114,10 +129,12 @@ jobs: fi - name: Clean checkpoint dir - working-directory: trinity-${{ github.run_id }}/.github/workflows/docker + working-directory: trinity-${{ github.run_id }} if: always() run: | - docker compose exec trinity-node-1 rm -rf /mnt/checkpoints/* + source docker/common.sh + init_docker_compose + "${COMPOSE_CMD[@]}" exec trinity-node-1 rm -rf /mnt/checkpoints/* continue-on-error: true - name: Upload test results @@ -134,6 +151,8 @@ jobs: with: report-path: trinity-${{ github.run_id }}/report.json summary: true + summary-report: true + collapse-large-reports: true pull-request: false issue: ${{ github.event.issue.number }} env: @@ -141,10 +160,10 @@ jobs: continue-on-error: true - name: Remove docker compose - working-directory: trinity-${{ github.run_id }}/.github/workflows/docker + working-directory: trinity-${{ github.run_id }} if: always() run: | - docker compose down --remove-orphans + bash docker/stop.sh continue-on-error: true - name: Cleanup workspace diff --git a/.gitignore b/.gitignore index 65cb8d9f5f..873fff3eef 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ ipython_config.py # Environments .env +docker/env .venv env/ venv/ diff --git a/scripts/docker/Dockerfile.uv b/docker/Dockerfile similarity index 98% rename from scripts/docker/Dockerfile.uv rename to docker/Dockerfile index 93f571b306..c0b3fc9c8d 100644 --- a/scripts/docker/Dockerfile.uv +++ b/docker/Dockerfile @@ -2,7 +2,7 @@ # Build and run the docker image with the following command: # # cd -# docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest . +# docker build -f docker/Dockerfile -t trinity-rft:latest . # docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v :/data trinity-rft:latest # # Note: diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000..1217c25822 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,97 @@ +# Docker Test Environment + +This document focuses on two things: + +1. how to prepare the Docker-based test environment +2. how to run Trinity unit tests inside that environment + +## Hardware Requirements + +- `trinity-node-1` requires at least 2 GPUs. +- `trinity-node-2` requires at least 2 GPUs. +- Each container requires `64G` shared memory. + +Default GPU assignment: + +- `trinity-node-1`: GPU `0` and GPU `1` +- `trinity-node-2`: GPU `2` and GPU `3` + +If the current machine uses different GPU indices, override them in `docker/env`. + +## Test Environment Setup + +Before running any Docker test command: + +1. Copy `docker/env.example` to `docker/env`. +2. Update `docker/env` for the current machine. +3. Make sure `TRINITY_MOUNT_DIR` points to a host directory that contains the required models, datasets, and checkpoints. + +The helper scripts do not read `env.example` automatically. If `docker/env` is missing, they will stop and ask you to create it first. + +Required settings in `docker/env`: + +- `TRINITY_DOCKER_IMAGE`: Docker image used by both containers. +- `TRINITY_MOUNT_DIR`: Host directory mounted to `/mnt` inside the containers. +- `TRINITY_NODE1_GPU_0`, `TRINITY_NODE1_GPU_1`: GPU indices for `trinity-node-1`. +- `TRINITY_NODE2_GPU_0`, `TRINITY_NODE2_GPU_1`: GPU indices for `trinity-node-2`. +- `TRINITY_HF_ENDPOINT`: Hugging Face mirror or endpoint. +- `TRINITY_PYPI_INDEX_URL`: Python package index used inside containers. +- `TRINITY_RAY_DASHBOARD_PORT`: Host port mapped to the Ray dashboard. + +## Start And Check The Environment + +Start the Docker test environment: + +```bash +bash docker/start.sh +``` + +Check whether both containers are up and whether Ray is healthy: + +```bash +bash docker/status.sh +``` + +Expected interpretation: + +- If a container does not exist, run `bash docker/start.sh` first. +- If a container exists but is stopped, start the environment again before running tests. +- If a container is running but Ray is unhealthy, resolve the container startup problem before running tests. + +## Run Tests + +Use `bash docker/run.sh` to execute pytest inside `trinity-node-1`. + +Run one narrow test module: + +```bash +bash docker/run.sh --module common +``` + +Run a filtered subset when a smaller slice is known: + +```bash +bash docker/run.sh --module common --keyword test_config +``` + +Rules for test execution: + +- Always prefer the smallest viable `--module`. +- Add `--keyword` whenever you know the failing test name, keyword, or a smaller slice. +- Do not widen the test scope unless the narrower check is insufficient. +- `run.sh` always executes tests inside `trinity-node-1`. + +## Stop The Environment + +Stop the Docker test environment after use: + +```bash +bash docker/stop.sh +``` + +## Script Roles + +- [start.sh](/nas/pxc/rft/Trinity-RFT/docker/start.sh): starts the test environment. +- [status.sh](/nas/pxc/rft/Trinity-RFT/docker/status.sh): checks container state and Ray health. +- [run.sh](/nas/pxc/rft/Trinity-RFT/docker/run.sh): runs pytest in the Docker test environment. +- [stop.sh](/nas/pxc/rft/Trinity-RFT/docker/stop.sh): shuts the test environment down cleanly. diff --git a/docker/common.sh b/docker/common.sh new file mode 100755 index 0000000000..1dd2b530bc --- /dev/null +++ b/docker/common.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yaml" +ENV_EXAMPLE_FILE="$SCRIPT_DIR/env.example" +ENV_FILE="$SCRIPT_DIR/env" +COMPOSE_CMD=() + +docker_fail() { + echo "$1" >&2 + return 1 +} + +load_docker_env() { + if [[ ! -f "$ENV_FILE" ]]; then + if [[ -f "$ENV_EXAMPLE_FILE" ]]; then + docker_fail "docker/env was not found. Copy docker/env.example to docker/env and adjust the machine-specific settings first." + else + docker_fail "docker/env was not found, and docker/env.example is also missing in $SCRIPT_DIR." + fi + return 1 + fi + + set -a + # shellcheck disable=SC1090 + source "$ENV_FILE" + set +a +} + +init_docker_compose() { + if ! command -v docker >/dev/null 2>&1; then + docker_fail "Docker is not installed or not available in PATH." + return 1 + fi + + if [[ ! -f "$COMPOSE_FILE" ]]; then + docker_fail "docker-compose.yaml was not found in $SCRIPT_DIR." + return 1 + fi + + load_docker_env || return 1 + + COMPOSE_CMD=(docker compose -f "$COMPOSE_FILE") + if ! "${COMPOSE_CMD[@]}" version >/dev/null 2>&1; then + docker_fail "Docker Compose is not available. Make sure 'docker compose' works on this machine." + return 1 + fi + + for required_var in \ + TRINITY_DOCKER_IMAGE \ + TRINITY_PYPI_INDEX_URL \ + TRINITY_HF_ENDPOINT \ + TRINITY_MOUNT_DIR \ + TRINITY_RAY_DASHBOARD_PORT \ + TRINITY_NODE1_GPU_0 \ + TRINITY_NODE1_GPU_1 \ + TRINITY_NODE2_GPU_0 \ + TRINITY_NODE2_GPU_1; do + if [[ -z "${!required_var:-}" ]]; then + docker_fail "Required Docker setting '$required_var' is empty. Check docker/env." + return 1 + fi + done +} diff --git a/.github/workflows/docker/docker-compose.yaml b/docker/docker-compose.yaml similarity index 54% rename from .github/workflows/docker/docker-compose.yaml rename to docker/docker-compose.yaml index 75a1d061e7..b3e032f1ce 100644 --- a/.github/workflows/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -1,74 +1,78 @@ services: trinity-node-1: - image: trinity-rft-unittest:20260523 - cap_add: - - SYS_PTRACE + image: ${TRINITY_DOCKER_IMAGE} pull_policy: never command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block" environment: - - HF_ENDPOINT=https://hf-mirror.com + - HF_ENDPOINT=${TRINITY_HF_ENDPOINT} - HF_HUB_DISABLE_PROGRESS_BARS=1 + - UV_DEFAULT_INDEX=${TRINITY_PYPI_INDEX_URL} - RAY_ADDRESS=auto - TRINITY_CHECKPOINT_ROOT_DIR=/mnt/checkpoints - TRINITY_TASKSET_PATH=/mnt/data - TRINITY_EVAL_TASKSET_PATH=/mnt/data - TRINITY_SFT_DATASET_PATH=/mnt/data - - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B - - TRINITY_API_MODEL_PATH=/mnt/models/Qwen3.5-0.8B - - TRINITY_MOE_MODEL_PATH=/mnt/models/Qwen3.5-35B-A3B - - TRINITY_VLM_MODEL_PATH=/mnt/models/Qwen3.5-0.8B - - TRINITY_ALTERNATIVE_VLM_MODEL_PATH=/mnt/models/Qwen3.5-0.8B + - TRINITY_MODEL_PATH=/mnt/Qwen3-0.6B + - TRINITY_API_MODEL_PATH=/mnt/Qwen3.5-0.8B + - TRINITY_MOE_MODEL_PATH=/mnt/Qwen3.5-35B-A3B + - TRINITY_VLM_MODEL_PATH=/mnt/Qwen3.5-0.8B + - TRINITY_ALTERNATIVE_VLM_MODEL_PATH=/mnt/Qwen3.5-0.8B - VIRTUAL_ENV=/opt/venv working_dir: /workspace networks: - trinity-network volumes: - - trinity-volume:/mnt - - ../../..:/workspace + - ..:/workspace + - ${TRINITY_MOUNT_DIR}:/mnt + # Hardware constraint: this container must have at least 2 GPUs and 64G shared memory. shm_size: "64G" deploy: resources: reservations: devices: - driver: nvidia - device_ids: ['4', '5'] + device_ids: + - "${TRINITY_NODE1_GPU_0}" + - "${TRINITY_NODE1_GPU_1}" capabilities: [gpu] + ports: + - "${TRINITY_RAY_DASHBOARD_PORT}:8265" # Ray dashboard trinity-node-2: - image: trinity-rft-unittest:20260523 + image: ${TRINITY_DOCKER_IMAGE} cap_add: - SYS_PTRACE pull_policy: never command: bash -c "source /opt/venv/bin/activate && uv pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block" environment: - - HF_ENDPOINT=https://hf-mirror.com + - HF_ENDPOINT=${TRINITY_HF_ENDPOINT} - HF_HUB_DISABLE_PROGRESS_BARS=1 + - UV_DEFAULT_INDEX=${TRINITY_PYPI_INDEX_URL} - TRINITY_CHECKPOINT_ROOT_DIR=/mnt/checkpoints - TRINITY_TASKSET_PATH=/mnt/data - - TRINITY_MODEL_PATH=/mnt/models/Qwen3-0.6B - - TRINITY_MOE_MODEL_PATH=/mnt/models/Qwen3.5-35B-A3B + - TRINITY_MODEL_PATH=/mnt/Qwen3-0.6B + - TRINITY_MOE_MODEL_PATH=/mnt/Qwen3.5-35B-A3B - VIRTUAL_ENV=/opt/venv working_dir: /workspace volumes: - - trinity-volume:/mnt - - ../../..:/workspace + - ..:/workspace + - ${TRINITY_MOUNT_DIR}:/mnt depends_on: - trinity-node-1 networks: - trinity-network + # Hardware constraint: this container must have at least 2 GPUs and 64G shared memory. shm_size: "64G" deploy: resources: reservations: devices: - driver: nvidia - device_ids: ['6', '7'] + device_ids: + - "${TRINITY_NODE2_GPU_0}" + - "${TRINITY_NODE2_GPU_1}" capabilities: [gpu] networks: trinity-network: driver: bridge - -volumes: - trinity-volume: - external: true diff --git a/docker/env.example b/docker/env.example new file mode 100644 index 0000000000..2c6cf26f58 --- /dev/null +++ b/docker/env.example @@ -0,0 +1,28 @@ +# Copy this file to docker/env and adjust the values for the current machine. +# docker/run.sh and docker/status.sh will automatically load docker/env when it exists. +# +# Hardware constraints: +# - trinity-node-1 requires at least 2 GPUs and 64G shared memory. +# - trinity-node-2 requires at least 2 GPUs and 64G shared memory. +# - shm_size is fixed at 64G in docker-compose.yaml and should not be reduced. +# +# GPU defaults are [0,1] for trinity-node-1 and [2,3] for trinity-node-2. +# Override these values if the target machine uses different GPU indices. + +TRINITY_NODE1_GPU_0=0 +TRINITY_NODE1_GPU_1=1 +TRINITY_NODE2_GPU_0=2 +TRINITY_NODE2_GPU_1=3 + +# Local Docker image to use for both containers. +TRINITY_DOCKER_IMAGE=ghcr.io/agentscope-ai/trinity-rft:latest + +# Host path that will be mounted to /nas/checkpoints inside the containers. +TRINITY_MOUNT_DIR=/nas/checkpoints + +# External endpoints that may vary by machine or network environment. +TRINITY_HF_ENDPOINT=https://hf-mirror.com +TRINITY_PYPI_INDEX_URL=http://mirrors.cloud.aliyuncs.com/pypi/simple/ + +# Host port mapped to the Ray dashboard on trinity-node-1. +TRINITY_RAY_DASHBOARD_PORT=8275 diff --git a/docker/run.sh b/docker/run.sh new file mode 100755 index 0000000000..8d64832805 --- /dev/null +++ b/docker/run.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" +DEFAULT_MODULE="common" +SERVICE_NAME="trinity-node-1" + +print_help() { + cat < Test module under tests/. Default: ${DEFAULT_MODULE} + -k, --keyword Pytest -k expression used to filter tests + -h, --help Show this help message and exit + +Examples: + bash docker/run.sh + bash docker/run.sh --module buffer + bash docker/run.sh --module common --keyword test_config +EOF +} + +fail() { + echo "$1" >&2 + exit 1 +} + +require_arg() { + local option="$1" + local value="$2" + + if [[ -z "$value" ]]; then + fail "Missing value for ${option}. Use --help for usage information." + fi +} + +module_name="$DEFAULT_MODULE" +keyword_expr="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -m|--module) + require_arg "$1" "$2" + module_name="$2" + shift 2 + ;; + -k|--keyword) + require_arg "$1" "$2" + keyword_expr="$2" + shift 2 + ;; + -h|--help) + print_help + exit 0 + ;; + --) + shift + break + ;; + -*) + fail "Unknown option: $1. Use --help for usage information." + ;; + *) + fail "Unexpected positional argument: $1. Use --help for usage information." + ;; + esac +done + +if [[ $# -gt 0 ]]; then + fail "Unexpected positional arguments: $*. Use --help for usage information." +fi + +if ! init_docker_compose; then + exit 1 +fi + +if [[ ! -d "$SCRIPT_DIR/../tests/${module_name}" ]]; then + fail "Test module 'tests/${module_name}' does not exist." +fi + +container_id="$("${COMPOSE_CMD[@]}" ps -a -q "$SERVICE_NAME" 2>/dev/null)" +if [[ -z "$container_id" ]]; then + fail "Container '${SERVICE_NAME}' does not exist. Run 'bash docker/start.sh' first." +fi + +running_id="$("${COMPOSE_CMD[@]}" ps -q "$SERVICE_NAME" 2>/dev/null)" +if [[ -z "$running_id" ]]; then + fail "Container '${SERVICE_NAME}' exists but is not running. Start it before running tests." +fi + +pytest_args=(pytest "tests/${module_name}" -v -s) +if [[ -n "$keyword_expr" ]]; then + pytest_args+=(-k "$keyword_expr") +fi + +printf -v pytest_cmd '%q ' "${pytest_args[@]}" +pytest_cmd="source /opt/venv/bin/activate && ${pytest_cmd% }" + +echo "Running tests in ${SERVICE_NAME}: ${pytest_cmd}" +"${COMPOSE_CMD[@]}" exec "$SERVICE_NAME" bash -c "$pytest_cmd" diff --git a/docker/start.sh b/docker/start.sh new file mode 100755 index 0000000000..c5c375e42a --- /dev/null +++ b/docker/start.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +if ! init_docker_compose; then + exit 1 +fi + +echo "Starting Trinity Docker services with 2 GPUs per container and 64G shm." +"${COMPOSE_CMD[@]}" up -d + +echo "Docker services started. Run 'bash docker/status.sh' to check Ray status." diff --git a/docker/status.sh b/docker/status.sh new file mode 100755 index 0000000000..abdd072ce5 --- /dev/null +++ b/docker/status.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" +SERVICES=(trinity-node-1 trinity-node-2) + +if ! init_docker_compose; then + exit 1 +fi + +check_service() { + local service="$1" + local container_id + local running_id + local ray_output + + container_id="$("${COMPOSE_CMD[@]}" ps -a -q "$service" 2>/dev/null)" + if [[ -z "$container_id" ]]; then + echo "[$service] Container does not exist. Run 'bash docker/start.sh' first." + return + fi + + running_id="$("${COMPOSE_CMD[@]}" ps -q "$service" 2>/dev/null)" + if [[ -z "$running_id" ]]; then + echo "[$service] Container exists but is not running. Start it before checking Ray status." + return + fi + + if ray_output="$("${COMPOSE_CMD[@]}" exec -T "$service" bash -c 'source /opt/venv/bin/activate && ray status' 2>&1)"; then + echo "[$service] Container is running and Ray is healthy." + echo "$ray_output" + else + echo "[$service] Container is running, but Ray does not appear to be started or healthy." + echo "$ray_output" + fi +} + +for service in "${SERVICES[@]}"; do + check_service "$service" + echo +done diff --git a/docker/stop.sh b/docker/stop.sh new file mode 100755 index 0000000000..398e2eca67 --- /dev/null +++ b/docker/stop.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +if ! init_docker_compose; then + exit 1 +fi + +echo "Stopping Trinity Docker services." +"${COMPOSE_CMD[@]}" down + +echo "Docker services stopped." diff --git a/docs/sphinx_doc/source/tutorial/example_megatron.md b/docs/sphinx_doc/source/tutorial/example_megatron.md index e852cfd8f3..2f3d780f18 100644 --- a/docs/sphinx_doc/source/tutorial/example_megatron.md +++ b/docs/sphinx_doc/source/tutorial/example_megatron.md @@ -42,7 +42,7 @@ We provide a Docker setup to simplify environment management. Trinity-RFT's provided Docker already has Megatron-LM related dependencies pre-installed. You can either use our provided Docker image directly or customize the Dockerfile to build your own image as needed. ```bash -docker build -f scripts/docker/Dockerfile.uv -t trinity-rft-megatron:latest . +docker build -f docker/Dockerfile -t trinity-rft-megatron:latest . ``` > 💡 You can customize the Dockerfile before building — for example, to add pip mirrors or set API keys. diff --git a/docs/sphinx_doc/source/tutorial/trinity_installation.md b/docs/sphinx_doc/source/tutorial/trinity_installation.md index 42c8ed35e2..11ff36a5d8 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source/tutorial/trinity_installation.md @@ -126,7 +126,7 @@ cd Trinity-RFT # Build the Docker image ## Tip: You can modify the Dockerfile to add mirrors or set API keys -docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest . +docker build -f docker/Dockerfile -t trinity-rft:latest . # Run the container, replacing with your actual path docker run -it \ diff --git a/docs/sphinx_doc/source_zh/tutorial/example_megatron.md b/docs/sphinx_doc/source_zh/tutorial/example_megatron.md index fc671ba5bf..dc3aa1df0c 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_megatron.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_megatron.md @@ -47,7 +47,7 @@ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \ Trinity-RFT 提供的 Docker 已经预装了 Megatron-LM 相关依赖。你可以直接使用我们提供的 Docker 镜像,或者根据需要自定义 Dockerfile 来构建镜像。 ```bash -docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest . +docker build -f docker/Dockerfile -t trinity-rft:latest . ``` > 💡 你可以在构建前自定义 Dockerfile —— 例如添加 pip 镜像源或设置 API 密钥。 diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md index 556cbf187b..42730f003b 100644 --- a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md +++ b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md @@ -127,7 +127,7 @@ cd Trinity-RFT # 构建 Docker 镜像 ## 提示:可根据需要修改 Dockerfile 添加镜像源或设置 API 密钥 -docker build -f scripts/docker/Dockerfile.uv -t trinity-rft:latest . +docker build -f docker/Dockerfile -t trinity-rft:latest . # 运行容器,请将 替换为实际需要挂载的路径 docker run -it \