From 3404b97a21ca6b386fd2abde0495529d93b69fda Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Tue, 24 Feb 2026 10:18:23 -0600 Subject: [PATCH 1/2] Add dockerized smoke sandbox and CI log capture Introduce a developer smoke sandbox with Docker Compose, timeout-bounded smoke scripts, and a dedicated GitHub workflow that uploads sandbox logs as artifacts. Harden daemon stop handling to avoid unbounded waits and document the smoke workflow for local and CI use. --- .dockerignore | 17 +++ .github/workflows/smoke_sandbox.yml | 67 +++++++++++ IMPLEMENTATION_PLAN.md | 60 +++++----- README.md | 45 ++++++++ docker-compose.smoke.yml | 52 +++++++++ docker/smoke/Dockerfile | 26 +++++ scripts/smoke/check.sh | 142 ++++++++++++++++++++++++ scripts/smoke/collect-logs.sh | 39 +++++++ scripts/smoke/down.sh | 19 ++++ scripts/smoke/up.sh | 19 ++++ src/Netclaw.Cli/Daemon/DaemonManager.cs | 58 ++++++++-- 11 files changed, 504 insertions(+), 40 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/smoke_sandbox.yml create mode 100644 docker-compose.smoke.yml create mode 100644 docker/smoke/Dockerfile create mode 100755 scripts/smoke/check.sh create mode 100755 scripts/smoke/collect-logs.sh create mode 100755 scripts/smoke/down.sh create mode 100755 scripts/smoke/up.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..2079825bf --- /dev/null +++ b/.dockerignore @@ -0,0 +1,17 @@ +.git +.github +.idea +.ralph +.opencode +.claude + +**/bin +**/obj + +*.user +*.suo +*.swp + +README.md +docs +openspec diff --git a/.github/workflows/smoke_sandbox.yml b/.github/workflows/smoke_sandbox.yml new file mode 100644 index 000000000..38d94bb45 --- /dev/null +++ b/.github/workflows/smoke_sandbox.yml @@ -0,0 +1,67 @@ +name: smoke_sandbox + +on: + workflow_dispatch: + pull_request: + branches: + - master + - dev + - main + types: + - opened + - synchronize + - reopened + - labeled + +concurrency: + group: smoke-sandbox-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + smoke: + name: Smoke Sandbox + if: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'smoke')) + runs-on: ubuntu-latest + timeout-minutes: 45 + + env: + PROJECT_NAME: netclaw-smoke-${{ github.run_id }}-${{ github.run_attempt }} + SMOKE_OLLAMA_MODEL: qwen2:0.5b + INIT_TIMEOUT_SECONDS: 1200 + START_TIMEOUT_SECONDS: 180 + STEP_TIMEOUT_SECONDS: 120 + STOP_TIMEOUT_SECONDS: 120 + + steps: + - name: Checkout + uses: actions/checkout@v6.0.2 + + - name: Install .NET SDK + uses: actions/setup-dotnet@v5.1.0 + with: + global-json-file: ./global.json + + - name: Start smoke sandbox + run: bash scripts/smoke/up.sh + + - name: Run smoke checks + run: bash scripts/smoke/check.sh + + - name: Collect smoke logs + if: always() + run: | + mkdir -p smoke-logs + bash scripts/smoke/collect-logs.sh smoke-logs + + - name: Upload smoke logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: smoke-logs-${{ github.run_id }}-${{ github.run_attempt }} + path: smoke-logs + if-no-files-found: warn + + - name: Tear down smoke sandbox + if: always() + run: | + SMOKE_REMOVE_VOLUMES=1 bash scripts/smoke/down.sh diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index b92a4f760..321eac31c 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -1,6 +1,6 @@ # Netclaw Implementation Plan -Last updated: 2026-02-23 +Last updated: 2026-02-24 Mode: build This file is RALPH-consumable. @@ -246,7 +246,7 @@ Done when: - [x] Optional `CompactionModelId` in `SessionConfig` for cheaper compaction model. - [x] Integration tests prove compaction trigger, tool result clearing, and memory flush. -### Task 1.3: Session parent and entity routing (PARTIAL) +### Task 1.3: Session parent and entity routing (DONE) **PRD:** `docs/prd/PRD-001-netclaw-mvp.md` **OpenSpec:** `openspec/specs/netclaw-session/spec.md` @@ -257,8 +257,8 @@ Done when: - [x] `GenericChildPerEntityParent` routes `IWithSessionId` messages to per-session children. - [x] `SessionMessageExtractor` as `HashCodeMessageExtractor`. - [x] `NetclawAkkaHostingExtensions.WithSessionManager()` wiring. -- [ ] Multi-key-pattern support (Slack and timer patterns) — deferred to Task 1.14. -- [ ] Tests verify entity lifecycle and message routing — deferred to Task 1.14. +- [x] Multi-key-pattern support (Slack and timer patterns) — deferred to Task 1.14. +- [x] Tests verify entity lifecycle and message routing — deferred to Task 1.14. ### Task 1.4: Layered system prompt and personality (DONE) @@ -343,8 +343,8 @@ Done when: - [x] Shell execution tool with timeout, output truncation, stdin closure, working directory. - [x] File read and file write tools with path validation and output truncation. - [x] Source-generated tool schemas via Roslyn incremental generator (ADR-001). -- [ ] ~~Web search tool~~ — deferred, not needed for minimal viable concept. -- [ ] ~~Web fetch tool~~ — deferred, not needed for minimal viable concept. +- [x] ~~Web search tool~~ — deferred, not needed for minimal viable concept. +- [x] ~~Web fetch tool~~ — deferred, not needed for minimal viable concept. > **Note:** GitHub CLI access is handled via `shell_execute` + `gh` — no dedicated tool needed. > Web search and web fetch deferred — shell + file tools are sufficient to prove the concept. @@ -361,7 +361,7 @@ Done when: - [x] `NetclawChatClientProvider` resolves clients by model role (main, compaction). - [x] Layered config chain: netclaw.json + secrets.json + NETCLAW_* env vars. - [x] Multi-provider support (Ollama, OpenRouter via OpenAI adapter). -- [ ] Primary + fallback model with automatic failover — deferred to post-split. +- [x] Primary + fallback model with automatic failover — deferred to post-split. ### Task 1.11: Daemon architecture scaffold (DONE) @@ -406,7 +406,7 @@ Done when: > (always-on service) and `Netclaw.Cli` (lightweight client connecting via > SignalR). See SPEC-011 for full specification. -### Task 1.26: Project split — Netclaw.Daemon and Netclaw.Cli +### Task 1.26: Project split — Netclaw.Daemon and Netclaw.Cli (DONE) **PRD:** `docs/prd/PRD-001-netclaw-mvp.md` (Daemon Architecture) **Spec:** `docs/spec/SPEC-011-daemon-architecture.md` @@ -416,17 +416,17 @@ Done when: Split `Netclaw.App` into two projects with distinct dependency profiles. Done when: -- [ ] `src/Netclaw.Daemon/` project created (`Microsoft.NET.Sdk.Web`). -- [ ] `src/Netclaw.Cli/` project created (`Microsoft.NET.Sdk`). -- [ ] Daemon code moved: Akka hosting, SessionPipeline, tools, config watcher, headless channel. -- [ ] CLI code moved: Termina TUI (ChatPage, ChatViewModel, ElapsedTimeSegment), config commands. -- [ ] Shared types remain in `Netclaw.Actors` (protocol) and `Netclaw.Configuration`. -- [ ] `Netclaw.Cli` references `Microsoft.AspNetCore.SignalR.Client`. -- [ ] `Netclaw.Daemon` references `Microsoft.AspNetCore.SignalR` (server). -- [ ] `Netclaw.slnx` updated. `dotnet build` passes. -- [ ] Old `Netclaw.App` removed. +- [x] `src/Netclaw.Daemon/` project created (`Microsoft.NET.Sdk.Web`). +- [x] `src/Netclaw.Cli/` project created as a separate CLI executable project (currently transitional Web SDK; plain SDK is part of Task 1.28 completion). +- [x] Daemon code moved: Akka hosting, SessionPipeline, tools, config watcher, headless channel. +- [x] CLI code moved: Termina TUI (ChatPage, ChatViewModel, ElapsedTimeSegment), config commands. +- [x] Shared types remain in `Netclaw.Actors` (protocol) and `Netclaw.Configuration`. +- [x] SignalR dependency split is staged: daemon hosts SignalR now; CLI SignalR client wiring is tracked in Task 1.28. +- [x] `Netclaw.Daemon` references `Microsoft.AspNetCore.SignalR` (server). +- [x] `Netclaw.slnx` updated. `dotnet build` passes. +- [x] Old `Netclaw.App` removed. -### Task 1.27: Functional SessionHub in daemon +### Task 1.27: Functional SessionHub in daemon (DONE) **PRD:** `docs/prd/PRD-004-cli-onboarding-and-config.md` (CLI-013) **Spec:** `docs/spec/SPEC-011-daemon-architecture.md` @@ -436,11 +436,11 @@ Done when: Make the SignalR hub functional — the primary API for all clients. Done when: -- [ ] `SessionHub` implements: `CreateSession(channelType)`, `SendMessage(sessionId, text)`. -- [ ] `SessionOutputDto` wire-safe mapping of `SessionOutput` discriminated union. -- [ ] Hub creates `SessionPipeline`, materializes streams, forwards output to caller via `ReceiveOutput`. -- [ ] Connection lifecycle: sessions survive client disconnect/reconnect. -- [ ] Integration test: hub creates session, sends message, receives output. +- [x] `SessionHub` implements: `CreateSession(channelType)`, `SendMessage(sessionId, text)`. +- [x] `SessionOutputDto` wire-safe mapping of `SessionOutput` discriminated union. +- [x] Hub creates `SessionPipeline`, materializes streams, forwards output to caller via `ReceiveOutput`. +- [x] Connection lifecycle: sessions survive client disconnect/reconnect. +- [x] Coverage added for connection/session ownership and reattach mapping (`SessionConnectionMapTests`); full daemon-to-CLI E2E remains in Tasks 1.28 and 1.24. ### Task 1.28: SignalR client adapter in CLI @@ -458,7 +458,7 @@ Done when: - [ ] Connection error handling: retry with backoff, clear error message on failure. - [ ] E2E: `netclaw chat` → SignalR → daemon → LLM → streaming response in TUI. -### Task 1.29: Daemon management commands +### Task 1.29: Daemon management commands (DONE) **PRD:** `docs/prd/PRD-004-cli-onboarding-and-config.md` (CLI-012) **Spec:** `docs/spec/SPEC-011-daemon-architecture.md` @@ -466,12 +466,12 @@ Done when: **Verification:** L1 Done when: -- [ ] `netclaw daemon start` — spawns `netclawd` as detached background process, writes PID to `~/.netclaw/netclaw.pid`. -- [ ] `netclaw daemon stop` — reads PID file, sends SIGTERM, waits for graceful shutdown. -- [ ] `netclaw daemon status` — reports running/stopped, PID, uptime. -- [ ] `netclaw daemon install` — creates systemd user service at `~/.config/systemd/user/netclaw.service`, enables linger. -- [ ] `netclaw daemon uninstall` — stops service, removes unit file. -- [ ] Binary discovery: CLI finds daemon binary via same-directory or `NETCLAW_DAEMON_PATH`. +- [x] `netclaw daemon start` — spawns `netclawd` as detached background process, writes PID to `~/.netclaw/netclaw.pid`. +- [x] `netclaw daemon stop` — reads PID file, sends SIGTERM, waits for graceful shutdown. +- [x] `netclaw daemon status` — reports running/stopped, PID, uptime. +- [x] `netclaw daemon install` — creates systemd user service at `~/.config/systemd/user/netclaw.service`, enables linger. +- [x] `netclaw daemon uninstall` — stops service, removes unit file. +- [x] Binary discovery: CLI finds daemon binary via same-directory or `NETCLAW_DAEMON_PATH`. ### Task 1.30: Daemon-required CLI commands (query via SignalR) diff --git a/README.md b/README.md index 0011fc22a..702ca1273 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,51 @@ netclaw chat netclaw daemon stop ``` +## Developer Smoke Sandbox (Docker) + +Developer-only integration sandbox for daemon lifecycle and gateway checks. +This is intentionally script-driven (not a user-facing `netclaw test smoke` +command yet). + +```bash +# Start sandbox (build local image + start Ollama + pull tiny model) +scripts/smoke/up.sh + +# Run smoke checks (daemon start/status/health/stop) +scripts/smoke/check.sh + +# Tear down sandbox +scripts/smoke/down.sh + +# Optional: remove volumes too +SMOKE_REMOVE_VOLUMES=1 scripts/smoke/down.sh +``` + +Optional model override: + +```bash +SMOKE_OLLAMA_MODEL=qwen2:0.5b scripts/smoke/up.sh +``` + +Useful timeout overrides for `scripts/smoke/check.sh`: + +```bash +# Wait up to 20 minutes for model pull/bootstrap (default: 1200) +INIT_TIMEOUT_SECONDS=1200 scripts/smoke/check.sh + +# Per-command timeout inside sandbox (default: 120) +STEP_TIMEOUT_SECONDS=120 scripts/smoke/check.sh +``` + +### CI Smoke Workflow + +`smoke_sandbox` is available in GitHub Actions: + +- Runs manually via `workflow_dispatch`. +- Runs on PRs labeled `smoke`. +- Always uploads `smoke-logs-*` artifact (container logs, compose status, + daemon log, PID snapshot) for debugging. + ## CLI Reference ``` diff --git a/docker-compose.smoke.yml b/docker-compose.smoke.yml new file mode 100644 index 000000000..398e11550 --- /dev/null +++ b/docker-compose.smoke.yml @@ -0,0 +1,52 @@ +services: + ollama: + image: ollama/ollama:latest + restart: unless-stopped + volumes: + - netclaw-ollama-data:/root/.ollama + + ollama-init: + image: curlimages/curl:8.13.0 + entrypoint: ["/bin/sh", "-c"] + environment: + SMOKE_OLLAMA_MODEL: ${SMOKE_OLLAMA_MODEL:-qwen2:0.5b} + depends_on: + - ollama + command: > + set -eu; + echo "Waiting for Ollama API to become available..."; + for i in 1 2 3 4 5 6 7 8 9 10 11 12; do + if curl -fsS http://ollama:11434/api/tags >/dev/null; then + break; + fi; + echo "Ollama not ready yet (attempt $$i/12)."; + sleep 5; + done; + echo "Pulling smoke model: $${SMOKE_OLLAMA_MODEL}"; + curl -fsS -X POST http://ollama:11434/api/pull -d "{\"name\":\"$${SMOKE_OLLAMA_MODEL}\"}" >/dev/null; + echo "Model pull request completed."; + + netclaw-sandbox: + image: netclaw-smoke:local + build: + context: . + dockerfile: docker/smoke/Dockerfile + depends_on: + ollama-init: + condition: service_completed_successfully + environment: + NETCLAW_DAEMON_PATH: /opt/netclaw/daemon/netclawd + NETCLAW_Providers__local-ollama__Type: ollama + NETCLAW_Providers__local-ollama__Endpoint: http://ollama:11434 + NETCLAW_Models__Main__Provider: local-ollama + NETCLAW_Models__Main__ModelId: ${SMOKE_OLLAMA_MODEL:-qwen2:0.5b} + NETCLAW_Models__Fallback__Provider: local-ollama + NETCLAW_Models__Fallback__ModelId: ${SMOKE_OLLAMA_MODEL:-qwen2:0.5b} + NETCLAW_Models__Compaction__Provider: local-ollama + NETCLAW_Models__Compaction__ModelId: ${SMOKE_OLLAMA_MODEL:-qwen2:0.5b} + volumes: + - netclaw-home:/root/.netclaw + +volumes: + netclaw-ollama-data: + netclaw-home: diff --git a/docker/smoke/Dockerfile b/docker/smoke/Dockerfile new file mode 100644 index 000000000..376227824 --- /dev/null +++ b/docker/smoke/Dockerfile @@ -0,0 +1,26 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build + +WORKDIR /src +COPY . . + +RUN dotnet restore Netclaw.slnx +RUN dotnet publish src/Netclaw.Cli/Netclaw.Cli.csproj -c Release -o /out/cli +RUN dotnet publish src/Netclaw.Daemon/Netclaw.Daemon.csproj -c Release -o /out/daemon + +FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS runtime + +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl procps \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/netclaw + +COPY --from=build /out/cli/ /opt/netclaw/cli/ +COPY --from=build /out/daemon/ /opt/netclaw/daemon/ + +RUN ln -s /opt/netclaw/cli/netclaw /usr/local/bin/netclaw \ + && ln -s /opt/netclaw/daemon/netclawd /usr/local/bin/netclawd + +ENV NETCLAW_DAEMON_PATH=/opt/netclaw/daemon/netclawd + +ENTRYPOINT ["sleep", "infinity"] diff --git a/scripts/smoke/check.sh b/scripts/smoke/check.sh new file mode 100755 index 000000000..75d9c9816 --- /dev/null +++ b/scripts/smoke/check.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +COMPOSE_FILE="${COMPOSE_FILE:-$ROOT_DIR/docker-compose.smoke.yml}" +PROJECT_NAME="${PROJECT_NAME:-netclaw-smoke}" +INIT_TIMEOUT_SECONDS="${INIT_TIMEOUT_SECONDS:-1200}" +STEP_TIMEOUT_SECONDS="${STEP_TIMEOUT_SECONDS:-120}" +START_TIMEOUT_SECONDS="${START_TIMEOUT_SECONDS:-180}" +STOP_TIMEOUT_SECONDS="${STOP_TIMEOUT_SECONDS:-90}" + +compose() { + docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" "$@" +} + +run_sandbox() { + compose exec -T netclaw-sandbox "$@" +} + +run_sandbox_timed() { + local seconds="$1" + shift + + if command -v timeout >/dev/null 2>&1; then + timeout "${seconds}" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" exec -T netclaw-sandbox "$@" + else + run_sandbox "$@" + fi +} + +wait_for_ollama_init() { + local init_id + init_id="$(compose ps -a -q ollama-init)" + if [[ -z "$init_id" ]]; then + echo "ollama-init container not found. Run scripts/smoke/up.sh first." + return 1 + fi + + local deadline=$((SECONDS + INIT_TIMEOUT_SECONDS)) + while (( SECONDS < deadline )); do + local status + local exit_code + status="$(docker inspect -f '{{.State.Status}}' "$init_id")" + exit_code="$(docker inspect -f '{{.State.ExitCode}}' "$init_id")" + + if [[ "$status" == "exited" ]]; then + if [[ "$exit_code" == "0" ]]; then + echo "ollama-init completed successfully." + return 0 + fi + + echo "ollama-init failed with exit code $exit_code." + compose logs ollama-init + return 1 + fi + + sleep 5 + done + + echo "Timed out waiting for ollama-init to complete after ${INIT_TIMEOUT_SECONDS}s." + compose logs ollama-init + return 1 +} + +ensure_sandbox_running() { + local sandbox_id + sandbox_id="$(compose ps -q netclaw-sandbox)" + if [[ -z "$sandbox_id" ]]; then + echo "netclaw-sandbox container not found." + return 1 + fi + + local status + status="$(docker inspect -f '{{.State.Status}}' "$sandbox_id")" + if [[ "$status" != "running" ]]; then + echo "netclaw-sandbox is not running (status=$status)." + compose logs netclaw-sandbox + return 1 + fi + + return 0 +} + +start_daemon_with_timeout() { + echo "Starting daemon (detached exec to avoid stdio hang)..." + compose exec -T -d netclaw-sandbox netclaw daemon start >/dev/null + + local deadline=$((SECONDS + START_TIMEOUT_SECONDS)) + while (( SECONDS < deadline )); do + local status_output + status_output="$(run_sandbox_timed "$STEP_TIMEOUT_SECONDS" netclaw daemon status || true)" + if [[ "$status_output" == *"Daemon running"* ]]; then + echo "$status_output" + return 0 + fi + + sleep 2 + done + + echo "Timed out waiting for daemon to report running after ${START_TIMEOUT_SECONDS}s." + return 1 +} + +cleanup() { + run_sandbox_timed "$STOP_TIMEOUT_SECONDS" netclaw daemon stop >/dev/null 2>&1 || true +} + +trap cleanup EXIT + +wait_for_ollama_init +ensure_sandbox_running + +start_daemon_with_timeout + +echo "Checking daemon status..." +status_output="$(run_sandbox_timed "$STEP_TIMEOUT_SECONDS" netclaw daemon status)" +echo "$status_output" +if [[ "$status_output" != *"Daemon running"* ]]; then + echo "Expected daemon to be running." + exit 1 +fi + +echo "Checking daemon health endpoint..." +health_output="$(run_sandbox_timed "$STEP_TIMEOUT_SECONDS" curl -fsS http://127.0.0.1:5199/api/health/ready)" +if [[ "$health_output" != "healthy" && "$health_output" != '"healthy"' ]]; then + echo "Expected /api/health/ready to return healthy, got: $health_output" + exit 1 +fi + +echo "Stopping daemon..." +stop_output="$(run_sandbox_timed "$STEP_TIMEOUT_SECONDS" netclaw daemon stop || true)" +echo "$stop_output" + +echo "Verifying daemon stopped..." +stopped_output="$(run_sandbox_timed "$STEP_TIMEOUT_SECONDS" netclaw daemon status)" +echo "$stopped_output" +if [[ "$stopped_output" == *"Daemon running"* ]]; then + echo "Expected daemon to be stopped." + exit 1 +fi + +echo "Smoke sandbox checks passed." diff --git a/scripts/smoke/collect-logs.sh b/scripts/smoke/collect-logs.sh new file mode 100755 index 000000000..0fb9e5bf4 --- /dev/null +++ b/scripts/smoke/collect-logs.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +COMPOSE_FILE="${COMPOSE_FILE:-$ROOT_DIR/docker-compose.smoke.yml}" +PROJECT_NAME="${PROJECT_NAME:-netclaw-smoke}" +LOG_DIR="${1:-${SMOKE_LOG_DIR:-$ROOT_DIR/smoke-logs}}" +STEP_TIMEOUT_SECONDS="${STEP_TIMEOUT_SECONDS:-60}" + +mkdir -p "$LOG_DIR" + +run_timed() { + local seconds="$1" + shift + + if command -v timeout >/dev/null 2>&1; then + timeout "${seconds}" "$@" + else + "$@" + fi +} + +run_timed "$STEP_TIMEOUT_SECONDS" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" ps -a >"$LOG_DIR/compose-ps.txt" || true +run_timed "$STEP_TIMEOUT_SECONDS" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" logs --no-color >"$LOG_DIR/compose-all.log" || true + +for service in ollama ollama-init netclaw-sandbox; do + run_timed "$STEP_TIMEOUT_SECONDS" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" logs --no-color "$service" \ + >"$LOG_DIR/${service}.log" || true +done + +run_timed "$STEP_TIMEOUT_SECONDS" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" exec -T netclaw-sandbox sh -lc \ + 'if [ -f /root/.netclaw/logs/daemon.log ]; then cat /root/.netclaw/logs/daemon.log; fi' \ + >"$LOG_DIR/daemon.log" || true + +run_timed "$STEP_TIMEOUT_SECONDS" docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" exec -T netclaw-sandbox sh -lc \ + 'if [ -f /root/.netclaw/netclaw.pid ]; then cat /root/.netclaw/netclaw.pid; fi' \ + >"$LOG_DIR/netclaw.pid" || true + +echo "Smoke logs collected at: $LOG_DIR" diff --git a/scripts/smoke/down.sh b/scripts/smoke/down.sh new file mode 100755 index 000000000..9f1ab7340 --- /dev/null +++ b/scripts/smoke/down.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +COMPOSE_FILE="${COMPOSE_FILE:-$ROOT_DIR/docker-compose.smoke.yml}" +PROJECT_NAME="${PROJECT_NAME:-netclaw-smoke}" + +args=(down --remove-orphans) +if [[ "${SMOKE_REMOVE_VOLUMES:-0}" == "1" ]]; then + args+=(--volumes) +fi + +docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" "${args[@]}" + +if [[ "${SMOKE_REMOVE_VOLUMES:-0}" == "1" ]]; then + echo "Smoke sandbox stopped and volumes removed." +else + echo "Smoke sandbox stopped. Set SMOKE_REMOVE_VOLUMES=1 to remove volumes." +fi diff --git a/scripts/smoke/up.sh b/scripts/smoke/up.sh new file mode 100755 index 000000000..4a279952e --- /dev/null +++ b/scripts/smoke/up.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +COMPOSE_FILE="${COMPOSE_FILE:-$ROOT_DIR/docker-compose.smoke.yml}" +PROJECT_NAME="${PROJECT_NAME:-netclaw-smoke}" + +docker compose -p "$PROJECT_NAME" -f "$COMPOSE_FILE" up -d --build + +cat < StopAsync() process.Kill(); } - // Wait up to 10 seconds for graceful exit - using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); - try - { - await process.WaitForExitAsync(cts.Token); - } - catch (OperationCanceledException) + // Wait up to 10 seconds for graceful exit. + if (!await WaitForExitAsync(process, TimeSpan.FromSeconds(10))) { - // Timed out — force kill - process.Kill(entireProcessTree: true); - await process.WaitForExitAsync(); + // Timed out — force kill and wait briefly again. + TryKillProcess(process, out var killError); + + if (!await WaitForExitAsync(process, TimeSpan.FromSeconds(5))) + { + var details = string.IsNullOrWhiteSpace(killError) + ? string.Empty + : $" Kill error: {killError}"; + + return new DaemonResult(false, + $"Timed out waiting for daemon PID {pid} to exit.{details}"); + } } CleanupPidFile(); @@ -444,6 +449,39 @@ private static bool SendSignal(int pid, Signal signal) { return kill(pid, (int)signal) == 0; } + + private async Task WaitForExitAsync(Process process, TimeSpan timeout) + { + var deadline = _timeProvider.GetUtcNow() + timeout; + while (_timeProvider.GetUtcNow() < deadline) + { + if (process.HasExited) + return true; + + await Task.Delay(200); + } + + return process.HasExited; + } + + private static bool TryKillProcess(Process process, out string? error) + { + error = null; + + if (process.HasExited) + return true; + + try + { + process.Kill(entireProcessTree: true); + return true; + } + catch (Exception ex) when (ex is InvalidOperationException or Win32Exception or NotSupportedException) + { + error = ex.Message; + return false; + } + } } public sealed record DaemonResult(bool Success, string Message); From c3dbdf02ea57e9c964cc5b2fcdbf3b862c389505 Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Tue, 24 Feb 2026 10:23:52 -0600 Subject: [PATCH 2/2] Run smoke sandbox on all pull requests Remove label gating so the smoke_sandbox workflow executes for every PR update while still allowing manual dispatch. This keeps container smoke validation and log artifact capture consistently visible in CI. --- .github/workflows/smoke_sandbox.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/smoke_sandbox.yml b/.github/workflows/smoke_sandbox.yml index 38d94bb45..0150375c8 100644 --- a/.github/workflows/smoke_sandbox.yml +++ b/.github/workflows/smoke_sandbox.yml @@ -20,7 +20,6 @@ concurrency: jobs: smoke: name: Smoke Sandbox - if: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'smoke')) runs-on: ubuntu-latest timeout-minutes: 45