UiPath · Chibionos · Jun 11, 2026 · chatgpt-codex-connector · Jun 11, 2026 · chatgpt-codex-connector
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -3,6 +3,8 @@ test:uipath-langchain:
     - any-glob-to-any-file: ['packages/uipath/src/**/*.py']
   - changed-files:
     - any-glob-to-any-file: ['packages/uipath-platform/src/**/*.py']
+  - changed-files:
+    - any-glob-to-any-file: ['packages/uipath-eval/src/**/*.py']
   - changed-files:
     - any-glob-to-any-file: ['packages/uipath-core/src/**/*.py']
 
@@ -11,6 +13,8 @@ test:uipath-integrations:
     - any-glob-to-any-file: ['packages/uipath/src/**/*.py']
   - changed-files:
     - any-glob-to-any-file: ['packages/uipath-platform/src/**/*.py']
+  - changed-files:
+    - any-glob-to-any-file: ['packages/uipath-eval/src/**/*.py']
   - changed-files:
     - any-glob-to-any-file: ['packages/uipath-core/src/**/*.py']
 

diff --git a/.github/scripts/detect_changed_packages.py b/.github/scripts/detect_changed_packages.py
@@ -17,8 +17,9 @@
 # External dependents (uipath-langchain, uipath-runtime, etc.) are
 # handled separately via labeler.yml auto-labels.
 DEPENDENTS: dict[str, list[str]] = {
-    "uipath-core": ["uipath-platform", "uipath"],
-    "uipath-platform": ["uipath"],
+    "uipath-core": ["uipath-platform", "uipath-eval", "uipath"],
+    "uipath-platform": ["uipath-eval", "uipath"],
+    "uipath-eval": ["uipath"],
 }
 
 
@@ -117,7 +118,9 @@ def main():
     if base_sha and head_sha:
         packages = get_changed_packages(base_sha, head_sha)
         event_type = "pull request" if event_name == "pull_request" else "push"
-        print(f"{event_type.capitalize()} - detected {len(packages)} directly changed package(s):")
+        print(
+            f"{event_type.capitalize()} - detected {len(packages)} directly changed package(s):"
+        )
         for pkg in packages:
             print(f"  - {pkg}")
 

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -133,7 +133,8 @@ jobs:
     needs: [detect-publishable-packages, publish-uipath-platform]
     if: |
       always() &&
-      contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath')
+      (contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval') ||
+       contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath'))
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -154,9 +155,70 @@ jobs:
         if: "!contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-platform')"
         run: echo "uipath-platform not being published — skipping wait"
 
-  # --- Tier 2: uipath (depends on core + platform) ---
-  build-uipath:
+  # --- Tier 1.5: uipath-eval (depends on core + platform) ---
+  build-uipath-eval:
     needs: [detect-publishable-packages, wait-for-uipath-platform]
+    if: |
+      always() &&
+      contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')
+    uses: ./.github/workflows/build-package.yml
+    with:
+      package: uipath-eval
+      needs-relock: true
+
+  publish-uipath-eval:
+    name: Publish uipath-eval
+    needs: [detect-publishable-packages, build-uipath-eval]
+    if: |
+      always() &&
+      contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists-uipath-eval
+          path: dist/
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          verbose: true
+          skip-existing: true
+
+  wait-for-uipath-eval:
+    name: Wait for uipath-eval on PyPI
+    needs: [detect-publishable-packages, publish-uipath-eval]
+    if: |
+      always() &&
+      contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        if: contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        if: contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Wait for uipath-eval
+        if: contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')
+        run: python .github/scripts/wait_for_pypi.py uipath-eval
+
+      - name: Skip
+        if: "!contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath-eval')"
+        run: echo "uipath-eval not being published — skipping wait"
+
+  # --- Tier 2: uipath (depends on core + platform + eval) ---
+  build-uipath:
+    needs: [detect-publishable-packages, wait-for-uipath-eval]
     if: |
       always() &&
       contains(fromJson(needs.detect-publishable-packages.outputs.packages), 'uipath')

diff --git a/.github/workflows/lint-packages.yml b/.github/workflows/lint-packages.yml
@@ -139,6 +139,60 @@ jobs:
         working-directory: packages/uipath-platform
         run: uv run ruff format --check .
 
+  lint-uipath-eval:
+    name: Lint uipath-eval
+    needs: detect-changed-packages
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check if package changed
+        id: check
+        run: |
+          if echo '${{ needs.detect-changed-packages.outputs.packages }}' | jq -e 'index("uipath-eval")' > /dev/null; then
+            echo "skip=false" >> $GITHUB_OUTPUT
+          else
+            echo "skip=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Skip
+        if: steps.check.outputs.skip == 'true'
+        run: echo "Skipping - no changes to uipath-eval"
+
+      - name: Checkout
+        if: steps.check.outputs.skip != 'true'
+        uses: actions/checkout@v4
+
+      - name: Setup uv
+        if: steps.check.outputs.skip != 'true'
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Setup Python
+        if: steps.check.outputs.skip != 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "packages/uipath-eval/.python-version"
+
+      - name: Install dependencies
+        if: steps.check.outputs.skip != 'true'
+        working-directory: packages/uipath-eval
+        run: uv sync --locked --all-extras
+
+      - name: Check static types
+        if: steps.check.outputs.skip != 'true'
+        working-directory: packages/uipath-eval
+        run: uv run mypy --config-file pyproject.toml .
+
+      - name: Check linting
+        if: steps.check.outputs.skip != 'true'
+        working-directory: packages/uipath-eval
+        run: uv run ruff check .
+
+      - name: Check formatting
+        if: steps.check.outputs.skip != 'true'
+        working-directory: packages/uipath-eval
+        run: uv run ruff format --check .
+
   lint-uipath:
     name: Lint uipath
     needs: detect-changed-packages

diff --git a/.github/workflows/test-packages.yml b/.github/workflows/test-packages.yml
@@ -224,6 +224,76 @@ jobs:
           UIPATH_FOLDER_KEY: ${{ secrets.UIPATH_MEMORY_FOLDER }}
         run: uv run pytest tests/services/test_memory_service_e2e.py -m e2e -v --no-cov
 
+  test-uipath-eval:
+    name: Test (uipath-eval, ${{ matrix.python-version }}, ${{ matrix.os }})
+    needs: detect-changed-packages
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12", "3.13"]
+        os: [ubuntu-latest, windows-latest]
+    steps:
+      - name: Check if package changed
+        id: check
+        shell: bash
+        run: |
+          if echo '${{ needs.detect-changed-packages.outputs.packages }}' | jq -e 'index("uipath-eval")' > /dev/null; then
+            echo "skip=false" >> $GITHUB_OUTPUT
+          else
+            echo "skip=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Skip
+        if: steps.check.outputs.skip == 'true'
+        shell: bash
+        run: echo "Skipping - no changes to uipath-eval"
+
+      - name: Checkout
+        if: steps.check.outputs.skip != 'true'
+        uses: actions/checkout@v4
+
+      - name: Setup uv
+        if: steps.check.outputs.skip != 'true'
+        uses: astral-sh/setup-uv@v5
+
+      - name: Setup Python
+        if: steps.check.outputs.skip != 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        if: steps.check.outputs.skip != 'true'
+        working-directory: packages/uipath-eval
+        run: uv sync --all-extras --python ${{ matrix.python-version }}
+
+      - name: Run tests
+        if: steps.check.outputs.skip != 'true' && !(matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13')
+        working-directory: packages/uipath-eval
+        run: uv run pytest
+
+      - name: Run tests with coverage
+        if: steps.check.outputs.skip != 'true' && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
+        working-directory: packages/uipath-eval
+        run: uv run pytest --cov-report=xml --cov-report=html --tb=short
+
+      - name: Upload coverage HTML report
+        if: steps.check.outputs.skip != 'true' && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' && always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-html-uipath-eval
+          path: packages/uipath-eval/htmlcov/
+          retention-days: 30
+
+      - name: Upload coverage XML report
+        if: steps.check.outputs.skip != 'true' && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' && always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-xml-uipath-eval
+          path: packages/uipath-eval/coverage.xml
+          retention-days: 30
+
   test-uipath:
     name: Test (uipath, ${{ matrix.python-version }}, ${{ matrix.os }})
     needs: detect-changed-packages

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -4,17 +4,18 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Repository Overview
 
-This is the **UiPath Python SDK** monorepo — a Python SDK and CLI for programmatic interaction with the UiPath Cloud Platform. It publishes three packages to PyPI: `uipath`, `uipath-core`, and `uipath-platform`.
+This is the **UiPath Python SDK** monorepo — a Python SDK and CLI for programmatic interaction with the UiPath Cloud Platform. It publishes four packages to PyPI: `uipath`, `uipath-core`, `uipath-platform`, and `uipath-eval`.
 
 ## Monorepo Structure
 
-The repo contains three packages under `packages/`, each with its own `pyproject.toml`, `src/`, and `tests/`:
+The repo contains four packages under `packages/`, each with its own `pyproject.toml`, `src/`, and `tests/`:
 
-- **`packages/uipath`** — The main SDK package. Contains the CLI (`uipath` command), agent framework, evaluation framework, tracing/telemetry, and function utilities. This is what users `uv pip install uipath`. Entry point: `src/uipath/_cli:cli`.
+- **`packages/uipath`** — The main SDK package. Contains the CLI (`uipath` command), agent framework, tracing/telemetry, and function utilities. This is what users `uv pip install uipath`. Entry point: `src/uipath/_cli:cli`.
 - **`packages/uipath-core`** — Core abstractions shared across packages: tracing, serialization, events, feature flags, error types, chat models, guardrails. Depends on OpenTelemetry and Pydantic.
+- **`packages/uipath-eval`** — The evaluation framework (`uipath.eval` namespace): evaluators (deterministic + LLM-based), the `@mockable` simulation/mocking system, and the eval runtime. Consumable standalone (e.g. by the python eval worker in the agents backend). Depends on `uipath-core`, `uipath-platform`, and `uipath-runtime`.
 - **`packages/uipath-platform`** — HTTP client layer for UiPath Platform APIs. Contains service classes for orchestrator resources (assets, buckets, jobs, processes, queues), action center, context grounding, documents, connections, chat, and guardrails. Depends on `uipath-core`, httpx, and tenacity.
 
-Dependency chain: `uipath` → `uipath-platform` → `uipath-core`. Local editable links are configured via `[tool.uv.sources]`.
+Dependency chain: `uipath` → `uipath-eval` → `uipath-platform` → `uipath-core`. Local editable links are configured via `[tool.uv.sources]`.
 
 ## Build & Development Commands
 
@@ -96,7 +97,7 @@ The CLI uses **click** and is organized as `cli_<command>.py` files in `src/uipa
 
 ### Evaluation Framework
 
-`src/uipath/eval/` provides evaluators (ExactMatch, Contains, JsonSimilarity, LLMJudge, Trajectory, ToolCall, Classification) for testing agent quality.
+`packages/uipath-eval/src/uipath/eval/` provides evaluators (ExactMatch, Contains, JsonSimilarity, LLMJudge, Trajectory, ToolCall, Classification) for testing agent quality.
 
 ## Code Conventions
 

diff --git a/packages/uipath-eval/.python-version b/packages/uipath-eval/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/packages/uipath-eval/CLAUDE.md b/packages/uipath-eval/CLAUDE.md
@@ -0,0 +1,39 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with the `uipath-eval` package.
+
+## Package Purpose
+
+The evaluation framework for UiPath agents, providing the `uipath.eval` namespace package. Extracted from the main `uipath` SDK so consumers (e.g. the python eval worker in the agents backend) can depend on evaluators without the CLI and the rest of the SDK. Depends on `uipath-core`, `uipath-platform`, and `uipath-runtime`.
+
+## Development Commands
+
+```bash
+cd packages/uipath-eval
+
+uv sync --all-extras          # Install dependencies
+pytest                        # Run all tests
+pytest tests/eval/mocks/      # Run a test subdirectory
+ruff check .                  # Lint
+ruff format --check .         # Format check
+mypy src                      # Type check
+```
+
+No justfile exists for this package — run commands directly.
+
+## Module Layout (`src/uipath/eval/`)
+
+| Module | Purpose |
+|--------|---------|
+| `evaluators/` | Deterministic evaluators (ExactMatch, Contains, JsonSimilarity, classification, tool-call order/args/count/output), LLM-judge evaluators, legacy evaluators, evaluator factory + registration |
+| `models/` | `EvaluationSet`, `EvaluationItem`, `EvaluationResult`, `AgentExecution`, score types |
+| `mocks/` | `@mockable` decorator, LLM tool/input mocking, mockito integration, response caching, `UiPathMockRuntime` |
+| `runtime/` | `UiPathEvalRuntime`, `UiPathEvalContext`, `evaluate()` entry point, eval events, parallelization, exporters |
+| `helpers.py` | `EvalHelpers` (eval set loading/migration), `get_agent_model()` |
+
+## Constraints
+
+- This is a **namespace package**: `src/uipath/` has no `__init__.py`; only `src/uipath/eval/` does. Import paths are unchanged from when the code lived in the main SDK (`from uipath.eval...`).
+- Do not import from the main `uipath` package internals (`uipath._cli`, `uipath._utils`, `uipath.agent`, ...) — only `uipath.core`, `uipath.platform`, and `uipath.runtime` are available here. The main `uipath` package depends on this one, not vice versa.
+- Structured output across model providers must use function calling, not `response_format` (Claude returns prose, Gemini returns empty content for `response_format` on the normalized gateway — see `mocks/_structured_output.py`).
+- The CLI-facing progress reporters (`_progress_reporter.py`, `_console_progress_reporter.py`) intentionally stay in `packages/uipath/src/uipath/_cli/_evals/` — they are CLI infrastructure, not part of this package.
diff --git a/packages/uipath-eval/README.md b/packages/uipath-eval/README.md
@@ -0,0 +1,44 @@
+# uipath-eval
+
+Evaluation framework for UiPath agents, extracted from the main `uipath` SDK so
+it can be consumed standalone (for example by the python eval worker in the
+agents backend) without pulling in the CLI and the rest of the SDK.
+
+Provides the `uipath.eval` namespace package:
+
+- **`uipath.eval.evaluators`** — deterministic evaluators (ExactMatch, Contains,
+  JsonSimilarity, classification, tool-call order/args/count/output) and
+  LLM-based evaluators (LLM-judge output/trajectory), plus their legacy
+  counterparts and the evaluator factory/registration system.
+- **`uipath.eval.models`** — evaluation sets, evaluation results, score types,
+  agent execution models.
+- **`uipath.eval.mocks`** — the `@mockable` decorator, LLM tool/input mocking,
+  mockito integration, and response caching used by simulation runs.
+- **`uipath.eval.runtime`** — `UiPathEvalRuntime`, `UiPathEvalContext`, the
+  `evaluate()` entry point, eval events, parallelization, and exporters.
+
+## Installation
+
+```bash
+uv pip install uipath-eval
+```
+
+Import paths are unchanged from when this code lived in the `uipath` package:
+
+```python
+from uipath.eval.evaluators import ExactMatchEvaluator
+from uipath.eval.models.evaluation_set import EvaluationSet
+from uipath.eval.runtime import UiPathEvalContext, evaluate
+```
+
+## Development
+
+```bash
+cd packages/uipath-eval
+
+uv sync --all-extras          # Install dependencies
+pytest                        # Run all tests
+ruff check .                  # Lint
+ruff format --check .         # Format check
+mypy src                      # Type check
+```